main.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. import os
  2. import time
  3. from playwright.sync_api import sync_playwright
  4. use_proxy = 1
  5. base_url = 'https://jcomic.net'
  6. herf_url = '/eps/'
  7. comico_url = '%E7%99%BE%E5%90%88%E3%83%95%E3%82%A7%E3%83%81LIFE'
  8. target_url = base_url + herf_url + comico_url
  9. scroll_speed = 5
  10. def scroll_to_percentage(page):
  11. # 滚动浏览器页面
  12. percentage_list = [i for i in range(5, 101, scroll_speed)]
  13. for percentage in percentage_list:
  14. # 计算页面的指定百分比高度
  15. height = page.evaluate("() => document.body.scrollHeight")
  16. scroll_position = height * (percentage / 100)
  17. # 跳转到指定的百分比位置
  18. page.evaluate(f"window.scrollTo({0}, {scroll_position})")
  19. time.sleep(0.5)
  20. def get_imgs(folder_path, chapter_data):
  21. with sync_playwright() as p:
  22. for chapter_name, url in chapter_data.items():
  23. # 创建文件夹
  24. chapter_folder = os.path.join(folder_path, chapter_name)
  25. if not os.path.exists(chapter_folder):
  26. os.makedirs(chapter_folder)
  27. # 遍历章节数据,依次下载图片
  28. if use_proxy:
  29. browser = p.chromium.launch(headless=True, proxy={"server": "http://127.0.0.1:7890"})
  30. else:
  31. browser = p.chromium.launch(headless=True)
  32. page = browser.new_page()
  33. page.goto(url)
  34. time.sleep(1)
  35. print(f'滚动 {chapter_name}')
  36. scroll_to_percentage(page)
  37. print(f'滚动 {chapter_name}完成')
  38. # 获取图片的上一层元素
  39. parent_locator = page.locator('body > div.container > div.row.col-lg-12.col-md-12.col-xs-12')
  40. # 获取匹配的图片元素数量
  41. total_images = parent_locator.locator('img').count()
  42. print(f'{chapter_name} 共 {total_images} 张图片')
  43. n = 1
  44. # 遍历图片元素并截图保存
  45. for page_num in range(1, total_images + 1):
  46. img_locator = f'body > div.container > div.row.col-lg-12.col-md-12.col-xs-12 > img:nth-child({page_num})'
  47. img_path = os.path.join(
  48. chapter_folder, f'{str(n).zfill(3)}.png')
  49. try:
  50. # 先不截图, 获取图片的src
  51. src_urls = page.query_selector_all(img_locator)
  52. for src_url in src_urls:
  53. src = src_url.get_attribute('src')
  54. if src:
  55. page.locator(img_locator).screenshot(path=img_path)
  56. n += 1
  57. except Exception:
  58. continue
  59. print(f'{chapter_name} 保存完成')
  60. browser.close()
  61. def save_urls(folder_path, chapter_data):
  62. with open(os.path.join(folder_path, 'urls.txt'), 'w') as file:
  63. for chapter_name, url in chapter_data.items():
  64. file.write(f"{chapter_name}: {url}\n")
  65. def new_folder(page_title):
  66. # 获取当前脚本所在的目录
  67. script_dir = os.path.dirname(os.path.abspath(__file__))
  68. download_dir = os.path.join(script_dir, 'downloads')
  69. if not os.path.exists(script_dir):
  70. os.makedirs(script_dir)
  71. if page_title:
  72. # 拼接目标文件夹路径
  73. folder_path = os.path.join(download_dir, page_title)
  74. # 检查文件夹是否存在,如果不存在则创建
  75. if not os.path.exists(folder_path):
  76. os.makedirs(folder_path)
  77. return folder_path
  78. def get_chapter_data():
  79. result = {}
  80. page_title = ''
  81. with sync_playwright() as p:
  82. if use_proxy:
  83. browser = p.chromium.launch(
  84. headless=True, proxy={"server": "http://127.0.0.1:7890"})
  85. else:
  86. browser = p.chromium.launch(headless=True)
  87. page = browser.new_page()
  88. # 打开目标页面
  89. page.goto(target_url)
  90. # 等待页面加载完成
  91. page.wait_for_selector('body > div.container > div:nth-child(3) > div:nth-child(2)')
  92. # 获取页面标题
  93. page_title = page.title()
  94. if page_title:
  95. page_title = page_title.replace('JComic - ', '')
  96. page_title = page_title.replace(' ', '')
  97. # 获取指定选择器下的所有元素
  98. elements = page.query_selector_all('body > div.container > div:nth-child(3) > div:nth-child(2) a')
  99. # 提取每个元素的 URL 和文本
  100. for element in elements:
  101. url = element.get_attribute('href')
  102. text = element.inner_text()
  103. result[text] = base_url + url
  104. # 关闭浏览器
  105. browser.close()
  106. return page_title, result
  107. def main():
  108. # 1, 获取页面 title 和章节数据
  109. page_title, chapter_data = get_chapter_data()
  110. if not page_title:
  111. print('获取页面标题失败')
  112. exit(0)
  113. # 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title
  114. folder_path = new_folder(page_title)
  115. # 3, 保存url到新建的文件夹中
  116. save_urls(folder_path, chapter_data)
  117. # 4, 遍历章节数据,依次下载图片
  118. get_imgs(folder_path, chapter_data)
  119. if __name__ == '__main__':
  120. main()
  121. print('done!')