main.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145
  1. # -*- coding: utf-8 -*-
  2. # https://www.zhuimh.com/comic/419025
  3. # 获取 zhuimh
  4. import time
  5. import os
  6. import psycopg2
  7. from playwright.sync_api import sync_playwright
  8. class Zhuimh:
  9. def __init__(self):
  10. self.comico_id = 419025
  11. self.base_url = 'https://www.zhuimh.com'
  12. self.href_url = '/comic/'
  13. self.target_url = self.base_url + self.href_url + str(self.comico_id)
  14. def window_scroll(self, page):
  15. # 获取页面高度
  16. page_height = page.evaluate('() => document.body.scrollHeight')
  17. # 获取视口高度
  18. viewport_height = page.evaluate('() => window.innerHeight')
  19. # 计算需要滚动的距离
  20. scroll_distance = page_height - viewport_height
  21. # 模拟鼠标滚轮操作,滚动到页面底部
  22. page.mouse.wheel(0, scroll_distance)
  23. def get_chapter_img(self, chapter_name_list, chapter_url_list, path):
  24. for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
  25. print(f'章节名: {chapter_name}, 章节url: {chapter_url}')
  26. with sync_playwright() as playwright:
  27. browser = playwright.chromium.launch(headless=False) # headless=False 可以开启浏览器界面,便于调试
  28. page = browser.new_page()
  29. page.goto(chapter_url)
  30. self.window_scroll(page)
  31. # for _ in range(5):
  32. # page.evaluate('''() => {window.scrollTo(0, document.body.scrollHeight);}''')
  33. #
  34. # time.sleep(0.2)
  35. #
  36. # page.wait_for_timeout(1000)
  37. page.wait_for_timeout(3)
  38. element = page.query_selector('body > div.chpater-images')
  39. links = element.query_selector_all('img')
  40. time.sleep(5)
  41. chapter_file = os.path.join(path, chapter_name)
  42. if not os.path.exists(chapter_file):
  43. os.mkdir(chapter_file)
  44. #图片后缀都是webp
  45. img_suffix = '.webp'
  46. img_count = 1
  47. for link in links:
  48. # 获取每个 <a> 标签的 href 属性
  49. img_src = link.get_attribute('src')
  50. if 'blob:' in img_src:
  51. # 这里开始保存图片
  52. # 检测一下图片是否有下载过, 如果有就跳过
  53. img_name = str(img_count).zfill(4)+img_suffix
  54. img_path = os.path.join(chapter_file, img_name)
  55. if not os.path.exists(img_path):
  56. # 使用fetch API获取blob数据
  57. self.save_blob_as_file(page, img_src, img_path)
  58. img_count += 1
  59. else:
  60. img_count += 1
  61. def save_blob_as_file(self, page, blob_url, file_path):
  62. # 使用 playwright 的 evaluate 方法来获取 blob 数据
  63. buffer = page.evaluate(f"""
  64. () => {
  65. const response = fetch('{blob_url}');
  66. const blob = response.blob();
  67. const reader = new FileReader();
  68. reader.readAsArrayBuffer(blob);
  69. return new Promise((resolve) => {
  70. reader.onloadend = () => resolve(reader.result);
  71. });
  72. }
  73. """)
  74. # 将 ArrayBuffer 转换为 Node.js 的 Buffer 对象
  75. with open(file_path, 'wb') as file:
  76. file.write(buffer)
  77. def get_chapter(self):
  78. with sync_playwright() as playwright:
  79. browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
  80. page = browser.new_page()
  81. page.goto(self.target_url)
  82. title = page.title()
  83. target_name = title.split('漫画免费')[0]
  84. current_path = os.path.dirname(os.path.abspath(__file__))
  85. path = os.path.join(current_path, 'zhuimh', target_name)
  86. if not os.path.exists(path):
  87. os.makedirs(path)
  88. element = page.query_selector('body > div.tbox.tabs > div.tabs_block > ul')
  89. chapter_name_list = []
  90. chapter_url_list = []
  91. if element:
  92. # 执行你需要的操作,例如获取元素的文本内容
  93. text = element.text_content()
  94. for line in text.split('\n'):
  95. if line.strip():
  96. chapter_name_list.append(line.strip())
  97. links = element.query_selector_all('a')
  98. for link in links:
  99. # 获取每个 <a> 标签的 href 属性
  100. href = link.get_attribute('href')
  101. if href:
  102. chapter_url_list.append(self.base_url + href)
  103. else:
  104. print('没有找到 href 属性')
  105. else:
  106. print('元素未找到')
  107. exit(0)
  108. return chapter_name_list, chapter_url_list, path
  109. def main(self):
  110. chapter_name_list, chapter_url_list, path = self.get_chapter()
  111. self.get_chapter_img(chapter_name_list, chapter_url_list, path)
  112. if __name__ == '__main__':
  113. zhuimh = Zhuimh()
  114. zhuimh.main()