main.py 3.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. # -*- coding: utf-8 -*-
  2. # https://www.zhuimh.com/comic/419025
  3. # 获取 zhuimh
  4. import time
  5. import os
  6. import psycopg2
  7. from playwright.sync_api import sync_playwright
  8. class Zhuimh:
  9. def __init__(self):
  10. self.comico_id = 419025
  11. self.base_url = 'https://www.zhuimh.com'
  12. self.href_url = '/comic/'
  13. self.target_url = self.base_url + self.href_url + str(self.comico_id)
  14. def window_scroll(self, page):
  15. # 获取页面高度
  16. page_height = page.evaluate('() => document.body.scrollHeight')
  17. # 获取视口高度
  18. viewport_height = page.evaluate('() => window.innerHeight')
  19. # 计算需要滚动的距离
  20. scroll_distance = page_height - viewport_height
  21. # 模拟鼠标滚轮操作,滚动到页面底部
  22. page.mouse.wheel(0, scroll_distance)
  23. def get_chapter_img(self, chapter_name_list, chapter_url_list):
  24. for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
  25. print(f'章节名: {chapter_name}, 章节url: {chapter_url}')
  26. with sync_playwright() as playwright:
  27. browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
  28. page = browser.new_page()
  29. page.goto(chapter_url)
  30. time.sleep(1)
  31. self.window_scroll(page)
  32. # for _ in range(5):
  33. # page.evaluate('''() => {window.scrollTo(0, document.body.scrollHeight);}''')
  34. #
  35. # time.sleep(0.2)
  36. #
  37. # page.wait_for_timeout(1000)
  38. time.sleep(1)
  39. element = page.query_selector('body > div.chpater-images')
  40. links = element.query_selector_all('img')
  41. chpater_img_links = []
  42. for link in links:
  43. # 获取每个 <a> 标签的 href 属性
  44. img_src = link.get_attribute('src')
  45. if 'blob:' in img_src:
  46. chpater_img_links.append(img_src)
  47. print(chpater_img_links)
  48. def get_chapter(self):
  49. with sync_playwright() as playwright:
  50. browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
  51. page = browser.new_page()
  52. page.goto(self.target_url)
  53. title = page.title()
  54. target_name = title.split('漫画免费')[0]
  55. current_path = os.path.dirname(os.path.abspath(__file__))
  56. path = os.path.join(current_path, 'zhuimh', target_name)
  57. if not os.path.exists(path):
  58. os.makedirs(path)
  59. element = page.query_selector('body > div.tbox.tabs > div.tabs_block > ul')
  60. chapter_name_list = []
  61. chapter_url_list = []
  62. if element:
  63. # 执行你需要的操作,例如获取元素的文本内容
  64. text = element.text_content()
  65. for line in text.split('\n'):
  66. if line.strip():
  67. chapter_name_list.append(line.strip())
  68. links = element.query_selector_all('a')
  69. for link in links:
  70. # 获取每个 <a> 标签的 href 属性
  71. href = link.get_attribute('href')
  72. if href:
  73. chapter_url_list.append(self.base_url + href)
  74. else:
  75. print('没有找到 href 属性')
  76. else:
  77. print('元素未找到')
  78. exit(0)
  79. return chapter_name_list, chapter_url_list
  80. def main(self):
  81. chapter_name_list, chapter_url_list = self.get_chapter()
  82. self.get_chapter_img(chapter_name_list, chapter_url_list)
  83. if __name__ == '__main__':
  84. zhuimh = Zhuimh()
  85. zhuimh.main()