|
@@ -0,0 +1,114 @@
|
|
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
|
|
+# https://www.zhuimh.com/comic/419025
|
|
|
|
|
+# 获取 zhuimh
|
|
|
|
|
+import time
|
|
|
|
|
+import os
|
|
|
|
|
+import psycopg2
|
|
|
|
|
+
|
|
|
|
|
+from playwright.sync_api import sync_playwright
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+class Zhuimh:
|
|
|
|
|
+ def __init__(self):
|
|
|
|
|
+ self.comico_id = 419025
|
|
|
|
|
+ self.base_url = 'https://www.zhuimh.com'
|
|
|
|
|
+ self.href_url = '/comic/'
|
|
|
|
|
+ self.target_url = self.base_url + self.href_url + str(self.comico_id)
|
|
|
|
|
+
|
|
|
|
|
+ def window_scroll(self, page):
|
|
|
|
|
+ # 获取页面高度
|
|
|
|
|
+ page_height = page.evaluate('() => document.body.scrollHeight')
|
|
|
|
|
+ # 获取视口高度
|
|
|
|
|
+ viewport_height = page.evaluate('() => window.innerHeight')
|
|
|
|
|
+
|
|
|
|
|
+ # 计算需要滚动的距离
|
|
|
|
|
+ scroll_distance = page_height - viewport_height
|
|
|
|
|
+
|
|
|
|
|
+ # 模拟鼠标滚轮操作,滚动到页面底部
|
|
|
|
|
+ page.mouse.wheel(0, scroll_distance)
|
|
|
|
|
+
|
|
|
|
|
+ def get_chapter_img(self, chapter_name_list, chapter_url_list):
|
|
|
|
|
+ for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
|
|
|
|
|
+ print(f'章节名: {chapter_name}, 章节url: {chapter_url}')
|
|
|
|
|
+ with sync_playwright() as playwright:
|
|
|
|
|
+ browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
|
|
|
|
|
+ page = browser.new_page()
|
|
|
|
|
+
|
|
|
|
|
+ page.goto(chapter_url)
|
|
|
|
|
+
|
|
|
|
|
+ time.sleep(1)
|
|
|
|
|
+
|
|
|
|
|
+ self.window_scroll(page)
|
|
|
|
|
+
|
|
|
|
|
+ # for _ in range(5):
|
|
|
|
|
+ # page.evaluate('''() => {window.scrollTo(0, document.body.scrollHeight);}''')
|
|
|
|
|
+ #
|
|
|
|
|
+ # time.sleep(0.2)
|
|
|
|
|
+ #
|
|
|
|
|
+ # page.wait_for_timeout(1000)
|
|
|
|
|
+
|
|
|
|
|
+ time.sleep(1)
|
|
|
|
|
+
|
|
|
|
|
+ element = page.query_selector('body > div.chpater-images')
|
|
|
|
|
+
|
|
|
|
|
+ links = element.query_selector_all('img')
|
|
|
|
|
+
|
|
|
|
|
+ chpater_img_links = []
|
|
|
|
|
+
|
|
|
|
|
+ for link in links:
|
|
|
|
|
+ # 获取每个 <a> 标签的 href 属性
|
|
|
|
|
+ img_src = link.get_attribute('src')
|
|
|
|
|
+ if 'blob:' in img_src:
|
|
|
|
|
+ chpater_img_links.append(img_src)
|
|
|
|
|
+
|
|
|
|
|
+ print(chpater_img_links)
|
|
|
|
|
+
|
|
|
|
|
+ def get_chapter(self):
|
|
|
|
|
+ with sync_playwright() as playwright:
|
|
|
|
|
+ browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
|
|
|
|
|
+ page = browser.new_page()
|
|
|
|
|
+
|
|
|
|
|
+ page.goto(self.target_url)
|
|
|
|
|
+
|
|
|
|
|
+ title = page.title()
|
|
|
|
|
+ target_name = title.split('漫画免费')[0]
|
|
|
|
|
+ current_path = os.path.dirname(os.path.abspath(__file__))
|
|
|
|
|
+
|
|
|
|
|
+ path = os.path.join(current_path, 'zhuimh', target_name)
|
|
|
|
|
+ if not os.path.exists(path):
|
|
|
|
|
+ os.makedirs(path)
|
|
|
|
|
+
|
|
|
|
|
+ element = page.query_selector('body > div.tbox.tabs > div.tabs_block > ul')
|
|
|
|
|
+
|
|
|
|
|
+ chapter_name_list = []
|
|
|
|
|
+ chapter_url_list = []
|
|
|
|
|
+
|
|
|
|
|
+ if element:
|
|
|
|
|
+ # 执行你需要的操作,例如获取元素的文本内容
|
|
|
|
|
+ text = element.text_content()
|
|
|
|
|
+ for line in text.split('\n'):
|
|
|
|
|
+ if line.strip():
|
|
|
|
|
+ chapter_name_list.append(line.strip())
|
|
|
|
|
+
|
|
|
|
|
+ links = element.query_selector_all('a')
|
|
|
|
|
+ for link in links:
|
|
|
|
|
+ # 获取每个 <a> 标签的 href 属性
|
|
|
|
|
+ href = link.get_attribute('href')
|
|
|
|
|
+ if href:
|
|
|
|
|
+ chapter_url_list.append(self.base_url + href)
|
|
|
|
|
+ else:
|
|
|
|
|
+ print('没有找到 href 属性')
|
|
|
|
|
+ else:
|
|
|
|
|
+ print('元素未找到')
|
|
|
|
|
+ exit(0)
|
|
|
|
|
+
|
|
|
|
|
+ return chapter_name_list, chapter_url_list
|
|
|
|
|
+
|
|
|
|
|
+ def main(self):
|
|
|
|
|
+ chapter_name_list, chapter_url_list = self.get_chapter()
|
|
|
|
|
+ self.get_chapter_img(chapter_name_list, chapter_url_list)
|
|
|
|
|
+
|
|
|
|
|
+
|
|
|
|
|
+if __name__ == '__main__':
|
|
|
|
|
+ zhuimh = Zhuimh()
|
|
|
|
|
+ zhuimh.main()
|