jack il y a 1 an
Parent
commit
bbbb72d02e
1 fichiers modifiés avec 114 ajouts et 0 suppressions
  1. 114 0
      manual/zhuimh/main.py

+ 114 - 0
manual/zhuimh/main.py

@@ -0,0 +1,114 @@
+# -*- coding: utf-8 -*-
+# https://www.zhuimh.com/comic/419025
+# 获取 zhuimh
+import time
+import os
+import psycopg2
+
+from playwright.sync_api import sync_playwright
+
+
+class Zhuimh:
+    def __init__(self):
+        self.comico_id = 419025
+        self.base_url = 'https://www.zhuimh.com'
+        self.href_url = '/comic/'
+        self.target_url = self.base_url + self.href_url + str(self.comico_id)
+
+    def window_scroll(self, page):
+        # 获取页面高度
+        page_height = page.evaluate('() => document.body.scrollHeight')
+        # 获取视口高度
+        viewport_height = page.evaluate('() => window.innerHeight')
+
+        # 计算需要滚动的距离
+        scroll_distance = page_height - viewport_height
+
+        # 模拟鼠标滚轮操作,滚动到页面底部
+        page.mouse.wheel(0, scroll_distance)
+
+    def get_chapter_img(self, chapter_name_list, chapter_url_list):
+        for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
+            print(f'章节名: {chapter_name}, 章节url: {chapter_url}')
+            with sync_playwright() as playwright:
+                browser = playwright.chromium.launch(headless=True)  # headless=False 可以开启浏览器界面,便于调试
+                page = browser.new_page()
+
+                page.goto(chapter_url)
+
+                time.sleep(1)
+
+                self.window_scroll(page)
+
+                # for _ in range(5):
+                #     page.evaluate('''() => {window.scrollTo(0, document.body.scrollHeight);}''')
+                #
+                #     time.sleep(0.2)
+                #
+                # page.wait_for_timeout(1000)
+
+                time.sleep(1)
+
+                element = page.query_selector('body > div.chpater-images')
+
+                links = element.query_selector_all('img')
+
+                chpater_img_links = []
+
+                for link in links:
+                    # 获取每个 <a> 标签的 href 属性
+                    img_src = link.get_attribute('src')
+                    if 'blob:' in img_src:
+                        chpater_img_links.append(img_src)
+
+                print(chpater_img_links)
+
+    def get_chapter(self):
+        with sync_playwright() as playwright:
+            browser = playwright.chromium.launch(headless=True)  # headless=False 可以开启浏览器界面,便于调试
+            page = browser.new_page()
+
+            page.goto(self.target_url)
+
+            title = page.title()
+            target_name = title.split('漫画免费')[0]
+            current_path = os.path.dirname(os.path.abspath(__file__))
+
+            path = os.path.join(current_path, 'zhuimh', target_name)
+            if not os.path.exists(path):
+                os.makedirs(path)
+
+            element = page.query_selector('body > div.tbox.tabs > div.tabs_block > ul')
+
+            chapter_name_list = []
+            chapter_url_list = []
+
+            if element:
+                # 执行你需要的操作,例如获取元素的文本内容
+                text = element.text_content()
+                for line in text.split('\n'):
+                    if line.strip():
+                        chapter_name_list.append(line.strip())
+
+                links = element.query_selector_all('a')
+                for link in links:
+                    # 获取每个 <a> 标签的 href 属性
+                    href = link.get_attribute('href')
+                    if href:
+                        chapter_url_list.append(self.base_url + href)
+                    else:
+                        print('没有找到 href 属性')
+            else:
+                print('元素未找到')
+                exit(0)
+
+            return chapter_name_list, chapter_url_list
+
+    def main(self):
+        chapter_name_list, chapter_url_list = self.get_chapter()
+        self.get_chapter_img(chapter_name_list, chapter_url_list)
+
+
+if __name__ == '__main__':
+    zhuimh = Zhuimh()
+    zhuimh.main()