1 年之前 · 623fc8f7bd
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,62 @@
 
				+.DS_Store
			
 
				+# Byte-compiled / optimized / DLL files
			
 
				+__pycache__/
			
 
				+*.py[cod]
			
 
				+*$py.class
			
 
				+
			
 
				+# C extensions
			
 
				+*.so
			
 
				+
			
 
				+# Distribution / packaging
			
 
				+.Python
			
 
				+env/
			
 
				+build/
			
 
				+develop-eggs/
			
 
				+dist/
			
 
				+downloads/
			
 
				+eggs/
			
 
				+.eggs/
			
 
				+lib/
			
 
				+lib64/
			
 
				+parts/
			
 
				+sdist/
			
 
				+var/
			
 
				+*.egg-info/
			
 
				+.installed.cfg
			
 
				+*.egg
			
 
				+.idea/*
			
 
				+config.json
			
 
				+
			
 
				+# PyInstaller
			
 
				+#  Usually these files are written by a python script from a template
			
 
				+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
			
 
				+*.manifest
			
 
				+*.spec
			
 
				+
			
 
				+# Installer logs
			
 
				+pip-log.txt
			
 
				+pip-delete-this-directory.txt
			
 
				+
			
 
				+# Unit test / coverage reports
			
 
				+htmlcov/
			
 
				+.tox/
			
 
				+.coverage
			
 
				+.coverage.*
			
 
				+.cache
			
 
				+nosetests.xml
			
 
				+coverage.xml
			
 
				+*,cover
			
 
				+
			
 
				+# Translations
			
 
				+*.mo
			
 
				+*.pot
			
 
				+
			
 
				+# Django stuff:
			
 
				+*.log
			
 
				+
			
 
				+# Sphinx documentation
			
 
				+docs/_build/
			
 
				+
			
 
				+# PyBuilder
			
 
				+target/
			
 
				+
			
--- a/demo_playwright_003.py
+++ b/demo_playwright_003.py
@@ -0,0 +1,57 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# download file
			
 
				+import os
			
 
				+import re
			
 
				+import time
			
 
				+import httpx
			
 
				+from pathlib import Path
			
 
				+from playwright.sync_api import sync_playwright
			
 
				+
			
 
				+# 确保保存图片的目录存在
			
 
				+save_dir = Path("downloaded_images")
			
 
				+save_dir.mkdir(parents=True, exist_ok=True)
			
 
				+
			
 
				+base_url = "https://e-hentai.org/g/"
			
 
				+href_url = "3055404/7ce423edd8"  # 每次改这个
			
 
				+change_page = '?p='
			
 
				+
			
 
				+with sync_playwright() as playwright:
			
 
				+    browser = playwright.webkit.launch(headless=True)  # 启动浏览器，headless=True 表示无头模式
			
 
				+    page = browser.new_page()  # 创建新页面
			
 
				+
			
 
				+    # 导航到网页
			
 
				+    page.goto(base_url + href_url)
			
 
				+    time.sleep(0.5)
			
 
				+
			
 
				+    # 获取页面标题
			
 
				+    title = page.title()
			
 
				+    img_file = Path(f"{save_dir}/{title}")
			
 
				+
			
 
				+    all_url_data = {}
			
 
				+    # TODO 此处循环获取每个页面的所有图片链接
			
 
				+    # 获取页面的 HTML 内容
			
 
				+    content = page.content()
			
 
				+
			
 
				+    # print(content)
			
 
				+
			
 
				+    view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
			
 
				+
			
 
				+    max_page_mun = re.findall('onclick="return false">(.*?)</a></td>', content)
			
 
				+
			
 
				+    if max_page_mun:
			
 
				+        max_page_mun = int(max_page_mun[0])
			
 
				+
			
 
				+    print(max_page_mun)
			
 
				+
			
 
				+    # # 访问内层图片链接
			
 
				+    # for n, img_url in enumerate(view_img_list):
			
 
				+    #     page.goto(img_url)
			
 
				+    #     img_content = page.content()
			
 
				+    #     b_img = re.findall('<img id="img" src="(.*?)">', content)
			
 
				+    #     if b_img:
			
 
				+    #         all_url_data.update({str(n).zfill(4): b_img[0]})
			
 
				+    #
			
 
				+    # print(all_url_data)
			
 
				+
			
 
				+    # 关闭浏览器
			
 
				+    browser.close()