瀏覽代碼

update ++

jack 1 年之前
當前提交
623fc8f7bd
共有 2 個文件被更改,包括 119 次插入0 次删除
  1. 62 0
      .gitignore
  2. 57 0
      demo_playwright_003.py

+ 62 - 0
.gitignore

@@ -0,0 +1,62 @@
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
+.idea/*
+config.json
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*,cover
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+

+ 57 - 0
demo_playwright_003.py

@@ -0,0 +1,57 @@
+# -*- coding: utf-8 -*-
+# download file
+import os
+import re
+import time
+import httpx
+from pathlib import Path
+from playwright.sync_api import sync_playwright
+
+# 确保保存图片的目录存在
+save_dir = Path("downloaded_images")
+save_dir.mkdir(parents=True, exist_ok=True)
+
+base_url = "https://e-hentai.org/g/"
+href_url = "3055404/7ce423edd8"  # 每次改这个
+change_page = '?p='
+
+with sync_playwright() as playwright:
+    browser = playwright.webkit.launch(headless=True)  # 启动浏览器,headless=True 表示无头模式
+    page = browser.new_page()  # 创建新页面
+
+    # 导航到网页
+    page.goto(base_url + href_url)
+    time.sleep(0.5)
+
+    # 获取页面标题
+    title = page.title()
+    img_file = Path(f"{save_dir}/{title}")
+
+    all_url_data = {}
+    # TODO 此处循环获取每个页面的所有图片链接
+    # 获取页面的 HTML 内容
+    content = page.content()
+
+    # print(content)
+
+    view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
+
+    max_page_mun = re.findall('onclick="return false">(.*?)</a></td>', content)
+
+    if max_page_mun:
+        max_page_mun = int(max_page_mun[0])
+
+    print(max_page_mun)
+
+    # # 访问内层图片链接
+    # for n, img_url in enumerate(view_img_list):
+    #     page.goto(img_url)
+    #     img_content = page.content()
+    #     b_img = re.findall('<img id="img" src="(.*?)">', content)
+    #     if b_img:
+    #         all_url_data.update({str(n).zfill(4): b_img[0]})
+    #
+    # print(all_url_data)
+
+    # 关闭浏览器
+    browser.close()