|
|
@@ -0,0 +1,57 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# download file
|
|
|
+import os
|
|
|
+import re
|
|
|
+import time
|
|
|
+import httpx
|
|
|
+from pathlib import Path
|
|
|
+from playwright.sync_api import sync_playwright
|
|
|
+
|
|
|
+# 确保保存图片的目录存在
|
|
|
+save_dir = Path("downloaded_images")
|
|
|
+save_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+
|
|
|
+base_url = "https://e-hentai.org/g/"
|
|
|
+href_url = "3055404/7ce423edd8" # 每次改这个
|
|
|
+change_page = '?p='
|
|
|
+
|
|
|
+with sync_playwright() as playwright:
|
|
|
+ browser = playwright.webkit.launch(headless=True) # 启动浏览器,headless=True 表示无头模式
|
|
|
+ page = browser.new_page() # 创建新页面
|
|
|
+
|
|
|
+ # 导航到网页
|
|
|
+ page.goto(base_url + href_url)
|
|
|
+ time.sleep(0.5)
|
|
|
+
|
|
|
+ # 获取页面标题
|
|
|
+ title = page.title()
|
|
|
+ img_file = Path(f"{save_dir}/{title}")
|
|
|
+
|
|
|
+ all_url_data = {}
|
|
|
+ # TODO 此处循环获取每个页面的所有图片链接
|
|
|
+ # 获取页面的 HTML 内容
|
|
|
+ content = page.content()
|
|
|
+
|
|
|
+ # print(content)
|
|
|
+
|
|
|
+ view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
|
|
|
+
|
|
|
+ max_page_mun = re.findall('onclick="return false">(.*?)</a></td>', content)
|
|
|
+
|
|
|
+ if max_page_mun:
|
|
|
+ max_page_mun = int(max_page_mun[0])
|
|
|
+
|
|
|
+ print(max_page_mun)
|
|
|
+
|
|
|
+ # # 访问内层图片链接
|
|
|
+ # for n, img_url in enumerate(view_img_list):
|
|
|
+ # page.goto(img_url)
|
|
|
+ # img_content = page.content()
|
|
|
+ # b_img = re.findall('<img id="img" src="(.*?)">', content)
|
|
|
+ # if b_img:
|
|
|
+ # all_url_data.update({str(n).zfill(4): b_img[0]})
|
|
|
+ #
|
|
|
+ # print(all_url_data)
|
|
|
+
|
|
|
+ # 关闭浏览器
|
|
|
+ browser.close()
|