| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657 |
- # -*- coding: utf-8 -*-
- # download file
- import os
- import re
- import time
- import httpx
- from pathlib import Path
- from playwright.sync_api import sync_playwright
- # 确保保存图片的目录存在
- save_dir = Path("downloaded_images")
- save_dir.mkdir(parents=True, exist_ok=True)
- base_url = "https://e-hentai.org/g/"
- href_url = "3055404/7ce423edd8" # 每次改这个
- change_page = '?p='
- with sync_playwright() as playwright:
- browser = playwright.webkit.launch(headless=True) # 启动浏览器,headless=True 表示无头模式
- page = browser.new_page() # 创建新页面
- # 导航到网页
- page.goto(base_url + href_url)
- time.sleep(0.5)
- # 获取页面标题
- title = page.title()
- img_file = Path(f"{save_dir}/{title}")
- all_url_data = {}
- # TODO 此处循环获取每个页面的所有图片链接
- # 获取页面的 HTML 内容
- content = page.content()
- # print(content)
- view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
- max_page_mun = re.findall('onclick="return false">(.*?)</a></td>', content)
- if max_page_mun:
- max_page_mun = int(max_page_mun[0])
- print(max_page_mun)
- # # 访问内层图片链接
- # for n, img_url in enumerate(view_img_list):
- # page.goto(img_url)
- # img_content = page.content()
- # b_img = re.findall('<img id="img" src="(.*?)">', content)
- # if b_img:
- # all_url_data.update({str(n).zfill(4): b_img[0]})
- #
- # print(all_url_data)
- # 关闭浏览器
- browser.close()
|