# -*- coding: utf-8 -*- # download file import os import re import time import httpx from pathlib import Path from playwright.sync_api import sync_playwright import os os.environ["PYTHONIOENCODING"] = "utf-8" # 确保保存图片的目录存在 save_dir = Path("downloaded_images") save_dir.mkdir(parents=True, exist_ok=True) base_url = "https://www.t66y.com/htm_data/2409/16/6500489.html" with sync_playwright() as playwright: browser = playwright.webkit.launch(headless=False) # 启动浏览器,headless=True 表示无头模式 page = browser.new_page() # 创建新页面 # 导航到网页 page.goto(base_url) time.sleep(0.5) # 获取页面标题 title = page.title() img_file = Path(f"{save_dir}/{title}") all_url_data = {} # TODO 此处循环获取每个页面的所有图片链接 # 获取页面的 HTML 内容 content = page.content() time.sleep(5) print(content.encode('utf-8')) # print(content) # view_img_list = re.findall('no-repeat">', content) # all_view_img_list = [i for i in view_img_list] # max_page_mun = re.findall('onclick="return false">(\\d*?)', content) # if max_page_mun: # max_page_mun = int(max(max_page_mun)) # for view_page in range(1, max_page_mun): # page.goto(base_url + href_url + change_page + str(view_page)) # content = page.content() # view_img_list = re.findall('no-repeat">', content) # for i in view_img_list: # all_view_img_list.append(i) # print(all_view_img_list) # # 访问内层图片链接 # for n, img_url in enumerate(view_img_list): # page.goto(img_url) # img_content = page.content() # b_img = re.findall('', content) # if b_img: # all_url_data.update({str(n).zfill(4): b_img[0]}) # # print(all_url_data) # 关闭浏览器 browser.close()