# -*- coding: utf-8 -*-
# download file
import os
import re
import time
import httpx
from pathlib import Path
from playwright.sync_api import sync_playwright
import os
os.environ["PYTHONIOENCODING"] = "utf-8"
# 确保保存图片的目录存在
save_dir = Path("downloaded_images")
save_dir.mkdir(parents=True, exist_ok=True)
base_url = "https://www.t66y.com/htm_data/2409/16/6500489.html"
with sync_playwright() as playwright:
browser = playwright.webkit.launch(headless=False) # 启动浏览器,headless=True 表示无头模式
page = browser.new_page() # 创建新页面
# 导航到网页
page.goto(base_url)
time.sleep(0.5)
# 获取页面标题
title = page.title()
img_file = Path(f"{save_dir}/{title}")
all_url_data = {}
# TODO 此处循环获取每个页面的所有图片链接
# 获取页面的 HTML 内容
content = page.content()
time.sleep(5)
print(content.encode('utf-8'))
# print(content)
# view_img_list = re.findall('no-repeat">', content)
# all_view_img_list = [i for i in view_img_list]
# max_page_mun = re.findall('onclick="return false">(\\d*?)', content)
# if max_page_mun:
# max_page_mun = int(max(max_page_mun))
# for view_page in range(1, max_page_mun):
# page.goto(base_url + href_url + change_page + str(view_page))
# content = page.content()
# view_img_list = re.findall('no-repeat">', content)
# for i in view_img_list:
# all_view_img_list.append(i)
# print(all_view_img_list)
# # 访问内层图片链接
# for n, img_url in enumerate(view_img_list):
# page.goto(img_url)
# img_content = page.content()
# b_img = re.findall('
', content)
# if b_img:
# all_url_data.update({str(n).zfill(4): b_img[0]})
#
# print(all_url_data)
# 关闭浏览器
browser.close()