Jack 1 gadu atpakaļ
revīzija
31a9c31a57
1 mainītis faili ar 66 papildinājumiem un 0 dzēšanām
  1. 66 0
      cl_img.py

+ 66 - 0
cl_img.py

@@ -0,0 +1,66 @@
+# -*- coding: utf-8 -*-
+# download file
+import os
+import re
+import time
+import httpx
+from pathlib import Path
+from playwright.sync_api import sync_playwright
+import os
+
+os.environ["PYTHONIOENCODING"] = "utf-8"
+
+# 确保保存图片的目录存在
+save_dir = Path("downloaded_images")
+save_dir.mkdir(parents=True, exist_ok=True)
+
+base_url = "https://www.t66y.com/htm_data/2409/16/6500489.html"
+
+with sync_playwright() as playwright:
+    browser = playwright.webkit.launch(headless=False)  # 启动浏览器,headless=True 表示无头模式
+    page = browser.new_page()  # 创建新页面
+
+    # 导航到网页
+    page.goto(base_url)
+    time.sleep(0.5)
+
+    # 获取页面标题
+    title = page.title()
+    img_file = Path(f"{save_dir}/{title}")
+
+    all_url_data = {}
+    # TODO 此处循环获取每个页面的所有图片链接
+    # 获取页面的 HTML 内容
+    content = page.content()
+    time.sleep(5)
+    print(content.encode('utf-8'))
+
+    # print(content)
+
+    # view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
+    # all_view_img_list = [i for i in view_img_list]
+
+    # max_page_mun = re.findall('onclick="return false">(\\d*?)</a></td>', content)
+    # if max_page_mun:
+    #     max_page_mun = int(max(max_page_mun))
+
+    # for view_page in range(1, max_page_mun):
+    #     page.goto(base_url + href_url + change_page + str(view_page))
+    #     content = page.content()
+    #     view_img_list = re.findall('no-repeat"><a href="(.*?)">', content)
+    #     for i in view_img_list:
+    #         all_view_img_list.append(i)
+
+    # print(all_view_img_list)
+    # # 访问内层图片链接
+    # for n, img_url in enumerate(view_img_list):
+    #     page.goto(img_url)
+    #     img_content = page.content()
+    #     b_img = re.findall('<img id="img" src="(.*?)">', content)
+    #     if b_img:
+    #         all_url_data.update({str(n).zfill(4): b_img[0]})
+    #
+    # print(all_url_data)
+
+    # 关闭浏览器
+    browser.close()