jack 1 ano atrás
pai
commit
e74853c1cc
2 arquivos alterados com 42 adições e 19 exclusões
  1. 8 8
      flaticon/flaticon.py
  2. 34 11
      kaizty/kaizty.py

+ 8 - 8
flaticon/flaticon.py

@@ -52,14 +52,6 @@ def open_browser(target_urls):
 
             img_sequence_num = 1
             for page_count in range(1, 999):
-                # 检查一下当前页面是不是 404
-                try:
-                    page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
-                    print(f'总页数是 {page_count - 1} 在 url: {goto_url}')
-                    break
-                except:
-                    pass
-
                 try:
                     goto_url = target_url + pages.format(page_count)
                     page.goto(goto_url, timeout=5000)
@@ -67,6 +59,14 @@ def open_browser(target_urls):
                     print(e)
                     print(f'页面加载失败:url:{goto_url}')
 
+                try:
+                    # 检查一下当前页面是不是 404
+                    page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
+                    print(f'总页数是 {page_count - 1} 在 url: {goto_url}')
+                    break
+                except:
+                    pass
+
                 if page_count == 1:
                     # 获取title
                     page.wait_for_selector(title_selector, state="attached", timeout=10000)

+ 34 - 11
kaizty/kaizty.py

@@ -14,8 +14,10 @@ import httpx
 from playwright.sync_api import sync_playwright
 
 target = 'kaizty'
-step = 1  # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
-local_proxy = 1
+
+# 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
+step = 2
+local_proxy = 0
 title_selector = '#pack-view__inner > section.pack-view__header > h1'  # 获取标题选择器
 img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'  # 获取图片的url
 img_count_selector = '#pack-view__inner > section.pack-view__header > p'  # 获取图片总数选择器
@@ -53,17 +55,23 @@ def open_browser(target_urls):
 
             img_sequence_num = 1
 
-            for page_count in range(30, 31):
+            for page_count in range(1, 999):
                 try:
                     goto_url = target_url + pages.format(page_count)
-                    page.goto(goto_url, timeout=5000)
-                    page.wait_for_load_state('domcontentloaded')
+                    page.goto(goto_url, timeout=20000)
+                    page.wait_for_selector('body > div.housing > div.housing-coveringap > div.thrcol.refill.afsite > div.thr-ot.hid > div > div.c-content > div:nth-child(3) > div')
                 except Exception as e:
                     print(e)
                     print(f'页面加载失败:url:{goto_url}')
 
                 page_source = page.content()
 
+                if "EMPTY" in page_source:
+                    print('没有下一页了, 跳出循环')
+                    break
+
+                print(f'开始获取第 {page_count} 页')
+
                 title = page.title()
 
                 img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
@@ -129,7 +137,22 @@ def download_img(load_data, target_file_path):
         while retry:
             try:
                 resp = httpx.get(img_url, headers={
-                    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+                    "Accept-Encoding": "gzip, deflate, br, zstd",
+                    "Accept-Language": "zh-CN,zh;q=0.9",
+                    "Cache-Control": "max-age=0",
+                    "Cookie": "asgfp2=77542c163334cb6fe4f6c38c671acfdd; _ga=GA1.1.1971075315.1723678888; _ga_WF05TQ75CR=GS1.1.1726202265.4.1.1726202301.24.0.0; asgfp2=77542c163334cb6fe4f6c38c671acfdd; sp-chjeuHenj=Po",
+                    "Priority": "u=0, i",
+                    "Referer": "https://www.kaizty.com/photos/bFh6Njdrc01HM0FxeEhrVFVXM2xlUT09.html?page=9",
+                    "Sec-CH-UA": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
+                    "Sec-CH-UA-Mobile": "?0",
+                    "Sec-CH-UA-Platform": "\"Windows\"",
+                    "Sec-Fetch-Dest": "document",
+                    "Sec-Fetch-Mode": "navigate",
+                    "Sec-Fetch-Site": "same-origin",
+                    "Sec-Fetch-User": "?1",
+                    "Upgrade-Insecure-Requests": "1",
+                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
                 })
                 with open(img_file_path, 'wb') as f:
                     f.write(resp.content)
@@ -154,13 +177,13 @@ def save_data(data_item):
     for k, v in data_item.items():
         for data in v:
             # 检查img_url是否重复
-            cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],))
+            cursor.execute(f"SELECT img_url FROM {target} WHERE img_url = %s", (data['url'],))
             if cursor.fetchone() is None:
                 # 插入数据
-                cursor.execute("""
-                    INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
+                cursor.execute(("""
+                    INSERT INTO {target} (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
                     VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
-                """, (
+                """).format(target=target), (
                     None,
                     target,
                     data['file_title'],
@@ -252,7 +275,7 @@ def check_psql():
             serial INT,
             download_state BOOLEAN,
             image_suffix VARCHAR(50),
-            img_url VARCHAR(255)
+            img_url TEXT
         );
         """)
         print(f"表 '{target}' 创建成功。")