jack 1 рік тому
батько
коміт
8a79545a91
3 змінених файлів з 49 додано та 42 видалено
  1. 11 0
      flaticon/flaticon.py
  2. 37 40
      kaizty/kaizty.py
  3. 1 2
      kaizty/target_link.txt

+ 11 - 0
flaticon/flaticon.py

@@ -329,5 +329,16 @@ if __name__ == "__main__":
         #  保存 img 链接
         all_data = open_browser(targets)
         save_data(all_data)
+
+        # 开始读取数据
+        load_data = load_data()
+
+        # 开始下载 img
+        target_file_path = check_local_downloads_dir()
+        download_img(load_data, target_file_path)
+        print('下载完成, 程序退出')
+    elif step == 4:
+        #  调试
+        pass
     else:
         pass

+ 37 - 40
kaizty/kaizty.py

@@ -14,8 +14,8 @@ import httpx
 from playwright.sync_api import sync_playwright
 
 target = 'kaizty'
-step = 1  # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2
-local_proxy = 0
+step = 4  # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
+local_proxy = 1
 title_selector = '#pack-view__inner > section.pack-view__header > h1'  # 获取标题选择器
 img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'  # 获取图片的url
 img_count_selector = '#pack-view__inner > section.pack-view__header > p'  # 获取图片总数选择器
@@ -36,7 +36,7 @@ def open_browser(target_urls):
     all_data = {}
 
     for target_url in target_urls:
-        pages = '/{}'
+        pages = '?page={}'
         urls = []
         title = ''  # 存放当前页面的title
         with sync_playwright() as playwright:
@@ -52,7 +52,8 @@ def open_browser(target_urls):
             page = context.new_page()
 
             img_sequence_num = 1
-            for page_count in range(1, 999):
+
+            for page_count in range(1, 2):
                 # 检查一下当前页面是不是 404
                 try:
                     page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
@@ -68,46 +69,31 @@ def open_browser(target_urls):
                     print(e)
                     print(f'页面加载失败:url:{goto_url}')
 
-                if page_count == 1:
-                    # 获取title
-                    page.wait_for_selector(title_selector, state="attached", timeout=10000)
-                    title = page.query_selector(title_selector).inner_text()
-
-                    img_count = page.query_selector(img_count_selector).inner_text()
-                    img_count = int(img_count.split(' ')[0])
-
-                    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', '  ', 'Icon Pack ']
-                    for char in invalid_chars:
-                        title = title.replace(char, '')
-
-                for i in range(1, img_count + 1):
-                    # 选择所有的<a>标签
-                    elements = page.query_selector_all(img_selector.format(i))
-
-                    # 遍历所有<a>标签,提取href属性
-                    for element in elements:
-                        src = element.get_attribute('src')
-                        if src:
-                            src = src.replace('/128/', '/512/')
-                            suffix = src.split('.')[-1]
-                            sequence = str(img_sequence_num).zfill(3)
-                            urls.append({
-                                'url': src,
-                                'file_title': title,
-                                'serial': sequence,
-                                'img': f'{title}_{sequence}',
-                                'suffix': suffix
-                            })
-                            img_sequence_num += 1
-                            break
-
-            print(f'所有图片URL已获取。总共图片 {len(urls)}')
+                page.wait_for_load_state('domcontentloaded')
 
-            page.close()
-            browser.close()
+                title = page.title()
+                page_source = page.content()
+                img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
+
+                title = clean_string(title)
+
+                for img_url in img_list:
+                    suffix = img_url.split('.')[-1]
+                    sequence = str(img_sequence_num).zfill(3)
+                    urls.append({
+                        'url': img_url,
+                        'file_title': title,
+                        'serial': sequence,
+                        'img': f'{title}_{sequence}',
+                        'suffix': suffix
+                    })
+                    img_sequence_num += 1
 
             all_data[title] = urls
 
+            page.close()
+            browser.close()
+
     # 获取所有 url 数据之后, 存数据库
     return all_data
 
@@ -340,5 +326,16 @@ if __name__ == "__main__":
         #  保存 img 链接
         all_data = open_browser(targets)
         save_data(all_data)
+
+        # 开始读取数据
+        load_data = load_data()
+
+        # 开始下载 img
+        target_file_path = check_local_downloads_dir()
+        download_img(load_data, target_file_path)
+        print('下载完成, 程序退出')
+    elif step == 4:
+        #  调试
+        all_data = open_browser(targets)
     else:
         pass

+ 1 - 2
kaizty/target_link.txt

@@ -1,2 +1 @@
-https://www.flaticon.com/packs/editorial-design-24
-https://www.flaticon.com/packs/space-347
+https://www.kaizty.com//photos/L2lBQ200aE0vOVNmUGcydzhhT296Zz09.html