jack 1 سال پیش
والد
کامیت
2a13e82cff
3فایلهای تغییر یافته به همراه135 افزوده شده و 120 حذف شده
  1. 132 67
      flaticon/main.py
  2. 3 0
      flaticon/target_link.txt
  3. 0 53
      flaticon/test.py

+ 132 - 67
flaticon/main.py

@@ -2,20 +2,47 @@
 import sys
 import os
 import time
+import random
 
 sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
 import httpx
 from playwright.sync_api import sync_playwright
 
-target_url = 'https://www.flaticon.com/packs/summer-watermelon-17517790'
-
 title_selector = '#pack-view__inner > section.pack-view__header > h1'
 selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
 img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
 img_count_selector = '#pack-view__inner > section.pack-view__header > p'
+not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
+
+
+def main(target_urls):
+    all_data = {}
+
+    for target_url in target_urls:
+        urls, file_path, title = open_browser(target_url)
+        all_data[title] = [urls, file_path, title]
+
+    for data in all_data:
+        urls = all_data[data][0]
+        file_path = all_data[data][1]
+        title = all_data[data][2]
+        while True:
+            if download_img(urls, file_path):
+                print(f'All images have been downloaded: {title}')
+                break
+            else:
+                print(f'Some images have not been downloaded, continue downloading {title}')
+
+        print('\n\n')
 
+    print('All Done')
 
-def main():
+
+def open_browser(target_url):
+    pages = '/{}'
+    urls = []
+    file_path = ''  # 存放图片的文件夹
+    title = ''  # 存放当前页面的title
     with sync_playwright() as playwright:
         browser = playwright.webkit.launch(
             headless=True,
@@ -24,75 +51,113 @@ def main():
         context = browser.new_context(viewport={'width': 1280, 'height': 700})
         page = context.new_page()
 
-        page.goto(target_url)
-
-        # 获取title
-        page.wait_for_selector(title_selector, state="attached", timeout=10000)
-        title = page.query_selector(title_selector).inner_text()
-
-        img_count = page.query_selector(img_count_selector).inner_text()
-        img_count = int(img_count.split(' ')[0])
-
-        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', '  ', 'Icon Pack ']
-        for char in invalid_chars:
-            title = title.replace(char, '')
-
-        img_name = title.replace(' ', '_')
-
-        current_path = os.getcwd()
-
-        download_file_path = os.path.join(current_path, 'download')
-        if not os.path.exists(download_file_path):
-            os.mkdir(download_file_path)
-
-        file_path = os.path.join(download_file_path, title)
-        if not os.path.exists(file_path):
-            os.mkdir(file_path)
-
-        # 第一层 url
-        urls = []
-        for i in range(1, img_count + 1):
-            # 选择所有的<a>标签
-            elements = page.query_selector_all(selector.format(i))
+        img_sequence_num = 1
+        for page_count in range(1, 999):
+            try:
+                goto_url = target_url + pages.format(page_count)
+                page.goto(goto_url, timeout=5000)
+            except Exception as e:
+                print(e)
+                print(f'Page load failed: url is : {goto_url}')
 
-            # 遍历所有<a>标签,提取href属性
-            for element in elements:
-                src = element.get_attribute('src')
-                if src:
-                    src = src.replace('/128/', '/512/')
-                    sequence = str(i).zfill(2)
-                    urls.append({
-                        'url': src,
-                        'img': f'{img_name}_{sequence}.png'
-                    })
-        print('已获取所有图片url')
+            # 检查一下当前页面是不是 404
+            try:
+                page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
+                print(f'Total page is {page_count - 1} in url: {goto_url}')
+                break
+            except:
+                pass
+
+            if page_count == 1:
+                # 获取title
+                page.wait_for_selector(title_selector, state="attached", timeout=10000)
+                title = page.query_selector(title_selector).inner_text()
+
+                img_count = page.query_selector(img_count_selector).inner_text()
+                img_count = int(img_count.split(' ')[0])
+
+                invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', '  ', 'Icon Pack ']
+                for char in invalid_chars:
+                    title = title.replace(char, '')
+
+                img_name = title.replace(' ', '_')
+
+                current_path = os.getcwd()
+
+                download_file_path = os.path.join(current_path, 'download')
+                if not os.path.exists(download_file_path):
+                    os.mkdir(download_file_path)
+
+                file_path = os.path.join(download_file_path, title)
+                if not os.path.exists(file_path):
+                    os.mkdir(file_path)
+
+            for i in range(1, img_count + 1):
+                # 选择所有的<a>标签
+                elements = page.query_selector_all(selector.format(i))
+
+                # 遍历所有<a>标签,提取href属性
+                for element in elements:
+                    src = element.get_attribute('src')
+                    if src:
+                        src = src.replace('/128/', '/512/')
+                        sequence = str(img_sequence_num).zfill(3)
+                        urls.append({
+                            'url': src,
+                            'img': f'{img_name}_{sequence}.png'
+                        })
+                        img_sequence_num += 1
+                        break
+
+        print(f'All image URLs have been obtained. Total img {len(urls)}')
 
         page.close()
         browser.close()
 
-        print('正在下载图片')
-        for url in urls:
-            # 如果png文件存在, 即已经下载过, 直接跳过
-            target_img_url = url['url']
-            img_png_name = url['img']
-            target_img_name = os.path.join(file_path, img_png_name)
-            if os.path.exists(target_img_name):
-                print(f'图片 {img_png_name} 已存在')
-                continue
-
-            try:
-                resp = httpx.get(target_img_url, headers={
-                    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
-                })
-                with open(target_img_name, 'wb') as f:
-                    f.write(resp.content)
-                print(f'已下载: {img_png_name}')
-                time.sleep(1)
-            except Exception as e:
-                print(e)
-
-        print(f'{title} : 已下载完成')
+        return urls, file_path, title
+
+
+def download_img(urls, file_path):
+    all_done = True
+    print('Downloading pictures')
+    for url in urls:
+        # 如果png文件存在, 即已经下载过, 直接跳过
+        target_img_url = url['url']
+        img_png_name = url['img']
+        target_img_name = os.path.join(file_path, img_png_name)
+        if os.path.exists(target_img_name):
+            print(f'The image {img_png_name} already exists. continue!')
+            continue
+
+        try:
+            resp = httpx.get(target_img_url, headers={
+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
+            })
+            with open(target_img_name, 'wb') as f:
+                f.write(resp.content)
+            print(f'Downloaded: {img_png_name}')
+            time.sleep(random.uniform(1, 2))
+        except Exception as e:
+            print(f'\nFailed to download image: {target_img_url}. err: {e}\n')
+            time.sleep(random.uniform(3, 5))
+            all_done = False
+
+    return all_done
 
 
 if __name__ == "__main__":
-    main()
+    txt_file_name = 'target_link.txt'
+    if not os.path.exists(txt_file_name):
+        with open(txt_file_name, 'w') as file:
+            file.write('')
+        print('Need to fill in the target link in target_link.txt')
+        exit(0)
+    else:
+        with open('target_link.txt', 'r') as f:
+            targets = [target.strip() for target in f.readlines()]
+            if not targets:
+                print('No target link found in target_link.txt')
+                exit(0)
+
+    print(f'target link is : {targets}')
+    main(targets)

+ 3 - 0
flaticon/target_link.txt

@@ -0,0 +1,3 @@
+https://www.flaticon.com/packs/farming-158
+https://www.flaticon.com/packs/space-347
+https://www.flaticon.com/packs/summer-370

+ 0 - 53
flaticon/test.py

@@ -1,53 +0,0 @@
-import httpx
-import re
-import time
-
-url = 'https://www.flaticon.com/packs/vegetable-17858464'
-
-headers = {
-    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
-    "accept-encoding": "gzip, deflate, br, zstd",
-    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
-    "cache-control": "max-age=0",
-    "priority": "u=0, i",
-    "referer": "https://www.flaticon.com/icons",
-    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Microsoft Edge\";v=\"128\"",
-    "sec-ch-ua-mobile": "?0",
-    "sec-ch-ua-platform": "\"macOS\"",
-    "sec-fetch-dest": "document",
-    "sec-fetch-mode": "navigate",
-    "sec-fetch-site": "same-origin",
-    "sec-fetch-user": "?1",
-    "upgrade-insecure-requests": "1",
-    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
-}
-
-resp = httpx.get(url, headers=headers)
-resp.encoding = 'utf-8'
-text = resp.text
-
-all_img_url_list = []
-img_count = re.findall('<meta property=\'og:title\' content="(.*?) ', text)
-img_url_list = re.findall('data-png="(.*?)"', text)
-for u in img_url_list:
-    if u == '{{png512}}':
-        continue
-    all_img_url_list.append(u)
-
-if int(img_count[0]) > 50:
-    for page in range(2, 999):
-        if 'Oopsies... Seems like you got lost! - Flaticon' in text:
-            break
-        next_url = url + '/' + str(page)
-        resp = httpx.get(next_url, headers=headers)
-        resp.encoding = 'utf-8'
-        text = resp.text
-        next_page_img_url = re.findall('data-png="(.*?)"', text)
-        for next_img in next_page_img_url:
-            if next_img == '{{png512}}':
-                continue
-            all_img_url_list.append(next_img)
-        time.sleep(2)
-
-print(img_url_list)
-print(len(img_url_list))