před 1 rokem · 2a13e82cff
--- a/flaticon/main.py
+++ b/flaticon/main.py
@@ -2,20 +2,47 @@
 
				 import sys
			
 
				 import os
			
 
				 import time
			
 
				+import random
			
 
				 
			
 
				 sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
			
 
				 import httpx
			
 
				 from playwright.sync_api import sync_playwright
			
 
				 
			
 
				-target_url = 'https://www.flaticon.com/packs/summer-watermelon-17517790'
			
 
				-
			
 
				 title_selector = '#pack-view__inner > section.pack-view__header > h1'
			
 
				 selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
			
 
				 img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
			
 
				 img_count_selector = '#pack-view__inner > section.pack-view__header > p'
			
 
				+not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
			
 
				+
			
 
				+
			
 
				+def main(target_urls):
			
 
				+    all_data = {}
			
 
				+
			
 
				+    for target_url in target_urls:
			
 
				+        urls, file_path, title = open_browser(target_url)
			
 
				+        all_data[title] = [urls, file_path, title]
			
 
				+
			
 
				+    for data in all_data:
			
 
				+        urls = all_data[data][0]
			
 
				+        file_path = all_data[data][1]
			
 
				+        title = all_data[data][2]
			
 
				+        while True:
			
 
				+            if download_img(urls, file_path):
			
 
				+                print(f'All images have been downloaded: {title}')
			
 
				+                break
			
 
				+            else:
			
 
				+                print(f'Some images have not been downloaded, continue downloading {title}')
			
 
				+
			
 
				+        print('\n\n')
			
 
				 
			
 
				+    print('All Done')
			
 
				 
			
 
				-def main():
			
 
				+
			
 
				+def open_browser(target_url):
			
 
				+    pages = '/{}'
			
 
				+    urls = []
			
 
				+    file_path = ''  # 存放图片的文件夹
			
 
				+    title = ''  # 存放当前页面的title
			
 
				     with sync_playwright() as playwright:
			
 
				         browser = playwright.webkit.launch(
			
 
				             headless=True,
			
@@ -24,75 +51,113 @@ def main():
 
				         context = browser.new_context(viewport={'width': 1280, 'height': 700})
			
 
				         page = context.new_page()
			
 
				 
			
 
				-        page.goto(target_url)
			
 
				-
			
 
				-        # 获取title
			
 
				-        page.wait_for_selector(title_selector, state="attached", timeout=10000)
			
 
				-        title = page.query_selector(title_selector).inner_text()
			
 
				-
			
 
				-        img_count = page.query_selector(img_count_selector).inner_text()
			
 
				-        img_count = int(img_count.split(' ')[0])
			
 
				-
			
 
				-        invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', '  ', 'Icon Pack ']
			
 
				-        for char in invalid_chars:
			
 
				-            title = title.replace(char, '')
			
 
				-
			
 
				-        img_name = title.replace(' ', '_')
			
 
				-
			
 
				-        current_path = os.getcwd()
			
 
				-
			
 
				-        download_file_path = os.path.join(current_path, 'download')
			
 
				-        if not os.path.exists(download_file_path):
			
 
				-            os.mkdir(download_file_path)
			
 
				-
			
 
				-        file_path = os.path.join(download_file_path, title)
			
 
				-        if not os.path.exists(file_path):
			
 
				-            os.mkdir(file_path)
			
 
				-
			
 
				-        # 第一层 url
			
 
				-        urls = []
			
 
				-        for i in range(1, img_count + 1):
			
 
				-            # 选择所有的<a>标签
			
 
				-            elements = page.query_selector_all(selector.format(i))
			
 
				+        img_sequence_num = 1
			
 
				+        for page_count in range(1, 999):
			
 
				+            try:
			
 
				+                goto_url = target_url + pages.format(page_count)
			
 
				+                page.goto(goto_url, timeout=5000)
			
 
				+            except Exception as e:
			
 
				+                print(e)
			
 
				+                print(f'Page load failed: url is : {goto_url}')
			
 
				 
			
 
				-            # 遍历所有<a>标签，提取href属性
			
 
				-            for element in elements:
			
 
				-                src = element.get_attribute('src')
			
 
				-                if src:
			
 
				-                    src = src.replace('/128/', '/512/')
			
 
				-                    sequence = str(i).zfill(2)
			
 
				-                    urls.append({
			
 
				-                        'url': src,
			
 
				-                        'img': f'{img_name}_{sequence}.png'
			
 
				-                    })
			
 
				-        print('已获取所有图片url')
			
 
				+            # 检查一下当前页面是不是 404
			
 
				+            try:
			
 
				+                page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
			
 
				+                print(f'Total page is {page_count - 1} in url: {goto_url}')
			
 
				+                break
			
 
				+            except:
			
 
				+                pass
			
 
				+
			
 
				+            if page_count == 1:
			
 
				+                # 获取title
			
 
				+                page.wait_for_selector(title_selector, state="attached", timeout=10000)
			
 
				+                title = page.query_selector(title_selector).inner_text()
			
 
				+
			
 
				+                img_count = page.query_selector(img_count_selector).inner_text()
			
 
				+                img_count = int(img_count.split(' ')[0])
			
 
				+
			
 
				+                invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', '  ', 'Icon Pack ']
			
 
				+                for char in invalid_chars:
			
 
				+                    title = title.replace(char, '')
			
 
				+
			
 
				+                img_name = title.replace(' ', '_')
			
 
				+
			
 
				+                current_path = os.getcwd()
			
 
				+
			
 
				+                download_file_path = os.path.join(current_path, 'download')
			
 
				+                if not os.path.exists(download_file_path):
			
 
				+                    os.mkdir(download_file_path)
			
 
				+
			
 
				+                file_path = os.path.join(download_file_path, title)
			
 
				+                if not os.path.exists(file_path):
			
 
				+                    os.mkdir(file_path)
			
 
				+
			
 
				+            for i in range(1, img_count + 1):
			
 
				+                # 选择所有的<a>标签
			
 
				+                elements = page.query_selector_all(selector.format(i))
			
 
				+
			
 
				+                # 遍历所有<a>标签，提取href属性
			
 
				+                for element in elements:
			
 
				+                    src = element.get_attribute('src')
			
 
				+                    if src:
			
 
				+                        src = src.replace('/128/', '/512/')
			
 
				+                        sequence = str(img_sequence_num).zfill(3)
			
 
				+                        urls.append({
			
 
				+                            'url': src,
			
 
				+                            'img': f'{img_name}_{sequence}.png'
			
 
				+                        })
			
 
				+                        img_sequence_num += 1
			
 
				+                        break
			
 
				+
			
 
				+        print(f'All image URLs have been obtained. Total img {len(urls)}')
			
 
				 
			
 
				         page.close()
			
 
				         browser.close()
			
 
				 
			
 
				-        print('正在下载图片')
			
 
				-        for url in urls:
			
 
				-            # 如果png文件存在, 即已经下载过, 直接跳过
			
 
				-            target_img_url = url['url']
			
 
				-            img_png_name = url['img']
			
 
				-            target_img_name = os.path.join(file_path, img_png_name)
			
 
				-            if os.path.exists(target_img_name):
			
 
				-                print(f'图片 {img_png_name} 已存在')
			
 
				-                continue
			
 
				-
			
 
				-            try:
			
 
				-                resp = httpx.get(target_img_url, headers={
			
 
				-                    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
			
 
				-                })
			
 
				-                with open(target_img_name, 'wb') as f:
			
 
				-                    f.write(resp.content)
			
 
				-                print(f'已下载: {img_png_name}')
			
 
				-                time.sleep(1)
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-
			
 
				-        print(f'{title} : 已下载完成')
			
 
				+        return urls, file_path, title
			
 
				+
			
 
				+
			
 
				+def download_img(urls, file_path):
			
 
				+    all_done = True
			
 
				+    print('Downloading pictures')
			
 
				+    for url in urls:
			
 
				+        # 如果png文件存在, 即已经下载过, 直接跳过
			
 
				+        target_img_url = url['url']
			
 
				+        img_png_name = url['img']
			
 
				+        target_img_name = os.path.join(file_path, img_png_name)
			
 
				+        if os.path.exists(target_img_name):
			
 
				+            print(f'The image {img_png_name} already exists. continue!')
			
 
				+            continue
			
 
				+
			
 
				+        try:
			
 
				+            resp = httpx.get(target_img_url, headers={
			
 
				+                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
			
 
				+            })
			
 
				+            with open(target_img_name, 'wb') as f:
			
 
				+                f.write(resp.content)
			
 
				+            print(f'Downloaded: {img_png_name}')
			
 
				+            time.sleep(random.uniform(1, 2))
			
 
				+        except Exception as e:
			
 
				+            print(f'\nFailed to download image: {target_img_url}. err: {e}\n')
			
 
				+            time.sleep(random.uniform(3, 5))
			
 
				+            all_done = False
			
 
				+
			
 
				+    return all_done
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main()
			
 
				+    txt_file_name = 'target_link.txt'
			
 
				+    if not os.path.exists(txt_file_name):
			
 
				+        with open(txt_file_name, 'w') as file:
			
 
				+            file.write('')
			
 
				+        print('Need to fill in the target link in target_link.txt')
			
 
				+        exit(0)
			
 
				+    else:
			
 
				+        with open('target_link.txt', 'r') as f:
			
 
				+            targets = [target.strip() for target in f.readlines()]
			
 
				+            if not targets:
			
 
				+                print('No target link found in target_link.txt')
			
 
				+                exit(0)
			
 
				+
			
 
				+    print(f'target link is : {targets}')
			
 
				+    main(targets)
			
--- a/flaticon/target_link.txt
+++ b/flaticon/target_link.txt
@@ -0,0 +1,3 @@
 
				+https://www.flaticon.com/packs/farming-158
			
 
				+https://www.flaticon.com/packs/space-347
			
 
				+https://www.flaticon.com/packs/summer-370
			
--- a/flaticon/test.py
+++ b/flaticon/test.py
@@ -1,53 +0,0 @@
 
				-import httpx
			
 
				-import re
			
 
				-import time
			
 
				-
			
 
				-url = 'https://www.flaticon.com/packs/vegetable-17858464'
			
 
				-
			
 
				-headers = {
			
 
				-    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				-    "accept-encoding": "gzip, deflate, br, zstd",
			
 
				-    "accept-language": "zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6",
			
 
				-    "cache-control": "max-age=0",
			
 
				-    "priority": "u=0, i",
			
 
				-    "referer": "https://www.flaticon.com/icons",
			
 
				-    "sec-ch-ua": "\"Chromium\";v=\"128\", \"Not;A=Brand\";v=\"24\", \"Microsoft Edge\";v=\"128\"",
			
 
				-    "sec-ch-ua-mobile": "?0",
			
 
				-    "sec-ch-ua-platform": "\"macOS\"",
			
 
				-    "sec-fetch-dest": "document",
			
 
				-    "sec-fetch-mode": "navigate",
			
 
				-    "sec-fetch-site": "same-origin",
			
 
				-    "sec-fetch-user": "?1",
			
 
				-    "upgrade-insecure-requests": "1",
			
 
				-    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 Edg/128.0.0.0"
			
 
				-}
			
 
				-
			
 
				-resp = httpx.get(url, headers=headers)
			
 
				-resp.encoding = 'utf-8'
			
 
				-text = resp.text
			
 
				-
			
 
				-all_img_url_list = []
			
 
				-img_count = re.findall('<meta property=\'og:title\' content="(.*?) ', text)
			
 
				-img_url_list = re.findall('data-png="(.*?)"', text)
			
 
				-for u in img_url_list:
			
 
				-    if u == '{{png512}}':
			
 
				-        continue
			
 
				-    all_img_url_list.append(u)
			
 
				-
			
 
				-if int(img_count[0]) > 50:
			
 
				-    for page in range(2, 999):
			
 
				-        if 'Oopsies... Seems like you got lost! - Flaticon' in text:
			
 
				-            break
			
 
				-        next_url = url + '/' + str(page)
			
 
				-        resp = httpx.get(next_url, headers=headers)
			
 
				-        resp.encoding = 'utf-8'
			
 
				-        text = resp.text
			
 
				-        next_page_img_url = re.findall('data-png="(.*?)"', text)
			
 
				-        for next_img in next_page_img_url:
			
 
				-            if next_img == '{{png512}}':
			
 
				-                continue
			
 
				-            all_img_url_list.append(next_img)
			
 
				-        time.sleep(2)
			
 
				-
			
 
				-print(img_url_list)
			
 
				-print(len(img_url_list))