1 yıl önce · 51ecc7ea4a
--- a/flaticon/flaticon.py
+++ b/flaticon/flaticon.py
@@ -0,0 +1,333 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+# 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
			
 
				+# 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
			
 
				+import sys
			
 
				+import os
			
 
				+import time
			
 
				+import random
			
 
				+
			
 
				+import psycopg2
			
 
				+
			
 
				+sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
			
 
				+import httpx
			
 
				+from playwright.sync_api import sync_playwright
			
 
				+
			
 
				+target = 'flaticon'
			
 
				+step = 2  # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2
			
 
				+local_proxy = 0
			
 
				+title_selector = '#pack-view__inner > section.pack-view__header > h1'  # 获取标题选择器
			
 
				+img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'  # 获取图片的url
			
 
				+img_count_selector = '#pack-view__inner > section.pack-view__header > p'  # 获取图片总数选择器
			
 
				+not_find_page_selector = '#viewport > div.errorpage.e404 > h1'  # 当无法获取下一页时, 此选择器为最后一页
			
 
				+
			
 
				+project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
			
 
				+
			
 
				+psql_params = {
			
 
				+    "host": "home.erhe.link",
			
 
				+    "port": 55434,
			
 
				+    "user": "psql",
			
 
				+    "password": "psql",
			
 
				+    "dbname": "collect"
			
 
				+}
			
 
				+
			
 
				+
			
 
				+def open_browser(target_urls):
			
 
				+    all_data = {}
			
 
				+
			
 
				+    for target_url in target_urls:
			
 
				+        pages = '/{}'
			
 
				+        urls = []
			
 
				+        title = ''  # 存放当前页面的title
			
 
				+        with sync_playwright() as playwright:
			
 
				+            if local_proxy:
			
 
				+                browser = playwright.chromium.launch(
			
 
				+                    headless=True,
			
 
				+                    proxy={"server": "http://127.0.0.1:7890"}
			
 
				+                )
			
 
				+            else:
			
 
				+                browser = playwright.chromium.launch(headless=True)
			
 
				+
			
 
				+            context = browser.new_context(viewport={'width': 1280, 'height': 700})
			
 
				+            page = context.new_page()
			
 
				+
			
 
				+            img_sequence_num = 1
			
 
				+            for page_count in range(1, 999):
			
 
				+                # 检查一下当前页面是不是 404
			
 
				+                try:
			
 
				+                    page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
			
 
				+                    print(f'总页数是 {page_count - 1} 在 url: {goto_url}')
			
 
				+                    break
			
 
				+                except:
			
 
				+                    pass
			
 
				+
			
 
				+                try:
			
 
				+                    goto_url = target_url + pages.format(page_count)
			
 
				+                    page.goto(goto_url, timeout=5000)
			
 
				+                except Exception as e:
			
 
				+                    print(e)
			
 
				+                    print(f'页面加载失败：url：{goto_url}')
			
 
				+
			
 
				+                if page_count == 1:
			
 
				+                    # 获取title
			
 
				+                    page.wait_for_selector(title_selector, state="attached", timeout=10000)
			
 
				+                    title = page.query_selector(title_selector).inner_text()
			
 
				+
			
 
				+                    img_count = page.query_selector(img_count_selector).inner_text()
			
 
				+                    img_count = int(img_count.split(' ')[0])
			
 
				+
			
 
				+                    invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', '  ', 'Icon Pack ']
			
 
				+                    for char in invalid_chars:
			
 
				+                        title = title.replace(char, '')
			
 
				+
			
 
				+                for i in range(1, img_count + 1):
			
 
				+                    # 选择所有的<a>标签
			
 
				+                    elements = page.query_selector_all(img_selector.format(i))
			
 
				+
			
 
				+                    # 遍历所有<a>标签，提取href属性
			
 
				+                    for element in elements:
			
 
				+                        src = element.get_attribute('src')
			
 
				+                        if src:
			
 
				+                            src = src.replace('/128/', '/512/')
			
 
				+                            suffix = src.split('.')[-1]
			
 
				+                            sequence = str(img_sequence_num).zfill(3)
			
 
				+                            urls.append({
			
 
				+                                'url': src,
			
 
				+                                'file_title': title,
			
 
				+                                'serial': sequence,
			
 
				+                                'img': f'{title}_{sequence}',
			
 
				+                                'suffix': suffix
			
 
				+                            })
			
 
				+                            img_sequence_num += 1
			
 
				+                            break
			
 
				+
			
 
				+            print(f'所有图片URL已获取。总共图片 {len(urls)}')
			
 
				+
			
 
				+            page.close()
			
 
				+            browser.close()
			
 
				+
			
 
				+            all_data[title] = urls
			
 
				+
			
 
				+    # 获取所有 url 数据之后, 存数据库
			
 
				+    return all_data
			
 
				+
			
 
				+
			
 
				+def download_img(load_data, target_file_path):
			
 
				+    # 连接数据库, 准备反写下载状态
			
 
				+    conn = psycopg2.connect(**psql_params)
			
 
				+    cursor = conn.cursor()
			
 
				+
			
 
				+    print('正在下载图片')
			
 
				+    for data in load_data:
			
 
				+        # 如果img文件存在, 即已经下载过, 直接跳过
			
 
				+        id = data['id']
			
 
				+        name = data['name']
			
 
				+        target_site = data['target_site'],
			
 
				+        file_title = data['file_title'].replace(' ', '_')
			
 
				+        set_name = data['set_name']
			
 
				+        serial = str(data['serial']).zfill(3)
			
 
				+        image_suffix = data['image_suffix']
			
 
				+        img_url = data['img_url']
			
 
				+
			
 
				+        # 查看每个合集的文件夹是否存在, 不存在就创建
			
 
				+        title_file_path = os.path.join(target_file_path, file_title)
			
 
				+        if not os.path.exists(title_file_path):
			
 
				+            os.mkdir(title_file_path)
			
 
				+
			
 
				+        img_name = f'{file_title}_{serial}.{image_suffix}'  # 图片文件名
			
 
				+        img_file_path = os.path.join(str(title_file_path), img_name)  # 图片完整路径
			
 
				+
			
 
				+        if os.path.exists(img_file_path):
			
 
				+            #  当此 img 已存在本地时, 在 psql 将数据库状态改为已下载
			
 
				+            query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
			
 
				+            cursor.execute(query, (True, id))
			
 
				+            conn.commit()
			
 
				+            print(f'图片 {img_file_path} 已存在。继续！')
			
 
				+            continue
			
 
				+
			
 
				+        retry = 8
			
 
				+        while retry:
			
 
				+            try:
			
 
				+                resp = httpx.get(img_url, headers={
			
 
				+                    "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
			
 
				+                })
			
 
				+                with open(img_file_path, 'wb') as f:
			
 
				+                    f.write(resp.content)
			
 
				+
			
 
				+                #  下载成功后, 在 psql 将数据库状态改为已下载
			
 
				+                query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
			
 
				+                cursor.execute(query, (True, id))
			
 
				+                conn.commit()
			
 
				+
			
 
				+                print(f'已下载：{img_name}')
			
 
				+                time.sleep(random.uniform(1, 2))
			
 
				+                break
			
 
				+            except Exception as e:
			
 
				+                print(f'下载图片失败：{img_name}。错误：{e}  重试: {retry}')
			
 
				+                retry -= 1
			
 
				+                time.sleep(random.uniform(3, 5))
			
 
				+
			
 
				+
			
 
				+def save_data(data_item):
			
 
				+    conn = psycopg2.connect(**psql_params)
			
 
				+    cursor = conn.cursor()
			
 
				+    for k, v in data_item.items():
			
 
				+        for data in v:
			
 
				+            # 检查img_url是否重复
			
 
				+            cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],))
			
 
				+            if cursor.fetchone() is None:
			
 
				+                # 插入数据
			
 
				+                cursor.execute("""
			
 
				+                    INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
			
 
				+                    VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
			
 
				+                """, (
			
 
				+                    None,
			
 
				+                    target,
			
 
				+                    data['file_title'],
			
 
				+                    None,
			
 
				+                    data['serial'],
			
 
				+                    False,
			
 
				+                    data['suffix'],
			
 
				+                    data['url']
			
 
				+                ))
			
 
				+                conn.commit()
			
 
				+                print(f"数据 {data['url']} 插入成功")
			
 
				+            else:
			
 
				+                print(f"数据 {data['url']} 已存在，未插入")
			
 
				+
			
 
				+    # 关闭数据库连接
			
 
				+    cursor.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+
			
 
				+def load_data():
			
 
				+    # 连接数据库
			
 
				+    conn = psycopg2.connect(**psql_params)
			
 
				+    cursor = conn.cursor()
			
 
				+
			
 
				+    # 查询download_state为false的所有数据
			
 
				+    query = f"SELECT * FROM {target} WHERE download_state = %s order by id asc"
			
 
				+
			
 
				+    load_data_list = []
			
 
				+
			
 
				+    try:
			
 
				+        # 执行查询
			
 
				+        cursor.execute(query, (False,))
			
 
				+        # 获取查询结果
			
 
				+        rows = cursor.fetchall()
			
 
				+
			
 
				+        # 打印结果
			
 
				+        for row in rows:
			
 
				+            load_data_list.append(
			
 
				+                {
			
 
				+                    'id': row[0],
			
 
				+                    'name': row[1],
			
 
				+                    'target_site': row[2],
			
 
				+                    'file_title': row[3],
			
 
				+                    'set_name': row[4],
			
 
				+                    'serial': row[5],
			
 
				+                    'download_state': row[6],
			
 
				+                    'image_suffix': row[7],
			
 
				+                    'img_url': row[8]
			
 
				+                }
			
 
				+            )
			
 
				+    except psycopg2.Error as e:
			
 
				+        print(f"Database error: {e}")
			
 
				+    finally:
			
 
				+        # 关闭数据库连接
			
 
				+        cursor.close()
			
 
				+        conn.close()
			
 
				+
			
 
				+    if load_data_list:
			
 
				+        return load_data_list
			
 
				+    else:
			
 
				+        print("没有需要下载的数据。")
			
 
				+        exit(0)
			
 
				+
			
 
				+
			
 
				+def check_psql():
			
 
				+    # 连接数据库
			
 
				+    try:
			
 
				+        conn = psycopg2.connect(**psql_params)
			
 
				+    except Exception as e:
			
 
				+        print(f"无法连接到数据库：{e}")
			
 
				+        exit(1)
			
 
				+
			
 
				+    # 创建cursor对象
			
 
				+    cur = conn.cursor()
			
 
				+
			
 
				+    cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
			
 
				+                (target,))
			
 
				+    exist = cur.fetchone()[0]
			
 
				+
			
 
				+    if not exist:
			
 
				+        # 如果不存在，则创建表
			
 
				+        cur.execute(f"""
			
 
				+        CREATE TABLE {target} (
			
 
				+            id SERIAL PRIMARY KEY,
			
 
				+            name VARCHAR(255),
			
 
				+            target_site VARCHAR(255),
			
 
				+            file_title VARCHAR(255),
			
 
				+            set_name VARCHAR(255),
			
 
				+            serial INT,
			
 
				+            download_state BOOLEAN,
			
 
				+            image_suffix VARCHAR(50),
			
 
				+            img_url VARCHAR(255)
			
 
				+        );
			
 
				+        """)
			
 
				+        print(f"表 '{target}' 创建成功。")
			
 
				+
			
 
				+    # 提交事务
			
 
				+    conn.commit()
			
 
				+
			
 
				+    # 关闭cursor和连接
			
 
				+    cur.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+
			
 
				+def check_local_downloads_dir():
			
 
				+    # 查看一下是否存在 downloads 文件夹, 不存在就创建一个
			
 
				+    download_file_path = os.path.join(str(project_root), 'downloads')
			
 
				+    if not os.path.exists(download_file_path):
			
 
				+        os.mkdir(download_file_path)
			
 
				+    target_file_path = os.path.join(download_file_path, target)
			
 
				+    if not os.path.exists(target_file_path):
			
 
				+        os.mkdir(target_file_path)
			
 
				+    return target_file_path
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    # 检查数据库
			
 
				+    check_psql()
			
 
				+
			
 
				+    txt_file_name = 'target_link.txt'
			
 
				+    if not os.path.exists(txt_file_name):
			
 
				+        with open(txt_file_name, 'w') as file:
			
 
				+            file.write('')
			
 
				+        print('需要在 target_link.txt 中填写目标链接')
			
 
				+        exit(0)
			
 
				+    else:
			
 
				+        with open('target_link.txt', 'r') as f:
			
 
				+            targets = [target.strip() for target in f.readlines()]
			
 
				+            if not targets:
			
 
				+                print('在 target_link.txt 中未找到目标链接')
			
 
				+                exit(0)
			
 
				+
			
 
				+    print(f'目标链接是：{targets}')
			
 
				+
			
 
				+    if step == 1:
			
 
				+        all_data = open_browser(targets)
			
 
				+        save_data(all_data)
			
 
				+    elif step == 2:
			
 
				+        # 开始读取数据
			
 
				+        load_data = load_data()
			
 
				+
			
 
				+        # 开始下载 img
			
 
				+        target_file_path = check_local_downloads_dir()
			
 
				+        download_img(load_data, target_file_path)
			
 
				+        print('下载完成, 程序退出')
			
 
				+    elif step == 3:
			
 
				+        #  保存 img 链接
			
 
				+        all_data = open_browser(targets)
			
 
				+        save_data(all_data)
			
 
				+    else:
			
 
				+        pass
			
--- a/flaticon/main.py
+++ b/flaticon/main.py
@@ -1,163 +0,0 @@
 
				-# -*- coding: utf-8 -*-
			
 
				-import sys
			
 
				-import os
			
 
				-import time
			
 
				-import random
			
 
				-
			
 
				-sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
			
 
				-import httpx
			
 
				-from playwright.sync_api import sync_playwright
			
 
				-
			
 
				-title_selector = '#pack-view__inner > section.pack-view__header > h1'
			
 
				-selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
			
 
				-img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
			
 
				-img_count_selector = '#pack-view__inner > section.pack-view__header > p'
			
 
				-not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
			
 
				-
			
 
				-
			
 
				-def main(target_urls):
			
 
				-    all_data = {}
			
 
				-
			
 
				-    for target_url in target_urls:
			
 
				-        urls, file_path, title = open_browser(target_url)
			
 
				-        all_data[title] = [urls, file_path, title]
			
 
				-
			
 
				-    for data in all_data:
			
 
				-        urls = all_data[data][0]
			
 
				-        file_path = all_data[data][1]
			
 
				-        title = all_data[data][2]
			
 
				-        while True:
			
 
				-            if download_img(urls, file_path):
			
 
				-                print(f'All images have been downloaded: {title}')
			
 
				-                break
			
 
				-            else:
			
 
				-                print(f'Some images have not been downloaded, continue downloading {title}')
			
 
				-
			
 
				-        print('\n\n')
			
 
				-
			
 
				-    print('All Done')
			
 
				-
			
 
				-
			
 
				-def open_browser(target_url):
			
 
				-    pages = '/{}'
			
 
				-    urls = []
			
 
				-    file_path = ''  # 存放图片的文件夹
			
 
				-    title = ''  # 存放当前页面的title
			
 
				-    with sync_playwright() as playwright:
			
 
				-        browser = playwright.webkit.launch(
			
 
				-            headless=True,
			
 
				-            proxy={"server": "http://127.0.0.1:7890"}
			
 
				-        )
			
 
				-        context = browser.new_context(viewport={'width': 1280, 'height': 700})
			
 
				-        page = context.new_page()
			
 
				-
			
 
				-        img_sequence_num = 1
			
 
				-        for page_count in range(1, 999):
			
 
				-            try:
			
 
				-                goto_url = target_url + pages.format(page_count)
			
 
				-                page.goto(goto_url, timeout=5000)
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-                print(f'Page load failed: url is : {goto_url}')
			
 
				-
			
 
				-            # 检查一下当前页面是不是 404
			
 
				-            try:
			
 
				-                page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
			
 
				-                print(f'Total page is {page_count - 1} in url: {goto_url}')
			
 
				-                break
			
 
				-            except:
			
 
				-                pass
			
 
				-
			
 
				-            if page_count == 1:
			
 
				-                # 获取title
			
 
				-                page.wait_for_selector(title_selector, state="attached", timeout=10000)
			
 
				-                title = page.query_selector(title_selector).inner_text()
			
 
				-
			
 
				-                img_count = page.query_selector(img_count_selector).inner_text()
			
 
				-                img_count = int(img_count.split(' ')[0])
			
 
				-
			
 
				-                invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', '  ', 'Icon Pack ']
			
 
				-                for char in invalid_chars:
			
 
				-                    title = title.replace(char, '')
			
 
				-
			
 
				-                img_name = title.replace(' ', '_')
			
 
				-
			
 
				-                current_path = os.getcwd()
			
 
				-
			
 
				-                download_file_path = os.path.join(current_path, 'download')
			
 
				-                if not os.path.exists(download_file_path):
			
 
				-                    os.mkdir(download_file_path)
			
 
				-
			
 
				-                file_path = os.path.join(download_file_path, title)
			
 
				-                if not os.path.exists(file_path):
			
 
				-                    os.mkdir(file_path)
			
 
				-
			
 
				-            for i in range(1, img_count + 1):
			
 
				-                # 选择所有的<a>标签
			
 
				-                elements = page.query_selector_all(selector.format(i))
			
 
				-
			
 
				-                # 遍历所有<a>标签，提取href属性
			
 
				-                for element in elements:
			
 
				-                    src = element.get_attribute('src')
			
 
				-                    if src:
			
 
				-                        src = src.replace('/128/', '/512/')
			
 
				-                        sequence = str(img_sequence_num).zfill(3)
			
 
				-                        urls.append({
			
 
				-                            'url': src,
			
 
				-                            'img': f'{img_name}_{sequence}.png'
			
 
				-                        })
			
 
				-                        img_sequence_num += 1
			
 
				-                        break
			
 
				-
			
 
				-        print(f'All image URLs have been obtained. Total img {len(urls)}')
			
 
				-
			
 
				-        page.close()
			
 
				-        browser.close()
			
 
				-
			
 
				-        return urls, file_path, title
			
 
				-
			
 
				-
			
 
				-def download_img(urls, file_path):
			
 
				-    all_done = True
			
 
				-    print('Downloading pictures')
			
 
				-    for url in urls:
			
 
				-        # 如果png文件存在, 即已经下载过, 直接跳过
			
 
				-        target_img_url = url['url']
			
 
				-        img_png_name = url['img']
			
 
				-        target_img_name = os.path.join(file_path, img_png_name)
			
 
				-        if os.path.exists(target_img_name):
			
 
				-            print(f'The image {img_png_name} already exists. continue!')
			
 
				-            continue
			
 
				-
			
 
				-        try:
			
 
				-            resp = httpx.get(target_img_url, headers={
			
 
				-                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
			
 
				-            })
			
 
				-            with open(target_img_name, 'wb') as f:
			
 
				-                f.write(resp.content)
			
 
				-            print(f'Downloaded: {img_png_name}')
			
 
				-            time.sleep(random.uniform(1, 2))
			
 
				-        except Exception as e:
			
 
				-            print(f'\nFailed to download image: {target_img_url}. err: {e}\n')
			
 
				-            time.sleep(random.uniform(3, 5))
			
 
				-            all_done = False
			
 
				-
			
 
				-    return all_done
			
 
				-
			
 
				-
			
 
				-if __name__ == "__main__":
			
 
				-    txt_file_name = 'target_link.txt'
			
 
				-    if not os.path.exists(txt_file_name):
			
 
				-        with open(txt_file_name, 'w') as file:
			
 
				-            file.write('')
			
 
				-        print('Need to fill in the target link in target_link.txt')
			
 
				-        exit(0)
			
 
				-    else:
			
 
				-        with open('target_link.txt', 'r') as f:
			
 
				-            targets = [target.strip() for target in f.readlines()]
			
 
				-            if not targets:
			
 
				-                print('No target link found in target_link.txt')
			
 
				-                exit(0)
			
 
				-
			
 
				-    print(f'target link is : {targets}')
			
 
				-    main(targets)