# -*- coding: utf-8 -*- # 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地 # 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表 import socket import sys import os import time import random from concurrent.futures import ThreadPoolExecutor import psycopg2 sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')) import httpx from playwright.sync_api import sync_playwright target = 'flaticon' step = 4 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试 local_proxy = 0 thread_count = 8 title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器 img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器 not_find_page_selector = '#viewport > div.errorpage.e404 > h1' # 当无法获取下一页时, 此选择器为最后一页 project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection') # 获取局域网ip, 如果不是局域网, 则用公网连接数据库 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s.connect(('10.255.255.255', 1)) IP = s.getsockname()[0] s.close() if '192.168.100' not in IP: psql_params = { "host": "home.erhe.link", "port": 55434, "user": "psql", "password": "psql", "dbname": "collect" } else: psql_params = { "host": "192.168.100.146", "port": 5434, "user": "psql", "password": "psql", "dbname": "collect" } def open_browser(target_urls): # all_data = {} link_count = 1 for target_url in target_urls: print(f'\n开始获取 {target_url} 数据, 当前链接是第 {link_count} 个, 共 {len(target_urls)} 个链接') link_count += 1 pages = '/{}' urls = [] title = '' # 存放当前页面的title total_page_count = 0 # 合集总共页数 with sync_playwright() as playwright: if local_proxy: browser = playwright.chromium.launch( headless=True, proxy={"server": "http://127.0.0.1:7890"} ) else: browser = playwright.chromium.launch(headless=True) context = browser.new_context(viewport={'width': 1280, 'height': 700}) page = context.new_page() img_sequence_num = 1 for page_count in range(1, 999): try: goto_url = target_url + pages.format(page_count) page.goto(goto_url, timeout=8000) except Exception as e: pass # print(e) # print(f'页面加载失败:url:{goto_url}') if page_count == 1: # 获取title page.wait_for_selector(title_selector, state="attached", timeout=10000) title = page.query_selector(title_selector).inner_text() img_count = page.query_selector(img_count_selector).inner_text() img_count = int(img_count.split(' ')[0]) invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack '] for char in invalid_chars: title = title.replace(char, '') else: try: # 检查一下当前页面是不是 404 page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000) total_page_count = page_count - 1 break except: pass for i in range(1, img_count + 1): # 选择所有的标签 elements = page.query_selector_all(img_selector.format(i)) # 遍历所有标签,提取href属性 for element in elements: src = element.get_attribute('src') if src: src = src.replace('/128/', '/512/') suffix = src.split('.')[-1] sequence = str(img_sequence_num).zfill(3) urls.append({ 'url': src, 'file_title': title, 'serial': sequence, 'img': f'{title}_{sequence}', 'suffix': suffix }) img_sequence_num += 1 break print(f'所有图片URL已获取。总页数: 共 {total_page_count} 页, 总共图片 {len(urls)}, 正在写入数据库...') page.close() browser.close() # all_data[title] = urls # 所有数据爬取完成再存 save_data({title: urls}) print(f'{title} 已保存') # 获取所有 url 数据之后, 存数据库 # return all_data def download_img(load_data, target_file_path): print('正在下载图片') with ThreadPoolExecutor(max_workers=thread_count) as executor: executor.map(single_img_download, [(index, data, load_data, target_file_path) for index, data in enumerate(load_data)]) def single_img_download(args): index, data, load_data, target_file_path = args # 连接数据库, 准备反写下载状态 conn = psycopg2.connect(**psql_params) cursor = conn.cursor() # 如果img文件存在, 即已经下载过, 直接跳过 id = data['id'] name = data['name'] target_site = data['target_site'], file_title = data['file_title'].replace(' ', '_') set_name = data['set_name'] serial = str(data['serial']).zfill(3) image_suffix = data['image_suffix'] img_url = data['img_url'] # 查看每个合集的文件夹是否存在, 不存在就创建 title_file_path = os.path.join(target_file_path, file_title) if not os.path.exists(title_file_path): os.mkdir(title_file_path) img_name = f'{file_title}_{serial}.{image_suffix}' # 图片文件名 img_file_path = os.path.join(str(title_file_path), img_name) # 图片完整路径 if os.path.exists(img_file_path): # 当此 img 已存在本地时, 在 psql 将数据库状态改为已下载 query = f"UPDATE {target} SET download_state = %s WHERE id = %s" cursor.execute(query, (True, id)) conn.commit() print(f'图片 {img_file_path} 已存在。继续!') return retry = 8 while retry: try: resp = httpx.get(img_url, headers={ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36" }) with open(img_file_path, 'wb') as f: f.write(resp.content) # 下载成功后, 在 psql 将数据库状态改为已下载 query = f"UPDATE {target} SET download_state = %s WHERE id = %s" cursor.execute(query, (True, id)) conn.commit() # 算一下进度 rate = index / len(load_data) * 100 print(f'已下载:{img_name}, 当前第 {index + 1} 个, 共 {len(load_data)} 个, 已下载 {rate:.2f}%') time.sleep(random.uniform(1, 2)) break except Exception as e: print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}') retry -= 1 time.sleep(random.uniform(3, 5)) conn.close() def save_data(data_item): conn = psycopg2.connect(**psql_params) cursor = conn.cursor() for k, v in data_item.items(): for data in v: # 检查img_url是否重复 cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],)) if cursor.fetchone() is None: # 插入数据 cursor.execute(""" INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) """, ( None, target, data['file_title'], None, data['serial'], False, data['suffix'], data['url'] )) conn.commit() # print(f"数据 {data['url']} 保存成功") else: print(f"数据 {data['url']} 已存在,跳过") # 关闭数据库连接 cursor.close() conn.close() def load_data(): # 连接数据库 conn = psycopg2.connect(**psql_params) cursor = conn.cursor() # 查询download_state为false的所有数据 query = f"SELECT * FROM {target} WHERE download_state = %s order by id asc" load_data_list = [] try: # 执行查询 cursor.execute(query, (False,)) # 获取查询结果 rows = cursor.fetchall() # 打印结果 for row in rows: load_data_list.append( { 'id': row[0], 'name': row[1], 'target_site': row[2], 'file_title': row[3], 'set_name': row[4], 'serial': row[5], 'download_state': row[6], 'image_suffix': row[7], 'img_url': row[8] } ) except psycopg2.Error as e: print(f"Database error: {e}") finally: # 关闭数据库连接 cursor.close() conn.close() if load_data_list: return load_data_list else: print("没有需要下载的数据。") exit(0) def check_psql(): # 连接数据库 try: conn = psycopg2.connect(**psql_params) except Exception as e: print(f"无法连接到数据库:{e}") exit(1) # 创建cursor对象 cur = conn.cursor() cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)", (target,)) exist = cur.fetchone()[0] if not exist: # 如果不存在,则创建表 cur.execute(f""" CREATE TABLE {target} ( id SERIAL PRIMARY KEY, name VARCHAR(255), target_site VARCHAR(255), file_title VARCHAR(255), set_name VARCHAR(255), serial INT, download_state BOOLEAN, image_suffix VARCHAR(50), img_url VARCHAR(255) ); """) print(f"表 '{target}' 创建成功。") # 提交事务 conn.commit() # 关闭cursor和连接 cur.close() conn.close() def check_local_downloads_dir(): # 查看一下是否存在 downloads 文件夹, 不存在就创建一个 download_file_path = os.path.join(str(project_root), 'downloads') if not os.path.exists(download_file_path): os.mkdir(download_file_path) target_file_path = os.path.join(download_file_path, target) if not os.path.exists(target_file_path): os.mkdir(target_file_path) return target_file_path def check_target_url_txt(): txt_file_name = 'target_link.txt' if not os.path.exists(txt_file_name): with open(txt_file_name, 'w') as file: file.write('') print('需要在 target_link.txt 中填写目标链接') exit(0) else: with open('target_link.txt', 'r') as f: targets = [target.strip() for target in f.readlines()] if not targets: print('在 target_link.txt 中未找到目标链接') exit(0) return targets if __name__ == "__main__": # 检查数据库 check_psql() if step == 1: targets = check_target_url_txt() open_browser(targets) elif step == 2: # 开始读取数据 load_data = load_data() # 开始下载 img target_file_path = check_local_downloads_dir() download_img(load_data, target_file_path) print('下载完成, 程序退出') elif step == 3: targets = check_target_url_txt() # 保存 img 合集链接 open_browser(targets) # 开始读取数据 load_data = load_data() # 开始下载 img target_file_path = check_local_downloads_dir() download_img(load_data, target_file_path) print('下载完成, 程序退出') elif step == 4: # 调试 pass else: pass