|
|
@@ -0,0 +1,333 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+# 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
|
|
|
+# 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
|
|
|
+import sys
|
|
|
+import os
|
|
|
+import time
|
|
|
+import random
|
|
|
+
|
|
|
+import psycopg2
|
|
|
+
|
|
|
+sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
|
|
|
+import httpx
|
|
|
+from playwright.sync_api import sync_playwright
|
|
|
+
|
|
|
+target = 'flaticon'
|
|
|
+step = 2 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2
|
|
|
+local_proxy = 0
|
|
|
+title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
|
|
|
+img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
|
|
|
+img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
|
|
|
+not_find_page_selector = '#viewport > div.errorpage.e404 > h1' # 当无法获取下一页时, 此选择器为最后一页
|
|
|
+
|
|
|
+project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
|
|
|
+
|
|
|
+psql_params = {
|
|
|
+ "host": "home.erhe.link",
|
|
|
+ "port": 55434,
|
|
|
+ "user": "psql",
|
|
|
+ "password": "psql",
|
|
|
+ "dbname": "collect"
|
|
|
+}
|
|
|
+
|
|
|
+
|
|
|
+def open_browser(target_urls):
|
|
|
+ all_data = {}
|
|
|
+
|
|
|
+ for target_url in target_urls:
|
|
|
+ pages = '/{}'
|
|
|
+ urls = []
|
|
|
+ title = '' # 存放当前页面的title
|
|
|
+ with sync_playwright() as playwright:
|
|
|
+ if local_proxy:
|
|
|
+ browser = playwright.chromium.launch(
|
|
|
+ headless=True,
|
|
|
+ proxy={"server": "http://127.0.0.1:7890"}
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ browser = playwright.chromium.launch(headless=True)
|
|
|
+
|
|
|
+ context = browser.new_context(viewport={'width': 1280, 'height': 700})
|
|
|
+ page = context.new_page()
|
|
|
+
|
|
|
+ img_sequence_num = 1
|
|
|
+ for page_count in range(1, 999):
|
|
|
+ # 检查一下当前页面是不是 404
|
|
|
+ try:
|
|
|
+ page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
|
|
|
+ print(f'总页数是 {page_count - 1} 在 url: {goto_url}')
|
|
|
+ break
|
|
|
+ except:
|
|
|
+ pass
|
|
|
+
|
|
|
+ try:
|
|
|
+ goto_url = target_url + pages.format(page_count)
|
|
|
+ page.goto(goto_url, timeout=5000)
|
|
|
+ except Exception as e:
|
|
|
+ print(e)
|
|
|
+ print(f'页面加载失败:url:{goto_url}')
|
|
|
+
|
|
|
+ if page_count == 1:
|
|
|
+ # 获取title
|
|
|
+ page.wait_for_selector(title_selector, state="attached", timeout=10000)
|
|
|
+ title = page.query_selector(title_selector).inner_text()
|
|
|
+
|
|
|
+ img_count = page.query_selector(img_count_selector).inner_text()
|
|
|
+ img_count = int(img_count.split(' ')[0])
|
|
|
+
|
|
|
+ invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
|
|
|
+ for char in invalid_chars:
|
|
|
+ title = title.replace(char, '')
|
|
|
+
|
|
|
+ for i in range(1, img_count + 1):
|
|
|
+ # 选择所有的<a>标签
|
|
|
+ elements = page.query_selector_all(img_selector.format(i))
|
|
|
+
|
|
|
+ # 遍历所有<a>标签,提取href属性
|
|
|
+ for element in elements:
|
|
|
+ src = element.get_attribute('src')
|
|
|
+ if src:
|
|
|
+ src = src.replace('/128/', '/512/')
|
|
|
+ suffix = src.split('.')[-1]
|
|
|
+ sequence = str(img_sequence_num).zfill(3)
|
|
|
+ urls.append({
|
|
|
+ 'url': src,
|
|
|
+ 'file_title': title,
|
|
|
+ 'serial': sequence,
|
|
|
+ 'img': f'{title}_{sequence}',
|
|
|
+ 'suffix': suffix
|
|
|
+ })
|
|
|
+ img_sequence_num += 1
|
|
|
+ break
|
|
|
+
|
|
|
+ print(f'所有图片URL已获取。总共图片 {len(urls)}')
|
|
|
+
|
|
|
+ page.close()
|
|
|
+ browser.close()
|
|
|
+
|
|
|
+ all_data[title] = urls
|
|
|
+
|
|
|
+ # 获取所有 url 数据之后, 存数据库
|
|
|
+ return all_data
|
|
|
+
|
|
|
+
|
|
|
+def download_img(load_data, target_file_path):
|
|
|
+ # 连接数据库, 准备反写下载状态
|
|
|
+ conn = psycopg2.connect(**psql_params)
|
|
|
+ cursor = conn.cursor()
|
|
|
+
|
|
|
+ print('正在下载图片')
|
|
|
+ for data in load_data:
|
|
|
+ # 如果img文件存在, 即已经下载过, 直接跳过
|
|
|
+ id = data['id']
|
|
|
+ name = data['name']
|
|
|
+ target_site = data['target_site'],
|
|
|
+ file_title = data['file_title'].replace(' ', '_')
|
|
|
+ set_name = data['set_name']
|
|
|
+ serial = str(data['serial']).zfill(3)
|
|
|
+ image_suffix = data['image_suffix']
|
|
|
+ img_url = data['img_url']
|
|
|
+
|
|
|
+ # 查看每个合集的文件夹是否存在, 不存在就创建
|
|
|
+ title_file_path = os.path.join(target_file_path, file_title)
|
|
|
+ if not os.path.exists(title_file_path):
|
|
|
+ os.mkdir(title_file_path)
|
|
|
+
|
|
|
+ img_name = f'{file_title}_{serial}.{image_suffix}' # 图片文件名
|
|
|
+ img_file_path = os.path.join(str(title_file_path), img_name) # 图片完整路径
|
|
|
+
|
|
|
+ if os.path.exists(img_file_path):
|
|
|
+ # 当此 img 已存在本地时, 在 psql 将数据库状态改为已下载
|
|
|
+ query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
|
|
|
+ cursor.execute(query, (True, id))
|
|
|
+ conn.commit()
|
|
|
+ print(f'图片 {img_file_path} 已存在。继续!')
|
|
|
+ continue
|
|
|
+
|
|
|
+ retry = 8
|
|
|
+ while retry:
|
|
|
+ try:
|
|
|
+ resp = httpx.get(img_url, headers={
|
|
|
+ "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
|
|
|
+ })
|
|
|
+ with open(img_file_path, 'wb') as f:
|
|
|
+ f.write(resp.content)
|
|
|
+
|
|
|
+ # 下载成功后, 在 psql 将数据库状态改为已下载
|
|
|
+ query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
|
|
|
+ cursor.execute(query, (True, id))
|
|
|
+ conn.commit()
|
|
|
+
|
|
|
+ print(f'已下载:{img_name}')
|
|
|
+ time.sleep(random.uniform(1, 2))
|
|
|
+ break
|
|
|
+ except Exception as e:
|
|
|
+ print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}')
|
|
|
+ retry -= 1
|
|
|
+ time.sleep(random.uniform(3, 5))
|
|
|
+
|
|
|
+
|
|
|
+def save_data(data_item):
|
|
|
+ conn = psycopg2.connect(**psql_params)
|
|
|
+ cursor = conn.cursor()
|
|
|
+ for k, v in data_item.items():
|
|
|
+ for data in v:
|
|
|
+ # 检查img_url是否重复
|
|
|
+ cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],))
|
|
|
+ if cursor.fetchone() is None:
|
|
|
+ # 插入数据
|
|
|
+ cursor.execute("""
|
|
|
+ INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
|
|
|
+ VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
|
|
|
+ """, (
|
|
|
+ None,
|
|
|
+ target,
|
|
|
+ data['file_title'],
|
|
|
+ None,
|
|
|
+ data['serial'],
|
|
|
+ False,
|
|
|
+ data['suffix'],
|
|
|
+ data['url']
|
|
|
+ ))
|
|
|
+ conn.commit()
|
|
|
+ print(f"数据 {data['url']} 插入成功")
|
|
|
+ else:
|
|
|
+ print(f"数据 {data['url']} 已存在,未插入")
|
|
|
+
|
|
|
+ # 关闭数据库连接
|
|
|
+ cursor.close()
|
|
|
+ conn.close()
|
|
|
+
|
|
|
+
|
|
|
+def load_data():
|
|
|
+ # 连接数据库
|
|
|
+ conn = psycopg2.connect(**psql_params)
|
|
|
+ cursor = conn.cursor()
|
|
|
+
|
|
|
+ # 查询download_state为false的所有数据
|
|
|
+ query = f"SELECT * FROM {target} WHERE download_state = %s order by id asc"
|
|
|
+
|
|
|
+ load_data_list = []
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 执行查询
|
|
|
+ cursor.execute(query, (False,))
|
|
|
+ # 获取查询结果
|
|
|
+ rows = cursor.fetchall()
|
|
|
+
|
|
|
+ # 打印结果
|
|
|
+ for row in rows:
|
|
|
+ load_data_list.append(
|
|
|
+ {
|
|
|
+ 'id': row[0],
|
|
|
+ 'name': row[1],
|
|
|
+ 'target_site': row[2],
|
|
|
+ 'file_title': row[3],
|
|
|
+ 'set_name': row[4],
|
|
|
+ 'serial': row[5],
|
|
|
+ 'download_state': row[6],
|
|
|
+ 'image_suffix': row[7],
|
|
|
+ 'img_url': row[8]
|
|
|
+ }
|
|
|
+ )
|
|
|
+ except psycopg2.Error as e:
|
|
|
+ print(f"Database error: {e}")
|
|
|
+ finally:
|
|
|
+ # 关闭数据库连接
|
|
|
+ cursor.close()
|
|
|
+ conn.close()
|
|
|
+
|
|
|
+ if load_data_list:
|
|
|
+ return load_data_list
|
|
|
+ else:
|
|
|
+ print("没有需要下载的数据。")
|
|
|
+ exit(0)
|
|
|
+
|
|
|
+
|
|
|
+def check_psql():
|
|
|
+ # 连接数据库
|
|
|
+ try:
|
|
|
+ conn = psycopg2.connect(**psql_params)
|
|
|
+ except Exception as e:
|
|
|
+ print(f"无法连接到数据库:{e}")
|
|
|
+ exit(1)
|
|
|
+
|
|
|
+ # 创建cursor对象
|
|
|
+ cur = conn.cursor()
|
|
|
+
|
|
|
+ cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
|
|
|
+ (target,))
|
|
|
+ exist = cur.fetchone()[0]
|
|
|
+
|
|
|
+ if not exist:
|
|
|
+ # 如果不存在,则创建表
|
|
|
+ cur.execute(f"""
|
|
|
+ CREATE TABLE {target} (
|
|
|
+ id SERIAL PRIMARY KEY,
|
|
|
+ name VARCHAR(255),
|
|
|
+ target_site VARCHAR(255),
|
|
|
+ file_title VARCHAR(255),
|
|
|
+ set_name VARCHAR(255),
|
|
|
+ serial INT,
|
|
|
+ download_state BOOLEAN,
|
|
|
+ image_suffix VARCHAR(50),
|
|
|
+ img_url VARCHAR(255)
|
|
|
+ );
|
|
|
+ """)
|
|
|
+ print(f"表 '{target}' 创建成功。")
|
|
|
+
|
|
|
+ # 提交事务
|
|
|
+ conn.commit()
|
|
|
+
|
|
|
+ # 关闭cursor和连接
|
|
|
+ cur.close()
|
|
|
+ conn.close()
|
|
|
+
|
|
|
+
|
|
|
+def check_local_downloads_dir():
|
|
|
+ # 查看一下是否存在 downloads 文件夹, 不存在就创建一个
|
|
|
+ download_file_path = os.path.join(str(project_root), 'downloads')
|
|
|
+ if not os.path.exists(download_file_path):
|
|
|
+ os.mkdir(download_file_path)
|
|
|
+ target_file_path = os.path.join(download_file_path, target)
|
|
|
+ if not os.path.exists(target_file_path):
|
|
|
+ os.mkdir(target_file_path)
|
|
|
+ return target_file_path
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ # 检查数据库
|
|
|
+ check_psql()
|
|
|
+
|
|
|
+ txt_file_name = 'target_link.txt'
|
|
|
+ if not os.path.exists(txt_file_name):
|
|
|
+ with open(txt_file_name, 'w') as file:
|
|
|
+ file.write('')
|
|
|
+ print('需要在 target_link.txt 中填写目标链接')
|
|
|
+ exit(0)
|
|
|
+ else:
|
|
|
+ with open('target_link.txt', 'r') as f:
|
|
|
+ targets = [target.strip() for target in f.readlines()]
|
|
|
+ if not targets:
|
|
|
+ print('在 target_link.txt 中未找到目标链接')
|
|
|
+ exit(0)
|
|
|
+
|
|
|
+ print(f'目标链接是:{targets}')
|
|
|
+
|
|
|
+ if step == 1:
|
|
|
+ all_data = open_browser(targets)
|
|
|
+ save_data(all_data)
|
|
|
+ elif step == 2:
|
|
|
+ # 开始读取数据
|
|
|
+ load_data = load_data()
|
|
|
+
|
|
|
+ # 开始下载 img
|
|
|
+ target_file_path = check_local_downloads_dir()
|
|
|
+ download_img(load_data, target_file_path)
|
|
|
+ print('下载完成, 程序退出')
|
|
|
+ elif step == 3:
|
|
|
+ # 保存 img 链接
|
|
|
+ all_data = open_browser(targets)
|
|
|
+ save_data(all_data)
|
|
|
+ else:
|
|
|
+ pass
|