| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341 |
- # -*- coding: utf-8 -*-
- # 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
- # 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
- import sys
- import os
- import time
- import random
- import re
- import psycopg2
- sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
- import httpx
- from playwright.sync_api import sync_playwright
- target = 'kaizty'
- step = 4 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
- local_proxy = 1
- title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
- img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
- img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器
- not_find_page_selector = 'body > div.page-navigation > a.next' # 当无法获取下一页时, 此选择器为最后一页
- project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
- psql_params = {
- "host": "home.erhe.link",
- "port": 55434,
- "user": "psql",
- "password": "psql",
- "dbname": "collect"
- }
- def open_browser(target_urls):
- all_data = {}
- for target_url in target_urls:
- pages = '?page={}'
- urls = []
- title = '' # 存放当前页面的title
- with sync_playwright() as playwright:
- if local_proxy:
- browser = playwright.chromium.launch(
- headless=True,
- proxy={"server": "http://127.0.0.1:7890"}
- )
- else:
- browser = playwright.chromium.launch(headless=True)
- context = browser.new_context(viewport={'width': 1280, 'height': 700})
- page = context.new_page()
- img_sequence_num = 1
- for page_count in range(1, 2):
- # 检查一下当前页面是不是 404
- try:
- page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
- print(f'总页数是 {page_count - 1} 在 url: {goto_url}')
- break
- except:
- pass
- try:
- goto_url = target_url + pages.format(page_count)
- page.goto(goto_url, timeout=5000)
- except Exception as e:
- print(e)
- print(f'页面加载失败:url:{goto_url}')
- page.wait_for_load_state('domcontentloaded')
- title = page.title()
- page_source = page.content()
- img_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
- title = clean_string(title)
- for img_url in img_list:
- suffix = img_url.split('.')[-1]
- sequence = str(img_sequence_num).zfill(3)
- urls.append({
- 'url': img_url,
- 'file_title': title,
- 'serial': sequence,
- 'img': f'{title}_{sequence}',
- 'suffix': suffix
- })
- img_sequence_num += 1
- all_data[title] = urls
- page.close()
- browser.close()
- # 获取所有 url 数据之后, 存数据库
- return all_data
- def download_img(load_data, target_file_path):
- # 连接数据库, 准备反写下载状态
- conn = psycopg2.connect(**psql_params)
- cursor = conn.cursor()
- print('正在下载图片')
- for data in load_data:
- # 如果img文件存在, 即已经下载过, 直接跳过
- id = data['id']
- name = data['name']
- target_site = data['target_site'],
- file_title = data['file_title'].replace(' ', '_')
- set_name = data['set_name']
- serial = str(data['serial']).zfill(3)
- image_suffix = data['image_suffix']
- img_url = data['img_url']
- # 查看每个合集的文件夹是否存在, 不存在就创建
- title_file_path = os.path.join(target_file_path, file_title)
- if not os.path.exists(title_file_path):
- os.mkdir(title_file_path)
- img_name = f'{file_title}_{serial}.{image_suffix}' # 图片文件名
- img_file_path = os.path.join(str(title_file_path), img_name) # 图片完整路径
- if os.path.exists(img_file_path):
- # 当此 img 已存在本地时, 在 psql 将数据库状态改为已下载
- query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
- cursor.execute(query, (True, id))
- conn.commit()
- print(f'图片 {img_file_path} 已存在。继续!')
- continue
- retry = 8
- while retry:
- try:
- resp = httpx.get(img_url, headers={
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
- })
- with open(img_file_path, 'wb') as f:
- f.write(resp.content)
- # 下载成功后, 在 psql 将数据库状态改为已下载
- query = f"UPDATE {target} SET download_state = %s WHERE id = %s"
- cursor.execute(query, (True, id))
- conn.commit()
- print(f'已下载:{img_name}')
- time.sleep(random.uniform(1, 2))
- break
- except Exception as e:
- print(f'下载图片失败:{img_name}。错误:{e} 重试: {retry}')
- retry -= 1
- time.sleep(random.uniform(3, 5))
- def save_data(data_item):
- conn = psycopg2.connect(**psql_params)
- cursor = conn.cursor()
- for k, v in data_item.items():
- for data in v:
- # 检查img_url是否重复
- cursor.execute("SELECT img_url FROM flaticon WHERE img_url = %s", (data['url'],))
- if cursor.fetchone() is None:
- # 插入数据
- cursor.execute("""
- INSERT INTO flaticon (name, target_site, file_title, set_name, serial, download_state, image_suffix, img_url)
- VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
- """, (
- None,
- target,
- data['file_title'],
- None,
- data['serial'],
- False,
- data['suffix'],
- data['url']
- ))
- conn.commit()
- print(f"数据 {data['url']} 插入成功")
- else:
- print(f"数据 {data['url']} 已存在,未插入")
- # 关闭数据库连接
- cursor.close()
- conn.close()
- def load_data():
- # 连接数据库
- conn = psycopg2.connect(**psql_params)
- cursor = conn.cursor()
- # 查询download_state为false的所有数据
- query = f"SELECT * FROM {target} WHERE download_state = %s order by id asc"
- load_data_list = []
- try:
- # 执行查询
- cursor.execute(query, (False,))
- # 获取查询结果
- rows = cursor.fetchall()
- # 打印结果
- for row in rows:
- load_data_list.append(
- {
- 'id': row[0],
- 'name': row[1],
- 'target_site': row[2],
- 'file_title': row[3],
- 'set_name': row[4],
- 'serial': row[5],
- 'download_state': row[6],
- 'image_suffix': row[7],
- 'img_url': row[8]
- }
- )
- except psycopg2.Error as e:
- print(f"Database error: {e}")
- finally:
- # 关闭数据库连接
- cursor.close()
- conn.close()
- if load_data_list:
- return load_data_list
- else:
- print("没有需要下载的数据。")
- exit(0)
- def check_psql():
- # 连接数据库
- try:
- conn = psycopg2.connect(**psql_params)
- except Exception as e:
- print(f"无法连接到数据库:{e}")
- exit(1)
- # 创建cursor对象
- cur = conn.cursor()
- cur.execute("SELECT EXISTS(SELECT FROM pg_catalog.pg_tables WHERE schemaname = 'public' AND tablename = %s)",
- (target,))
- exist = cur.fetchone()[0]
- if not exist:
- # 如果不存在,则创建表
- cur.execute(f"""
- CREATE TABLE {target} (
- id SERIAL PRIMARY KEY,
- name VARCHAR(255),
- target_site VARCHAR(255),
- file_title VARCHAR(255),
- set_name VARCHAR(255),
- serial INT,
- download_state BOOLEAN,
- image_suffix VARCHAR(50),
- img_url VARCHAR(255)
- );
- """)
- print(f"表 '{target}' 创建成功。")
- # 提交事务
- conn.commit()
- # 关闭cursor和连接
- cur.close()
- conn.close()
- def check_local_downloads_dir():
- # 查看一下是否存在 downloads 文件夹, 不存在就创建一个
- download_file_path = os.path.join(str(project_root), 'downloads')
- if not os.path.exists(download_file_path):
- os.mkdir(download_file_path)
- target_file_path = os.path.join(download_file_path, target)
- if not os.path.exists(target_file_path):
- os.mkdir(target_file_path)
- return target_file_path
- def clean_string(string):
- string = string.replace('Kaizty Photos: ', '')
- string = string.split('|')[0]
- string = re.sub(r'[^\u4e00-\u9fff a-zA-Z0-9]', '', string)
- string = string.replace(' ', '_')
- if string.endswith('_'):
- string = string[:-1]
- return string
- if __name__ == "__main__":
- # 检查数据库
- check_psql()
- txt_file_name = 'target_link.txt'
- if not os.path.exists(txt_file_name):
- with open(txt_file_name, 'w') as file:
- file.write('')
- print('需要在 target_link.txt 中填写目标链接')
- exit(0)
- else:
- with open('target_link.txt', 'r') as f:
- targets = [target.strip() for target in f.readlines()]
- if not targets:
- print('在 target_link.txt 中未找到目标链接')
- exit(0)
- print(f'目标链接是:{targets}')
- if step == 1:
- all_data = open_browser(targets)
- save_data(all_data)
- elif step == 2:
- # 开始读取数据
- load_data = load_data()
- # 开始下载 img
- target_file_path = check_local_downloads_dir()
- download_img(load_data, target_file_path)
- print('下载完成, 程序退出')
- elif step == 3:
- # 保存 img 链接
- all_data = open_browser(targets)
- save_data(all_data)
- # 开始读取数据
- load_data = load_data()
- # 开始下载 img
- target_file_path = check_local_downloads_dir()
- download_img(load_data, target_file_path)
- print('下载完成, 程序退出')
- elif step == 4:
- # 调试
- all_data = open_browser(targets)
- else:
- pass
|