|
|
@@ -1,11 +1,12 @@
|
|
|
# -*- coding: utf-8 -*-
|
|
|
# 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地
|
|
|
# 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表
|
|
|
+import socket
|
|
|
import sys
|
|
|
import os
|
|
|
import time
|
|
|
import random
|
|
|
-
|
|
|
+from concurrent.futures import ThreadPoolExecutor
|
|
|
import psycopg2
|
|
|
|
|
|
sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
|
|
|
@@ -13,8 +14,7 @@ import httpx
|
|
|
from playwright.sync_api import sync_playwright
|
|
|
|
|
|
target = 'flaticon'
|
|
|
-step = 2 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2
|
|
|
-remote_databases = 1
|
|
|
+step = 4 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试
|
|
|
local_proxy = 0
|
|
|
title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器
|
|
|
img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url
|
|
|
@@ -23,7 +23,12 @@ not_find_page_selector = '#viewport > div.errorpage.e404 > h1' # 当无法获
|
|
|
|
|
|
project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')
|
|
|
|
|
|
-if remote_databases:
|
|
|
+# 获取局域网ip, 如果不是局域网, 则用公网连接数据库
|
|
|
+s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
|
|
|
+s.connect(('10.255.255.255', 1))
|
|
|
+IP = s.getsockname()[0]
|
|
|
+s.close()
|
|
|
+if '192.168.100' not in IP:
|
|
|
psql_params = {
|
|
|
"host": "home.erhe.link",
|
|
|
"port": 55434,
|
|
|
@@ -130,11 +135,13 @@ def open_browser(target_urls):
|
|
|
|
|
|
def download_img(load_data, target_file_path):
|
|
|
print('正在下载图片')
|
|
|
- for index, data in enumerate(load_data): # 循环内是单张图片
|
|
|
- multitasking_download(index, data, load_data, target_file_path)
|
|
|
+ with ThreadPoolExecutor(max_workers=4) as executor:
|
|
|
+ executor.map(single_img_download,
|
|
|
+ [(index, data, load_data, target_file_path) for index, data in enumerate(load_data)])
|
|
|
|
|
|
|
|
|
-def multitasking_download(index, data, load_data, target_file_path):
|
|
|
+def single_img_download(args):
|
|
|
+ index, data, load_data, target_file_path = args
|
|
|
# 连接数据库, 准备反写下载状态
|
|
|
conn = psycopg2.connect(**psql_params)
|
|
|
cursor = conn.cursor()
|