# -*- coding: utf-8 -*- # 共两个步骤, 1, 将目标图片的信息拉到数据库(标题, 所有img的url), 2, 从数据库中读取对应目标站点的所有未下载过的img的url, 下载到本地 # 需要安装psql, 并且 CREATE DATABASE collect; 运行会自动建表 import sys import os import time import random import re import psycopg2 sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection')) import httpx from playwright.sync_api import sync_playwright target = 'kaizty' step = 4 # 1 = 获取img链接, 2 = 下载图片, 3 = 1 + 2, 4 = 调试 local_proxy = 1 title_selector = '#pack-view__inner > section.pack-view__header > h1' # 获取标题选择器 img_selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' # 获取图片的url img_count_selector = '#pack-view__inner > section.pack-view__header > p' # 获取图片总数选择器 not_find_page_selector = 'body > div.page-navigation > a.next' # 当无法获取下一页时, 此选择器为最后一页 project_root = os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection') psql_params = { "host": "home.erhe.link", "port": 55434, "user": "psql", "password": "psql", "dbname": "collect" } def open_browser(target_urls): all_data = {} for target_url in target_urls: pages = '?page={}' urls = [] title = '' # 存放当前页面的title with sync_playwright() as playwright: if local_proxy: browser = playwright.chromium.launch( headless=True, proxy={"server": "http://127.0.0.1:7890"} ) else: browser = playwright.chromium.launch(headless=True) context = browser.new_context(viewport={'width': 1280, 'height': 700}) page = context.new_page() img_sequence_num = 1 for page_count in range(1, 2): # 检查一下当前页面是不是 404 try: page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000) print(f'总页数是 {page_count - 1} 在 url: {goto_url}') break except: pass try: goto_url = target_url + pages.format(page_count) page.goto(goto_url, timeout=5000) except Exception as e: print(e) print(f'页面加载失败:url:{goto_url}') page.wait_for_load_state('domcontentloaded') title = page.title() page_source = page.content() img_list = re.findall('