import sys import os import time import random import httpx from playwright.sync_api import sync_playwright target_base_name = 'flaticon' target_base_url = 'https://www.flaticon.com' title_selector = '#pack-view__inner > section.pack-view__header > h1' selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img' img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img' img_count_selector = '#pack-view__inner > section.pack-view__header > p' not_find_page_selector = '#viewport > div.errorpage.e404 > h1' clean_string_seed = ['<', '>', ':', '"', '/', f'\\', '|', '?', '*', '.', ' ', 'Icon Pack '] all_data = {} # ---------------------------------- open_browser ------------------------------------------- pages = '/{}' urls = [] file_path = '' # 存放图片的文件夹 title = '' # 存放当前页面的title with sync_playwright() as playwright: if self.show_browser: browser = playwright.webkit.launch(headless=False) else: browser = playwright.webkit.launch(headless=True) context = browser.new_context(viewport={'width': 1280, 'height': 700}) page = context.new_page() img_sequence_num = 1 for page_count in range(1, 999): try: goto_url = self.img_set_url + pages.format(page_count) page.goto(goto_url, timeout=5000) except Exception as e: print(e) print(f'Page load failed: url is : {goto_url}') # 检查一下当前页面是不是 404 try: page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000) print(f'Total page is {page_count - 1}') break except: pass if page_count == 1: # 获取title page.wait_for_selector(title_selector, state="attached", timeout=10000) title = page.query_selector(title_selector).inner_text() for char in clean_string_seed: title = title.replace(char, '') title = title.replace(' ', '_') # 获取当前图片合集总 img 数量 img_count = page.query_selector(img_count_selector).inner_text() img_count = int(img_count.split(' ')[0]) for i in range(1, img_count + 1): # 选择所有的标签 elements = page.query_selector_all(selector.format(i)) # 遍历所有标签,提取href属性 for element in elements: src = element.get_attribute('src') if src: src = src.replace('/128/', '/512/') suffix = src.split('.')[-1] sequence = str(img_sequence_num).zfill(3) urls.append({ 'url': src, 'img': f'{title}_{sequence}', 'suffix': suffix }) img_sequence_num += 1 break print(f'All image URLs have been obtained. Total img {len(urls)}') page.close() browser.close() # -------------------------------------------------------------- process data -------------------------------------------------------------- save_line = [] if urls: n = 1 for data in urls: save_line += [[0, 0, { 'serial': n, 'url': data['url'], 'name': data['img'], 'image_suffix': data['suffix'] }]] n += 1 self.update({ 'file_title': title, 'line_ids': save_line, 'image_count': len(urls) }) print('done!')