| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109 |
- import sys
- import os
- import time
- import random
- import httpx
- from playwright.sync_api import sync_playwright
- target_base_name = 'flaticon'
- target_base_url = 'https://www.flaticon.com'
- title_selector = '#pack-view__inner > section.pack-view__header > h1'
- selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
- img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
- img_count_selector = '#pack-view__inner > section.pack-view__header > p'
- not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
- clean_string_seed = ['<', '>', ':', '"', '/', f'\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
- all_data = {}
- # ---------------------------------- open_browser -------------------------------------------
- pages = '/{}'
- urls = []
- file_path = '' # 存放图片的文件夹
- title = '' # 存放当前页面的title
- with sync_playwright() as playwright:
- if self.show_browser:
- browser = playwright.webkit.launch(headless=False)
- else:
- browser = playwright.webkit.launch(headless=True)
- context = browser.new_context(viewport={'width': 1280, 'height': 700})
- page = context.new_page()
- img_sequence_num = 1
- for page_count in range(1, 999):
- try:
- goto_url = self.img_set_url + pages.format(page_count)
- page.goto(goto_url, timeout=5000)
- except Exception as e:
- print(e)
- print(f'Page load failed: url is : {goto_url}')
- # 检查一下当前页面是不是 404
- try:
- page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
- print(f'Total page is {page_count - 1}')
- break
- except:
- pass
- if page_count == 1:
- # 获取title
- page.wait_for_selector(title_selector, state="attached", timeout=10000)
- title = page.query_selector(title_selector).inner_text()
- for char in clean_string_seed:
- title = title.replace(char, '')
- title = title.replace(' ', '_')
- # 获取当前图片合集总 img 数量
- img_count = page.query_selector(img_count_selector).inner_text()
- img_count = int(img_count.split(' ')[0])
- for i in range(1, img_count + 1):
- # 选择所有的<a>标签
- elements = page.query_selector_all(selector.format(i))
- # 遍历所有<a>标签,提取href属性
- for element in elements:
- src = element.get_attribute('src')
- if src:
- src = src.replace('/128/', '/512/')
- suffix = src.split('.')[-1]
- sequence = str(img_sequence_num).zfill(3)
- urls.append({
- 'url': src,
- 'img': f'{title}_{sequence}',
- 'suffix': suffix
- })
- img_sequence_num += 1
- break
- print(f'All image URLs have been obtained. Total img {len(urls)}')
- page.close()
- browser.close()
- # -------------------------------------------------------------- process data --------------------------------------------------------------
- save_line = []
- if urls:
- n = 1
- for data in urls:
- save_line += [[0, 0, {
- 'serial': n,
- 'url': data['url'],
- 'name': data['img'],
- 'image_suffix': data['suffix']
- }]]
- n += 1
- self.update({
- 'file_title': title,
- 'line_ids': save_line,
- 'image_count': len(urls)
- })
- print('done!')
|