| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798 |
- # -*- coding: utf-8 -*-
- import sys
- import os
- import time
- sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
- import httpx
- from playwright.sync_api import sync_playwright
- target_url = 'https://www.flaticon.com/packs/summer-watermelon-17517790'
- title_selector = '#pack-view__inner > section.pack-view__header > h1'
- selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
- img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
- img_count_selector = '#pack-view__inner > section.pack-view__header > p'
- def main():
- with sync_playwright() as playwright:
- browser = playwright.webkit.launch(
- headless=True,
- proxy={"server": "http://127.0.0.1:7890"}
- )
- context = browser.new_context(viewport={'width': 1280, 'height': 700})
- page = context.new_page()
- page.goto(target_url)
- # 获取title
- page.wait_for_selector(title_selector, state="attached", timeout=10000)
- title = page.query_selector(title_selector).inner_text()
- img_count = page.query_selector(img_count_selector).inner_text()
- img_count = int(img_count.split(' ')[0])
- invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
- for char in invalid_chars:
- title = title.replace(char, '')
- img_name = title.replace(' ', '_')
- current_path = os.getcwd()
- download_file_path = os.path.join(current_path, 'download')
- if not os.path.exists(download_file_path):
- os.mkdir(download_file_path)
- file_path = os.path.join(download_file_path, title)
- if not os.path.exists(file_path):
- os.mkdir(file_path)
- # 第一层 url
- urls = []
- for i in range(1, img_count + 1):
- # 选择所有的<a>标签
- elements = page.query_selector_all(selector.format(i))
- # 遍历所有<a>标签,提取href属性
- for element in elements:
- src = element.get_attribute('src')
- if src:
- src = src.replace('/128/', '/512/')
- sequence = str(i).zfill(2)
- urls.append({
- 'url': src,
- 'img': f'{img_name}_{sequence}.png'
- })
- print('已获取所有图片url')
- page.close()
- browser.close()
- print('正在下载图片')
- for url in urls:
- # 如果png文件存在, 即已经下载过, 直接跳过
- target_img_url = url['url']
- img_png_name = url['img']
- target_img_name = os.path.join(file_path, img_png_name)
- if os.path.exists(target_img_name):
- print(f'图片 {img_png_name} 已存在')
- continue
- try:
- resp = httpx.get(target_img_url, headers={
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
- })
- with open(target_img_name, 'wb') as f:
- f.write(resp.content)
- print(f'已下载: {img_png_name}')
- time.sleep(1)
- except Exception as e:
- print(e)
- print(f'{title} : 已下载完成')
- if __name__ == "__main__":
- main()
|