main.py 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. import os
  4. import time
  5. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  6. import httpx
  7. from playwright.sync_api import sync_playwright
  8. target_url = 'https://www.flaticon.com/packs/summer-watermelon-17517790'
  9. title_selector = '#pack-view__inner > section.pack-view__header > h1'
  10. selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
  11. img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
  12. img_count_selector = '#pack-view__inner > section.pack-view__header > p'
  13. def main():
  14. with sync_playwright() as playwright:
  15. browser = playwright.webkit.launch(
  16. headless=True,
  17. proxy={"server": "http://127.0.0.1:7897"}
  18. )
  19. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  20. page = context.new_page()
  21. page.goto(target_url)
  22. # 获取title
  23. page.wait_for_selector(title_selector, state="attached", timeout=10000)
  24. title = page.query_selector(title_selector).inner_text()
  25. img_count = page.query_selector(img_count_selector).inner_text()
  26. img_count = int(img_count.split(' ')[0])
  27. invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
  28. for char in invalid_chars:
  29. title = title.replace(char, '')
  30. img_name = title.replace(' ', '_')
  31. current_path = os.getcwd()
  32. download_file_path = os.path.join(current_path, 'download')
  33. if not os.path.exists(download_file_path):
  34. os.mkdir(download_file_path)
  35. file_path = os.path.join(download_file_path, title)
  36. if not os.path.exists(file_path):
  37. os.mkdir(file_path)
  38. # 第一层 url
  39. urls = []
  40. for i in range(1, img_count + 1):
  41. # 选择所有的<a>标签
  42. elements = page.query_selector_all(selector.format(i))
  43. # 遍历所有<a>标签,提取href属性
  44. for element in elements:
  45. src = element.get_attribute('src')
  46. if src:
  47. src = src.replace('/128/', '/512/')
  48. sequence = str(i).zfill(2)
  49. urls.append({
  50. 'url': src,
  51. 'img': f'{img_name}_{sequence}.png'
  52. })
  53. print('已获取所有图片url')
  54. page.close()
  55. browser.close()
  56. print('正在下载图片')
  57. for url in urls:
  58. # 如果png文件存在, 即已经下载过, 直接跳过
  59. target_img_url = url['url']
  60. img_png_name = url['img']
  61. target_img_name = os.path.join(file_path, img_png_name)
  62. if os.path.exists(target_img_name):
  63. print(f'图片 {img_png_name} 已存在')
  64. continue
  65. try:
  66. resp = httpx.get(target_img_url, headers={
  67. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
  68. })
  69. with open(target_img_name, 'wb') as f:
  70. f.write(resp.content)
  71. print(f'已下载: {img_png_name}')
  72. time.sleep(1)
  73. except Exception as e:
  74. print(e)
  75. print(f'{title} : 已下载完成')
  76. if __name__ == "__main__":
  77. main()