main.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # -*- coding: utf-8 -*-
  2. import sys
  3. import os
  4. import time
  5. import random
  6. sys.path.append(os.path.join(os.path.abspath(__file__).split('ResourceCollection')[0] + 'ResourceCollection'))
  7. import httpx
  8. from playwright.sync_api import sync_playwright
  9. title_selector = '#pack-view__inner > section.pack-view__header > h1'
  10. selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
  11. img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
  12. img_count_selector = '#pack-view__inner > section.pack-view__header > p'
  13. not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
  14. def main(target_urls):
  15. all_data = {}
  16. for target_url in target_urls:
  17. urls, file_path, title = open_browser(target_url)
  18. all_data[title] = [urls, file_path, title]
  19. for data in all_data:
  20. urls = all_data[data][0]
  21. file_path = all_data[data][1]
  22. title = all_data[data][2]
  23. while True:
  24. if download_img(urls, file_path):
  25. print(f'All images have been downloaded: {title}')
  26. break
  27. else:
  28. print(f'Some images have not been downloaded, continue downloading {title}')
  29. print('\n\n')
  30. print('All Done')
  31. def open_browser(target_url):
  32. pages = '/{}'
  33. urls = []
  34. file_path = '' # 存放图片的文件夹
  35. title = '' # 存放当前页面的title
  36. with sync_playwright() as playwright:
  37. browser = playwright.webkit.launch(
  38. headless=True,
  39. proxy={"server": "http://127.0.0.1:7890"}
  40. )
  41. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  42. page = context.new_page()
  43. img_sequence_num = 1
  44. for page_count in range(1, 999):
  45. try:
  46. goto_url = target_url + pages.format(page_count)
  47. page.goto(goto_url, timeout=5000)
  48. except Exception as e:
  49. print(e)
  50. print(f'Page load failed: url is : {goto_url}')
  51. # 检查一下当前页面是不是 404
  52. try:
  53. page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
  54. print(f'Total page is {page_count - 1} in url: {goto_url}')
  55. break
  56. except:
  57. pass
  58. if page_count == 1:
  59. # 获取title
  60. page.wait_for_selector(title_selector, state="attached", timeout=10000)
  61. title = page.query_selector(title_selector).inner_text()
  62. img_count = page.query_selector(img_count_selector).inner_text()
  63. img_count = int(img_count.split(' ')[0])
  64. invalid_chars = ['<', '>', ':', '"', '/', '\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
  65. for char in invalid_chars:
  66. title = title.replace(char, '')
  67. img_name = title.replace(' ', '_')
  68. current_path = os.getcwd()
  69. download_file_path = os.path.join(current_path, 'download')
  70. if not os.path.exists(download_file_path):
  71. os.mkdir(download_file_path)
  72. file_path = os.path.join(download_file_path, title)
  73. if not os.path.exists(file_path):
  74. os.mkdir(file_path)
  75. for i in range(1, img_count + 1):
  76. # 选择所有的<a>标签
  77. elements = page.query_selector_all(selector.format(i))
  78. # 遍历所有<a>标签,提取href属性
  79. for element in elements:
  80. src = element.get_attribute('src')
  81. if src:
  82. src = src.replace('/128/', '/512/')
  83. sequence = str(img_sequence_num).zfill(3)
  84. urls.append({
  85. 'url': src,
  86. 'img': f'{img_name}_{sequence}.png'
  87. })
  88. img_sequence_num += 1
  89. break
  90. print(f'All image URLs have been obtained. Total img {len(urls)}')
  91. page.close()
  92. browser.close()
  93. return urls, file_path, title
  94. def download_img(urls, file_path):
  95. all_done = True
  96. print('Downloading pictures')
  97. for url in urls:
  98. # 如果png文件存在, 即已经下载过, 直接跳过
  99. target_img_url = url['url']
  100. img_png_name = url['img']
  101. target_img_name = os.path.join(file_path, img_png_name)
  102. if os.path.exists(target_img_name):
  103. print(f'The image {img_png_name} already exists. continue!')
  104. continue
  105. try:
  106. resp = httpx.get(target_img_url, headers={
  107. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36"
  108. })
  109. with open(target_img_name, 'wb') as f:
  110. f.write(resp.content)
  111. print(f'Downloaded: {img_png_name}')
  112. time.sleep(random.uniform(1, 2))
  113. except Exception as e:
  114. print(f'\nFailed to download image: {target_img_url}. err: {e}\n')
  115. time.sleep(random.uniform(3, 5))
  116. all_done = False
  117. return all_done
  118. if __name__ == "__main__":
  119. txt_file_name = 'target_link.txt'
  120. if not os.path.exists(txt_file_name):
  121. with open(txt_file_name, 'w') as file:
  122. file.write('')
  123. print('Need to fill in the target link in target_link.txt')
  124. exit(0)
  125. else:
  126. with open('target_link.txt', 'r') as f:
  127. targets = [target.strip() for target in f.readlines()]
  128. if not targets:
  129. print('No target link found in target_link.txt')
  130. exit(0)
  131. print(f'target link is : {targets}')
  132. main(targets)