2
0

flaticon 3.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import sys
  2. import os
  3. import time
  4. import random
  5. import httpx
  6. from playwright.sync_api import sync_playwright
  7. target_base_name = 'flaticon'
  8. target_base_url = 'https://www.flaticon.com'
  9. title_selector = '#pack-view__inner > section.pack-view__header > h1'
  10. selector = '#pack-view__inner > section.search-result > ul > li:nth-child({}) > div > a > img'
  11. img_selector = '#detail > div > div.row.detail__top.mg-none > section > div > div > div.row.row--vertical-center.mg-none.full-height.detail__icon__inner > div > div > img'
  12. img_count_selector = '#pack-view__inner > section.pack-view__header > p'
  13. not_find_page_selector = '#viewport > div.errorpage.e404 > h1'
  14. clean_string_seed = ['<', '>', ':', '"', '/', f'\\', '|', '?', '*', '.', ' ', 'Icon Pack ']
  15. all_data = {}
  16. # ---------------------------------- open_browser -------------------------------------------
  17. pages = '/{}'
  18. urls = []
  19. file_path = '' # 存放图片的文件夹
  20. title = '' # 存放当前页面的title
  21. with sync_playwright() as playwright:
  22. if self.show_browser:
  23. browser = playwright.webkit.launch(headless=False)
  24. else:
  25. browser = playwright.webkit.launch(headless=True)
  26. context = browser.new_context(viewport={'width': 1280, 'height': 700})
  27. page = context.new_page()
  28. img_sequence_num = 1
  29. for page_count in range(1, 999):
  30. try:
  31. goto_url = self.img_set_url + pages.format(page_count)
  32. page.goto(goto_url, timeout=5000)
  33. except Exception as e:
  34. print(e)
  35. print(f'Page load failed: url is : {goto_url}')
  36. # 检查一下当前页面是不是 404
  37. try:
  38. page.wait_for_selector(not_find_page_selector, state="attached", timeout=2000)
  39. print(f'Total page is {page_count - 1}')
  40. break
  41. except:
  42. pass
  43. if page_count == 1:
  44. # 获取title
  45. page.wait_for_selector(title_selector, state="attached", timeout=10000)
  46. title = page.query_selector(title_selector).inner_text()
  47. for char in clean_string_seed:
  48. title = title.replace(char, '')
  49. title = title.replace(' ', '_')
  50. # 获取当前图片合集总 img 数量
  51. img_count = page.query_selector(img_count_selector).inner_text()
  52. img_count = int(img_count.split(' ')[0])
  53. for i in range(1, img_count + 1):
  54. # 选择所有的<a>标签
  55. elements = page.query_selector_all(selector.format(i))
  56. # 遍历所有<a>标签,提取href属性
  57. for element in elements:
  58. src = element.get_attribute('src')
  59. if src:
  60. src = src.replace('/128/', '/512/')
  61. suffix = src.split('.')[-1]
  62. sequence = str(img_sequence_num).zfill(3)
  63. urls.append({
  64. 'url': src,
  65. 'img': f'{title}_{sequence}',
  66. 'suffix': suffix
  67. })
  68. img_sequence_num += 1
  69. break
  70. print(f'All image URLs have been obtained. Total img {len(urls)}')
  71. page.close()
  72. browser.close()
  73. # -------------------------------------------------------------- process data --------------------------------------------------------------
  74. save_line = []
  75. if urls:
  76. n = 1
  77. for data in urls:
  78. save_line += [[0, 0, {
  79. 'serial': n,
  80. 'url': data['url'],
  81. 'name': data['img'],
  82. 'image_suffix': data['suffix']
  83. }]]
  84. n += 1
  85. self.update({
  86. 'file_title': title,
  87. 'line_ids': save_line,
  88. 'image_count': len(urls)
  89. })
  90. print('done!')