step0_get_flaticon_all_url.py 749 B

123456789101112131415161718192021222324252627282930
  1. # -*- coding: utf-8 -*-
  2. import re
  3. import time
  4. import random
  5. import httpx
  6. all_urls = []
  7. for i in range(151, 201):
  8. url = 'https://www.flaticon.com/stickers-packs/{}'.format(i)
  9. headers = {
  10. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'
  11. }
  12. resp = httpx.get(url, headers=headers)
  13. resp.encoding = 'utf-8'
  14. page = resp.text
  15. url_list = re.findall('data-pin-url="(.*?)"', page)
  16. for u in url_list:
  17. if '{{link}}' not in u:
  18. all_urls.append(u)
  19. time.sleep(random.uniform(1, 2))
  20. with open('url_file_0_flaticon_urls.txt', 'w', encoding='utf-8') as file:
  21. for url in all_urls:
  22. file.write(url + '\n')