test01.py 2.1 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
  1. # -*- coding: UTF-8 -*-
  2. import re
  3. import urllib.parse
  4. import httpx
  5. url = "https://www.fantasyfactory.xyz"
  6. headers = {
  7. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
  8. }
  9. response = httpx.get(url, headers=headers)
  10. re_inter_urls = re.findall('<td class="fb-n"><a href="(.*?)"', response.text)
  11. replacements = {
  12. ".": "_",
  13. "/": "",
  14. }
  15. all_jpg = {}
  16. for u in re_inter_urls:
  17. inter_url = url + u
  18. inter_response = httpx.get(inter_url, headers=headers)
  19. re_inter_response = re.findall('<td class="fb-n"><a href="(.*?)"', inter_response.text)
  20. for j in re_inter_response:
  21. if j != re_inter_response[0]:
  22. if "jpg" in j:
  23. for old, new in replacements.items():
  24. u = u.replace(old, new)
  25. print(url + j)
  26. if u not in all_jpg:
  27. all_jpg[u] = [url + j]
  28. else:
  29. all_jpg[u].append(url + j)
  30. else:
  31. jj = urllib.parse.unquote(j)
  32. inter_inter_response = url + jj
  33. inter_inter_jpg_response = httpx.get(inter_inter_response, headers=headers)
  34. re_inter_inter_jpg_response = re.findall('<td class="fb-n"><a href="(.*?)"',
  35. inter_inter_jpg_response.text)
  36. for inter_jpg in re_inter_inter_jpg_response:
  37. if inter_jpg != re_inter_inter_jpg_response[0]:
  38. if "jpg" in inter_jpg:
  39. jj_file = jj.split('/')[2]
  40. for old, new in replacements.items():
  41. jj_file = jj_file.replace(old, new)
  42. print(url + inter_jpg)
  43. if jj_file not in all_jpg:
  44. all_jpg[jj_file] = [url + inter_jpg]
  45. else:
  46. all_jpg[jj_file].append(url + inter_jpg)
  47. with open('test.txt', 'w') as f:
  48. f.write(str(all_jpg))