kaizty_spider.py 2.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. import os.path
  2. import re
  3. import random
  4. import time
  5. import httpx
  6. url_key = 'UHh0dkRPOWwyV2R2V0ZFU3hMRFZaZz09.html?'
  7. url_photos = '/photos/'
  8. base_url = 'https://www.kaizty.com/'
  9. url_page = 'page={}'
  10. headers = {
  11. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
  12. }
  13. def get_pages():
  14. title = ''
  15. all_img_list = []
  16. error_times = 0
  17. max_error_times = 2
  18. page = 1
  19. while True:
  20. if error_times >= max_error_times:
  21. break
  22. print('正在获取第 {} 页数据'.format(page))
  23. url = base_url + url_photos + url_key + url_page.format(page)
  24. page += 1
  25. response = httpx.get(url, headers=headers)
  26. response.encoding = 'utf-8'
  27. html = response.text
  28. target_block = re.findall('<\!\[endif\]--><title>(.*?)<meta property="og:locale"', html)
  29. if not target_block:
  30. continue
  31. target_block = target_block[0]
  32. if not title:
  33. re_title = re.findall('(.*?)\| Page', target_block)
  34. if not re_title:
  35. print('获取 title 失败')
  36. error_times += 1
  37. continue
  38. re_title = re_title[0]
  39. title = re.sub(r'[<>:"/\\|?*]', '', re_title)
  40. title = title.replace(' ', '')
  41. img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block)
  42. if not img_list:
  43. print('获取图片链接失败, 第{}页'.format(page))
  44. error_times += 1
  45. continue
  46. all_img_list += img_list
  47. # time.sleep(random.uniform(2, 3))
  48. return all_img_list, title
  49. def get_imgs(all_img_list, title):
  50. print('\n\n开始保存图片')
  51. current_directory = os.getcwd()
  52. if not os.path.exists(title):
  53. os.mkdir(title)
  54. img_dir = os.path.join(current_directory, title)
  55. files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
  56. now_last_num = 1
  57. if files:
  58. now_last_num = int(files[-1].split('.')[0])
  59. for n in range(now_last_num, len(all_img_list)):
  60. img = httpx.get(all_img_list[n], headers=headers)
  61. if not img.status_code == 200:
  62. print('请求图片错误, 程序退出')
  63. raise Exception(f'状态码 {img.status_code}')
  64. file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1]
  65. print('正在保存图片: {}'.format(file_name))
  66. with open(title + "/" + file_name, "wb") as f:
  67. f.write(img.content)
  68. # time.sleep(random.uniform(8, 10))
  69. if __name__ == '__main__':
  70. all_img_list, title = get_pages()
  71. while True:
  72. try:
  73. get_imgs(all_img_list, title)
  74. except Exception as e:
  75. print(e)
  76. time.sleep(random.uniform(30, 40))
  77. continue
  78. else:
  79. print("程序执行完成,退出循环")
  80. break
  81. print("done")