kaizty_spider.py 3.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109
  1. import os.path
  2. import re
  3. import random
  4. import time
  5. import httpx
  6. url_keys = [
  7. 'L2lBQ200aE0vOVNmUGcydzhhT296Zz09',
  8. 'RFFRQXFIZEhNeDNaV2txWjRlMk5xdz09'
  9. ]
  10. url_photos = '/photos/'
  11. base_url = 'https://www.kaizty.com/'
  12. url_page = 'page={}'
  13. headers = {
  14. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
  15. }
  16. def get_pages(url_key):
  17. title = ''
  18. all_img_list = []
  19. error_times = 0
  20. max_error_times = 2
  21. page = 1
  22. while True:
  23. if error_times >= max_error_times:
  24. break
  25. print('正在获取第 {} 页数据'.format(page))
  26. url = base_url + url_photos + url_key + url_page.format(page)
  27. page += 1
  28. response = httpx.get(url, headers=headers)
  29. response.encoding = 'utf-8'
  30. html = response.text
  31. target_block = re.findall('<\!\[endif\]--><title>(.*?)<meta property="og:locale"', html)
  32. if not target_block:
  33. continue
  34. target_block = target_block[0]
  35. if not title:
  36. re_title = re.findall('(.*?)\| Page', target_block)
  37. if not re_title:
  38. print('获取 title 失败')
  39. error_times += 1
  40. continue
  41. re_title = re_title[0]
  42. title = re.sub(r'[<>:"/\\|?*]', '', re_title)
  43. title = title.replace(' ', '')
  44. img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block)
  45. if not img_list:
  46. print('获取图片链接失败, 第{}页'.format(page))
  47. error_times += 1
  48. continue
  49. all_img_list += img_list
  50. time.sleep(random.uniform(2, 3))
  51. return all_img_list, title
  52. def get_imgs(all_img_list, title):
  53. print('\n\n开始保存图片')
  54. current_directory = os.getcwd()
  55. if not os.path.exists(title):
  56. os.mkdir(title)
  57. img_dir = os.path.join(current_directory, title)
  58. files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
  59. now_last_num = 1
  60. if files:
  61. now_last_num = int(files[-1].split('.')[0])
  62. for n in range(now_last_num, len(all_img_list)):
  63. img = httpx.get(all_img_list[n], headers=headers)
  64. if not img.status_code == 200:
  65. print('请求图片错误, 程序退出')
  66. raise Exception(f'状态码 {img.status_code}')
  67. file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1]
  68. print('正在保存图片: {}'.format(file_name))
  69. with open(title + "/" + file_name, "wb") as f:
  70. f.write(img.content)
  71. time.sleep(random.uniform(5, 8))
  72. if __name__ == '__main__':
  73. for url_key in url_keys:
  74. url_key = url_key + '.html?'
  75. all_img_list, title = get_pages(url_key)
  76. while True:
  77. try:
  78. get_imgs(all_img_list, title)
  79. except Exception as e:
  80. print(e)
  81. time.sleep(random.uniform(30, 40))
  82. continue
  83. else:
  84. print("图片保存完成,退出循环")
  85. break
  86. print("done")