import os.path import re import random import time import httpx url_key = 'UHh0dkRPOWwyV2R2V0ZFU3hMRFZaZz09.html?' url_photos = '/photos/' base_url = 'https://www.kaizty.com/' url_page = 'page={}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0' } def get_pages(): title = '' all_img_list = [] error_times = 0 max_error_times = 2 page = 1 while True: if error_times >= max_error_times: break print('正在获取第 {} 页数据'.format(page)) url = base_url + url_photos + url_key + url_page.format(page) page += 1 response = httpx.get(url, headers=headers) response.encoding = 'utf-8' html = response.text target_block = re.findall('<\!\[endif\]-->(.*?)<meta property="og:locale"', html) if not target_block: continue target_block = target_block[0] if not title: re_title = re.findall('(.*?)\| Page', target_block) if not re_title: print('获取 title 失败') error_times += 1 continue re_title = re_title[0] title = re.sub(r'[<>:"/\\|?*]', '', re_title) title = title.replace(' ', '') img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block) if not img_list: print('获取图片链接失败, 第{}页'.format(page)) error_times += 1 continue all_img_list += img_list # time.sleep(random.uniform(2, 3)) return all_img_list, title def get_imgs(all_img_list, title): print('\n\n开始保存图片') current_directory = os.getcwd() if not os.path.exists(title): os.mkdir(title) img_dir = os.path.join(current_directory, title) files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))] now_last_num = 1 if files: now_last_num = int(files[-1].split('.')[0]) for n in range(now_last_num, len(all_img_list)): img = httpx.get(all_img_list[n], headers=headers) if not img.status_code == 200: print('请求图片错误, 程序退出') raise Exception(f'状态码 {img.status_code}') file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1] print('正在保存图片: {}'.format(file_name)) with open(title + "/" + file_name, "wb") as f: f.write(img.content) # time.sleep(random.uniform(8, 10)) if __name__ == '__main__': all_img_list, title = get_pages() while True: try: get_imgs(all_img_list, title) except Exception as e: print(e) time.sleep(random.uniform(30, 40)) continue else: print("程序执行完成,退出循环") break print("done")