| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105 |
- import os.path
- import re
- import random
- import time
- import httpx
- url_key = 'UHh0dkRPOWwyV2R2V0ZFU3hMRFZaZz09.html?'
- url_photos = '/photos/'
- base_url = 'https://www.kaizty.com/'
- url_page = 'page={}'
- headers = {
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
- }
- def get_pages():
- title = ''
- all_img_list = []
- error_times = 0
- max_error_times = 2
- page = 1
- while True:
- if error_times >= max_error_times:
- break
- print('正在获取第 {} 页数据'.format(page))
- url = base_url + url_photos + url_key + url_page.format(page)
- page += 1
- response = httpx.get(url, headers=headers)
- response.encoding = 'utf-8'
- html = response.text
- target_block = re.findall('<\!\[endif\]--><title>(.*?)<meta property="og:locale"', html)
- if not target_block:
- continue
- target_block = target_block[0]
- if not title:
- re_title = re.findall('(.*?)\| Page', target_block)
- if not re_title:
- print('获取 title 失败')
- error_times += 1
- continue
- re_title = re_title[0]
- title = re.sub(r'[<>:"/\\|?*]', '', re_title)
- title = title.replace(' ', '')
- img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block)
- if not img_list:
- print('获取图片链接失败, 第{}页'.format(page))
- error_times += 1
- continue
- all_img_list += img_list
- # time.sleep(random.uniform(2, 3))
- return all_img_list, title
- def get_imgs(all_img_list, title):
- print('\n\n开始保存图片')
- current_directory = os.getcwd()
- if not os.path.exists(title):
- os.mkdir(title)
- img_dir = os.path.join(current_directory, title)
- files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
- now_last_num = 1
- if files:
- now_last_num = int(files[-1].split('.')[0])
- for n in range(now_last_num, len(all_img_list)):
- img = httpx.get(all_img_list[n], headers=headers)
- if not img.status_code == 200:
- print('请求图片错误, 程序退出')
- raise Exception(f'状态码 {img.status_code}')
- file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1]
- print('正在保存图片: {}'.format(file_name))
- with open(title + "/" + file_name, "wb") as f:
- f.write(img.content)
- # time.sleep(random.uniform(8, 10))
- if __name__ == '__main__':
- all_img_list, title = get_pages()
- while True:
- try:
- get_imgs(all_img_list, title)
- except Exception as e:
- print(e)
- time.sleep(random.uniform(30, 40))
- continue
- else:
- print("程序执行完成,退出循环")
- break
- print("done")
|