|
|
@@ -1,109 +1,291 @@
|
|
|
-import os.path
|
|
|
-import re
|
|
|
-import random
|
|
|
import time
|
|
|
+import asyncio
|
|
|
+import random
|
|
|
+import re
|
|
|
+import json
|
|
|
+import os
|
|
|
+import concurrent.futures
|
|
|
import httpx
|
|
|
|
|
|
-url_keys = [
|
|
|
- 'L2lBQ200aE0vOVNmUGcydzhhT296Zz09',
|
|
|
- 'RFFRQXFIZEhNeDNaV2txWjRlMk5xdz09'
|
|
|
-]
|
|
|
-url_photos = '/photos/'
|
|
|
-base_url = 'https://www.kaizty.com/'
|
|
|
-url_page = 'page={}'
|
|
|
+max_workers = 2
|
|
|
+proxies="http://127.0.0.1:7890"
|
|
|
+def check_urls_json_exists(key):
|
|
|
+ downloads_path = os.path.join(os.getcwd(), "downloads")
|
|
|
+ for root, dirs, files in os.walk(downloads_path):
|
|
|
+ if f"{key}.json" in files:
|
|
|
+ json_path = root.split('/')[-1]
|
|
|
+ print(f'json文件已存在 {json_path} 中')
|
|
|
+ return True
|
|
|
+ return False
|
|
|
|
|
|
-headers = {
|
|
|
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
|
|
|
-}
|
|
|
|
|
|
+def check_and_load_keys():
|
|
|
+ # 从 keys.txt 文件中读取 key
|
|
|
+ keys = []
|
|
|
+ keys_file = os.path.join(os.getcwd(), "keys.txt")
|
|
|
+ if not os.path.exists(keys_file):
|
|
|
+ print("keys.txt 文件不存在\n新建keys.txt文件。")
|
|
|
+ with open(keys_file, "w", encoding="utf-8") as f:
|
|
|
+ f.write("")
|
|
|
+ exit(0)
|
|
|
|
|
|
-def get_pages(url_key):
|
|
|
- title = ''
|
|
|
- all_img_list = []
|
|
|
+ with open(keys_file, "r", encoding="utf-8") as f:
|
|
|
+ keys = [line.strip() for line in f.readlines()]
|
|
|
+ if keys:
|
|
|
+ return keys
|
|
|
+ else:
|
|
|
+ print("keys.txt 文件为空\n请填写key。")
|
|
|
+ exit(0)
|
|
|
|
|
|
- error_times = 0
|
|
|
- max_error_times = 2
|
|
|
- page = 1
|
|
|
|
|
|
- while True:
|
|
|
- if error_times >= max_error_times:
|
|
|
- break
|
|
|
+async def fetch_page(client, url):
|
|
|
+ try:
|
|
|
+ response = await client.get(url)
|
|
|
+ response.raise_for_status() # 检查请求是否成功
|
|
|
+ return response.text
|
|
|
+ except httpx.HTTPError as e:
|
|
|
+ print(f"请求失败: {e}")
|
|
|
+ return None
|
|
|
|
|
|
- print('正在获取第 {} 页数据'.format(page))
|
|
|
- url = base_url + url_photos + url_key + url_page.format(page)
|
|
|
- page += 1
|
|
|
-
|
|
|
- response = httpx.get(url, headers=headers)
|
|
|
- response.encoding = 'utf-8'
|
|
|
- html = response.text
|
|
|
- target_block = re.findall('<\!\[endif\]--><title>(.*?)<meta property="og:locale"', html)
|
|
|
- if not target_block:
|
|
|
- continue
|
|
|
- target_block = target_block[0]
|
|
|
- if not title:
|
|
|
- re_title = re.findall('(.*?)\| Page', target_block)
|
|
|
- if not re_title:
|
|
|
- print('获取 title 失败')
|
|
|
- error_times += 1
|
|
|
- continue
|
|
|
- re_title = re_title[0]
|
|
|
- title = re.sub(r'[<>:"/\\|?*]', '', re_title)
|
|
|
- title = title.replace(' ', '')
|
|
|
|
|
|
- img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block)
|
|
|
- if not img_list:
|
|
|
- print('获取图片链接失败, 第{}页'.format(page))
|
|
|
- error_times += 1
|
|
|
- continue
|
|
|
- all_img_list += img_list
|
|
|
- time.sleep(random.uniform(2, 3))
|
|
|
+def extract_image_links(content):
|
|
|
+ # 使用正则表达式提取图片链接
|
|
|
+ pattern = r'<meta itemprop="image" content="(.*?)">'
|
|
|
+ image_links = re.findall(pattern, content)
|
|
|
+ return image_links
|
|
|
+
|
|
|
+
|
|
|
+def clean_folder_name(title):
|
|
|
+ # 清洗标题,使其成为 Windows 文件夹合法字符
|
|
|
+ invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
|
|
|
+ title = re.sub(invalid_chars, '_', title) # 替换非法字符为下划线
|
|
|
+ title = title.replace(" ", "") # 删除空格
|
|
|
+ title = title.replace("_", "") # 删除下划线
|
|
|
+ return title.strip()
|
|
|
+
|
|
|
|
|
|
- return all_img_list, title
|
|
|
+async def get_urls(key):
|
|
|
+ # 这里判定一下, 这个 key 是否已经爬取过
|
|
|
+ is_exists = check_urls_json_exists(key)
|
|
|
+ if is_exists:
|
|
|
+ print(f"{key}.json 文件已存在,跳过爬取。")
|
|
|
+ return
|
|
|
|
|
|
+ base_url = f"https://www.kaizty.com/photos/{key}.html?page="
|
|
|
+ data = {}
|
|
|
+ folder_name = "default_folder" # 默认文件夹名
|
|
|
+ async with httpx.AsyncClient(proxies=proxies) as client:
|
|
|
+ n = 1
|
|
|
+ for page in range(1, 30):
|
|
|
+ url = base_url + str(page)
|
|
|
+ print(f"正在爬取页面: {url}")
|
|
|
+ content = await fetch_page(client, url)
|
|
|
+ if content is None:
|
|
|
+ print(f"无法获取页面内容: {url}")
|
|
|
+ continue
|
|
|
|
|
|
-def get_imgs(all_img_list, title):
|
|
|
- print('\n\n开始保存图片')
|
|
|
+ # 检查页面内容是否为空
|
|
|
+ if "EMPTY" in content:
|
|
|
+ print("页面内容为空,停止爬取。")
|
|
|
+ break
|
|
|
|
|
|
- current_directory = os.getcwd()
|
|
|
+ # 获取标题(仅在第一页获取)
|
|
|
+ if page == 1:
|
|
|
+ title_pattern = r'<title>(.*?)</title>'
|
|
|
+ title_match = re.search(title_pattern, content)
|
|
|
+ if title_match:
|
|
|
+ title = title_match.group(1)
|
|
|
+ folder_name = clean_folder_name(title)
|
|
|
+ print(f"页面标题: {title}")
|
|
|
+ print(f"清洗后的文件夹名: {folder_name}")
|
|
|
+ else:
|
|
|
+ print("无法获取页面标题,使用默认文件夹名。")
|
|
|
|
|
|
- if not os.path.exists(title):
|
|
|
- os.mkdir(title)
|
|
|
+ # 提取图片链接
|
|
|
+ image_links = extract_image_links(content)
|
|
|
+ if image_links:
|
|
|
+ print(f"在页面 {url} 中找到图片链接:")
|
|
|
+ for link in image_links:
|
|
|
+ print(link)
|
|
|
+ prefix = str(n).zfill(3)
|
|
|
+ suffix = link.split('.')[-1]
|
|
|
+ img_name = f'{prefix}.{suffix}'
|
|
|
+ data[img_name] = link
|
|
|
+ n += 1
|
|
|
+ else:
|
|
|
+ print(f"页面 {url} 中未找到图片链接。")
|
|
|
|
|
|
- img_dir = os.path.join(current_directory, title)
|
|
|
- files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
|
|
|
+ # 创建文件夹并保存数据
|
|
|
+ downloads_path = os.path.join(os.getcwd(), "downloads")
|
|
|
+ if not os.path.exists(downloads_path):
|
|
|
+ os.makedirs(downloads_path)
|
|
|
+ print("创建了 downloads 文件夹。")
|
|
|
|
|
|
- now_last_num = 1
|
|
|
- if files:
|
|
|
- now_last_num = int(files[-1].split('.')[0])
|
|
|
+ folder_path = os.path.join(downloads_path, folder_name)
|
|
|
+ if not os.path.exists(folder_path):
|
|
|
+ os.makedirs(folder_path)
|
|
|
+ print(f"创建了文件夹: {folder_path}")
|
|
|
|
|
|
- for n in range(now_last_num, len(all_img_list)):
|
|
|
- img = httpx.get(all_img_list[n], headers=headers)
|
|
|
+ data_file_path = os.path.join(folder_path, f"{key}.json")
|
|
|
+ with open(data_file_path, "w", encoding="utf-8") as f:
|
|
|
+ json.dump(data, f, ensure_ascii=False, indent=4)
|
|
|
+ print(f"数据已保存到 {data_file_path}")
|
|
|
|
|
|
- if not img.status_code == 200:
|
|
|
- print('请求图片错误, 程序退出')
|
|
|
- raise Exception(f'状态码 {img.status_code}')
|
|
|
+ return [folder_name, data_file_path]
|
|
|
|
|
|
- file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1]
|
|
|
- print('正在保存图片: {}'.format(file_name))
|
|
|
- with open(title + "/" + file_name, "wb") as f:
|
|
|
- f.write(img.content)
|
|
|
- time.sleep(random.uniform(5, 8))
|
|
|
|
|
|
+def load_imgs_url_and_patn():
|
|
|
+ result = []
|
|
|
+ downloads_path = os.path.join(os.getcwd(), "downloads")
|
|
|
+ for root, dirs, files in os.walk(downloads_path):
|
|
|
+ for file in files:
|
|
|
+ if file.endswith(".json"):
|
|
|
+ json_path = os.path.join(root, file)
|
|
|
+ with open(json_path, "r", encoding="utf-8") as f:
|
|
|
+ data = json.load(f)
|
|
|
+ for img_name, img_url in data.items():
|
|
|
+ img_path = os.path.join(root, img_name)
|
|
|
+ if not os.path.exists(img_path):
|
|
|
+ result.append([img_path, img_url])
|
|
|
+ return result
|
|
|
|
|
|
-if __name__ == '__main__':
|
|
|
- for url_key in url_keys:
|
|
|
- url_key = url_key + '.html?'
|
|
|
- all_img_list, title = get_pages(url_key)
|
|
|
|
|
|
- while True:
|
|
|
- try:
|
|
|
- get_imgs(all_img_list, title)
|
|
|
- except Exception as e:
|
|
|
- print(e)
|
|
|
- time.sleep(random.uniform(30, 40))
|
|
|
- continue
|
|
|
+def save_img(client, img_path, img_url, max_retries=999):
|
|
|
+ retries = 0
|
|
|
+ headers = {
|
|
|
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
+ "Accept-Encoding": "gzip, deflate, br, zstd",
|
|
|
+ "Accept-Language": "zh-CN,zh;q=0.9",
|
|
|
+ "Priority": "u=0, i",
|
|
|
+ "Sec-CH-UA": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
|
|
|
+ "Sec-CH-UA-Mobile": "?1",
|
|
|
+ "Sec-CH-UA-Platform": '"Android"',
|
|
|
+ "Sec-Fetch-Dest": "document",
|
|
|
+ "Sec-Fetch-Mode": "navigate",
|
|
|
+ "Sec-Fetch-Site": "none",
|
|
|
+ "Sec-Fetch-User": "?1",
|
|
|
+ "Upgrade-Insecure-Requests": "1",
|
|
|
+ "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36"
|
|
|
+ }
|
|
|
+
|
|
|
+ while retries < max_retries:
|
|
|
+ try:
|
|
|
+ # 使用传入的 client 下载图片,并设置请求头
|
|
|
+ response = client.get(img_url, headers=headers, timeout=10)
|
|
|
+ response.raise_for_status() # 检查请求是否成功
|
|
|
+
|
|
|
+ # 保存图片到指定路径
|
|
|
+ os.makedirs(os.path.dirname(img_path), exist_ok=True)
|
|
|
+ with open(img_path, "wb") as f:
|
|
|
+ f.write(response.content)
|
|
|
+ print(f"图片已下载并保存到 {img_path}")
|
|
|
+ time.sleep(random.uniform(1, 1.5))
|
|
|
+ return # 成功下载后退出函数
|
|
|
+ except httpx.HTTPStatusError as e:
|
|
|
+ switch_to_random_proxy()
|
|
|
+ if e.response.status_code == 429:
|
|
|
+ # 如果是 429 错误,获取 Retry-After 时间
|
|
|
+ retry_after = int(e.response.headers.get('Retry-After', 3))
|
|
|
+ print(f"遇到 429 错误,等待 {retry_after} 秒后重试...")
|
|
|
+ time.sleep(retry_after)
|
|
|
+ retries += 1
|
|
|
else:
|
|
|
- print("图片保存完成,退出循环")
|
|
|
+ print(f"下载图片失败: {img_path.split('/')[-1]},错误码: {e.response.status_code}")
|
|
|
break
|
|
|
+ except Exception as e:
|
|
|
+ print(f"保存图片时发生错误: {e}")
|
|
|
+ break
|
|
|
+ if retries == max_retries:
|
|
|
+ print(f"图片下载失败,已达到最大重试次数: {img_path}")
|
|
|
+
|
|
|
+def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
|
|
|
+ """
|
|
|
+ 随机切换代理组中的一个节点(排除当前节点和 DIRECT/REJECT)
|
|
|
+
|
|
|
+ :param clash_api_url: Clash RESTful API 地址,默认为 "http://127.0.0.1:9090"
|
|
|
+ :param group_name: 代理组名称,默认为 "GLOBAL"
|
|
|
+ """
|
|
|
+ try:
|
|
|
+ # 获取代理组的所有节点
|
|
|
+ response = httpx.get(f"{clash_api_url}/proxies")
|
|
|
+ response.raise_for_status()
|
|
|
+ proxies = response.json()
|
|
|
+
|
|
|
+ if group_name not in proxies['proxies']:
|
|
|
+ print(f"代理组 '{group_name}' 不存在")
|
|
|
+ return
|
|
|
+
|
|
|
+ group_info = proxies['proxies'][group_name]
|
|
|
+ if group_info['type'] != 'Selector':
|
|
|
+ print(f"'{group_name}' 不是 Selector 类型的代理组")
|
|
|
+ return
|
|
|
+
|
|
|
+ # 获取当前使用的节点
|
|
|
+ current_node = group_info['now']
|
|
|
+ print(f"当前节点: {current_node}")
|
|
|
+
|
|
|
+ # 获取所有可选节点(排除 DIRECT 和 REJECT)
|
|
|
+ nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
|
|
|
+ if not nodes:
|
|
|
+ print("没有可用的代理节点")
|
|
|
+ return
|
|
|
+
|
|
|
+ # 随机选择一个非当前节点的代理
|
|
|
+ available_nodes = [node for node in nodes if node != current_node]
|
|
|
+ if not available_nodes:
|
|
|
+ print("没有其他可用的代理节点")
|
|
|
+ return
|
|
|
+
|
|
|
+ random_node = random.choice(available_nodes)
|
|
|
+ print(f"正在切换到随机节点: {random_node}")
|
|
|
+
|
|
|
+ # 切换节点
|
|
|
+ switch_url = f"{clash_api_url}/proxies/{group_name}"
|
|
|
+ response = httpx.put(switch_url, json={"name": random_node})
|
|
|
+ if response.status_code == 204:
|
|
|
+ print(f"成功切换到节点: {random_node}")
|
|
|
+ else:
|
|
|
+ print(f"切换节点失败: {response.status_code}")
|
|
|
+
|
|
|
+ except httpx.exceptions.RequestException as e:
|
|
|
+ print(f"请求失败: {e}")
|
|
|
+
|
|
|
+def main():
|
|
|
+ keys = check_and_load_keys()
|
|
|
+
|
|
|
+ # 在这里获取当前路径并且判定如果没有downloads文件夹的话,就创建一个
|
|
|
+ downloads_path = os.path.join(os.getcwd(), "downloads")
|
|
|
+ if not os.path.exists(downloads_path):
|
|
|
+ os.makedirs(downloads_path)
|
|
|
+ print("创建了 downloads 文件夹。")
|
|
|
+
|
|
|
+ for key in keys:
|
|
|
+ # 调用异步函数
|
|
|
+ result = asyncio.run(get_urls(key))
|
|
|
+ if result:
|
|
|
+ folder_name = result[0]
|
|
|
+ data_file_path = result[1]
|
|
|
+ print(f"处理完成,文件夹名称: {folder_name}, 数据保存路径: {data_file_path}")
|
|
|
+
|
|
|
+ print(f'已获取全部keys的url数据, 开始下载图片')
|
|
|
+ time.sleep(0.1)
|
|
|
+
|
|
|
+ all_data = load_imgs_url_and_patn()
|
|
|
+
|
|
|
+ # 创建一个全局的 httpx.Client 实例
|
|
|
+ with httpx.Client(proxies=proxies) as client:
|
|
|
+ # 使用线程池并发下载图片
|
|
|
+ with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
|
|
|
+ futures = []
|
|
|
+ for img_path, img_url in all_data:
|
|
|
+ futures.append(executor.submit(save_img, client, img_path, img_url))
|
|
|
+
|
|
|
+ # 等待所有线程完成
|
|
|
+ for future in concurrent.futures.as_completed(futures):
|
|
|
+ future.result() # 捕获异常(如果有)
|
|
|
+
|
|
|
+ print("所有图片下载完成!")
|
|
|
+
|
|
|
|
|
|
-print("done")
|
|
|
+if __name__ == "__main__":
|
|
|
+ main()
|