пре 3 месеци · f0504173c4
--- a/kaizty_spider.py
+++ b/kaizty_spider.py
@@ -1,14 +1,10 @@
 
				-import time
			
 
				 import asyncio
			
 
				-import random
			
 
				 import re
			
 
				 import json
			
 
				 import os
			
 
				-import concurrent.futures
			
 
				 import httpx
			
 
				 
			
 
				-max_workers = 2
			
 
				-proxies="http://127.0.0.1:7890"
			
 
				+
			
 
				 def check_urls_json_exists(key):
			
 
				     downloads_path = os.path.join(os.getcwd(), "downloads")
			
 
				     for root, dirs, files in os.walk(downloads_path):
			
@@ -32,7 +28,7 @@ def check_and_load_keys():
 
				     with open(keys_file, "r", encoding="utf-8") as f:
			
 
				         keys = [line.strip() for line in f.readlines()]
			
 
				     if keys:
			
 
				-        return keys
			
 
				+        return list(set(keys))
			
 
				     else:
			
 
				         print("keys.txt 文件为空\n请填写key。")
			
 
				         exit(0)
			
@@ -74,15 +70,21 @@ async def get_urls(key):
 
				     base_url = f"https://www.kaizty.com/photos/{key}.html?page="
			
 
				     data = {}
			
 
				     folder_name = "default_folder"  # 默认文件夹名
			
 
				-    async with httpx.AsyncClient(proxies=proxies) as client:
			
 
				+    async with httpx.AsyncClient(proxy="http://127.0.0.1:7890") as client:
			
 
				         n = 1
			
 
				+        retry_count = 5
			
 
				         for page in range(1, 30):
			
 
				             url = base_url + str(page)
			
 
				             print(f"正在爬取页面: {url}")
			
 
				             content = await fetch_page(client, url)
			
 
				             if content is None:
			
 
				                 print(f"无法获取页面内容: {url}")
			
 
				-                continue
			
 
				+                if retry_count > 0:
			
 
				+                    retry_count -= 1
			
 
				+                    continue
			
 
				+                else:
			
 
				+                    print(f"{key} 爬取失败，跳过")
			
 
				+                    break
			
 
				 
			
 
				             # 检查页面内容是否为空
			
 
				             if "EMPTY" in content:
			
@@ -115,6 +117,10 @@ async def get_urls(key):
 
				             else:
			
 
				                 print(f"页面 {url} 中未找到图片链接。")
			
 
				 
			
 
				+    # 如果 data 有数据, 则保存, 没有则直接跳过
			
 
				+    if not data:
			
 
				+        return {}
			
 
				+
			
 
				     # 创建文件夹并保存数据
			
 
				     downloads_path = os.path.join(os.getcwd(), "downloads")
			
 
				     if not os.path.exists(downloads_path):
			
@@ -150,107 +156,7 @@ def load_imgs_url_and_patn():
 
				     return result
			
 
				 
			
 
				 
			
 
				-def save_img(client, img_path, img_url, max_retries=999):
			
 
				-    retries = 0
			
 
				-    headers = {
			
 
				-        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				-        "Accept-Encoding": "gzip, deflate, br, zstd",
			
 
				-        "Accept-Language": "zh-CN,zh;q=0.9",
			
 
				-        "Priority": "u=0, i",
			
 
				-        "Sec-CH-UA": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
			
 
				-        "Sec-CH-UA-Mobile": "?1",
			
 
				-        "Sec-CH-UA-Platform": '"Android"',
			
 
				-        "Sec-Fetch-Dest": "document",
			
 
				-        "Sec-Fetch-Mode": "navigate",
			
 
				-        "Sec-Fetch-Site": "none",
			
 
				-        "Sec-Fetch-User": "?1",
			
 
				-        "Upgrade-Insecure-Requests": "1",
			
 
				-        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36"
			
 
				-    }
			
 
				-
			
 
				-    while retries < max_retries:
			
 
				-        try:
			
 
				-            # 使用传入的 client 下载图片，并设置请求头
			
 
				-            response = client.get(img_url, headers=headers, timeout=10)
			
 
				-            response.raise_for_status()  # 检查请求是否成功
			
 
				-
			
 
				-            # 保存图片到指定路径
			
 
				-            os.makedirs(os.path.dirname(img_path), exist_ok=True)
			
 
				-            with open(img_path, "wb") as f:
			
 
				-                f.write(response.content)
			
 
				-            print(f"图片已下载并保存到 {img_path}")
			
 
				-            time.sleep(random.uniform(1, 1.5))
			
 
				-            return  # 成功下载后退出函数
			
 
				-        except httpx.HTTPStatusError as e:
			
 
				-            switch_to_random_proxy()
			
 
				-            if e.response.status_code == 429:
			
 
				-                # 如果是 429 错误，获取 Retry-After 时间
			
 
				-                retry_after = int(e.response.headers.get('Retry-After', 3))
			
 
				-                print(f"遇到 429 错误，等待 {retry_after} 秒后重试...")
			
 
				-                time.sleep(retry_after)
			
 
				-                retries += 1
			
 
				-            else:
			
 
				-                print(f"下载图片失败: {img_path.split('/')[-1]}，错误码: {e.response.status_code}")
			
 
				-                break
			
 
				-        except Exception as e:
			
 
				-            print(f"保存图片时发生错误: {e}")
			
 
				-            break
			
 
				-    if retries == max_retries:
			
 
				-        print(f"图片下载失败，已达到最大重试次数: {img_path}")
			
 
				-
			
 
				-def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
			
 
				-    """
			
 
				-    随机切换代理组中的一个节点（排除当前节点和 DIRECT/REJECT）
			
 
				-
			
 
				-    :param clash_api_url: Clash RESTful API 地址，默认为 "http://127.0.0.1:9090"
			
 
				-    :param group_name: 代理组名称，默认为 "GLOBAL"
			
 
				-    """
			
 
				-    try:
			
 
				-        # 获取代理组的所有节点
			
 
				-        response = httpx.get(f"{clash_api_url}/proxies")
			
 
				-        response.raise_for_status()
			
 
				-        proxies = response.json()
			
 
				-
			
 
				-        if group_name not in proxies['proxies']:
			
 
				-            print(f"代理组 '{group_name}' 不存在")
			
 
				-            return
			
 
				-
			
 
				-        group_info = proxies['proxies'][group_name]
			
 
				-        if group_info['type'] != 'Selector':
			
 
				-            print(f"'{group_name}' 不是 Selector 类型的代理组")
			
 
				-            return
			
 
				-
			
 
				-        # 获取当前使用的节点
			
 
				-        current_node = group_info['now']
			
 
				-        print(f"当前节点: {current_node}")
			
 
				-
			
 
				-        # 获取所有可选节点（排除 DIRECT 和 REJECT）
			
 
				-        nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
			
 
				-        if not nodes:
			
 
				-            print("没有可用的代理节点")
			
 
				-            return
			
 
				-
			
 
				-        # 随机选择一个非当前节点的代理
			
 
				-        available_nodes = [node for node in nodes if node != current_node]
			
 
				-        if not available_nodes:
			
 
				-            print("没有其他可用的代理节点")
			
 
				-            return
			
 
				-
			
 
				-        random_node = random.choice(available_nodes)
			
 
				-        print(f"正在切换到随机节点: {random_node}")
			
 
				-
			
 
				-        # 切换节点
			
 
				-        switch_url = f"{clash_api_url}/proxies/{group_name}"
			
 
				-        response = httpx.put(switch_url, json={"name": random_node})
			
 
				-        if response.status_code == 204:
			
 
				-            print(f"成功切换到节点: {random_node}")
			
 
				-        else:
			
 
				-            print(f"切换节点失败: {response.status_code}")
			
 
				-
			
 
				-    except httpx.exceptions.RequestException as e:
			
 
				-        print(f"请求失败: {e}")
			
 
				-
			
 
				-def main():
			
 
				+def start_get_urls():
			
 
				     keys = check_and_load_keys()
			
 
				 
			
 
				     # 在这里获取当前路径并且判定如果没有downloads文件夹的话，就创建一个
			
@@ -266,26 +172,11 @@ def main():
 
				             folder_name = result[0]
			
 
				             data_file_path = result[1]
			
 
				             print(f"处理完成，文件夹名称: {folder_name}, 数据保存路径: {data_file_path}")
			
 
				+        else:
			
 
				+            print(f"没有获取到数据，跳过")
			
 
				 
			
 
				-    print(f'已获取全部keys的url数据, 开始下载图片')
			
 
				-    time.sleep(0.1)
			
 
				-
			
 
				-    all_data = load_imgs_url_and_patn()
			
 
				-
			
 
				-    # 创建一个全局的 httpx.Client 实例
			
 
				-    with httpx.Client(proxies=proxies) as client:
			
 
				-        # 使用线程池并发下载图片
			
 
				-        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
			
 
				-            futures = []
			
 
				-            for img_path, img_url in all_data:
			
 
				-                futures.append(executor.submit(save_img, client, img_path, img_url))
			
 
				-
			
 
				-            # 等待所有线程完成
			
 
				-            for future in concurrent.futures.as_completed(futures):
			
 
				-                future.result()  # 捕获异常（如果有）
			
 
				-
			
 
				-    print("所有图片下载完成！")
			
 
				+    print(f'已获取全部keys的url数据')
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				-    main()
			
 
				+    start_get_urls()
			
--- a/2step.py
+++ b/2step.py
@@ -0,0 +1,100 @@
 
				+import asyncio
			
 
				+import time
			
 
				+import json
			
 
				+import os
			
 
				+import httpx
			
 
				+
			
 
				+
			
 
				+async def download_image(session, img_path, img_url, retry_count=3):
			
 
				+    for attempt in range(retry_count):
			
 
				+        try:
			
 
				+            # 发起请求下载图片
			
 
				+            response = await session.get(img_url)
			
 
				+            response.raise_for_status()  # 检查请求是否成功
			
 
				+            # 确保图片文件夹存在
			
 
				+            os.makedirs(os.path.dirname(img_path), exist_ok=True)
			
 
				+            # 将图片内容写入文件
			
 
				+            with open(img_path, 'wb') as f:
			
 
				+                f.write(response.content)
			
 
				+            # print(f"图片下载完成: {img_path}")
			
 
				+            return True
			
 
				+        except httpx.HTTPStatusError as e:
			
 
				+            if e.response.status_code == 429:
			
 
				+                wait_time = 2 ** attempt  # 指数退避策略
			
 
				+                # print(f"429 Too Many Requests, 等待 {wait_time} 秒后重试...")
			
 
				+                await asyncio.sleep(wait_time)
			
 
				+            else:
			
 
				+                # print(f"下载图片失败: {img_url}, 错误信息: {e}")
			
 
				+                return False
			
 
				+        except Exception as e:
			
 
				+            # print(f"下载图片失败: {img_url}, 错误信息: {e}")
			
 
				+            await asyncio.sleep(1)  # 简单的重试间隔
			
 
				+    # print(f"图片下载失败，达到最大重试次数: {img_url}")
			
 
				+    return False
			
 
				+
			
 
				+# 异步下载所有图片
			
 
				+
			
 
				+
			
 
				+async def download_all_images(ready_to_download_list, max_concurrent_downloads=5):
			
 
				+    async with httpx.AsyncClient() as session:
			
 
				+        tasks = []
			
 
				+        semaphore = asyncio.Semaphore(max_concurrent_downloads)  # 限制并发数量
			
 
				+
			
 
				+        async def bounded_download(item):
			
 
				+            async with semaphore:
			
 
				+                return await download_image(session, item['img_path'], item['img_url'])
			
 
				+
			
 
				+        for item in ready_to_download_list:
			
 
				+            task = asyncio.create_task(bounded_download(item))
			
 
				+            tasks.append(task)
			
 
				+        # 等待所有任务完成
			
 
				+        await asyncio.gather(*tasks)
			
 
				+
			
 
				+# 加载需要下载的图片列表
			
 
				+
			
 
				+
			
 
				+def load_ready_to_download_list():
			
 
				+    result = []
			
 
				+    # 获取项目根目录
			
 
				+    project_root = os.path.dirname(os.path.abspath(__file__))
			
 
				+    downloads_path = os.path.join(project_root, 'downloads')
			
 
				+
			
 
				+    all_path = []
			
 
				+    for root, dirs, files in os.walk(downloads_path):
			
 
				+        for dir in dirs:
			
 
				+            all_path.append(os.path.join(root, dir))
			
 
				+
			
 
				+    for path in all_path:
			
 
				+        json_files = [f for f in os.listdir(path) if f.endswith('.json')]
			
 
				+        if len(json_files) != 1:
			
 
				+            continue
			
 
				+        json_file = json_files[0]
			
 
				+        json_path = os.path.join(path, json_file)
			
 
				+        with open(json_path, 'r', encoding='utf-8') as f:
			
 
				+            img_list = json.load(f)
			
 
				+        for k, v in img_list.items():
			
 
				+            img_path = os.path.join(path, k)
			
 
				+            if os.path.exists(img_path):
			
 
				+                continue
			
 
				+            result.append({
			
 
				+                'img_path': img_path,
			
 
				+                'img_url': v
			
 
				+            })
			
 
				+
			
 
				+    return result
			
 
				+
			
 
				+# 主函数
			
 
				+
			
 
				+
			
 
				+async def start_download():
			
 
				+    for retry in range(3):
			
 
				+        ready_to_download_list = load_ready_to_download_list()
			
 
				+        print(f"准备下载图片共: {len(ready_to_download_list)} 张")
			
 
				+        if not ready_to_download_list:
			
 
				+            print("已全部下载完成或没有需要下载的图片")
			
 
				+            return
			
 
				+        await download_all_images(ready_to_download_list)
			
 
				+        time.sleep(2)  # 间隔2秒后重新检查
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    asyncio.run(start_download())
			
--- a/keys.txt
+++ b/keys.txt
@@ -1 +1,2 @@
 
				-Y0VRSUQ2NFgvdkVTNVNPOHJJUW9Idz09
			
 
				+eGd5ZHZLVGJ0c0s3ZkN6MVc0YWQ1QT09
			
 
				+THFuTUVqUFF4c3hJV2pmTkF2RkplQT09
			
--- a/random_proxy.py
+++ b/random_proxy.py
@@ -1,56 +0,0 @@
 
				-import httpx
			
 
				-import random
			
 
				-
			
 
				-def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
			
 
				-    """
			
 
				-    随机切换代理组中的一个节点（排除当前节点和 DIRECT/REJECT）
			
 
				-
			
 
				-    :param clash_api_url: Clash RESTful API 地址，默认为 "http://127.0.0.1:9090"
			
 
				-    :param group_name: 代理组名称，默认为 "GLOBAL"
			
 
				-    """
			
 
				-    try:
			
 
				-        # 获取代理组的所有节点
			
 
				-        response = httpx.get(f"{clash_api_url}/proxies")
			
 
				-        response.raise_for_status()
			
 
				-        proxies = response.json()
			
 
				-
			
 
				-        if group_name not in proxies['proxies']:
			
 
				-            print(f"代理组 '{group_name}' 不存在")
			
 
				-            return
			
 
				-
			
 
				-        group_info = proxies['proxies'][group_name]
			
 
				-        if group_info['type'] != 'Selector':
			
 
				-            print(f"'{group_name}' 不是 Selector 类型的代理组")
			
 
				-            return
			
 
				-
			
 
				-        # 获取当前使用的节点
			
 
				-        current_node = group_info['now']
			
 
				-        print(f"当前节点: {current_node}")
			
 
				-
			
 
				-        # 获取所有可选节点（排除 DIRECT 和 REJECT）
			
 
				-        nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
			
 
				-        if not nodes:
			
 
				-            print("没有可用的代理节点")
			
 
				-            return
			
 
				-
			
 
				-        # 随机选择一个非当前节点的代理
			
 
				-        available_nodes = [node for node in nodes if node != current_node]
			
 
				-        if not available_nodes:
			
 
				-            print("没有其他可用的代理节点")
			
 
				-            return
			
 
				-
			
 
				-        random_node = random.choice(available_nodes)
			
 
				-        print(f"正在切换到随机节点: {random_node}")
			
 
				-
			
 
				-        # 切换节点
			
 
				-        switch_url = f"{clash_api_url}/proxies/{group_name}"
			
 
				-        response = httpx.put(switch_url, json={"name": random_node})
			
 
				-        if response.status_code == 204:
			
 
				-            print(f"成功切换到节点: {random_node}")
			
 
				-        else:
			
 
				-            print(f"切换节点失败: {response.status_code}")
			
 
				-
			
 
				-    except httpx.exceptions.RequestException as e:
			
 
				-        print(f"请求失败: {e}")
			
 
				-
			
 
				-switch_to_random_proxy()