преди 11 месеца · 714e87b7f2
--- a/README.md
+++ b/README.md
@@ -1,2 +1,5 @@
 
				 # spider_kaizty
			
 
				 
			
 
				+# 目标网站: https://www.kaizty.com
			
 
				+
			
 
				+# 依赖: pip install httpx
			
--- a/kaizty_playwerght.py
+++ b/kaizty_playwerght.py
@@ -1,69 +0,0 @@
 
				-# -*- coding: utf-8 -*-
			
 
				-import os
			
 
				-import time
			
 
				-import re
			
 
				-import httpx
			
 
				-from playwright.sync_api import sync_playwright
			
 
				-
			
 
				-url_photos = '/photos/'
			
 
				-base_url = 'https://www.kaizty.com//photos/L2lBQ200aE0vOVNmUGcydzhhT296Zz09.html?page={}'
			
 
				-headers = {
			
 
				-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
			
 
				-
			
 
				-
			
 
				-def clean_string(string):
			
 
				-    string = string.replace('Kaizty Photos: ', '')
			
 
				-    string = string.split('|')[0]
			
 
				-    string = re.sub(r'[^\u4e00-\u9fff a-zA-Z0-9]', '', string)
			
 
				-    string = string.replace(' ', '_')
			
 
				-    if string.endswith('_'):
			
 
				-        string = string[:-1]
			
 
				-    return string
			
 
				-
			
 
				-
			
 
				-def each_page(page, photo_url, folder):
			
 
				-    img_suffix = photo_url.split('.')[-1]
			
 
				-    img_name = str(int(time.time())) + '.' + img_suffix
			
 
				-    img_content = page.goto(photo_url).body()
			
 
				-    with open(os.path.join(folder, img_name), 'wb') as f:
			
 
				-        f.write(img_content)
			
 
				-    time.sleep(2)
			
 
				-
			
 
				-
			
 
				-def run(playwright):
			
 
				-    browser = playwright.webkit.launch(headless=True)
			
 
				-
			
 
				-    context = browser.new_context()
			
 
				-
			
 
				-    page = context.new_page()
			
 
				-
			
 
				-    for page_num in range(1, 20):
			
 
				-        page.goto(base_url.format(page_num))
			
 
				-
			
 
				-        title = page.title()
			
 
				-        # folder = clean_string(title)
			
 
				-        folder = 'aaa'
			
 
				-        if not os.path.exists(folder):
			
 
				-            print(f'new folder {folder}')
			
 
				-            os.makedirs(folder)
			
 
				-
			
 
				-        page_source = page.content()
			
 
				-        photos_list = re.findall('<meta property="og:image" content="(.*?)"', page_source)
			
 
				-
			
 
				-        for photo_url in photos_list:
			
 
				-            each_page(page, photo_url, folder)
			
 
				-
			
 
				-        # 延时一下
			
 
				-        time.sleep(2)
			
 
				-
			
 
				-        # 没找到下一页, 就退出循环
			
 
				-        if not page.query_selector('body > div.page-navigation > a.next'):
			
 
				-            print('no next page')
			
 
				-            break
			
 
				-
			
 
				-    context.close()
			
 
				-    browser.close()
			
 
				-
			
 
				-
			
 
				-with sync_playwright() as playwright:
			
 
				-    run(playwright)
			
--- a/kaizty_spider.py
+++ b/kaizty_spider.py
@@ -1,109 +1,291 @@
 
				-import os.path
			
 
				-import re
			
 
				-import random
			
 
				 import time
			
 
				+import asyncio
			
 
				+import random
			
 
				+import re
			
 
				+import json
			
 
				+import os
			
 
				+import concurrent.futures
			
 
				 import httpx
			
 
				 
			
 
				-url_keys = [
			
 
				-    'L2lBQ200aE0vOVNmUGcydzhhT296Zz09',
			
 
				-    'RFFRQXFIZEhNeDNaV2txWjRlMk5xdz09'
			
 
				-]
			
 
				-url_photos = '/photos/'
			
 
				-base_url = 'https://www.kaizty.com/'
			
 
				-url_page = 'page={}'
			
 
				+max_workers = 2
			
 
				+proxies="http://127.0.0.1:7890"
			
 
				+def check_urls_json_exists(key):
			
 
				+    downloads_path = os.path.join(os.getcwd(), "downloads")
			
 
				+    for root, dirs, files in os.walk(downloads_path):
			
 
				+        if f"{key}.json" in files:
			
 
				+            json_path = root.split('/')[-1]
			
 
				+            print(f'json文件已存在 {json_path} 中')
			
 
				+            return True
			
 
				+    return False
			
 
				 
			
 
				-headers = {
			
 
				-    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
			
 
				-}
			
 
				 
			
 
				+def check_and_load_keys():
			
 
				+    # 从 keys.txt 文件中读取 key
			
 
				+    keys = []
			
 
				+    keys_file = os.path.join(os.getcwd(), "keys.txt")
			
 
				+    if not os.path.exists(keys_file):
			
 
				+        print("keys.txt 文件不存在\n新建keys.txt文件。")
			
 
				+        with open(keys_file, "w", encoding="utf-8") as f:
			
 
				+            f.write("")
			
 
				+        exit(0)
			
 
				 
			
 
				-def get_pages(url_key):
			
 
				-    title = ''
			
 
				-    all_img_list = []
			
 
				+    with open(keys_file, "r", encoding="utf-8") as f:
			
 
				+        keys = [line.strip() for line in f.readlines()]
			
 
				+    if keys:
			
 
				+        return keys
			
 
				+    else:
			
 
				+        print("keys.txt 文件为空\n请填写key。")
			
 
				+        exit(0)
			
 
				 
			
 
				-    error_times = 0
			
 
				-    max_error_times = 2
			
 
				-    page = 1
			
 
				 
			
 
				-    while True:
			
 
				-        if error_times >= max_error_times:
			
 
				-            break
			
 
				+async def fetch_page(client, url):
			
 
				+    try:
			
 
				+        response = await client.get(url)
			
 
				+        response.raise_for_status()  # 检查请求是否成功
			
 
				+        return response.text
			
 
				+    except httpx.HTTPError as e:
			
 
				+        print(f"请求失败: {e}")
			
 
				+        return None
			
 
				 
			
 
				-        print('正在获取第 {} 页数据'.format(page))
			
 
				-        url = base_url + url_photos + url_key + url_page.format(page)
			
 
				-        page += 1
			
 
				-
			
 
				-        response = httpx.get(url, headers=headers)
			
 
				-        response.encoding = 'utf-8'
			
 
				-        html = response.text
			
 
				-        target_block = re.findall('<\!\[endif\]--><title>(.*?)<meta property="og:locale"', html)
			
 
				-        if not target_block:
			
 
				-            continue
			
 
				-        target_block = target_block[0]
			
 
				-        if not title:
			
 
				-            re_title = re.findall('(.*?)\| Page', target_block)
			
 
				-            if not re_title:
			
 
				-                print('获取 title 失败')
			
 
				-                error_times += 1
			
 
				-                continue
			
 
				-            re_title = re_title[0]
			
 
				-            title = re.sub(r'[<>:"/\\|?*]', '', re_title)
			
 
				-            title = title.replace(' ', '')
			
 
				 
			
 
				-        img_list = re.findall('<meta itemprop="image" content="(.*?)"', target_block)
			
 
				-        if not img_list:
			
 
				-            print('获取图片链接失败, 第{}页'.format(page))
			
 
				-            error_times += 1
			
 
				-            continue
			
 
				-        all_img_list += img_list
			
 
				-        time.sleep(random.uniform(2, 3))
			
 
				+def extract_image_links(content):
			
 
				+    # 使用正则表达式提取图片链接
			
 
				+    pattern = r'<meta itemprop="image" content="(.*?)">'
			
 
				+    image_links = re.findall(pattern, content)
			
 
				+    return image_links
			
 
				+
			
 
				+
			
 
				+def clean_folder_name(title):
			
 
				+    # 清洗标题，使其成为 Windows 文件夹合法字符
			
 
				+    invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
			
 
				+    title = re.sub(invalid_chars, '_', title)  # 替换非法字符为下划线
			
 
				+    title = title.replace(" ", "")  # 删除空格
			
 
				+    title = title.replace("_", "")  # 删除下划线
			
 
				+    return title.strip()
			
 
				+
			
 
				 
			
 
				-    return all_img_list, title
			
 
				+async def get_urls(key):
			
 
				+    # 这里判定一下, 这个 key 是否已经爬取过
			
 
				+    is_exists = check_urls_json_exists(key)
			
 
				+    if is_exists:
			
 
				+        print(f"{key}.json 文件已存在，跳过爬取。")
			
 
				+        return
			
 
				 
			
 
				+    base_url = f"https://www.kaizty.com/photos/{key}.html?page="
			
 
				+    data = {}
			
 
				+    folder_name = "default_folder"  # 默认文件夹名
			
 
				+    async with httpx.AsyncClient(proxies=proxies) as client:
			
 
				+        n = 1
			
 
				+        for page in range(1, 30):
			
 
				+            url = base_url + str(page)
			
 
				+            print(f"正在爬取页面: {url}")
			
 
				+            content = await fetch_page(client, url)
			
 
				+            if content is None:
			
 
				+                print(f"无法获取页面内容: {url}")
			
 
				+                continue
			
 
				 
			
 
				-def get_imgs(all_img_list, title):
			
 
				-    print('\n\n开始保存图片')
			
 
				+            # 检查页面内容是否为空
			
 
				+            if "EMPTY" in content:
			
 
				+                print("页面内容为空，停止爬取。")
			
 
				+                break
			
 
				 
			
 
				-    current_directory = os.getcwd()
			
 
				+            # 获取标题（仅在第一页获取）
			
 
				+            if page == 1:
			
 
				+                title_pattern = r'<title>(.*?)</title>'
			
 
				+                title_match = re.search(title_pattern, content)
			
 
				+                if title_match:
			
 
				+                    title = title_match.group(1)
			
 
				+                    folder_name = clean_folder_name(title)
			
 
				+                    print(f"页面标题: {title}")
			
 
				+                    print(f"清洗后的文件夹名: {folder_name}")
			
 
				+                else:
			
 
				+                    print("无法获取页面标题，使用默认文件夹名。")
			
 
				 
			
 
				-    if not os.path.exists(title):
			
 
				-        os.mkdir(title)
			
 
				+            # 提取图片链接
			
 
				+            image_links = extract_image_links(content)
			
 
				+            if image_links:
			
 
				+                print(f"在页面 {url} 中找到图片链接：")
			
 
				+                for link in image_links:
			
 
				+                    print(link)
			
 
				+                    prefix = str(n).zfill(3)
			
 
				+                    suffix = link.split('.')[-1]
			
 
				+                    img_name = f'{prefix}.{suffix}'
			
 
				+                    data[img_name] = link
			
 
				+                    n += 1
			
 
				+            else:
			
 
				+                print(f"页面 {url} 中未找到图片链接。")
			
 
				 
			
 
				-    img_dir = os.path.join(current_directory, title)
			
 
				-    files = [f for f in os.listdir(img_dir) if os.path.isfile(os.path.join(img_dir, f))]
			
 
				+    # 创建文件夹并保存数据
			
 
				+    downloads_path = os.path.join(os.getcwd(), "downloads")
			
 
				+    if not os.path.exists(downloads_path):
			
 
				+        os.makedirs(downloads_path)
			
 
				+        print("创建了 downloads 文件夹。")
			
 
				 
			
 
				-    now_last_num = 1
			
 
				-    if files:
			
 
				-        now_last_num = int(files[-1].split('.')[0])
			
 
				+    folder_path = os.path.join(downloads_path, folder_name)
			
 
				+    if not os.path.exists(folder_path):
			
 
				+        os.makedirs(folder_path)
			
 
				+        print(f"创建了文件夹: {folder_path}")
			
 
				 
			
 
				-    for n in range(now_last_num, len(all_img_list)):
			
 
				-        img = httpx.get(all_img_list[n], headers=headers)
			
 
				+    data_file_path = os.path.join(folder_path, f"{key}.json")
			
 
				+    with open(data_file_path, "w", encoding="utf-8") as f:
			
 
				+        json.dump(data, f, ensure_ascii=False, indent=4)
			
 
				+    print(f"数据已保存到 {data_file_path}")
			
 
				 
			
 
				-        if not img.status_code == 200:
			
 
				-            print('请求图片错误, 程序退出')
			
 
				-            raise Exception(f'状态码 {img.status_code}')
			
 
				+    return [folder_name, data_file_path]
			
 
				 
			
 
				-        file_name = f"{n:04d}" + "." + all_img_list[n].split(".")[-1]
			
 
				-        print('正在保存图片: {}'.format(file_name))
			
 
				-        with open(title + "/" + file_name, "wb") as f:
			
 
				-            f.write(img.content)
			
 
				-            time.sleep(random.uniform(5, 8))
			
 
				 
			
 
				+def load_imgs_url_and_patn():
			
 
				+    result = []
			
 
				+    downloads_path = os.path.join(os.getcwd(), "downloads")
			
 
				+    for root, dirs, files in os.walk(downloads_path):
			
 
				+        for file in files:
			
 
				+            if file.endswith(".json"):
			
 
				+                json_path = os.path.join(root, file)
			
 
				+                with open(json_path, "r", encoding="utf-8") as f:
			
 
				+                    data = json.load(f)
			
 
				+                    for img_name, img_url in data.items():
			
 
				+                        img_path = os.path.join(root, img_name)
			
 
				+                        if not os.path.exists(img_path):
			
 
				+                            result.append([img_path, img_url])
			
 
				+    return result
			
 
				 
			
 
				-if __name__ == '__main__':
			
 
				-    for url_key in url_keys:
			
 
				-        url_key = url_key + '.html?'
			
 
				-        all_img_list, title = get_pages(url_key)
			
 
				 
			
 
				-        while True:
			
 
				-            try:
			
 
				-                get_imgs(all_img_list, title)
			
 
				-            except Exception as e:
			
 
				-                print(e)
			
 
				-                time.sleep(random.uniform(30, 40))
			
 
				-                continue
			
 
				+def save_img(client, img_path, img_url, max_retries=999):
			
 
				+    retries = 0
			
 
				+    headers = {
			
 
				+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
			
 
				+        "Accept-Encoding": "gzip, deflate, br, zstd",
			
 
				+        "Accept-Language": "zh-CN,zh;q=0.9",
			
 
				+        "Priority": "u=0, i",
			
 
				+        "Sec-CH-UA": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
			
 
				+        "Sec-CH-UA-Mobile": "?1",
			
 
				+        "Sec-CH-UA-Platform": '"Android"',
			
 
				+        "Sec-Fetch-Dest": "document",
			
 
				+        "Sec-Fetch-Mode": "navigate",
			
 
				+        "Sec-Fetch-Site": "none",
			
 
				+        "Sec-Fetch-User": "?1",
			
 
				+        "Upgrade-Insecure-Requests": "1",
			
 
				+        "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36"
			
 
				+    }
			
 
				+
			
 
				+    while retries < max_retries:
			
 
				+        try:
			
 
				+            # 使用传入的 client 下载图片，并设置请求头
			
 
				+            response = client.get(img_url, headers=headers, timeout=10)
			
 
				+            response.raise_for_status()  # 检查请求是否成功
			
 
				+
			
 
				+            # 保存图片到指定路径
			
 
				+            os.makedirs(os.path.dirname(img_path), exist_ok=True)
			
 
				+            with open(img_path, "wb") as f:
			
 
				+                f.write(response.content)
			
 
				+            print(f"图片已下载并保存到 {img_path}")
			
 
				+            time.sleep(random.uniform(1, 1.5))
			
 
				+            return  # 成功下载后退出函数
			
 
				+        except httpx.HTTPStatusError as e:
			
 
				+            switch_to_random_proxy()
			
 
				+            if e.response.status_code == 429:
			
 
				+                # 如果是 429 错误，获取 Retry-After 时间
			
 
				+                retry_after = int(e.response.headers.get('Retry-After', 3))
			
 
				+                print(f"遇到 429 错误，等待 {retry_after} 秒后重试...")
			
 
				+                time.sleep(retry_after)
			
 
				+                retries += 1
			
 
				             else:
			
 
				-                print("图片保存完成，退出循环")
			
 
				+                print(f"下载图片失败: {img_path.split('/')[-1]}，错误码: {e.response.status_code}")
			
 
				                 break
			
 
				+        except Exception as e:
			
 
				+            print(f"保存图片时发生错误: {e}")
			
 
				+            break
			
 
				+    if retries == max_retries:
			
 
				+        print(f"图片下载失败，已达到最大重试次数: {img_path}")
			
 
				+
			
 
				+def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
			
 
				+    """
			
 
				+    随机切换代理组中的一个节点（排除当前节点和 DIRECT/REJECT）
			
 
				+
			
 
				+    :param clash_api_url: Clash RESTful API 地址，默认为 "http://127.0.0.1:9090"
			
 
				+    :param group_name: 代理组名称，默认为 "GLOBAL"
			
 
				+    """
			
 
				+    try:
			
 
				+        # 获取代理组的所有节点
			
 
				+        response = httpx.get(f"{clash_api_url}/proxies")
			
 
				+        response.raise_for_status()
			
 
				+        proxies = response.json()
			
 
				+
			
 
				+        if group_name not in proxies['proxies']:
			
 
				+            print(f"代理组 '{group_name}' 不存在")
			
 
				+            return
			
 
				+
			
 
				+        group_info = proxies['proxies'][group_name]
			
 
				+        if group_info['type'] != 'Selector':
			
 
				+            print(f"'{group_name}' 不是 Selector 类型的代理组")
			
 
				+            return
			
 
				+
			
 
				+        # 获取当前使用的节点
			
 
				+        current_node = group_info['now']
			
 
				+        print(f"当前节点: {current_node}")
			
 
				+
			
 
				+        # 获取所有可选节点（排除 DIRECT 和 REJECT）
			
 
				+        nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
			
 
				+        if not nodes:
			
 
				+            print("没有可用的代理节点")
			
 
				+            return
			
 
				+
			
 
				+        # 随机选择一个非当前节点的代理
			
 
				+        available_nodes = [node for node in nodes if node != current_node]
			
 
				+        if not available_nodes:
			
 
				+            print("没有其他可用的代理节点")
			
 
				+            return
			
 
				+
			
 
				+        random_node = random.choice(available_nodes)
			
 
				+        print(f"正在切换到随机节点: {random_node}")
			
 
				+
			
 
				+        # 切换节点
			
 
				+        switch_url = f"{clash_api_url}/proxies/{group_name}"
			
 
				+        response = httpx.put(switch_url, json={"name": random_node})
			
 
				+        if response.status_code == 204:
			
 
				+            print(f"成功切换到节点: {random_node}")
			
 
				+        else:
			
 
				+            print(f"切换节点失败: {response.status_code}")
			
 
				+
			
 
				+    except httpx.exceptions.RequestException as e:
			
 
				+        print(f"请求失败: {e}")
			
 
				+
			
 
				+def main():
			
 
				+    keys = check_and_load_keys()
			
 
				+
			
 
				+    # 在这里获取当前路径并且判定如果没有downloads文件夹的话，就创建一个
			
 
				+    downloads_path = os.path.join(os.getcwd(), "downloads")
			
 
				+    if not os.path.exists(downloads_path):
			
 
				+        os.makedirs(downloads_path)
			
 
				+        print("创建了 downloads 文件夹。")
			
 
				+
			
 
				+    for key in keys:
			
 
				+        # 调用异步函数
			
 
				+        result = asyncio.run(get_urls(key))
			
 
				+        if result:
			
 
				+            folder_name = result[0]
			
 
				+            data_file_path = result[1]
			
 
				+            print(f"处理完成，文件夹名称: {folder_name}, 数据保存路径: {data_file_path}")
			
 
				+
			
 
				+    print(f'已获取全部keys的url数据, 开始下载图片')
			
 
				+    time.sleep(0.1)
			
 
				+
			
 
				+    all_data = load_imgs_url_and_patn()
			
 
				+
			
 
				+    # 创建一个全局的 httpx.Client 实例
			
 
				+    with httpx.Client(proxies=proxies) as client:
			
 
				+        # 使用线程池并发下载图片
			
 
				+        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
			
 
				+            futures = []
			
 
				+            for img_path, img_url in all_data:
			
 
				+                futures.append(executor.submit(save_img, client, img_path, img_url))
			
 
				+
			
 
				+            # 等待所有线程完成
			
 
				+            for future in concurrent.futures.as_completed(futures):
			
 
				+                future.result()  # 捕获异常（如果有）
			
 
				+
			
 
				+    print("所有图片下载完成！")
			
 
				+
			
 
				 
			
 
				-print("done")
			
 
				+if __name__ == "__main__":
			
 
				+    main()
			
--- a/keys.txt
+++ b/keys.txt
@@ -0,0 +1 @@
 
				+Y0VRSUQ2NFgvdkVTNVNPOHJJUW9Idz09
			
--- a/random_proxy.py
+++ b/random_proxy.py
@@ -0,0 +1,56 @@
 
				+import httpx
			
 
				+import random
			
 
				+
			
 
				+def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
			
 
				+    """
			
 
				+    随机切换代理组中的一个节点（排除当前节点和 DIRECT/REJECT）
			
 
				+
			
 
				+    :param clash_api_url: Clash RESTful API 地址，默认为 "http://127.0.0.1:9090"
			
 
				+    :param group_name: 代理组名称，默认为 "GLOBAL"
			
 
				+    """
			
 
				+    try:
			
 
				+        # 获取代理组的所有节点
			
 
				+        response = httpx.get(f"{clash_api_url}/proxies")
			
 
				+        response.raise_for_status()
			
 
				+        proxies = response.json()
			
 
				+
			
 
				+        if group_name not in proxies['proxies']:
			
 
				+            print(f"代理组 '{group_name}' 不存在")
			
 
				+            return
			
 
				+
			
 
				+        group_info = proxies['proxies'][group_name]
			
 
				+        if group_info['type'] != 'Selector':
			
 
				+            print(f"'{group_name}' 不是 Selector 类型的代理组")
			
 
				+            return
			
 
				+
			
 
				+        # 获取当前使用的节点
			
 
				+        current_node = group_info['now']
			
 
				+        print(f"当前节点: {current_node}")
			
 
				+
			
 
				+        # 获取所有可选节点（排除 DIRECT 和 REJECT）
			
 
				+        nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
			
 
				+        if not nodes:
			
 
				+            print("没有可用的代理节点")
			
 
				+            return
			
 
				+
			
 
				+        # 随机选择一个非当前节点的代理
			
 
				+        available_nodes = [node for node in nodes if node != current_node]
			
 
				+        if not available_nodes:
			
 
				+            print("没有其他可用的代理节点")
			
 
				+            return
			
 
				+
			
 
				+        random_node = random.choice(available_nodes)
			
 
				+        print(f"正在切换到随机节点: {random_node}")
			
 
				+
			
 
				+        # 切换节点
			
 
				+        switch_url = f"{clash_api_url}/proxies/{group_name}"
			
 
				+        response = httpx.put(switch_url, json={"name": random_node})
			
 
				+        if response.status_code == 204:
			
 
				+            print(f"成功切换到节点: {random_node}")
			
 
				+        else:
			
 
				+            print(f"切换节点失败: {response.status_code}")
			
 
				+
			
 
				+    except httpx.exceptions.RequestException as e:
			
 
				+        print(f"请求失败: {e}")
			
 
				+
			
 
				+switch_to_random_proxy()