python-code
/
spider_kaizty


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							import asyncio
import time
import json
import os
import httpx


async def download_image(session, img_path, img_url, retry_count=3):
    for attempt in range(retry_count):
        try:
            # 发起请求下载图片
            response = await session.get(img_url)
            response.raise_for_status()  # 检查请求是否成功
            # 确保图片文件夹存在
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            # 将图片内容写入文件
            with open(img_path, 'wb') as f:
                f.write(response.content)
            # print(f"图片下载完成: {img_path}")
            return True
        except httpx.HTTPStatusError as e:
            if e.response.status_code == 429:
                wait_time = 2 ** attempt  # 指数退避策略
                # print(f"429 Too Many Requests, 等待 {wait_time} 秒后重试...")
                await asyncio.sleep(wait_time)
            else:
                # print(f"下载图片失败: {img_url}, 错误信息: {e}")
                return False
        except Exception as e:
            # print(f"下载图片失败: {img_url}, 错误信息: {e}")
            await asyncio.sleep(1)  # 简单的重试间隔
    # print(f"图片下载失败，达到最大重试次数: {img_url}")
    return False

# 异步下载所有图片


async def download_all_images(ready_to_download_list, max_concurrent_downloads=5):
    async with httpx.AsyncClient() as session:
        tasks = []
        semaphore = asyncio.Semaphore(max_concurrent_downloads)  # 限制并发数量

        async def bounded_download(item):
            async with semaphore:
                return await download_image(session, item['img_path'], item['img_url'])

        for item in ready_to_download_list:
            task = asyncio.create_task(bounded_download(item))
            tasks.append(task)
        # 等待所有任务完成
        await asyncio.gather(*tasks)

# 加载需要下载的图片列表


def load_ready_to_download_list():
    result = []
    # 获取项目根目录
    project_root = os.path.dirname(os.path.abspath(__file__))
    downloads_path = os.path.join(project_root, 'downloads')

    all_path = []
    for root, dirs, files in os.walk(downloads_path):
        for dir in dirs:
            all_path.append(os.path.join(root, dir))

    for path in all_path:
        json_files = [f for f in os.listdir(path) if f.endswith('.json')]
        if len(json_files) != 1:
            continue
        json_file = json_files[0]
        json_path = os.path.join(path, json_file)
        with open(json_path, 'r', encoding='utf-8') as f:
            img_list = json.load(f)
        for k, v in img_list.items():
            img_path = os.path.join(path, k)
            if os.path.exists(img_path):
                continue
            result.append({
                'img_path': img_path,
                'img_url': v
            })

    return result

# 主函数


async def start_download():
    for retry in range(3):
        ready_to_download_list = load_ready_to_download_list()
        print(f"准备下载图片共: {len(ready_to_download_list)} 张")
        if not ready_to_download_list:
            print("已全部下载完成或没有需要下载的图片")
            return
        await download_all_images(ready_to_download_list)
        time.sleep(2)  # 间隔2秒后重新检查

if __name__ == "__main__":
    asyncio.run(start_download())