jack há 3 meses atrás
pai
commit
7bfeb90f5a
6 ficheiros alterados com 484 adições e 18 exclusões
  1. 6 0
      failed_downloads.json
  2. 46 16
      main.py
  3. 30 2
      static/script.js
  4. 193 0
      step1.py
  5. 180 0
      step2.py
  6. 29 0
      utils.py

+ 6 - 0
failed_downloads.json

@@ -0,0 +1,6 @@
+[
+  {
+    "img_path": "data/downloads/[Pixiv]玲殿下(81002566)2025.09.23-E-HentaiGalleries/0016",
+    "img_url": "https://e-hentai.org/s/e3f2a8c9a8/3550066-16"
+  }
+]

+ 46 - 16
main.py

@@ -4,9 +4,32 @@ from fastapi.templating import Jinja2Templates
 from fastapi.responses import JSONResponse
 import uvicorn
 import os
+from pydantic import BaseModel
+from utils import *
 
 app = FastAPI(title="下载工具", version="1.0.0")
 
+# 在应用启动时检查并创建data文件夹和targets.txt
+@app.on_event("startup")
+async def startup_event():
+    # 检查并创建data文件夹
+    data_dir = "data"
+    if not os.path.exists(data_dir):
+        os.makedirs(data_dir)
+        print(f"创建目录: {data_dir}")
+    
+    # 检查并创建targets.txt文件
+    targets_file = os.path.join(data_dir, "targets.txt")
+    if not os.path.exists(targets_file):
+        with open(targets_file, 'w', encoding='utf-8') as f:
+            f.write("# 在这里添加目标URL,每行一个\n")
+            f.write("# 示例:\n")
+            f.write("# https://example.com/file1.zip\n")
+            f.write("# https://example.com/image.jpg\n")
+        print(f"创建文件: {targets_file}")
+    else:
+        print(f"文件已存在: {targets_file}")
+
 # 挂载静态文件和模板
 app.mount("/static", StaticFiles(directory="static"), name="static")
 templates = Jinja2Templates(directory="templates")
@@ -22,22 +45,6 @@ async def load_urls():
     try:
         file_path = "data/targets.txt"
         
-        # 检查文件是否存在
-        if not os.path.exists('data'):
-            os.mkdir('data')
-            return JSONResponse({
-                "success": False,
-                "message": f"文件 {file_path} 不存在",
-                "urls": []
-            })
-        # 检查导入url配置是否存在
-        if not os.path.exists(file_path):
-            return JSONResponse({
-                "success": False,
-                "message": f"文件 {file_path} 不存在",
-                "urls": []
-            })
-        
         # 读取文件内容
         with open(file_path, 'r', encoding='utf-8') as f:
             urls = [line.strip() for line in f.readlines() if line.strip()]
@@ -45,6 +52,13 @@ async def load_urls():
         # 过滤掉空行和注释行(以#开头的行)
         urls = [url for url in urls if url and not url.startswith('#')]
         
+        if not urls:
+            return JSONResponse({
+                "success": True,
+                "message": "targets.txt 文件为空,请在data/targets.txt中添加URL",
+                "urls": []
+            })
+        
         return JSONResponse({
             "success": True,
             "message": f"成功读取 {len(urls)} 个URL",
@@ -67,5 +81,21 @@ async def clear_output():
         "output": ""
     })
 
+class ProxyRequest(BaseModel):
+    ip: str
+    port: str
+
+@app.post("/download_urls")
+async def download_urls(req: ProxyRequest):
+    proxy = f"http://{req.ip}:{req.port}"
+    msg = await run_step1(proxy)
+    return JSONResponse({"success": True, "message": msg})
+
+@app.post("/download_images")
+async def download_images(req: ProxyRequest):
+    proxy = f"http://{req.ip}:{req.port}"
+    msg = await run_step2(proxy)
+    return JSONResponse({"success": True, "message": msg})
+
 if __name__ == "__main__":
     uvicorn.run("main:app", host="0.0.0.0", port=8000, reload=True)

+ 30 - 2
static/script.js

@@ -19,12 +19,12 @@ class DownloadTool {
         
         // 下载URL按钮
         this.downloadUrlBtn.addEventListener('click', () => {
-            this.showOutput('下载URL被点击', 'success');
+            this.downloadUrls()
         });
         
         // 下载图片按钮
         this.downloadImageBtn.addEventListener('click', () => {
-            this.showOutput('下载IMG被点击', 'success');
+            this.downloadImages()
         });
         
         // 清除输出按钮
@@ -71,6 +71,34 @@ class DownloadTool {
         }
     }
     
+    async downloadUrls() {
+        const ip = document.getElementById('ip').value;
+        const port = document.getElementById('port').value;
+    
+        this.showOutput('正在抓取画廊链接...', 'info');
+        const res = await fetch('/download_urls', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ ip, port })
+        });
+        const data = await res.json();
+        this.showOutput(data.message, data.success ? 'success' : 'error');
+    }
+
+    async downloadImages() {
+        const ip = document.getElementById('ip').value;
+        const port = document.getElementById('port').value;
+    
+        this.showOutput('正在下载图片...', 'info');
+        const res = await fetch('/download_images', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ ip, port })
+        });
+        const data = await res.json();
+        this.showOutput(data.message, data.success ? 'success' : 'error');
+    }
+
     showOutput(message, type = '') {
         this.output.textContent = message;
         this.output.className = 'output-area';

+ 193 - 0
step1.py

@@ -0,0 +1,193 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+异步批量抓取 E-H 画廊图片链接,按专辑保存 json
+python eh_crawler.py
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+
+import aiofiles
+import httpx
+from bs4 import BeautifulSoup
+from tqdm.asyncio import tqdm_asyncio
+from aiopath import AsyncPath
+
+# -------------------- 可配置常量 --------------------
+CONCURRENCY = 20                 # 并发页数
+MAX_PAGE = 100                   # 单专辑最大翻页
+RETRY_PER_PAGE = 5               # 单页重试
+TIMEOUT = httpx.Timeout(10.0)    # 请求超时
+IMG_SELECTOR = "#gdt"            # 图片入口区域
+FAILED_RECORD = "failed_keys.json"
+LOG_LEVEL = logging.INFO
+# ----------------------------------------------------
+
+logging.basicConfig(
+    level=LOG_LEVEL,
+    format="[%(asctime)s] [%(levelname)s] %(message)s",
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler("crawl.log", encoding="utf-8"),
+    ],
+)
+log = logging.getLogger("eh_crawler")
+
+# 预编译正则
+ILLEGAL_CHARS = re.compile(r'[<>:"/\\|?*\x00-\x1F]')
+
+
+# -------------------- 工具函数 --------------------
+def clean_folder_name(title: str) -> str:
+    """清洗文件夹名"""
+    return ILLEGAL_CHARS.sub("_", title).replace(" ", "").replace("_", "").strip() or "gallery"
+
+
+def load_targets() -> List[str]:
+    """读取 targets.txt"""
+    tgt = Path("data/targets.txt")
+    if not tgt.exists():
+        log.error("targets.txt 不存在,已自动创建,请先填写 URL")
+        tgt.touch()
+        sys.exit(0)
+    lines = [ln.strip() for ln in tgt.read_text(encoding="utf-8").splitlines() if ln.strip()]
+    if not lines:
+        log.error("targets.txt 为空,请先填写 URL")
+        sys.exit(0)
+    return list(set(lines))  # 去重
+
+
+def load_failed() -> List[str]:
+    if Path(FAILED_RECORD).exists():
+        try:
+            return json.loads(Path(FAILED_RECORD).read_text(encoding="utf-8"))
+        except Exception as exc:
+            log.warning(f"加载失败记录失败 -> {exc}")
+    return []
+
+
+def save_failed(keys: List[str]) -> None:
+    Path(FAILED_RECORD).write_text(json.dumps(keys, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+# -------------------- 爬虫核心 --------------------
+async def fetch_page(client: httpx.AsyncClient, url: str) -> Optional[str]:
+    """获取单页 HTML"""
+    for attempt in range(1, RETRY_PER_PAGE + 1):
+        try:
+            resp = await client.get(url)
+            resp.raise_for_status()
+            return resp.text
+        except httpx.HTTPError as exc:
+            log.error(f"[{attempt}/{RETRY_PER_PAGE}] 请求失败 {url} -> {exc}")
+            await asyncio.sleep(2 ** attempt)
+    return None
+
+
+async def crawl_single_gallery(
+    client: httpx.AsyncClient, sem: asyncio.Semaphore, gallery_url: str
+) -> bool:
+    """抓取单个画廊,成功返回 True"""
+    async with sem:
+        base_url = gallery_url.rstrip("/")
+        key = base_url.split("/")[-1]  # 用最后一截当 key
+        json_name = f"{key}.json"
+
+        folder_path: Optional[AsyncPath] = None
+        json_data: Dict[str, str] = {}
+        img_count = 1
+        last_page = False
+
+        for page in range(MAX_PAGE):
+            if last_page:
+                break
+            url = f"{base_url}?p={page}"
+            html = await fetch_page(client, url)
+            if html is None:
+                continue
+
+            soup = BeautifulSoup(html, "lxml")
+            title = soup.title.string if soup.title else "gallery"
+            clean_title = clean_folder_name(title)
+            folder_path = AsyncPath("data/downloads") / clean_title
+            await folder_path.mkdir(parents=True, exist_ok=True)
+
+            # 如果 json 已存在则跳过整个画廊
+            json_path = folder_path / json_name
+            if await json_path.exists():
+                log.info(f"{json_name} 已存在,跳过")
+                return True
+
+            log.info(f"当前页码:{page + 1}  {url}")
+
+            selected = soup.select_one(IMG_SELECTOR)
+            if not selected:
+                log.warning(f"未找到选择器 {IMG_SELECTOR}")
+                continue
+
+            links = re.findall(r'<a href="(.*?)"', selected.prettify())
+            if not links:
+                log.info("本页无图片入口,视为最后一页")
+                last_page = True
+                continue
+
+            for img_entry in links:
+                if img_entry in json_data.values():
+                    last_page = True
+                    break
+                json_data[f"{img_count:04d}"] = img_entry
+                img_count += 1
+
+        if json_data:
+            await json_path.write_text(
+                json.dumps(json_data, ensure_ascii=False, indent=2), encoding="utf-8"
+            )
+            log.info(f"保存成功 -> {json_path}  ({len(json_data)} 张)")
+            return True
+        else:
+            log.warning(f"{key} 未解析到任何图片链接")
+            return False
+
+
+# -------------------- 主流程 --------------------
+async def main(proxy: str | None = None) -> None:
+    targets = load_targets()
+    failed = load_failed()
+    if failed:
+        log.info(f"优先重试上次失败画廊: {len(failed)} 个")
+    all_urls = list(set(targets + failed))
+
+    print(proxy)
+    limits = httpx.Limits(max_keepalive_connections=20, max_connections=50)
+    async with httpx.AsyncClient(
+        limits=limits, timeout=TIMEOUT, proxies=proxy, verify=True
+    ) as client:
+        sem = asyncio.Semaphore(CONCURRENCY)
+        results = await tqdm_asyncio.gather(
+            *[crawl_single_gallery(client, sem, u) for u in all_urls],
+            desc="Galleries",
+            total=len(all_urls),
+        )
+
+    # 失败持久化
+    new_failed = [u for u, ok in zip(all_urls, results) if not ok]
+    if new_failed:
+        save_failed(new_failed)
+        log.warning(f"本轮仍有 {len(new_failed)} 个画廊失败,已写入 {FAILED_RECORD}")
+    else:
+        Path(FAILED_RECORD).unlink(missing_ok=True)
+        log.info("全部画廊抓取完成!")
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        log.info("用户中断,抓取结束")

+ 180 - 0
step2.py

@@ -0,0 +1,180 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+异步批量下载 EH 画廊真实图片
+python download_images.py
+"""
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+import sys
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import aiofiles
+import httpx
+from aiopath import AsyncPath
+from tqdm.asyncio import tqdm_asyncio
+
+# -------------------- 可配置常量 --------------------
+CONCURRENCY = 20                 # 并发下载数
+RETRY_PER_IMG = 3                # 单图重试
+TIMEOUT = httpx.Timeout(15.0)    # 请求超时
+FAILED_RECORD = "failed_downloads.json"
+LOG_LEVEL = logging.INFO
+# ----------------------------------------------------
+
+logging.basicConfig(
+    level=LOG_LEVEL,
+    format="[%(asctime)s] [%(levelname)s] %(message)s",
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler("download.log", encoding="utf-8"),
+    ],
+)
+log = logging.getLogger("img_downloader")
+
+# 预编译正则
+IMG_URL_RE = re.compile(r'<img id="img" src="(.*?)"', re.S)
+EXT_RE = re.compile(r"\.(jpg|jpeg|png|gif|webp)$", re.I)
+
+
+# -------------------- 工具函数 --------------------
+def load_failed() -> List[Dict[str, str]]:
+    if Path(FAILED_RECORD).exists():
+        try:
+            return json.loads(Path(FAILED_RECORD).read_text(encoding="utf-8"))
+        except Exception as exc:
+            log.warning(f"加载失败记录失败 -> {exc}")
+    return []
+
+
+def save_failed(failed: List[Dict[str, str]]) -> None:
+    Path(FAILED_RECORD).write_text(json.dumps(failed, ensure_ascii=False, indent=2), encoding="utf-8")
+
+
+# -------------------- 下载核心 --------------------
+async def download_one(
+    client: httpx.AsyncClient, sem: asyncio.Semaphore, item: Dict[str, str]
+) -> bool:
+    """下载单张图,成功返回 True"""
+    img_path, img_url = Path(item["img_path"]), item["img_url"]
+
+    await sem.acquire()
+    try:
+        for attempt in range(1, RETRY_PER_IMG + 1):
+            try:
+                # 1. 获取详情页
+                resp = await client.get(img_url)
+                resp.raise_for_status()
+                real_url_match = IMG_URL_RE.search(resp.text)
+                if not real_url_match:
+                    log.warning(f"未解析到真实图片链接: {img_url}")
+                    return False          # <- 这里不会触发 await
+                real_url = real_url_match.group(1)
+
+                # 2. 下载真实图片(流式)
+                ext_match = EXT_RE.search(real_url)
+                ext = ext_match.group(1).lower() if ext_match else "jpg"
+                final_path = img_path.with_suffix(f".{ext}")
+
+                if await AsyncPath(final_path).exists():
+                    log.info(f"已存在,跳过: {final_path.name}")
+                    return True
+
+                async with client.stream("GET", real_url) as img_resp:
+                    img_resp.raise_for_status()
+                    await AsyncPath(final_path).parent.mkdir(parents=True, exist_ok=True)
+                    async with aiofiles.open(final_path, "wb") as fp:
+                        async for chunk in img_resp.aiter_bytes(chunk_size=65536):
+                            await fp.write(chunk)
+
+                log.info(f"[OK] {final_path.name}")
+                return True
+
+            except httpx.HTTPStatusError as exc:
+                if exc.response.status_code == 429:
+                    wait = 2 ** (attempt - 1)
+                    log.warning(f"[429] 等待 {wait}s 后重试({attempt}/{RETRY_PER_IMG})")
+                    await asyncio.sleep(wait)
+                else:
+                    log.error(f"[HTTP {exc.response.status_code}] {img_url}")
+                    break
+            except Exception as exc:
+                log.error(f"[ERROR] {img_url} -> {exc} ({attempt}/{RETRY_PER_IMG})")
+                await asyncio.sleep(1)
+
+        return False
+    finally:
+        sem.release()
+
+
+# -------------------- 扫描待下载列表 --------------------
+async def scan_tasks() -> List[Dict[str, str]]:
+    """扫描 downloads/ 下所有 json,返回待下载列表"""
+    result = []
+    root = AsyncPath("data/downloads")
+    if not await root.exists():
+        return result
+
+    async for json_path in root.rglob("*.json"):
+        folder = json_path.parent
+        try:
+            data: Dict[str, str] = json.loads(await json_path.read_text(encoding="utf-8"))
+        except Exception as exc:
+            log.warning(f"读取 json 失败 {json_path} -> {exc}")
+            continue
+
+        for img_name, img_url in data.items():
+            img_path = folder / img_name  # 无后缀
+            # 异步判断任意后缀是否存在
+            exists = False
+            for ext in (".jpg", ".jpeg", ".png", ".gif", ".webp"):
+                if await img_path.with_suffix(ext).exists():
+                    exists = True
+                    break
+            if not exists:
+                result.append({"img_path": str(img_path), "img_url": img_url})
+
+    return result
+
+
+# -------------------- 主流程 --------------------
+async def main(proxy: str | None = None) -> None:
+    # 1. 优先重试上次失败
+    failed_tasks = load_failed()
+    if failed_tasks:
+        log.info(f"优先重试上次失败任务: {len(failed_tasks)} 张")
+
+    tasks = failed_tasks + await scan_tasks()
+    if not tasks:
+        log.info("没有需要下载的图片,收工!")
+        return
+
+    limits = httpx.Limits(max_keepalive_connections=20, max_connections=50)
+    async with httpx.AsyncClient(limits=limits, timeout=TIMEOUT, proxies=proxy, verify=True) as client:
+        sem = asyncio.Semaphore(CONCURRENCY)
+        results = await tqdm_asyncio.gather(
+            *[download_one(client, sem, t) for t in tasks],
+            desc="Downloading",
+            total=len(tasks),
+        )
+
+    # 统计 & 持久化新失败
+    failed_again = [t for t, ok in zip(tasks, results) if not ok]
+    if failed_again:
+        save_failed(failed_again)
+        log.warning(f"本轮仍有 {len(failed_again)} 张下载失败,已写入 {FAILED_RECORD}")
+    else:
+        Path(FAILED_RECORD).unlink(missing_ok=True)
+        log.info("全部下载完成!")
+
+
+if __name__ == "__main__":
+    try:
+        asyncio.run(main())
+    except KeyboardInterrupt:
+        log.info("用户中断,下载结束")

+ 29 - 0
utils.py

@@ -0,0 +1,29 @@
+# utils.py
+import asyncio
+from pathlib import Path
+from typing import List
+
+from aiopath import AsyncPath
+import logging
+
+# 把 1step.py 的主逻辑封装成函数
+from step1 import main as step1_main
+from step2 import main as step2_main
+
+log = logging.getLogger("utils")
+
+async def run_step1(proxy: str | None = None) -> str:
+    try:
+        await step1_main(proxy)
+        return "画廊链接抓取完成!"
+    except Exception as e:
+        log.exception("step1 执行失败")
+        return f"抓取失败:{e}"
+
+async def run_step2(proxy: str | None = None) -> str:
+    try:
+        await step2_main(proxy)
+        return "图片下载完成!"
+    except Exception as e:
+        log.exception("step2 执行失败")
+        return f"下载失败:{e}"