step1.py 6.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. #!/usr/bin/env python3
  2. # -*- coding: utf-8 -*-
  3. """
  4. 异步批量抓取 E-H 画廊图片链接,按专辑保存 json
  5. python eh_crawler.py
  6. """
  7. from __future__ import annotations
  8. import asyncio
  9. import json
  10. import logging
  11. import os
  12. import re
  13. import sys
  14. from pathlib import Path
  15. from typing import Dict, List, Optional
  16. import httpx
  17. from bs4 import BeautifulSoup
  18. from tqdm.asyncio import tqdm_asyncio
  19. from pathlib import Path
  20. # -------------------- 可配置常量 --------------------
  21. from config import config
  22. CONCURRENCY = config.concurrency
  23. MAX_PAGE = config.max_page
  24. RETRY_PER_PAGE = config.retry_per_page
  25. TIMEOUT = httpx.Timeout(config.timeout)
  26. IMG_SELECTOR = "#gdt" # 图片入口区域
  27. FAILED_RECORD = "data/failed_keys.json"
  28. LOG_LEVEL = getattr(logging, config.log_level.upper())
  29. # ----------------------------------------------------
  30. # 确保数据目录存在
  31. if not os.path.exists("data"):
  32. os.mkdir("data")
  33. # 使用统一的日志配置
  34. from logger import get_logger
  35. from realtime_logger import realtime_logger
  36. log = get_logger("step1", "crawl.log")
  37. # 预编译正则
  38. ILLEGAL_CHARS = re.compile(r'[<>:"/\\|?*\x00-\x1F]')
  39. # -------------------- 工具函数 --------------------
  40. def clean_folder_name(title: str) -> str:
  41. """清洗文件夹名"""
  42. return ILLEGAL_CHARS.sub("_", title).replace(" ", "").replace("_", "").strip() or "gallery"
  43. def load_targets() -> List[str]:
  44. """读取 targets.txt"""
  45. tgt = Path("data/targets.txt")
  46. with open(tgt, 'r', encoding='utf-8') as f:
  47. urls = [line.strip() for line in f.readlines() if line.strip()]
  48. lines = []
  49. for ln in tgt.read_text(encoding="utf-8").splitlines():
  50. url = ln.strip()
  51. if url and not url.startswith('#'):
  52. lines.append(url)
  53. if not lines:
  54. log.error("targets.txt 为空,请先填写 URL")
  55. return
  56. return list(set(lines)) # 去重
  57. def load_failed() -> List[str]:
  58. if Path(FAILED_RECORD).exists():
  59. try:
  60. return json.loads(Path(FAILED_RECORD).read_text(encoding="utf-8"))
  61. except Exception as exc:
  62. log.warning(f"加载失败记录失败 -> {exc}")
  63. return []
  64. def save_failed(keys: List[str]) -> None:
  65. Path(FAILED_RECORD).write_text(json.dumps(keys, ensure_ascii=False, indent=2), encoding="utf-8")
  66. # -------------------- 爬虫核心 --------------------
  67. async def fetch_page(client: httpx.AsyncClient, url: str) -> Optional[str]:
  68. """获取单页 HTML"""
  69. for attempt in range(1, RETRY_PER_PAGE + 1):
  70. try:
  71. resp = await client.get(url)
  72. resp.raise_for_status()
  73. return resp.text
  74. except httpx.HTTPError as exc:
  75. log.error(f"[{attempt}/{RETRY_PER_PAGE}] 请求失败 {url} -> {exc}")
  76. await asyncio.sleep(2 ** attempt)
  77. return None
  78. async def crawl_single_gallery(
  79. client: httpx.AsyncClient, sem: asyncio.Semaphore, gallery_url: str
  80. ) -> bool:
  81. """抓取单个画廊,成功返回 True"""
  82. async with sem:
  83. base_url = gallery_url.rstrip("/")
  84. key = base_url.split("/")[-1] # 用最后一截当 key
  85. json_name = f"{key}.json"
  86. folder_path: Optional[Path] = None
  87. json_data: Dict[str, str] = {}
  88. img_count = 1
  89. last_page = False
  90. for page in range(MAX_PAGE):
  91. if last_page:
  92. break
  93. url = f"{base_url}?p={page}"
  94. html = await fetch_page(client, url)
  95. if html is None:
  96. continue
  97. soup = BeautifulSoup(html, "lxml")
  98. title = soup.title.string if soup.title else "gallery"
  99. clean_title = clean_folder_name(title)
  100. folder_path = Path("data/downloads") / clean_title
  101. folder_path.mkdir(parents=True, exist_ok=True)
  102. # 如果 json 已存在则跳过整个画廊
  103. json_path = folder_path / json_name
  104. if json_path.exists():
  105. log.info(f"{json_name} 已存在,跳过")
  106. return True
  107. log.info(f"当前页码:{page + 1} {url}")
  108. selected = soup.select_one(IMG_SELECTOR)
  109. if not selected:
  110. log.warning(f"未找到选择器 {IMG_SELECTOR}")
  111. continue
  112. links = re.findall(r'<a href="(.*?)"', selected.prettify())
  113. if not links:
  114. log.info("本页无图片入口,视为最后一页")
  115. last_page = True
  116. continue
  117. for img_entry in links:
  118. if img_entry in json_data.values():
  119. last_page = True
  120. break
  121. json_data[f"{img_count:04d}"] = img_entry
  122. img_count += 1
  123. if json_data:
  124. json_path.write_text(
  125. json.dumps(json_data, ensure_ascii=False, indent=2), encoding="utf-8"
  126. )
  127. log.info(f"保存成功 -> {json_path} ({len(json_data)} 张)")
  128. # 发送实时日志
  129. try:
  130. realtime_logger.broadcast_log_sync(f"画廊 {key} 抓取完成,共 {len(json_data)} 张图片", "SUCCESS", "step1")
  131. except Exception as e:
  132. log.warning(f"发送实时日志失败: {e}")
  133. return True
  134. else:
  135. log.warning(f"{key} 未解析到任何图片链接")
  136. # 发送实时日志
  137. try:
  138. realtime_logger.broadcast_log_sync(f"画廊 {key} 未解析到任何图片链接", "WARNING", "step1")
  139. except Exception as e:
  140. log.warning(f"发送实时日志失败: {e}")
  141. return False
  142. # -------------------- 主流程 --------------------
  143. async def main(proxy: str | None = None) -> None:
  144. targets = load_targets()
  145. failed = load_failed()
  146. if failed:
  147. log.info(f"优先重试上次失败画廊: {len(failed)} 个")
  148. all_urls = list(set(targets + failed))
  149. print(proxy)
  150. limits = httpx.Limits(max_keepalive_connections=20, max_connections=50)
  151. async with httpx.AsyncClient(
  152. limits=limits, timeout=TIMEOUT, proxies=proxy, verify=True
  153. ) as client:
  154. sem = asyncio.Semaphore(CONCURRENCY)
  155. results = await tqdm_asyncio.gather(
  156. *[crawl_single_gallery(client, sem, u) for u in all_urls],
  157. desc="Galleries",
  158. total=len(all_urls),
  159. )
  160. # 失败持久化
  161. new_failed = [u for u, ok in zip(all_urls, results) if not ok]
  162. if new_failed:
  163. save_failed(new_failed)
  164. log.warning(f"本轮仍有 {len(new_failed)} 个画廊失败,已写入 {FAILED_RECORD}")
  165. else:
  166. Path(FAILED_RECORD).unlink(missing_ok=True)
  167. log.info("全部画廊抓取完成!")
  168. if __name__ == "__main__":
  169. try:
  170. asyncio.run(main())
  171. except KeyboardInterrupt:
  172. log.info("用户中断,抓取结束")