kaizty_spider.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. import time
  2. import asyncio
  3. import random
  4. import re
  5. import json
  6. import os
  7. import concurrent.futures
  8. import httpx
  9. max_workers = 2
  10. proxies="http://127.0.0.1:7890"
  11. def check_urls_json_exists(key):
  12. downloads_path = os.path.join(os.getcwd(), "downloads")
  13. for root, dirs, files in os.walk(downloads_path):
  14. if f"{key}.json" in files:
  15. json_path = root.split('/')[-1]
  16. print(f'json文件已存在 {json_path} 中')
  17. return True
  18. return False
  19. def check_and_load_keys():
  20. # 从 keys.txt 文件中读取 key
  21. keys = []
  22. keys_file = os.path.join(os.getcwd(), "keys.txt")
  23. if not os.path.exists(keys_file):
  24. print("keys.txt 文件不存在\n新建keys.txt文件。")
  25. with open(keys_file, "w", encoding="utf-8") as f:
  26. f.write("")
  27. exit(0)
  28. with open(keys_file, "r", encoding="utf-8") as f:
  29. keys = [line.strip() for line in f.readlines()]
  30. if keys:
  31. return keys
  32. else:
  33. print("keys.txt 文件为空\n请填写key。")
  34. exit(0)
  35. async def fetch_page(client, url):
  36. try:
  37. response = await client.get(url)
  38. response.raise_for_status() # 检查请求是否成功
  39. return response.text
  40. except httpx.HTTPError as e:
  41. print(f"请求失败: {e}")
  42. return None
  43. def extract_image_links(content):
  44. # 使用正则表达式提取图片链接
  45. pattern = r'<meta itemprop="image" content="(.*?)">'
  46. image_links = re.findall(pattern, content)
  47. return image_links
  48. def clean_folder_name(title):
  49. # 清洗标题,使其成为 Windows 文件夹合法字符
  50. invalid_chars = r'[<>:"/\\|?*\x00-\x1F]'
  51. title = re.sub(invalid_chars, '_', title) # 替换非法字符为下划线
  52. title = title.replace(" ", "") # 删除空格
  53. title = title.replace("_", "") # 删除下划线
  54. return title.strip()
  55. async def get_urls(key):
  56. # 这里判定一下, 这个 key 是否已经爬取过
  57. is_exists = check_urls_json_exists(key)
  58. if is_exists:
  59. print(f"{key}.json 文件已存在,跳过爬取。")
  60. return
  61. base_url = f"https://www.kaizty.com/photos/{key}.html?page="
  62. data = {}
  63. folder_name = "default_folder" # 默认文件夹名
  64. async with httpx.AsyncClient(proxies=proxies) as client:
  65. n = 1
  66. for page in range(1, 30):
  67. url = base_url + str(page)
  68. print(f"正在爬取页面: {url}")
  69. content = await fetch_page(client, url)
  70. if content is None:
  71. print(f"无法获取页面内容: {url}")
  72. continue
  73. # 检查页面内容是否为空
  74. if "EMPTY" in content:
  75. print("页面内容为空,停止爬取。")
  76. break
  77. # 获取标题(仅在第一页获取)
  78. if page == 1:
  79. title_pattern = r'<title>(.*?)</title>'
  80. title_match = re.search(title_pattern, content)
  81. if title_match:
  82. title = title_match.group(1)
  83. folder_name = clean_folder_name(title)
  84. print(f"页面标题: {title}")
  85. print(f"清洗后的文件夹名: {folder_name}")
  86. else:
  87. print("无法获取页面标题,使用默认文件夹名。")
  88. # 提取图片链接
  89. image_links = extract_image_links(content)
  90. if image_links:
  91. print(f"在页面 {url} 中找到图片链接:")
  92. for link in image_links:
  93. print(link)
  94. prefix = str(n).zfill(3)
  95. suffix = link.split('.')[-1]
  96. img_name = f'{prefix}.{suffix}'
  97. data[img_name] = link
  98. n += 1
  99. else:
  100. print(f"页面 {url} 中未找到图片链接。")
  101. # 创建文件夹并保存数据
  102. downloads_path = os.path.join(os.getcwd(), "downloads")
  103. if not os.path.exists(downloads_path):
  104. os.makedirs(downloads_path)
  105. print("创建了 downloads 文件夹。")
  106. folder_path = os.path.join(downloads_path, folder_name)
  107. if not os.path.exists(folder_path):
  108. os.makedirs(folder_path)
  109. print(f"创建了文件夹: {folder_path}")
  110. data_file_path = os.path.join(folder_path, f"{key}.json")
  111. with open(data_file_path, "w", encoding="utf-8") as f:
  112. json.dump(data, f, ensure_ascii=False, indent=4)
  113. print(f"数据已保存到 {data_file_path}")
  114. return [folder_name, data_file_path]
  115. def load_imgs_url_and_patn():
  116. result = []
  117. downloads_path = os.path.join(os.getcwd(), "downloads")
  118. for root, dirs, files in os.walk(downloads_path):
  119. for file in files:
  120. if file.endswith(".json"):
  121. json_path = os.path.join(root, file)
  122. with open(json_path, "r", encoding="utf-8") as f:
  123. data = json.load(f)
  124. for img_name, img_url in data.items():
  125. img_path = os.path.join(root, img_name)
  126. if not os.path.exists(img_path):
  127. result.append([img_path, img_url])
  128. return result
  129. def save_img(client, img_path, img_url, max_retries=999):
  130. retries = 0
  131. headers = {
  132. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  133. "Accept-Encoding": "gzip, deflate, br, zstd",
  134. "Accept-Language": "zh-CN,zh;q=0.9",
  135. "Priority": "u=0, i",
  136. "Sec-CH-UA": '"Chromium";v="134", "Not:A-Brand";v="24", "Google Chrome";v="134"',
  137. "Sec-CH-UA-Mobile": "?1",
  138. "Sec-CH-UA-Platform": '"Android"',
  139. "Sec-Fetch-Dest": "document",
  140. "Sec-Fetch-Mode": "navigate",
  141. "Sec-Fetch-Site": "none",
  142. "Sec-Fetch-User": "?1",
  143. "Upgrade-Insecure-Requests": "1",
  144. "User-Agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/134.0.0.0 Mobile Safari/537.36"
  145. }
  146. while retries < max_retries:
  147. try:
  148. # 使用传入的 client 下载图片,并设置请求头
  149. response = client.get(img_url, headers=headers, timeout=10)
  150. response.raise_for_status() # 检查请求是否成功
  151. # 保存图片到指定路径
  152. os.makedirs(os.path.dirname(img_path), exist_ok=True)
  153. with open(img_path, "wb") as f:
  154. f.write(response.content)
  155. print(f"图片已下载并保存到 {img_path}")
  156. time.sleep(random.uniform(1, 1.5))
  157. return # 成功下载后退出函数
  158. except httpx.HTTPStatusError as e:
  159. switch_to_random_proxy()
  160. if e.response.status_code == 429:
  161. # 如果是 429 错误,获取 Retry-After 时间
  162. retry_after = int(e.response.headers.get('Retry-After', 3))
  163. print(f"遇到 429 错误,等待 {retry_after} 秒后重试...")
  164. time.sleep(retry_after)
  165. retries += 1
  166. else:
  167. print(f"下载图片失败: {img_path.split('/')[-1]},错误码: {e.response.status_code}")
  168. break
  169. except Exception as e:
  170. print(f"保存图片时发生错误: {e}")
  171. break
  172. if retries == max_retries:
  173. print(f"图片下载失败,已达到最大重试次数: {img_path}")
  174. def switch_to_random_proxy(clash_api_url="http://127.0.0.1:9090", group_name="GLOBAL"):
  175. """
  176. 随机切换代理组中的一个节点(排除当前节点和 DIRECT/REJECT)
  177. :param clash_api_url: Clash RESTful API 地址,默认为 "http://127.0.0.1:9090"
  178. :param group_name: 代理组名称,默认为 "GLOBAL"
  179. """
  180. try:
  181. # 获取代理组的所有节点
  182. response = httpx.get(f"{clash_api_url}/proxies")
  183. response.raise_for_status()
  184. proxies = response.json()
  185. if group_name not in proxies['proxies']:
  186. print(f"代理组 '{group_name}' 不存在")
  187. return
  188. group_info = proxies['proxies'][group_name]
  189. if group_info['type'] != 'Selector':
  190. print(f"'{group_name}' 不是 Selector 类型的代理组")
  191. return
  192. # 获取当前使用的节点
  193. current_node = group_info['now']
  194. print(f"当前节点: {current_node}")
  195. # 获取所有可选节点(排除 DIRECT 和 REJECT)
  196. nodes = [node for node in group_info['all'] if node not in ["DIRECT", "REJECT"]]
  197. if not nodes:
  198. print("没有可用的代理节点")
  199. return
  200. # 随机选择一个非当前节点的代理
  201. available_nodes = [node for node in nodes if node != current_node]
  202. if not available_nodes:
  203. print("没有其他可用的代理节点")
  204. return
  205. random_node = random.choice(available_nodes)
  206. print(f"正在切换到随机节点: {random_node}")
  207. # 切换节点
  208. switch_url = f"{clash_api_url}/proxies/{group_name}"
  209. response = httpx.put(switch_url, json={"name": random_node})
  210. if response.status_code == 204:
  211. print(f"成功切换到节点: {random_node}")
  212. else:
  213. print(f"切换节点失败: {response.status_code}")
  214. except httpx.exceptions.RequestException as e:
  215. print(f"请求失败: {e}")
  216. def main():
  217. keys = check_and_load_keys()
  218. # 在这里获取当前路径并且判定如果没有downloads文件夹的话,就创建一个
  219. downloads_path = os.path.join(os.getcwd(), "downloads")
  220. if not os.path.exists(downloads_path):
  221. os.makedirs(downloads_path)
  222. print("创建了 downloads 文件夹。")
  223. for key in keys:
  224. # 调用异步函数
  225. result = asyncio.run(get_urls(key))
  226. if result:
  227. folder_name = result[0]
  228. data_file_path = result[1]
  229. print(f"处理完成,文件夹名称: {folder_name}, 数据保存路径: {data_file_path}")
  230. print(f'已获取全部keys的url数据, 开始下载图片')
  231. time.sleep(0.1)
  232. all_data = load_imgs_url_and_patn()
  233. # 创建一个全局的 httpx.Client 实例
  234. with httpx.Client(proxies=proxies) as client:
  235. # 使用线程池并发下载图片
  236. with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
  237. futures = []
  238. for img_path, img_url in all_data:
  239. futures.append(executor.submit(save_img, client, img_path, img_url))
  240. # 等待所有线程完成
  241. for future in concurrent.futures.as_completed(futures):
  242. future.result() # 捕获异常(如果有)
  243. print("所有图片下载完成!")
  244. if __name__ == "__main__":
  245. main()