main.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. '''
  2. https://jcomic.net/
  3. '''
  4. import os
  5. import time
  6. import re
  7. import random
  8. from urllib.parse import unquote
  9. import httpx
  10. from bs4 import BeautifulSoup
  11. from concurrent.futures import ThreadPoolExecutor, as_completed
  12. comico_urls = []
  13. urls_txt = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'urls.txt')
  14. # 如果文件不存在,创建一个空文件
  15. if not os.path.exists(urls_txt):
  16. with open(urls_txt, 'w') as f:
  17. f.write('')
  18. # 读取文件内容
  19. with open(urls_txt, 'r', encoding='utf-8') as f:
  20. lines = f.readlines() # 调用 readlines() 方法
  21. for line in lines:
  22. comico_urls.append(line.strip()) # 去除换行符
  23. if not comico_urls:
  24. exit(0)
  25. else:
  26. print(f'准备下载 {comico_urls}')
  27. # 是否使用代理
  28. use_proxy = 0
  29. def save_img(client, folder_path, img_links):
  30. def download_image(index, img_url):
  31. try:
  32. # 生成文件名,例如 0001.png, 0002.png
  33. file_name = f"{str(index).zfill(4)}.png"
  34. file_path = os.path.join(folder_path, file_name)
  35. # 检查文件是否已经存在
  36. if os.path.exists(file_path):
  37. print(f"文件已存在,跳过下载: {file_path}")
  38. return
  39. # 发送请求获取图片内容
  40. response = client.get(img_url)
  41. if response.status_code != 200:
  42. raise Exception(f"无法下载图片 {img_url},状态码: {response.status_code}")
  43. # 保存图片到本地
  44. with open(file_path, 'wb') as file:
  45. file.write(response.content)
  46. print(f"图片已保存: {file_path}")
  47. except Exception as e:
  48. print(f"下载图片 {img_url} 时出错: {e}")
  49. # 使用 ThreadPoolExecutor 进行多线程下载
  50. with ThreadPoolExecutor(max_workers=10) as executor:
  51. futures = []
  52. for index, img_url in img_links.items():
  53. futures.append(executor.submit(download_image, index, img_url))
  54. # 等待所有任务完成
  55. for future in as_completed(futures):
  56. future.result() # 获取任务结果,如果有异常会在这里抛出
  57. def get_imgs(client, folder_path, chapter_data):
  58. img_links = {}
  59. for chapter_name, url in chapter_data.items():
  60. try:
  61. # 发送请求获取页面内容
  62. response = client.get(url)
  63. if response.status_code != 200:
  64. raise Exception(f"无法访问 {url},状态码: {response.status_code}")
  65. # 解析 HTML
  66. soup = BeautifulSoup(response.text, 'html.parser')
  67. # 获取图片的上一层元素
  68. parent_element = soup.select_one(
  69. 'body > div.container > div.row.col-lg-12.col-md-12.col-xs-12')
  70. if not parent_element:
  71. raise Exception(f"{chapter_name} 未找到图片容器")
  72. # 获取所有图片元素
  73. img_elements = parent_element.select('img')
  74. total_images = len(img_elements)
  75. print(f'{chapter_name} 共 {total_images} 张图片')
  76. # 输出图片的 URL
  77. page = 1
  78. for img in img_elements:
  79. img_url = img.get('src')
  80. if img_url:
  81. img_links[str(page).zfill(4)] = img_url
  82. page += 1
  83. except Exception as e:
  84. print(f"获取图片时出错: {e}")
  85. raise # 抛出异常,触发重试逻辑
  86. return img_links
  87. def new_file_name(file_name):
  88. """
  89. 将文件名中的非法字符直接消除,使其符合 Windows 文件夹命名规则。
  90. :param file_name: 原始文件名
  91. :return: 合法的文件名
  92. """
  93. # 定义 Windows 文件系统中不允许的字符
  94. illegal_chars = r'[\\/:*?"<>|]'
  95. # 直接消除非法字符
  96. safe_name = re.sub(illegal_chars, '', file_name)
  97. # 去掉首尾的空格(如果有)
  98. safe_name = safe_name.strip()
  99. return safe_name
  100. def save_urls(folder_path, img_links):
  101. # 定义保存文件路径
  102. save_path = os.path.join(folder_path, 'img_links.txt')
  103. # 将图片链接写入文件
  104. with open(save_path, 'w', encoding='utf-8') as file:
  105. for num, link in img_links.items():
  106. file.write(link + '\n')
  107. print(f"图片链接已保存到: {save_path}")
  108. def new_folder(page_title):
  109. # 获取当前脚本所在的目录
  110. script_dir = os.path.dirname(os.path.abspath(__file__))
  111. download_dir = os.path.join(script_dir, 'downloads')
  112. if not os.path.exists(script_dir):
  113. os.makedirs(script_dir)
  114. if page_title:
  115. # 拼接目标文件夹路径
  116. folder_path = os.path.join(download_dir, page_title)
  117. # 检查文件夹是否存在,如果不存在则创建
  118. if not os.path.exists(folder_path):
  119. os.makedirs(folder_path)
  120. return folder_path
  121. def get_chapter_data(client, target_url, base_url):
  122. result = {}
  123. page_title = ''
  124. try:
  125. response = client.get(target_url)
  126. if response.status_code != 200:
  127. raise Exception(f"无法访问 {target_url},状态码: {response.status_code}")
  128. soup = BeautifulSoup(response.text, 'html.parser')
  129. # 获取指定选择器下的所有元素
  130. elements = soup.select(
  131. 'body > div.container > div:nth-child(3) > div:nth-child(2) a')
  132. if elements:
  133. # 提取每个元素的 URL 和文本
  134. for element in elements:
  135. url = element.get('href')
  136. text = element.get_text()
  137. result[text] = base_url + url
  138. else:
  139. # 这里是只有第一话的情况
  140. result['第1话'] = target_url.replace('eps', 'page')
  141. except Exception as e:
  142. print(f"获取章节数据时出错: {e}")
  143. raise # 抛出异常,触发重试逻辑
  144. return result
  145. def main():
  146. proxy_url = 'http://127.0.0.1:7890'
  147. base_url = 'https://jcomic.net'
  148. # 自定义请求头
  149. custom_headers = {
  150. "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  151. "accept-language": "zh-CN,zh;q=0.9",
  152. "cache-control": "max-age=0",
  153. "cookie": "_gid=GA1.2.724162267.1736817775; _ga_QL6YSDRWEV=GS1.1.1736837388.3.0.1736837388.0.0.0; _ga=GA1.2.1324234734.1736817774; _gat=1",
  154. "priority": "u=0, i",
  155. "sec-ch-ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
  156. "sec-ch-ua-mobile": "?0",
  157. "sec-ch-ua-platform": '"macOS"',
  158. "sec-fetch-dest": "document",
  159. "sec-fetch-mode": "navigate",
  160. "sec-fetch-site": "same-origin",
  161. "sec-fetch-user": "?1",
  162. "upgrade-insecure-requests": "1",
  163. "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
  164. }
  165. for comico_url in comico_urls:
  166. # 处理url,并获取文件名
  167. file_name = new_file_name(unquote(comico_url.split('/')[-1]))
  168. target_url = comico_url
  169. print(file_name)
  170. # 最大重试次数
  171. max_retries = 999
  172. retry_count = 0
  173. while retry_count < max_retries:
  174. try:
  175. # 创建 httpx.Client 实例,并设置自定义请求头
  176. with httpx.Client(proxies=proxy_url if use_proxy else None, headers=custom_headers) as client:
  177. # 1, 获取页面章节数据
  178. chapter_data = get_chapter_data(client, target_url, base_url)
  179. # 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title
  180. folder_path = new_folder(file_name)
  181. # 3, 遍历章节数据,获取img的链接
  182. img_links = get_imgs(client, folder_path, chapter_data)
  183. # 4, 保存url到新建的文件夹中
  184. #save_urls(folder_path, img_links)
  185. # 5,遍历 img_links ,将图片保存到 folder_path中, 保存的文件名类似 0001.png
  186. save_img(client, folder_path, img_links)
  187. # 如果成功执行完成,跳出循环
  188. print('done!')
  189. break
  190. except Exception as e:
  191. retry_count += 1
  192. print(f"发生错误: {e},正在进行第 {retry_count} 次重试...")
  193. if retry_count >= max_retries:
  194. print("已达到最大重试次数,程序终止。")
  195. break
  196. delay = 5
  197. print(f"等待 {delay} 秒后重试...")
  198. time.sleep(delay)
  199. if __name__ == '__main__':
  200. main()