|
|
@@ -1,19 +1,39 @@
|
|
|
+'''
|
|
|
+https://jcomic.net/
|
|
|
+'''
|
|
|
import os
|
|
|
import time
|
|
|
+import re
|
|
|
import random
|
|
|
+from urllib.parse import unquote
|
|
|
import httpx
|
|
|
from bs4 import BeautifulSoup
|
|
|
-
|
|
|
-comico_urls = [
|
|
|
- '[PIXIV] LotteryFate (18900473)(AI)',
|
|
|
-]
|
|
|
+from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
+
|
|
|
+comico_urls = []
|
|
|
+urls_txt = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'urls.txt')
|
|
|
+# 如果文件不存在,创建一个空文件
|
|
|
+if not os.path.exists(urls_txt):
|
|
|
+ with open(urls_txt, 'w') as f:
|
|
|
+ f.write('')
|
|
|
+
|
|
|
+# 读取文件内容
|
|
|
+with open(urls_txt, 'r', encoding='utf-8') as f:
|
|
|
+ lines = f.readlines() # 调用 readlines() 方法
|
|
|
+ for line in lines:
|
|
|
+ comico_urls.append(line.strip()) # 去除换行符
|
|
|
+
|
|
|
+if not comico_urls:
|
|
|
+ exit(0)
|
|
|
+else:
|
|
|
+ print(f'准备下载 {comico_urls}')
|
|
|
|
|
|
# 是否使用代理
|
|
|
-use_proxy = 1
|
|
|
+use_proxy = 0
|
|
|
|
|
|
|
|
|
def save_img(client, folder_path, img_links):
|
|
|
- for index, img_url in enumerate(img_links, start=1):
|
|
|
+ def download_image(index, img_url):
|
|
|
try:
|
|
|
# 生成文件名,例如 0001.png, 0002.png
|
|
|
file_name = f"{str(index).zfill(4)}.png"
|
|
|
@@ -22,13 +42,12 @@ def save_img(client, folder_path, img_links):
|
|
|
# 检查文件是否已经存在
|
|
|
if os.path.exists(file_path):
|
|
|
print(f"文件已存在,跳过下载: {file_path}")
|
|
|
- continue
|
|
|
+ return
|
|
|
|
|
|
# 发送请求获取图片内容
|
|
|
response = client.get(img_url)
|
|
|
if response.status_code != 200:
|
|
|
- raise Exception(
|
|
|
- f"无法下载图片 {img_url},状态码: {response.status_code}")
|
|
|
+ raise Exception(f"无法下载图片 {img_url},状态码: {response.status_code}")
|
|
|
|
|
|
# 保存图片到本地
|
|
|
with open(file_path, 'wb') as file:
|
|
|
@@ -36,15 +55,21 @@ def save_img(client, folder_path, img_links):
|
|
|
|
|
|
print(f"图片已保存: {file_path}")
|
|
|
except Exception as e:
|
|
|
- raise Exception(f"下载图片 {img_url} 时出错: {e}")
|
|
|
+ print(f"下载图片 {img_url} 时出错: {e}")
|
|
|
+
|
|
|
+ # 使用 ThreadPoolExecutor 进行多线程下载
|
|
|
+ with ThreadPoolExecutor(max_workers=10) as executor:
|
|
|
+ futures = []
|
|
|
+ for index, img_url in img_links.items():
|
|
|
+ futures.append(executor.submit(download_image, index, img_url))
|
|
|
|
|
|
- # random_sleep = random.uniform(2, 3)
|
|
|
- # print(f"随机休眠 {random_sleep} 秒")
|
|
|
- # time.sleep(random_sleep)
|
|
|
+ # 等待所有任务完成
|
|
|
+ for future in as_completed(futures):
|
|
|
+ future.result() # 获取任务结果,如果有异常会在这里抛出
|
|
|
|
|
|
|
|
|
def get_imgs(client, folder_path, chapter_data):
|
|
|
- img_links = []
|
|
|
+ img_links = {}
|
|
|
for chapter_name, url in chapter_data.items():
|
|
|
try:
|
|
|
# 发送请求获取页面内容
|
|
|
@@ -67,23 +92,43 @@ def get_imgs(client, folder_path, chapter_data):
|
|
|
print(f'{chapter_name} 共 {total_images} 张图片')
|
|
|
|
|
|
# 输出图片的 URL
|
|
|
+ page = 1
|
|
|
for img in img_elements:
|
|
|
img_url = img.get('src')
|
|
|
if img_url:
|
|
|
- img_links.append(img_url)
|
|
|
+ img_links[str(page).zfill(4)] = img_url
|
|
|
+ page += 1
|
|
|
except Exception as e:
|
|
|
print(f"获取图片时出错: {e}")
|
|
|
raise # 抛出异常,触发重试逻辑
|
|
|
return img_links
|
|
|
|
|
|
|
|
|
+def new_file_name(file_name):
|
|
|
+ """
|
|
|
+ 将文件名中的非法字符直接消除,使其符合 Windows 文件夹命名规则。
|
|
|
+ :param file_name: 原始文件名
|
|
|
+ :return: 合法的文件名
|
|
|
+ """
|
|
|
+ # 定义 Windows 文件系统中不允许的字符
|
|
|
+ illegal_chars = r'[\\/:*?"<>|]'
|
|
|
+
|
|
|
+ # 直接消除非法字符
|
|
|
+ safe_name = re.sub(illegal_chars, '', file_name)
|
|
|
+
|
|
|
+ # 去掉首尾的空格(如果有)
|
|
|
+ safe_name = safe_name.strip()
|
|
|
+
|
|
|
+ return safe_name
|
|
|
+
|
|
|
+
|
|
|
def save_urls(folder_path, img_links):
|
|
|
# 定义保存文件路径
|
|
|
save_path = os.path.join(folder_path, 'img_links.txt')
|
|
|
|
|
|
# 将图片链接写入文件
|
|
|
with open(save_path, 'w', encoding='utf-8') as file:
|
|
|
- for link in img_links:
|
|
|
+ for num, link in img_links.items():
|
|
|
file.write(link + '\n')
|
|
|
|
|
|
print(f"图片链接已保存到: {save_path}")
|
|
|
@@ -107,7 +152,7 @@ def new_folder(page_title):
|
|
|
return folder_path
|
|
|
|
|
|
|
|
|
-def get_chapter_data(client, target_url):
|
|
|
+def get_chapter_data(client, target_url, base_url):
|
|
|
result = {}
|
|
|
page_title = ''
|
|
|
|
|
|
@@ -121,12 +166,16 @@ def get_chapter_data(client, target_url):
|
|
|
# 获取指定选择器下的所有元素
|
|
|
elements = soup.select(
|
|
|
'body > div.container > div:nth-child(3) > div:nth-child(2) a')
|
|
|
+ if elements:
|
|
|
+ # 提取每个元素的 URL 和文本
|
|
|
+ for element in elements:
|
|
|
+ url = element.get('href')
|
|
|
+ text = element.get_text()
|
|
|
+ result[text] = base_url + url
|
|
|
+ else:
|
|
|
+ # 这里是只有第一话的情况
|
|
|
+ result['第1话'] = target_url.replace('eps', 'page')
|
|
|
|
|
|
- # 提取每个元素的 URL 和文本
|
|
|
- for element in elements:
|
|
|
- url = element.get('href')
|
|
|
- text = element.get_text()
|
|
|
- result[text] = base_url + url
|
|
|
except Exception as e:
|
|
|
print(f"获取章节数据时出错: {e}")
|
|
|
raise # 抛出异常,触发重试逻辑
|
|
|
@@ -137,7 +186,6 @@ def get_chapter_data(client, target_url):
|
|
|
def main():
|
|
|
proxy_url = 'http://127.0.0.1:7890'
|
|
|
base_url = 'https://jcomic.net'
|
|
|
- herf_url = '/eps/'
|
|
|
# 自定义请求头
|
|
|
custom_headers = {
|
|
|
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
|
@@ -157,8 +205,11 @@ def main():
|
|
|
}
|
|
|
|
|
|
for comico_url in comico_urls:
|
|
|
- target_url = base_url + herf_url + comico_url
|
|
|
- print(target_url)
|
|
|
+ # 处理url,并获取文件名
|
|
|
+ file_name = new_file_name(unquote(comico_url.split('/')[-1]))
|
|
|
+
|
|
|
+ target_url = comico_url
|
|
|
+ print(file_name)
|
|
|
# 最大重试次数
|
|
|
max_retries = 999
|
|
|
retry_count = 0
|
|
|
@@ -168,18 +219,16 @@ def main():
|
|
|
# 创建 httpx.Client 实例,并设置自定义请求头
|
|
|
with httpx.Client(proxies=proxy_url if use_proxy else None, headers=custom_headers) as client:
|
|
|
# 1, 获取页面章节数据
|
|
|
- chapter_data = get_chapter_data(client, target_url)
|
|
|
- print(chapter_data)
|
|
|
+ chapter_data = get_chapter_data(client, target_url, base_url)
|
|
|
|
|
|
# 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title
|
|
|
- folder_path = new_folder(comico_url)
|
|
|
+ folder_path = new_folder(file_name)
|
|
|
|
|
|
# 3, 遍历章节数据,获取img的链接
|
|
|
img_links = get_imgs(client, folder_path, chapter_data)
|
|
|
- print(img_links)
|
|
|
|
|
|
# 4, 保存url到新建的文件夹中
|
|
|
- save_urls(folder_path, img_links)
|
|
|
+ #save_urls(folder_path, img_links)
|
|
|
|
|
|
# 5,遍历 img_links ,将图片保存到 folder_path中, 保存的文件名类似 0001.png
|
|
|
save_img(client, folder_path, img_links)
|
|
|
@@ -195,11 +244,10 @@ def main():
|
|
|
print("已达到最大重试次数,程序终止。")
|
|
|
break
|
|
|
|
|
|
- # 固定延迟 10 分钟(600 秒)
|
|
|
- delay = 30
|
|
|
+ delay = 5
|
|
|
print(f"等待 {delay} 秒后重试...")
|
|
|
time.sleep(delay)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
- main()
|
|
|
+ main()
|