| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205 |
- import os
- import time
- import random
- import httpx
- from bs4 import BeautifulSoup
- comico_urls = [
- '[PIXIV] LotteryFate (18900473)(AI)',
- ]
- # 是否使用代理
- use_proxy = 1
- def save_img(client, folder_path, img_links):
- for index, img_url in enumerate(img_links, start=1):
- try:
- # 生成文件名,例如 0001.png, 0002.png
- file_name = f"{str(index).zfill(4)}.png"
- file_path = os.path.join(folder_path, file_name)
- # 检查文件是否已经存在
- if os.path.exists(file_path):
- print(f"文件已存在,跳过下载: {file_path}")
- continue
- # 发送请求获取图片内容
- response = client.get(img_url)
- if response.status_code != 200:
- raise Exception(
- f"无法下载图片 {img_url},状态码: {response.status_code}")
- # 保存图片到本地
- with open(file_path, 'wb') as file:
- file.write(response.content)
- print(f"图片已保存: {file_path}")
- except Exception as e:
- raise Exception(f"下载图片 {img_url} 时出错: {e}")
- # random_sleep = random.uniform(2, 3)
- # print(f"随机休眠 {random_sleep} 秒")
- # time.sleep(random_sleep)
- def get_imgs(client, folder_path, chapter_data):
- img_links = []
- for chapter_name, url in chapter_data.items():
- try:
- # 发送请求获取页面内容
- response = client.get(url)
- if response.status_code != 200:
- raise Exception(f"无法访问 {url},状态码: {response.status_code}")
- # 解析 HTML
- soup = BeautifulSoup(response.text, 'html.parser')
- # 获取图片的上一层元素
- parent_element = soup.select_one(
- 'body > div.container > div.row.col-lg-12.col-md-12.col-xs-12')
- if not parent_element:
- raise Exception(f"{chapter_name} 未找到图片容器")
- # 获取所有图片元素
- img_elements = parent_element.select('img')
- total_images = len(img_elements)
- print(f'{chapter_name} 共 {total_images} 张图片')
- # 输出图片的 URL
- for img in img_elements:
- img_url = img.get('src')
- if img_url:
- img_links.append(img_url)
- except Exception as e:
- print(f"获取图片时出错: {e}")
- raise # 抛出异常,触发重试逻辑
- return img_links
- def save_urls(folder_path, img_links):
- # 定义保存文件路径
- save_path = os.path.join(folder_path, 'img_links.txt')
- # 将图片链接写入文件
- with open(save_path, 'w', encoding='utf-8') as file:
- for link in img_links:
- file.write(link + '\n')
- print(f"图片链接已保存到: {save_path}")
- def new_folder(page_title):
- # 获取当前脚本所在的目录
- script_dir = os.path.dirname(os.path.abspath(__file__))
- download_dir = os.path.join(script_dir, 'downloads')
- if not os.path.exists(script_dir):
- os.makedirs(script_dir)
- if page_title:
- # 拼接目标文件夹路径
- folder_path = os.path.join(download_dir, page_title)
- # 检查文件夹是否存在,如果不存在则创建
- if not os.path.exists(folder_path):
- os.makedirs(folder_path)
- return folder_path
- def get_chapter_data(client, target_url):
- result = {}
- page_title = ''
- try:
- response = client.get(target_url)
- if response.status_code != 200:
- raise Exception(f"无法访问 {target_url},状态码: {response.status_code}")
- soup = BeautifulSoup(response.text, 'html.parser')
- # 获取指定选择器下的所有元素
- elements = soup.select(
- 'body > div.container > div:nth-child(3) > div:nth-child(2) a')
- # 提取每个元素的 URL 和文本
- for element in elements:
- url = element.get('href')
- text = element.get_text()
- result[text] = base_url + url
- except Exception as e:
- print(f"获取章节数据时出错: {e}")
- raise # 抛出异常,触发重试逻辑
- return result
- def main():
- proxy_url = 'http://127.0.0.1:7890'
- base_url = 'https://jcomic.net'
- herf_url = '/eps/'
- # 自定义请求头
- custom_headers = {
- "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
- "accept-language": "zh-CN,zh;q=0.9",
- "cache-control": "max-age=0",
- "cookie": "_gid=GA1.2.724162267.1736817775; _ga_QL6YSDRWEV=GS1.1.1736837388.3.0.1736837388.0.0.0; _ga=GA1.2.1324234734.1736817774; _gat=1",
- "priority": "u=0, i",
- "sec-ch-ua": '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
- "sec-ch-ua-mobile": "?0",
- "sec-ch-ua-platform": '"macOS"',
- "sec-fetch-dest": "document",
- "sec-fetch-mode": "navigate",
- "sec-fetch-site": "same-origin",
- "sec-fetch-user": "?1",
- "upgrade-insecure-requests": "1",
- "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
- }
- for comico_url in comico_urls:
- target_url = base_url + herf_url + comico_url
- print(target_url)
- # 最大重试次数
- max_retries = 999
- retry_count = 0
- while retry_count < max_retries:
- try:
- # 创建 httpx.Client 实例,并设置自定义请求头
- with httpx.Client(proxies=proxy_url if use_proxy else None, headers=custom_headers) as client:
- # 1, 获取页面章节数据
- chapter_data = get_chapter_data(client, target_url)
- print(chapter_data)
- # 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title
- folder_path = new_folder(comico_url)
- # 3, 遍历章节数据,获取img的链接
- img_links = get_imgs(client, folder_path, chapter_data)
- print(img_links)
- # 4, 保存url到新建的文件夹中
- save_urls(folder_path, img_links)
- # 5,遍历 img_links ,将图片保存到 folder_path中, 保存的文件名类似 0001.png
- save_img(client, folder_path, img_links)
- # 如果成功执行完成,跳出循环
- print('done!')
- break
- except Exception as e:
- retry_count += 1
- print(f"发生错误: {e},正在进行第 {retry_count} 次重试...")
- if retry_count >= max_retries:
- print("已达到最大重试次数,程序终止。")
- break
- # 固定延迟 10 分钟(600 秒)
- delay = 30
- print(f"等待 {delay} 秒后重试...")
- time.sleep(delay)
- if __name__ == '__main__':
- main()
|