|
@@ -1,84 +1,84 @@
|
|
|
import os
|
|
import os
|
|
|
import time
|
|
import time
|
|
|
-from playwright.sync_api import sync_playwright
|
|
|
|
|
|
|
+import httpx
|
|
|
|
|
+from bs4 import BeautifulSoup
|
|
|
|
|
|
|
|
-
|
|
|
|
|
-use_proxy = 1
|
|
|
|
|
|
|
+use_proxy = 0
|
|
|
base_url = 'https://jcomic.net'
|
|
base_url = 'https://jcomic.net'
|
|
|
herf_url = '/eps/'
|
|
herf_url = '/eps/'
|
|
|
-comico_url = '%E7%99%BE%E5%90%88%E3%83%95%E3%82%A7%E3%83%81LIFE'
|
|
|
|
|
|
|
+# comico_url = '%E7%99%BE%E5%90%88%E3%83%95%E3%82%A7%E3%83%81LIFE'
|
|
|
|
|
+comico_url = '神崎咲良ハーレム化計画'
|
|
|
target_url = base_url + herf_url + comico_url
|
|
target_url = base_url + herf_url + comico_url
|
|
|
-scroll_speed = 5
|
|
|
|
|
|
|
|
|
|
|
|
+def save_img(folder_path, img_links):
|
|
|
|
|
+ with httpx.Client() as client:
|
|
|
|
|
+ for index, img_url in enumerate(img_links, start=1):
|
|
|
|
|
+ try:
|
|
|
|
|
+ # 发送请求获取图片内容
|
|
|
|
|
+ response = client.get(img_url)
|
|
|
|
|
+ if response.status_code != 200:
|
|
|
|
|
+ print(f"无法下载图片 {img_url},状态码: {response.status_code}")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 生成文件名,例如 0001.png, 0002.png
|
|
|
|
|
+ file_name = f"{str(index).zfill(4)}.png"
|
|
|
|
|
+ file_path = os.path.join(folder_path, file_name)
|
|
|
|
|
|
|
|
-def scroll_to_percentage(page):
|
|
|
|
|
- # 滚动浏览器页面
|
|
|
|
|
- percentage_list = [i for i in range(5, 101, scroll_speed)]
|
|
|
|
|
- for percentage in percentage_list:
|
|
|
|
|
- # 计算页面的指定百分比高度
|
|
|
|
|
- height = page.evaluate("() => document.body.scrollHeight")
|
|
|
|
|
- scroll_position = height * (percentage / 100)
|
|
|
|
|
- # 跳转到指定的百分比位置
|
|
|
|
|
- page.evaluate(f"window.scrollTo({0}, {scroll_position})")
|
|
|
|
|
- time.sleep(0.5)
|
|
|
|
|
|
|
+ # 保存图片到本地
|
|
|
|
|
+ with open(file_path, 'wb') as file:
|
|
|
|
|
+ file.write(response.content)
|
|
|
|
|
|
|
|
|
|
+ print(f"图片已保存: {file_path}")
|
|
|
|
|
+ except Exception as e:
|
|
|
|
|
+ print(f"下载图片 {img_url} 时出错: {e}")
|
|
|
|
|
+ time.sleep(1)
|
|
|
|
|
|
|
|
def get_imgs(folder_path, chapter_data):
|
|
def get_imgs(folder_path, chapter_data):
|
|
|
- with sync_playwright() as p:
|
|
|
|
|
|
|
+ img_links = []
|
|
|
|
|
+ with httpx.Client() as client:
|
|
|
for chapter_name, url in chapter_data.items():
|
|
for chapter_name, url in chapter_data.items():
|
|
|
# 创建文件夹
|
|
# 创建文件夹
|
|
|
chapter_folder = os.path.join(folder_path, chapter_name)
|
|
chapter_folder = os.path.join(folder_path, chapter_name)
|
|
|
if not os.path.exists(chapter_folder):
|
|
if not os.path.exists(chapter_folder):
|
|
|
os.makedirs(chapter_folder)
|
|
os.makedirs(chapter_folder)
|
|
|
|
|
|
|
|
- # 遍历章节数据,依次下载图片
|
|
|
|
|
-
|
|
|
|
|
- if use_proxy:
|
|
|
|
|
- browser = p.chromium.launch(headless=True, proxy={"server": "http://127.0.0.1:7890"})
|
|
|
|
|
- else:
|
|
|
|
|
- browser = p.chromium.launch(headless=True)
|
|
|
|
|
- page = browser.new_page()
|
|
|
|
|
|
|
+ # 发送请求获取页面内容
|
|
|
|
|
+ response = client.get(url)
|
|
|
|
|
+ if response.status_code != 200:
|
|
|
|
|
+ print(f"无法访问 {url},状态码: {response.status_code}")
|
|
|
|
|
+ continue
|
|
|
|
|
|
|
|
- page.goto(url)
|
|
|
|
|
-
|
|
|
|
|
- time.sleep(1)
|
|
|
|
|
- print(f'滚动 {chapter_name}')
|
|
|
|
|
- scroll_to_percentage(page)
|
|
|
|
|
- print(f'滚动 {chapter_name}完成')
|
|
|
|
|
|
|
+ # 解析 HTML
|
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
# 获取图片的上一层元素
|
|
# 获取图片的上一层元素
|
|
|
- parent_locator = page.locator('body > div.container > div.row.col-lg-12.col-md-12.col-xs-12')
|
|
|
|
|
-
|
|
|
|
|
- # 获取匹配的图片元素数量
|
|
|
|
|
- total_images = parent_locator.locator('img').count()
|
|
|
|
|
|
|
+ parent_element = soup.select_one('body > div.container > div.row.col-lg-12.col-md-12.col-xs-12')
|
|
|
|
|
+ if not parent_element:
|
|
|
|
|
+ print(f"{chapter_name} 未找到图片容器")
|
|
|
|
|
+ continue
|
|
|
|
|
+
|
|
|
|
|
+ # 获取所有图片元素
|
|
|
|
|
+ img_elements = parent_element.select('img')
|
|
|
|
|
+ total_images = len(img_elements)
|
|
|
print(f'{chapter_name} 共 {total_images} 张图片')
|
|
print(f'{chapter_name} 共 {total_images} 张图片')
|
|
|
|
|
|
|
|
- n = 1
|
|
|
|
|
- # 遍历图片元素并截图保存
|
|
|
|
|
- for page_num in range(1, total_images + 1):
|
|
|
|
|
- img_locator = f'body > div.container > div.row.col-lg-12.col-md-12.col-xs-12 > img:nth-child({page_num})'
|
|
|
|
|
- img_path = os.path.join(
|
|
|
|
|
- chapter_folder, f'{str(n).zfill(3)}.png')
|
|
|
|
|
- try:
|
|
|
|
|
- # 先不截图, 获取图片的src
|
|
|
|
|
- src_urls = page.query_selector_all(img_locator)
|
|
|
|
|
- for src_url in src_urls:
|
|
|
|
|
- src = src_url.get_attribute('src')
|
|
|
|
|
- if src:
|
|
|
|
|
- page.locator(img_locator).screenshot(path=img_path)
|
|
|
|
|
- n += 1
|
|
|
|
|
- except Exception:
|
|
|
|
|
- continue
|
|
|
|
|
- print(f'{chapter_name} 保存完成')
|
|
|
|
|
-
|
|
|
|
|
- browser.close()
|
|
|
|
|
|
|
+ # 输出图片的 URL
|
|
|
|
|
+ for img in img_elements:
|
|
|
|
|
+ img_url = img.get('src')
|
|
|
|
|
+ if img_url:
|
|
|
|
|
+ img_links.append(img_url)
|
|
|
|
|
+ return img_links
|
|
|
|
|
|
|
|
|
|
+def save_urls(folder_path, img_links):
|
|
|
|
|
+ # 定义保存文件路径
|
|
|
|
|
+ save_path = os.path.join(folder_path, 'img_links.txt')
|
|
|
|
|
|
|
|
-def save_urls(folder_path, chapter_data):
|
|
|
|
|
- with open(os.path.join(folder_path, 'urls.txt'), 'w') as file:
|
|
|
|
|
- for chapter_name, url in chapter_data.items():
|
|
|
|
|
- file.write(f"{chapter_name}: {url}\n")
|
|
|
|
|
|
|
+ # 将图片链接写入文件
|
|
|
|
|
+ with open(save_path, 'w', encoding='utf-8') as file:
|
|
|
|
|
+ for link in img_links:
|
|
|
|
|
+ file.write(link + '\n')
|
|
|
|
|
|
|
|
|
|
+ print(f"图片链接已保存到: {save_path}")
|
|
|
|
|
|
|
|
def new_folder(page_title):
|
|
def new_folder(page_title):
|
|
|
# 获取当前脚本所在的目录
|
|
# 获取当前脚本所在的目录
|
|
@@ -97,61 +97,43 @@ def new_folder(page_title):
|
|
|
|
|
|
|
|
return folder_path
|
|
return folder_path
|
|
|
|
|
|
|
|
-
|
|
|
|
|
def get_chapter_data():
|
|
def get_chapter_data():
|
|
|
result = {}
|
|
result = {}
|
|
|
page_title = ''
|
|
page_title = ''
|
|
|
- with sync_playwright() as p:
|
|
|
|
|
- if use_proxy:
|
|
|
|
|
- browser = p.chromium.launch(
|
|
|
|
|
- headless=True, proxy={"server": "http://127.0.0.1:7890"})
|
|
|
|
|
- else:
|
|
|
|
|
- browser = p.chromium.launch(headless=True)
|
|
|
|
|
- page = browser.new_page()
|
|
|
|
|
-
|
|
|
|
|
- # 打开目标页面
|
|
|
|
|
- page.goto(target_url)
|
|
|
|
|
-
|
|
|
|
|
- # 等待页面加载完成
|
|
|
|
|
- page.wait_for_selector('body > div.container > div:nth-child(3) > div:nth-child(2)')
|
|
|
|
|
-
|
|
|
|
|
- # 获取页面标题
|
|
|
|
|
- page_title = page.title()
|
|
|
|
|
- if page_title:
|
|
|
|
|
- page_title = page_title.replace('JComic - ', '')
|
|
|
|
|
- page_title = page_title.replace(' ', '')
|
|
|
|
|
|
|
|
|
|
- # 获取指定选择器下的所有元素
|
|
|
|
|
- elements = page.query_selector_all('body > div.container > div:nth-child(3) > div:nth-child(2) a')
|
|
|
|
|
|
|
+ # 使用 httpx 发送请求
|
|
|
|
|
+ with httpx.Client() as client:
|
|
|
|
|
+ response = client.get(target_url)
|
|
|
|
|
+ if response.status_code == 200:
|
|
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
|
- # 提取每个元素的 URL 和文本
|
|
|
|
|
- for element in elements:
|
|
|
|
|
- url = element.get_attribute('href')
|
|
|
|
|
- text = element.inner_text()
|
|
|
|
|
- result[text] = base_url + url
|
|
|
|
|
|
|
+ # 获取指定选择器下的所有元素
|
|
|
|
|
+ elements = soup.select('body > div.container > div:nth-child(3) > div:nth-child(2) a')
|
|
|
|
|
|
|
|
- # 关闭浏览器
|
|
|
|
|
- browser.close()
|
|
|
|
|
- return page_title, result
|
|
|
|
|
|
|
+ # 提取每个元素的 URL 和文本
|
|
|
|
|
+ for element in elements:
|
|
|
|
|
+ url = element.get('href')
|
|
|
|
|
+ text = element.get_text()
|
|
|
|
|
+ result[text] = base_url + url
|
|
|
|
|
|
|
|
|
|
+ return result
|
|
|
|
|
|
|
|
def main():
|
|
def main():
|
|
|
- # 1, 获取页面 title 和章节数据
|
|
|
|
|
- page_title, chapter_data = get_chapter_data()
|
|
|
|
|
- if not page_title:
|
|
|
|
|
- print('获取页面标题失败')
|
|
|
|
|
- exit(0)
|
|
|
|
|
|
|
+ # 1, 获取页面章节数据
|
|
|
|
|
+ chapter_data = get_chapter_data()
|
|
|
|
|
|
|
|
# 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title
|
|
# 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title
|
|
|
- folder_path = new_folder(page_title)
|
|
|
|
|
|
|
+ folder_path = new_folder(comico_url)
|
|
|
|
|
|
|
|
- # 3, 保存url到新建的文件夹中
|
|
|
|
|
- save_urls(folder_path, chapter_data)
|
|
|
|
|
|
|
+ # 3, 遍历章节数据,获取img的链接
|
|
|
|
|
+ img_links = get_imgs(folder_path, chapter_data)
|
|
|
|
|
|
|
|
- # 4, 遍历章节数据,依次下载图片
|
|
|
|
|
- get_imgs(folder_path, chapter_data)
|
|
|
|
|
|
|
+ # 4, 保存url到新建的文件夹中
|
|
|
|
|
+ save_urls(folder_path, img_links)
|
|
|
|
|
|
|
|
|
|
+ # 5,遍历 img_links ,将图片保存到 folder_path中, 保存的文件名类似 0001.png
|
|
|
|
|
+ save_img(folder_path, img_links)
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
if __name__ == '__main__':
|
|
|
main()
|
|
main()
|
|
|
- print('done!')
|
|
|
|
|
|
|
+ print('done!')
|