import os import time from playwright.sync_api import sync_playwright use_proxy = 1 base_url = 'https://jcomic.net' herf_url = '/eps/' comico_url = '%E7%99%BE%E5%90%88%E3%83%95%E3%82%A7%E3%83%81LIFE' target_url = base_url + herf_url + comico_url scroll_speed = 5 def scroll_to_percentage(page): # 滚动浏览器页面 percentage_list = [i for i in range(5, 101, scroll_speed)] for percentage in percentage_list: # 计算页面的指定百分比高度 height = page.evaluate("() => document.body.scrollHeight") scroll_position = height * (percentage / 100) # 跳转到指定的百分比位置 page.evaluate(f"window.scrollTo({0}, {scroll_position})") time.sleep(0.5) def get_imgs(folder_path, chapter_data): with sync_playwright() as p: for chapter_name, url in chapter_data.items(): # 创建文件夹 chapter_folder = os.path.join(folder_path, chapter_name) if not os.path.exists(chapter_folder): os.makedirs(chapter_folder) # 遍历章节数据,依次下载图片 if use_proxy: browser = p.chromium.launch(headless=True, proxy={"server": "http://127.0.0.1:7890"}) else: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto(url) time.sleep(1) print(f'滚动 {chapter_name}') scroll_to_percentage(page) print(f'滚动 {chapter_name}完成') # 获取图片的上一层元素 parent_locator = page.locator('body > div.container > div.row.col-lg-12.col-md-12.col-xs-12') # 获取匹配的图片元素数量 total_images = parent_locator.locator('img').count() print(f'{chapter_name} 共 {total_images} 张图片') n = 1 # 遍历图片元素并截图保存 for page_num in range(1, total_images + 1): img_locator = f'body > div.container > div.row.col-lg-12.col-md-12.col-xs-12 > img:nth-child({page_num})' img_path = os.path.join( chapter_folder, f'{str(n).zfill(3)}.png') try: # 先不截图, 获取图片的src src_urls = page.query_selector_all(img_locator) for src_url in src_urls: src = src_url.get_attribute('src') if src: page.locator(img_locator).screenshot(path=img_path) n += 1 except Exception: continue print(f'{chapter_name} 保存完成') browser.close() def save_urls(folder_path, chapter_data): with open(os.path.join(folder_path, 'urls.txt'), 'w') as file: for chapter_name, url in chapter_data.items(): file.write(f"{chapter_name}: {url}\n") def new_folder(page_title): # 获取当前脚本所在的目录 script_dir = os.path.dirname(os.path.abspath(__file__)) download_dir = os.path.join(script_dir, 'downloads') if not os.path.exists(script_dir): os.makedirs(script_dir) if page_title: # 拼接目标文件夹路径 folder_path = os.path.join(download_dir, page_title) # 检查文件夹是否存在,如果不存在则创建 if not os.path.exists(folder_path): os.makedirs(folder_path) return folder_path def get_chapter_data(): result = {} page_title = '' with sync_playwright() as p: if use_proxy: browser = p.chromium.launch( headless=True, proxy={"server": "http://127.0.0.1:7890"}) else: browser = p.chromium.launch(headless=True) page = browser.new_page() # 打开目标页面 page.goto(target_url) # 等待页面加载完成 page.wait_for_selector('body > div.container > div:nth-child(3) > div:nth-child(2)') # 获取页面标题 page_title = page.title() if page_title: page_title = page_title.replace('JComic - ', '') page_title = page_title.replace(' ', '') # 获取指定选择器下的所有元素 elements = page.query_selector_all('body > div.container > div:nth-child(3) > div:nth-child(2) a') # 提取每个元素的 URL 和文本 for element in elements: url = element.get_attribute('href') text = element.inner_text() result[text] = base_url + url # 关闭浏览器 browser.close() return page_title, result def main(): # 1, 获取页面 title 和章节数据 page_title, chapter_data = get_chapter_data() if not page_title: print('获取页面标题失败') exit(0) # 2, 在当前文件夹下创建一个文件夹,用来保存图片, 文件名称是 title folder_path = new_folder(page_title) # 3, 保存url到新建的文件夹中 save_urls(folder_path, chapter_data) # 4, 遍历章节数据,依次下载图片 get_imgs(folder_path, chapter_data) if __name__ == '__main__': main() print('done!')