# -*- coding: utf-8 -*- # https://www.zhuimh.com/comic/419025 # 获取 zhuimh import time import os import psycopg2 from playwright.sync_api import sync_playwright class Zhuimh: def __init__(self): self.comico_id = 419025 self.base_url = 'https://www.zhuimh.com' self.href_url = '/comic/' self.target_url = self.base_url + self.href_url + str(self.comico_id) def window_scroll(self, page): # 获取页面高度 page_height = page.evaluate('() => document.body.scrollHeight') # 获取视口高度 viewport_height = page.evaluate('() => window.innerHeight') # 计算需要滚动的距离 scroll_distance = page_height - viewport_height # 模拟鼠标滚轮操作,滚动到页面底部 page.mouse.wheel(0, scroll_distance) def get_chapter_img(self, chapter_name_list, chapter_url_list, path): for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list): print(f'章节名: {chapter_name}, 章节url: {chapter_url}') with sync_playwright() as playwright: browser = playwright.chromium.launch(headless=False) # headless=False 可以开启浏览器界面,便于调试 page = browser.new_page() page.goto(chapter_url) self.window_scroll(page) # for _ in range(5): # page.evaluate('''() => {window.scrollTo(0, document.body.scrollHeight);}''') # # time.sleep(0.2) # # page.wait_for_timeout(1000) page.wait_for_timeout(3) element = page.query_selector('body > div.chpater-images') links = element.query_selector_all('img') time.sleep(5) chapter_file = os.path.join(path, chapter_name) if not os.path.exists(chapter_file): os.mkdir(chapter_file) #图片后缀都是webp img_suffix = '.webp' img_count = 1 for link in links: # 获取每个 标签的 href 属性 img_src = link.get_attribute('src') if 'blob:' in img_src: # 这里开始保存图片 # 检测一下图片是否有下载过, 如果有就跳过 img_name = str(img_count).zfill(4)+img_suffix img_path = os.path.join(chapter_file, img_name) if not os.path.exists(img_path): # 使用fetch API获取blob数据 self.save_blob_as_file(page, img_src, img_path) img_count += 1 else: img_count += 1 def save_blob_as_file(self, page, blob_url, file_path): # 使用 playwright 的 evaluate 方法来获取 blob 数据 buffer = page.evaluate(f""" () => { const response = fetch('{blob_url}'); const blob = response.blob(); const reader = new FileReader(); reader.readAsArrayBuffer(blob); return new Promise((resolve) => { reader.onloadend = () => resolve(reader.result); }); } """) # 将 ArrayBuffer 转换为 Node.js 的 Buffer 对象 with open(file_path, 'wb') as file: file.write(buffer) def get_chapter(self): with sync_playwright() as playwright: browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试 page = browser.new_page() page.goto(self.target_url) title = page.title() target_name = title.split('漫画免费')[0] current_path = os.path.dirname(os.path.abspath(__file__)) path = os.path.join(current_path, 'zhuimh', target_name) if not os.path.exists(path): os.makedirs(path) element = page.query_selector('body > div.tbox.tabs > div.tabs_block > ul') chapter_name_list = [] chapter_url_list = [] if element: # 执行你需要的操作,例如获取元素的文本内容 text = element.text_content() for line in text.split('\n'): if line.strip(): chapter_name_list.append(line.strip()) links = element.query_selector_all('a') for link in links: # 获取每个 标签的 href 属性 href = link.get_attribute('href') if href: chapter_url_list.append(self.base_url + href) else: print('没有找到 href 属性') else: print('元素未找到') exit(0) return chapter_name_list, chapter_url_list, path def main(self): chapter_name_list, chapter_url_list, path = self.get_chapter() self.get_chapter_img(chapter_name_list, chapter_url_list, path) if __name__ == '__main__': zhuimh = Zhuimh() zhuimh.main()