| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145 |
- # -*- coding: utf-8 -*-
- # https://www.zhuimh.com/comic/419025
- # 获取 zhuimh
- import time
- import os
- import psycopg2
- from playwright.sync_api import sync_playwright
- class Zhuimh:
- def __init__(self):
- self.comico_id = 419025
- self.base_url = 'https://www.zhuimh.com'
- self.href_url = '/comic/'
- self.target_url = self.base_url + self.href_url + str(self.comico_id)
- def window_scroll(self, page):
- # 获取页面高度
- page_height = page.evaluate('() => document.body.scrollHeight')
- # 获取视口高度
- viewport_height = page.evaluate('() => window.innerHeight')
- # 计算需要滚动的距离
- scroll_distance = page_height - viewport_height
- # 模拟鼠标滚轮操作,滚动到页面底部
- page.mouse.wheel(0, scroll_distance)
- def get_chapter_img(self, chapter_name_list, chapter_url_list, path):
- for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
- print(f'章节名: {chapter_name}, 章节url: {chapter_url}')
- with sync_playwright() as playwright:
- browser = playwright.chromium.launch(headless=False) # headless=False 可以开启浏览器界面,便于调试
- page = browser.new_page()
- page.goto(chapter_url)
- self.window_scroll(page)
- # for _ in range(5):
- # page.evaluate('''() => {window.scrollTo(0, document.body.scrollHeight);}''')
- #
- # time.sleep(0.2)
- #
- # page.wait_for_timeout(1000)
- page.wait_for_timeout(3)
- element = page.query_selector('body > div.chpater-images')
- links = element.query_selector_all('img')
- time.sleep(5)
- chapter_file = os.path.join(path, chapter_name)
- if not os.path.exists(chapter_file):
- os.mkdir(chapter_file)
- #图片后缀都是webp
- img_suffix = '.webp'
- img_count = 1
- for link in links:
- # 获取每个 <a> 标签的 href 属性
- img_src = link.get_attribute('src')
- if 'blob:' in img_src:
- # 这里开始保存图片
- # 检测一下图片是否有下载过, 如果有就跳过
- img_name = str(img_count).zfill(4)+img_suffix
- img_path = os.path.join(chapter_file, img_name)
- if not os.path.exists(img_path):
- # 使用fetch API获取blob数据
- self.save_blob_as_file(page, img_src, img_path)
- img_count += 1
- else:
- img_count += 1
- def save_blob_as_file(self, page, blob_url, file_path):
- # 使用 playwright 的 evaluate 方法来获取 blob 数据
- buffer = page.evaluate(f"""
- () => {
- const response = fetch('{blob_url}');
- const blob = response.blob();
- const reader = new FileReader();
- reader.readAsArrayBuffer(blob);
- return new Promise((resolve) => {
- reader.onloadend = () => resolve(reader.result);
- });
- }
- """)
- # 将 ArrayBuffer 转换为 Node.js 的 Buffer 对象
- with open(file_path, 'wb') as file:
- file.write(buffer)
- def get_chapter(self):
- with sync_playwright() as playwright:
- browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
- page = browser.new_page()
- page.goto(self.target_url)
- title = page.title()
- target_name = title.split('漫画免费')[0]
- current_path = os.path.dirname(os.path.abspath(__file__))
- path = os.path.join(current_path, 'zhuimh', target_name)
- if not os.path.exists(path):
- os.makedirs(path)
- element = page.query_selector('body > div.tbox.tabs > div.tabs_block > ul')
- chapter_name_list = []
- chapter_url_list = []
- if element:
- # 执行你需要的操作,例如获取元素的文本内容
- text = element.text_content()
- for line in text.split('\n'):
- if line.strip():
- chapter_name_list.append(line.strip())
- links = element.query_selector_all('a')
- for link in links:
- # 获取每个 <a> 标签的 href 属性
- href = link.get_attribute('href')
- if href:
- chapter_url_list.append(self.base_url + href)
- else:
- print('没有找到 href 属性')
- else:
- print('元素未找到')
- exit(0)
- return chapter_name_list, chapter_url_list, path
- def main(self):
- chapter_name_list, chapter_url_list, path = self.get_chapter()
- self.get_chapter_img(chapter_name_list, chapter_url_list, path)
- if __name__ == '__main__':
- zhuimh = Zhuimh()
- zhuimh.main()
|