# -*- coding: utf-8 -*-
# https://www.zhuimh.com/comic/419025
# 获取 zhuimh
import time
import os
import psycopg2
from playwright.sync_api import sync_playwright
class Zhuimh:
def __init__(self):
self.comico_id = 419025
self.base_url = 'https://www.zhuimh.com'
self.href_url = '/comic/'
self.target_url = self.base_url + self.href_url + str(self.comico_id)
def window_scroll(self, page):
# 获取页面高度
page_height = page.evaluate('() => document.body.scrollHeight')
# 获取视口高度
viewport_height = page.evaluate('() => window.innerHeight')
# 计算需要滚动的距离
scroll_distance = page_height - viewport_height
# 模拟鼠标滚轮操作,滚动到页面底部
page.mouse.wheel(0, scroll_distance)
def get_chapter_img(self, chapter_name_list, chapter_url_list, path):
for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
print(f'章节名: {chapter_name}, 章节url: {chapter_url}')
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=False) # headless=False 可以开启浏览器界面,便于调试
page = browser.new_page()
page.goto(chapter_url)
self.window_scroll(page)
# for _ in range(5):
# page.evaluate('''() => {window.scrollTo(0, document.body.scrollHeight);}''')
#
# time.sleep(0.2)
#
# page.wait_for_timeout(1000)
page.wait_for_timeout(3)
element = page.query_selector('body > div.chpater-images')
links = element.query_selector_all('img')
time.sleep(5)
chapter_file = os.path.join(path, chapter_name)
if not os.path.exists(chapter_file):
os.mkdir(chapter_file)
#图片后缀都是webp
img_suffix = '.webp'
img_count = 1
for link in links:
# 获取每个 标签的 href 属性
img_src = link.get_attribute('src')
if 'blob:' in img_src:
# 这里开始保存图片
# 检测一下图片是否有下载过, 如果有就跳过
img_name = str(img_count).zfill(4)+img_suffix
img_path = os.path.join(chapter_file, img_name)
if not os.path.exists(img_path):
# 使用fetch API获取blob数据
self.save_blob_as_file(page, img_src, img_path)
img_count += 1
else:
img_count += 1
def save_blob_as_file(self, page, blob_url, file_path):
# 使用 playwright 的 evaluate 方法来获取 blob 数据
buffer = page.evaluate(f"""
() => {
const response = fetch('{blob_url}');
const blob = response.blob();
const reader = new FileReader();
reader.readAsArrayBuffer(blob);
return new Promise((resolve) => {
reader.onloadend = () => resolve(reader.result);
});
}
""")
# 将 ArrayBuffer 转换为 Node.js 的 Buffer 对象
with open(file_path, 'wb') as file:
file.write(buffer)
def get_chapter(self):
with sync_playwright() as playwright:
browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
page = browser.new_page()
page.goto(self.target_url)
title = page.title()
target_name = title.split('漫画免费')[0]
current_path = os.path.dirname(os.path.abspath(__file__))
path = os.path.join(current_path, 'zhuimh', target_name)
if not os.path.exists(path):
os.makedirs(path)
element = page.query_selector('body > div.tbox.tabs > div.tabs_block > ul')
chapter_name_list = []
chapter_url_list = []
if element:
# 执行你需要的操作,例如获取元素的文本内容
text = element.text_content()
for line in text.split('\n'):
if line.strip():
chapter_name_list.append(line.strip())
links = element.query_selector_all('a')
for link in links:
# 获取每个 标签的 href 属性
href = link.get_attribute('href')
if href:
chapter_url_list.append(self.base_url + href)
else:
print('没有找到 href 属性')
else:
print('元素未找到')
exit(0)
return chapter_name_list, chapter_url_list, path
def main(self):
chapter_name_list, chapter_url_list, path = self.get_chapter()
self.get_chapter_img(chapter_name_list, chapter_url_list, path)
if __name__ == '__main__':
zhuimh = Zhuimh()
zhuimh.main()