|
|
@@ -27,17 +27,15 @@ class Zhuimh:
|
|
|
# 模拟鼠标滚轮操作,滚动到页面底部
|
|
|
page.mouse.wheel(0, scroll_distance)
|
|
|
|
|
|
- def get_chapter_img(self, chapter_name_list, chapter_url_list):
|
|
|
+ def get_chapter_img(self, chapter_name_list, chapter_url_list, path):
|
|
|
for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
|
|
|
print(f'章节名: {chapter_name}, 章节url: {chapter_url}')
|
|
|
with sync_playwright() as playwright:
|
|
|
- browser = playwright.chromium.launch(headless=True) # headless=False 可以开启浏览器界面,便于调试
|
|
|
+ browser = playwright.chromium.launch(headless=False) # headless=False 可以开启浏览器界面,便于调试
|
|
|
page = browser.new_page()
|
|
|
|
|
|
page.goto(chapter_url)
|
|
|
|
|
|
- time.sleep(1)
|
|
|
-
|
|
|
self.window_scroll(page)
|
|
|
|
|
|
# for _ in range(5):
|
|
|
@@ -47,21 +45,54 @@ class Zhuimh:
|
|
|
#
|
|
|
# page.wait_for_timeout(1000)
|
|
|
|
|
|
- time.sleep(1)
|
|
|
+ page.wait_for_timeout(3)
|
|
|
|
|
|
element = page.query_selector('body > div.chpater-images')
|
|
|
|
|
|
links = element.query_selector_all('img')
|
|
|
|
|
|
- chpater_img_links = []
|
|
|
+ time.sleep(5)
|
|
|
+
|
|
|
+ chapter_file = os.path.join(path, chapter_name)
|
|
|
+ if not os.path.exists(chapter_file):
|
|
|
+ os.mkdir(chapter_file)
|
|
|
+
|
|
|
+ #图片后缀都是webp
|
|
|
+ img_suffix = '.webp'
|
|
|
+
|
|
|
+ img_count = 1
|
|
|
|
|
|
for link in links:
|
|
|
# 获取每个 <a> 标签的 href 属性
|
|
|
img_src = link.get_attribute('src')
|
|
|
if 'blob:' in img_src:
|
|
|
- chpater_img_links.append(img_src)
|
|
|
-
|
|
|
- print(chpater_img_links)
|
|
|
+ # 这里开始保存图片
|
|
|
+ # 检测一下图片是否有下载过, 如果有就跳过
|
|
|
+ img_name = str(img_count).zfill(4)+img_suffix
|
|
|
+ img_path = os.path.join(chapter_file, img_name)
|
|
|
+ if not os.path.exists(img_path):
|
|
|
+ # 使用fetch API获取blob数据
|
|
|
+ self.save_blob_as_file(page, img_src, img_path)
|
|
|
+ img_count += 1
|
|
|
+ else:
|
|
|
+ img_count += 1
|
|
|
+
|
|
|
+ def save_blob_as_file(self, page, blob_url, file_path):
|
|
|
+ # 使用 playwright 的 evaluate 方法来获取 blob 数据
|
|
|
+ buffer = page.evaluate(f"""
|
|
|
+ () => {
|
|
|
+ const response = fetch('{blob_url}');
|
|
|
+ const blob = response.blob();
|
|
|
+ const reader = new FileReader();
|
|
|
+ reader.readAsArrayBuffer(blob);
|
|
|
+ return new Promise((resolve) => {
|
|
|
+ reader.onloadend = () => resolve(reader.result);
|
|
|
+ });
|
|
|
+ }
|
|
|
+ """)
|
|
|
+ # 将 ArrayBuffer 转换为 Node.js 的 Buffer 对象
|
|
|
+ with open(file_path, 'wb') as file:
|
|
|
+ file.write(buffer)
|
|
|
|
|
|
def get_chapter(self):
|
|
|
with sync_playwright() as playwright:
|
|
|
@@ -102,11 +133,11 @@ class Zhuimh:
|
|
|
print('元素未找到')
|
|
|
exit(0)
|
|
|
|
|
|
- return chapter_name_list, chapter_url_list
|
|
|
+ return chapter_name_list, chapter_url_list, path
|
|
|
|
|
|
def main(self):
|
|
|
- chapter_name_list, chapter_url_list = self.get_chapter()
|
|
|
- self.get_chapter_img(chapter_name_list, chapter_url_list)
|
|
|
+ chapter_name_list, chapter_url_list, path = self.get_chapter()
|
|
|
+ self.get_chapter_img(chapter_name_list, chapter_url_list, path)
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|