1 yıl önce · 3010657a5d
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,4 @@
 
				+.DS_Store
			
 
				+__pycache__/
			
 
				+*.pyc
			
 
				+.idea
			
--- a/zcymh/zcymh.py
+++ b/zcymh/zcymh.py
@@ -0,0 +1,261 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+import platform
			
 
				+import time
			
 
				+import random
			
 
				+from datetime import datetime
			
 
				+import re
			
 
				+import os
			
 
				+from pymongo import MongoClient
			
 
				+
			
 
				+from selenium import webdriver
			
 
				+from selenium.webdriver.chrome.options import Options
			
 
				+from selenium.webdriver.support.ui import WebDriverWait
			
 
				+from selenium.webdriver.support import expected_conditions as EC
			
 
				+from selenium.webdriver.common.by import By
			
 
				+import httpx
			
 
				+
			
 
				+
			
 
				+def browser_opt():
			
 
				+    # 浏览器打开前, 设置浏览器
			
 
				+    os_name = platform.system()
			
 
				+    chrome_options = Options()
			
 
				+    chrome_options.add_argument('--no-sandbox')
			
 
				+    chrome_options.add_argument('--disable-setuid-sandbox')
			
 
				+    chrome_options.add_argument('--disable-gpu')
			
 
				+    chrome_options.add_argument('--headless')  # 添加无头模式参数
			
 
				+    # chrome_options.add_argument('--incognito') # 隐身模式（无痕模式）
			
 
				+    # chrome_options.binary_location = r"C:\Program Files (x86)\Google\Chrome\Application\chrome.exe" # 手动指定使用的浏览器位置
			
 
				+
			
 
				+    if os_name == 'Linux':
			
 
				+        chrome_options.add_argument('/usr/bin/chromium')  # linux 必须指定chromium路径
			
 
				+    else:
			
 
				+        pass  # 其他系统不需要指定路径
			
 
				+
			
 
				+    browser = webdriver.Chrome(options=chrome_options)
			
 
				+
			
 
				+    return browser
			
 
				+
			
 
				+
			
 
				+def browser_open(browser, url):
			
 
				+    # 打开浏览器
			
 
				+    browser.get(url)
			
 
				+    time.sleep(random.uniform(1, 2))
			
 
				+    return browser
			
 
				+
			
 
				+
			
 
				+def browser_get_page_source(browser):
			
 
				+    # 获取当前页面源代码
			
 
				+    return browser.page_source
			
 
				+
			
 
				+
			
 
				+def browser_find_by_selector(browser, selector):
			
 
				+    # 通过 css 选择器搜素
			
 
				+    try:
			
 
				+        WebDriverWait(browser, 10).until(EC.presence_of_element_located((By.CSS_SELECTOR, selector)))
			
 
				+        element = browser.find_element(By.CSS_SELECTOR, selector)
			
 
				+        if not element:
			
 
				+            return None
			
 
				+        return element.text
			
 
				+    except Exception as e:
			
 
				+        print(e)
			
 
				+        return None
			
 
				+
			
 
				+
			
 
				+def browser_screenshot(browser):
			
 
				+    # 获取当前网页的标题
			
 
				+    title = browser.title
			
 
				+    # 获取当前时间的时间戳
			
 
				+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
			
 
				+    # 构建文件名
			
 
				+    filename = f"{title.replace(' ', '')}_{timestamp}.png"
			
 
				+    # 保存截图
			
 
				+    browser.save_screenshot(filename)
			
 
				+    print(f"保存截图文件: {filename}")
			
 
				+
			
 
				+
			
 
				+def browser_close(browser):
			
 
				+    browser.close()
			
 
				+
			
 
				+
			
 
				+def sanitize_filename(string):
			
 
				+    # 替换Windows不允许的字符
			
 
				+    allowed_chars = re.compile(r'[<>:"/\\|?*]')
			
 
				+    sanitized_filename = allowed_chars.sub('', string)
			
 
				+
			
 
				+    # 替换空格为下划线
			
 
				+    sanitized_filename = sanitized_filename.replace(' ', '_')
			
 
				+
			
 
				+    # 确保文件名不以点开头
			
 
				+    if sanitized_filename.startswith('.'):
			
 
				+        sanitized_filename = '_' + sanitized_filename[1:]
			
 
				+
			
 
				+    # 确保文件名不包含两个连续的点
			
 
				+    sanitized_filename = sanitized_filename.replace('..', '.')
			
 
				+
			
 
				+    # 确保文件名不是空字符串
			
 
				+    if not sanitized_filename:
			
 
				+        sanitized_filename = 'noname' + '_' + str(int(time.time()))
			
 
				+
			
 
				+    return sanitized_filename
			
 
				+
			
 
				+
			
 
				+def task1():
			
 
				+    browser = browser_opt()
			
 
				+    print(f'正在打开浏览器')
			
 
				+    browser = browser_open(browser, url)
			
 
				+    print(f'前往 url: {url}')
			
 
				+
			
 
				+    page_source = browser_get_page_source(browser)
			
 
				+
			
 
				+    # 获取漫画名, 作为文件夹名
			
 
				+    book_name = re.findall('meta property="og:novel:book_name" content="(.*?)"', page_source)
			
 
				+    if book_name:
			
 
				+        book_name = book_name[0]
			
 
				+
			
 
				+        book_name = sanitize_filename(book_name)
			
 
				+    else:
			
 
				+        print("获取漫画名称失败")
			
 
				+        exit(0)
			
 
				+
			
 
				+    # 获取每一集的url
			
 
				+    all_set = []
			
 
				+
			
 
				+    host = 'https://zcymh.com'
			
 
				+
			
 
				+    start_tag = '<ol class="chapter-list col-4 text" id="j_chapter_list">'
			
 
				+    end_tag = '</ol>'
			
 
				+    start_index = page_source.find(start_tag)
			
 
				+    end_index = page_source.find(end_tag, start_index)
			
 
				+    if start_index != -1 and end_index != -1:
			
 
				+        target_element = page_source[start_index + len(start_tag):end_index]
			
 
				+        pattern = r'<a title="(.*?)" href="(.*?)" target="_self">'
			
 
				+        matches = re.findall(pattern, target_element)
			
 
				+        set_num = 1
			
 
				+        for match in matches:
			
 
				+            title = sanitize_filename(match[0])
			
 
				+            set_url = host + match[1]
			
 
				+            # 观看顺序排序, 集名, 集url
			
 
				+            all_set.append([str(set_num).zfill(4), title, set_url])
			
 
				+            set_num += 1
			
 
				+
			
 
				+    # 循环每一集的 url, 拿到每集的每一个图片, 存到一个总列表里面
			
 
				+    all_data_list = []
			
 
				+    for set_data in all_set:
			
 
				+        browser = browser_open(browser, set_data[2])
			
 
				+
			
 
				+        page_source = browser_get_page_source(browser)
			
 
				+        page_list = re.findall('<img src="(.*?)" width', page_source)
			
 
				+        print(f'正在获取 {set_data[1]}')
			
 
				+        page_num = 1
			
 
				+        for page in page_list:
			
 
				+            # 此处是 db 或者 csv 的一行数据
			
 
				+            all_data_list.append({
			
 
				+                'comico_serial': set_data[0],
			
 
				+                'set_name': set_data[1],
			
 
				+                'page_num': page_num,
			
 
				+                'set_url': set_data[2],
			
 
				+                'img_url': page,
			
 
				+                'is_download': 0,
			
 
				+            })
			
 
				+            page_num += 1
			
 
				+
			
 
				+    # 总列表储存所有数据, 存 mongodb
			
 
				+    conn = MongoClient(mongodb_link)
			
 
				+    db = conn[db_name]
			
 
				+    collection = db[book_name]
			
 
				+
			
 
				+    for data in all_data_list:
			
 
				+        data_exists = collection.find_one({"img_url": data['img_url']})
			
 
				+        if data_exists is None:
			
 
				+            try:
			
 
				+                result = collection.insert_one(data)
			
 
				+                print(f"数据插入成功，ObjectId: {data['comico_serial']}\t{data['set_name']}\t{data['page_num']}")
			
 
				+            except Exception as e:
			
 
				+                print(f"数据插入失败，错误信息: {e}")
			
 
				+        else:
			
 
				+            print(f'数据已存在: {data}')
			
 
				+
			
 
				+
			
 
				+    comico_path = os.path.join(os.getcwd(), 'comico')
			
 
				+    if not os.path.exists(comico_path):
			
 
				+        os.makedirs(comico_path)
			
 
				+
			
 
				+    # 写完所有数据, 创建一个文件夹
			
 
				+    file_path = os.path.join(comico_path, book_name)
			
 
				+    if not os.path.exists(file_path):
			
 
				+        os.mkdir(file_path)
			
 
				+
			
 
				+    browser_close(browser)
			
 
				+
			
 
				+
			
 
				+def task2():
			
 
				+    file_path = os.path.join(os.getcwd(), 'comico', load_book_name)
			
 
				+
			
 
				+    if not os.path.exists(file_path):
			
 
				+        os.mkdir(file_path)
			
 
				+
			
 
				+    client = MongoClient(mongodb_link)
			
 
				+
			
 
				+    db = client[db_name]
			
 
				+
			
 
				+    collection = db[load_book_name]
			
 
				+
			
 
				+    # 还原is_download
			
 
				+    # for document in collection.find():
			
 
				+    #     collection.update_one({'_id': document['_id']}, {'$set': {'is_download': 0}})
			
 
				+
			
 
				+    # 读取集合中的所有文档
			
 
				+    try:
			
 
				+        for document in collection.find():
			
 
				+            if document['is_download'] == 0:
			
 
				+                # 执行你的代码
			
 
				+                try:
			
 
				+                    resp = httpx.get(document['img_url'], headers=headers)
			
 
				+                    if resp.status_code != 200:
			
 
				+                        err = f'请求图片失败, 错误码: {resp.status_code}'
			
 
				+                        raise Exception(err)
			
 
				+
			
 
				+                    set_file_name = document['comico_serial'] + '_' + sanitize_filename(document['set_name'])
			
 
				+
			
 
				+                    if not os.path.exists(os.path.join(file_path, set_file_name)):
			
 
				+                        os.makedirs(os.path.join(file_path, set_file_name))
			
 
				+
			
 
				+                    img_name = str(document['page_num']).zfill(4)
			
 
				+
			
 
				+                    suffix = document['img_url'].split('.')[-1]
			
 
				+
			
 
				+                    img_path = file_path + '/' + set_file_name + '/' + img_name + '.' + suffix
			
 
				+
			
 
				+                    with open(img_path, 'wb') as f:
			
 
				+                        f.write(resp.content)
			
 
				+
			
 
				+                    # 执行成功后，将is_download字段更新为1
			
 
				+                    collection.update_one({'_id': document['_id']}, {'$set': {'is_download': 1}})
			
 
				+                    print(f"已更新文档: {load_book_name}\t{document['comico_serial']}\t{document['set_name']}\t{document['page_num']}")
			
 
				+                except Exception as e:
			
 
				+                    print(f"处理文档时发生错误：{e}")
			
 
				+            else:
			
 
				+                print("已下载，跳过")
			
 
				+    except Exception as e:
			
 
				+        print(f"读取集合时发生错误：{e}")
			
 
				+
			
 
				+    # 关闭数据库连接
			
 
				+    client.close()
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    choose = 2
			
 
				+
			
 
				+    mongodb_link = 'mongodb://root:aaaAAA111!!!@192.168.31.177:38000/'
			
 
				+    db_name = 'comico'
			
 
				+
			
 
				+    if choose == 1:
			
 
				+        comico_id = '384'
			
 
				+        url = 'https://zcymh.com/manben/{}/'.format(comico_id)
			
 
				+        host = 'https://zcymh.com'
			
 
				+        task1()
			
 
				+    elif choose == 2:
			
 
				+        load_book_name = '诚如神之所说'
			
 
				+        headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0'}
			
 
				+        task2()
			
--- a/zhuimh/merge_images.py
+++ b/zhuimh/merge_images.py
@@ -0,0 +1,45 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import os
			
 
				+from PIL import Image
			
 
				+
			
 
				+current_dir_path = os.path.dirname(os.path.abspath(__file__))
			
 
				+# 设置起始目录
			
 
				+start_dir = os.path.join(current_dir_path, 'downloads')
			
 
				+
			
 
				+
			
 
				+
			
 
				+# 遍历downloads文件夹
			
 
				+for root, dirs, files in os.walk(start_dir):
			
 
				+    for dir in dirs:
			
 
				+        sub_dir = os.path.join(root, dir)
			
 
				+        for sub_root, sub_dirs, sub_files in os.walk(sub_dir):
			
 
				+            for sub_sub_dir in sub_dirs:
			
 
				+                sub_sub_dir_path = os.path.join(sub_root, sub_sub_dir)
			
 
				+                print(sub_sub_dir_path)
			
 
				+                png_count = 0
			
 
				+                images = []
			
 
				+                for file in os.listdir(sub_sub_dir_path):
			
 
				+                    if file.lower().endswith('.png'):
			
 
				+                        images.append(os.path.join(sub_sub_dir_path, file))
			
 
				+                        png_count += 1
			
 
				+
			
 
				+                if not images:
			
 
				+                    raise ValueError("图片列表不能为空")
			
 
				+
			
 
				+                total_image = Image.open(images[0])
			
 
				+
			
 
				+                for image in images[1:]:
			
 
				+                    img = Image.open(image)
			
 
				+                    
			
 
				+                    new_image = Image.new('RGB', (max(total_image.width, img.width), total_image.height + img.height))
			
 
				+                    
			
 
				+                    new_image.paste(total_image, (0, 0))
			
 
				+                    new_image.paste(img, (0, total_image.height))
			
 
				+
			
 
				+                    total_image = new_image
			
 
				+
			
 
				+                total_image.save(f'{sub_sub_dir_path}.png')
			
 
				+                break
			
 
				+            break
			
 
				+    break
			
 
				+
			
--- a/zhuimh/zhuimh.py
+++ b/zhuimh/zhuimh.py
@@ -0,0 +1,203 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import os
			
 
				+import time
			
 
				+import httpx
			
 
				+import asyncio
			
 
				+import re
			
 
				+import sqlite3
			
 
				+from playwright.sync_api import sync_playwright
			
 
				+
			
 
				+comico_id = '419025'
			
 
				+base_url = 'https://www.zhuimh.com'
			
 
				+target_href_url = 'https://www.zhuimh.com/comic/'
			
 
				+scroll_speed = 2
			
 
				+current_dir_path = os.path.dirname(os.path.abspath(__file__))
			
 
				+
			
 
				+download_folder = os.path.join(current_dir_path, 'downloads')
			
 
				+if not os.path.exists(download_folder):
			
 
				+    os.mkdir(download_folder)
			
 
				+
			
 
				+db_path = os.path.join(download_folder, 'zhuimh.db')
			
 
				+txt_path = os.path.join(download_folder, 'target_comico_name.txt')
			
 
				+
			
 
				+def create_db(title):
			
 
				+    conn = sqlite3.connect(db_path)
			
 
				+    cursor = conn.cursor()
			
 
				+    cursor.execute(
			
 
				+        f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)'
			
 
				+    )
			
 
				+    conn.commit()
			
 
				+    cursor.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+
			
 
				+def write_to_db(title, chapter_name, url):
			
 
				+    conn = sqlite3.connect(db_path)
			
 
				+    cursor = conn.cursor()
			
 
				+
			
 
				+    # 检查chapter_name是否已存在
			
 
				+    cursor.execute(
			
 
				+        f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_name,))
			
 
				+    exists = cursor.fetchone()[0]
			
 
				+
			
 
				+    if not exists:
			
 
				+        # 如果不存在，则插入新记录
			
 
				+        cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_name, url))
			
 
				+        conn.commit()
			
 
				+
			
 
				+    cursor.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+
			
 
				+async def async_get_chapter_list():
			
 
				+    async with httpx.AsyncClient() as client:
			
 
				+        chapters_data = {}
			
 
				+        response = await client.get(target_href_url + comico_id)
			
 
				+        if response.status_code == 200:
			
 
				+            text = response.text
			
 
				+            title = re.findall(r'<h4>(.*?)</h4>', text)
			
 
				+            title = title[0] if title else comico_id
			
 
				+            print(title)
			
 
				+            # 这里先创建一个 txt, 固定获取本次爬取的名称,下面读取数据库的时候, 需要通过这个名称作为表名读出来
			
 
				+            with open(txt_path, 'w', encoding='utf-8') as f:
			
 
				+                print('写入当前目标名称')
			
 
				+                f.write(title)
			
 
				+
			
 
				+            chapters = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', text)
			
 
				+            for chapter in chapters:
			
 
				+                chapters_data[chapter[1]] = base_url + chapter[0]
			
 
				+
			
 
				+            # 创建 sqlite 将数据存起来
			
 
				+            create_db(title)
			
 
				+
			
 
				+            for chapter_name, url in chapters_data.items():
			
 
				+                write_to_db(title, chapter_name, url)
			
 
				+    print('数据ok')
			
 
				+
			
 
				+
			
 
				+async def get_chapter_list():
			
 
				+    await async_get_chapter_list()
			
 
				+
			
 
				+
			
 
				+def load_db(title):
			
 
				+    conn = sqlite3.connect(db_path)
			
 
				+    cursor = conn.cursor()
			
 
				+    cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC')
			
 
				+    rows = cursor.fetchall()
			
 
				+    cursor.close()
			
 
				+    conn.close()
			
 
				+    return rows
			
 
				+
			
 
				+
			
 
				+def change_db_data_state(data_id, t_name):
			
 
				+    conn = sqlite3.connect(db_path)
			
 
				+    cursor = conn.cursor()
			
 
				+    table_name = t_name
			
 
				+    id_column = 'id'
			
 
				+    id_value = data_id
			
 
				+    bool_column = 'state'
			
 
				+    sql = f'UPDATE {table_name} SET {bool_column} = 1 WHERE {id_column} = ?'
			
 
				+    cursor.execute(sql, (id_value,))
			
 
				+    conn.commit()
			
 
				+    cursor.close()
			
 
				+    conn.close()
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+def scroll_to_percentage(page):
			
 
				+    percentage_list = [i for i in range(5, 101, scroll_speed)]
			
 
				+    for percentage in percentage_list:
			
 
				+        # 计算页面的指定百分比高度
			
 
				+        height = page.evaluate("() => document.body.scrollHeight")
			
 
				+        scroll_position = height * (percentage / 100)
			
 
				+        # 跳转到指定的百分比位置
			
 
				+        page.evaluate(f"window.scrollTo({0}, {scroll_position})")
			
 
				+        time.sleep(0.5)
			
 
				+    
			
 
				+
			
 
				+
			
 
				+def request_chapter_data(title, data_id, chapter_name, chapter_url):
			
 
				+    chapter_folder = os.path.join(current_dir_path, 'downloads', title, chapter_name)
			
 
				+    with sync_playwright() as playwright:
			
 
				+        try:
			
 
				+            browser = playwright.chromium.launch(headless=True)
			
 
				+            page = browser.new_page()
			
 
				+            page.goto(chapter_url)
			
 
				+            page.wait_for_load_state('networkidle')
			
 
				+        except Exception as e:
			
 
				+            print(e)
			
 
				+            return False
			
 
				+
			
 
				+        # 滚动页面
			
 
				+        print('开始滚动页面')
			
 
				+        scroll_to_percentage(page)
			
 
				+        page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
			
 
				+        scroll_to_percentage(page)
			
 
				+        print('滚动完成')
			
 
				+        time.sleep(2)
			
 
				+
			
 
				+        # 检查一下是否所有的图片都加载完成, 如果没有, 直接退出函数, 然后重新打开浏览器
			
 
				+        html_content = page.content()
			
 
				+        check_list = re.findall('img class="lazy-read" src="(.*?)"', html_content)
			
 
				+        for l in check_list:
			
 
				+            if 'lazy-read.gif' in l:
			
 
				+                return False
			
 
				+
			
 
				+        # 创建章节文件夹
			
 
				+        if not os.path.exists(chapter_folder):
			
 
				+            os.makedirs(chapter_folder)
			
 
				+
			
 
				+        # 获取匹配的元素数量
			
 
				+        total_images = page.locator('.lazy-read').count()
			
 
				+
			
 
				+        for page_num in range(1, total_images+1):
			
 
				+            img_locator = f'body > div.chpater-images > img:nth-child({page_num})'
			
 
				+            img_path = os.path.join(chapter_folder, f'{str(page_num).zfill(3)}.png')
			
 
				+            page.locator(img_locator).screenshot(path=img_path)
			
 
				+            print(f'已下载 {img_path}')
			
 
				+            page_num += 1
			
 
				+
			
 
				+        # 下载完当前章节后， 将state字段改为True
			
 
				+        print(f'{chapter_name} 已下载完成\n\n')
			
 
				+        change_db_data_state(data_id, title)
			
 
				+
			
 
				+        browser.close()
			
 
				+        return True
			
 
				+
			
 
				+
			
 
				+def main():
			
 
				+    asyncio.run(get_chapter_list())
			
 
				+
			
 
				+    # 开始对每一页的章节进行爬取
			
 
				+    # 先读取当前的目标名称
			
 
				+    title = ''
			
 
				+    with open(txt_path, 'r', encoding='utf-8') as f:
			
 
				+        title = f.read()
			
 
				+
			
 
				+    folder_name = os.path.join(download_folder, title)
			
 
				+    if not os.path.exists(folder_name):
			
 
				+        os.mkdir(folder_name)
			
 
				+
			
 
				+    for retry in range(999):
			
 
				+        load_data = load_db(title)
			
 
				+
			
 
				+        if not load_data:
			
 
				+            print('The database has no data or all done!')
			
 
				+            exit(0)
			
 
				+        
			
 
				+        for data in load_data:
			
 
				+            ok = True
			
 
				+            data_id = data[0]
			
 
				+            chapter_name = data[1]
			
 
				+            chapter_url = data[2]
			
 
				+            print(f'准备获取图片： {title}  {chapter_name}')
			
 
				+            ok = request_chapter_data(title, data_id, chapter_name, chapter_url)
			
 
				+            if not ok:
			
 
				+                print(f'图片加载失败： {title}  {chapter_name} 重试\n\n')
			
 
				+                time.sleep(5)
			
 
				+                break
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    main()