zhuimh.py 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import time
  4. import httpx
  5. import asyncio
  6. import re
  7. import sqlite3
  8. from playwright.sync_api import sync_playwright
  9. comico_id = '419025'
  10. base_url = 'https://www.zhuimh.com'
  11. target_href_url = 'https://www.zhuimh.com/comic/'
  12. scroll_speed = 2
  13. current_dir_path = os.path.dirname(os.path.abspath(__file__))
  14. download_folder = os.path.join(current_dir_path, 'downloads')
  15. if not os.path.exists(download_folder):
  16. os.mkdir(download_folder)
  17. db_path = os.path.join(download_folder, 'zhuimh.db')
  18. txt_path = os.path.join(download_folder, 'target_comico_name.txt')
  19. def create_db(title):
  20. conn = sqlite3.connect(db_path)
  21. cursor = conn.cursor()
  22. cursor.execute(
  23. f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)'
  24. )
  25. conn.commit()
  26. cursor.close()
  27. conn.close()
  28. def write_to_db(title, chapter_name, url):
  29. conn = sqlite3.connect(db_path)
  30. cursor = conn.cursor()
  31. # 检查chapter_name是否已存在
  32. cursor.execute(
  33. f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_name,))
  34. exists = cursor.fetchone()[0]
  35. if not exists:
  36. # 如果不存在,则插入新记录
  37. cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_name, url))
  38. conn.commit()
  39. cursor.close()
  40. conn.close()
  41. async def async_get_chapter_list():
  42. async with httpx.AsyncClient() as client:
  43. chapters_data = {}
  44. response = await client.get(target_href_url + comico_id)
  45. if response.status_code == 200:
  46. text = response.text
  47. title = re.findall(r'<h4>(.*?)</h4>', text)
  48. title = title[0] if title else comico_id
  49. print(title)
  50. # 这里先创建一个 txt, 固定获取本次爬取的名称,下面读取数据库的时候, 需要通过这个名称作为表名读出来
  51. with open(txt_path, 'w', encoding='utf-8') as f:
  52. print('写入当前目标名称')
  53. f.write(title)
  54. chapters = re.findall(r'<li><a href="(.*?)">(.*?)</a></li>', text)
  55. for chapter in chapters:
  56. chapters_data[chapter[1]] = base_url + chapter[0]
  57. # 创建 sqlite 将数据存起来
  58. create_db(title)
  59. for chapter_name, url in chapters_data.items():
  60. write_to_db(title, chapter_name, url)
  61. print('数据ok')
  62. async def get_chapter_list():
  63. await async_get_chapter_list()
  64. def load_db(title):
  65. conn = sqlite3.connect(db_path)
  66. cursor = conn.cursor()
  67. cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC')
  68. rows = cursor.fetchall()
  69. cursor.close()
  70. conn.close()
  71. return rows
  72. def change_db_data_state(data_id, t_name):
  73. conn = sqlite3.connect(db_path)
  74. cursor = conn.cursor()
  75. table_name = t_name
  76. id_column = 'id'
  77. id_value = data_id
  78. bool_column = 'state'
  79. sql = f'UPDATE {table_name} SET {bool_column} = 1 WHERE {id_column} = ?'
  80. cursor.execute(sql, (id_value,))
  81. conn.commit()
  82. cursor.close()
  83. conn.close()
  84. def scroll_to_percentage(page):
  85. percentage_list = [i for i in range(5, 101, scroll_speed)]
  86. for percentage in percentage_list:
  87. # 计算页面的指定百分比高度
  88. height = page.evaluate("() => document.body.scrollHeight")
  89. scroll_position = height * (percentage / 100)
  90. # 跳转到指定的百分比位置
  91. page.evaluate(f"window.scrollTo({0}, {scroll_position})")
  92. time.sleep(0.5)
  93. def request_chapter_data(title, data_id, chapter_name, chapter_url):
  94. chapter_folder = os.path.join(current_dir_path, 'downloads', title, chapter_name)
  95. with sync_playwright() as playwright:
  96. try:
  97. browser = playwright.chromium.launch(headless=True)
  98. page = browser.new_page()
  99. page.goto(chapter_url)
  100. page.wait_for_load_state('networkidle')
  101. except Exception as e:
  102. print(e)
  103. return False
  104. # 滚动页面
  105. print('开始滚动页面')
  106. scroll_to_percentage(page)
  107. page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
  108. scroll_to_percentage(page)
  109. print('滚动完成')
  110. time.sleep(2)
  111. # 检查一下是否所有的图片都加载完成, 如果没有, 直接退出函数, 然后重新打开浏览器
  112. html_content = page.content()
  113. check_list = re.findall('img class="lazy-read" src="(.*?)"', html_content)
  114. for l in check_list:
  115. if 'lazy-read.gif' in l:
  116. return False
  117. # 创建章节文件夹
  118. if not os.path.exists(chapter_folder):
  119. os.makedirs(chapter_folder)
  120. # 获取匹配的元素数量
  121. total_images = page.locator('.lazy-read').count()
  122. for page_num in range(1, total_images+1):
  123. img_locator = f'body > div.chpater-images > img:nth-child({page_num})'
  124. img_path = os.path.join(chapter_folder, f'{str(page_num).zfill(3)}.png')
  125. page.locator(img_locator).screenshot(path=img_path)
  126. print(f'已下载 {img_path}')
  127. page_num += 1
  128. # 下载完当前章节后, 将state字段改为True
  129. print(f'{chapter_name} 已下载完成\n\n')
  130. change_db_data_state(data_id, title)
  131. browser.close()
  132. return True
  133. def main():
  134. asyncio.run(get_chapter_list())
  135. # 开始对每一页的章节进行爬取
  136. # 先读取当前的目标名称
  137. title = ''
  138. with open(txt_path, 'r', encoding='utf-8') as f:
  139. title = f.read()
  140. folder_name = os.path.join(download_folder, title)
  141. if not os.path.exists(folder_name):
  142. os.mkdir(folder_name)
  143. for retry in range(999):
  144. load_data = load_db(title)
  145. if not load_data:
  146. print('The database has no data or all done!')
  147. exit(0)
  148. for data in load_data:
  149. ok = True
  150. data_id = data[0]
  151. chapter_name = data[1]
  152. chapter_url = data[2]
  153. print(f'准备获取图片: {title} {chapter_name}')
  154. ok = request_chapter_data(title, data_id, chapter_name, chapter_url)
  155. if not ok:
  156. print(f'图片加载失败: {title} {chapter_name} 重试\n\n')
  157. time.sleep(5)
  158. break
  159. if __name__ == "__main__":
  160. main()