main.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168
  1. import time
  2. import re
  3. import os
  4. import sqlite3
  5. import httpx
  6. from playwright.sync_api import sync_playwright
  7. current_dir_path = os.path.dirname(os.path.abspath(__file__))
  8. comico_key = 'OMzNzNS'
  9. base_url = 'https://www.dumanwu.com'
  10. target_url = base_url + '/' + comico_key
  11. download_folder = os.path.join(current_dir_path, 'downloads')
  12. if not os.path.exists(download_folder):
  13. os.mkdir(download_folder)
  14. def write_db(title, db_path, chapter_folder_name, chapter_url):
  15. conn = sqlite3.connect(db_path)
  16. cursor = conn.cursor()
  17. cursor.execute(
  18. f'CREATE TABLE IF NOT EXISTS {title} (id INTEGER PRIMARY KEY AUTOINCREMENT, chapter_name TEXT, url TEXT, state BOOLEAN DEFAULT 0)'
  19. )
  20. conn.commit()
  21. # 检查chapter_name是否已存在
  22. cursor.execute(
  23. f'SELECT EXISTS(SELECT 1 FROM {title} WHERE chapter_name = ?)', (chapter_folder_name,))
  24. exists = cursor.fetchone()[0]
  25. if not exists:
  26. # 如果不存在,则插入新记录
  27. cursor.execute(f'INSERT INTO {title} (chapter_name, url) VALUES (?, ?)', (chapter_folder_name, chapter_url))
  28. conn.commit()
  29. cursor.close()
  30. conn.close()
  31. def load_db(title, db_path):
  32. conn = sqlite3.connect(db_path)
  33. cursor = conn.cursor()
  34. cursor.execute(f'SELECT * FROM {title} WHERE state = 0 ORDER BY id ASC')
  35. rows = cursor.fetchall()
  36. cursor.close()
  37. conn.close()
  38. return rows
  39. def fetch_page_title(target_url):
  40. with httpx.Client(verify=False) as client: # 设置不验证证书
  41. response = client.get(target_url)
  42. if response.status_code != 200:
  43. print(f'Error: {response.status_code}')
  44. exit(0)
  45. title = re.findall(r'<p></p><h1 class="name_mh">(.*?)</h1><p></p>', response.text)
  46. if title:
  47. return title[0]
  48. else:
  49. print("Title not found")
  50. exit(0)
  51. def fetch_chapter_data():
  52. with sync_playwright() as playwright:
  53. browser = playwright.chromium.launch(
  54. headless=True,
  55. args=['--ignore-certificate-errors']
  56. )
  57. page = browser.new_page()
  58. page.goto(target_url)
  59. time.sleep(1)
  60. button_selector = 'body > div > div > div.forminfo > div.chapterList > div.chapterlistload > div > button'
  61. for i in range(3):
  62. try:
  63. page.click(button_selector)
  64. break
  65. except Exception as e:
  66. pass
  67. page.wait_for_timeout(1000)
  68. source = page.content()
  69. ul_list = re.findall('<ul>(.*?)</ul>', source, re.DOTALL)
  70. if len(ul_list) > 0:
  71. ul_list = ul_list[0]
  72. else:
  73. return False
  74. chapter_url_list = re.findall('<a href="(.*?)">', ul_list)
  75. chapter_name_list = re.findall('<li>(.*?)</li>', ul_list)
  76. chapter_url_list = chapter_url_list[::-1]
  77. chapter_name_list = chapter_name_list[::-1]
  78. result = {}
  79. chapter_count = 1
  80. for chapter_name, chapter_url in zip(chapter_name_list, chapter_url_list):
  81. chapter_count_str = str(chapter_count).zfill(4)
  82. chapter_url = base_url + chapter_url
  83. result[chapter_count_str] = (chapter_name, chapter_url)
  84. chapter_count += 1
  85. browser.close()
  86. return result
  87. def fetch_images(data, chapter_folder_name):
  88. data_id = data[0]
  89. chapter_url = data[2]
  90. with sync_playwright() as playwright:
  91. browser = playwright.chromium.launch(
  92. headless=False,
  93. args=['--ignore-certificate-errors']
  94. )
  95. page = browser.new_page()
  96. page.goto(chapter_url)
  97. time.sleep(1)
  98. html_content = page.content() # 获取渲染后的整个页面HTML
  99. img_list = re.findall('<div class="main_img"><div class="chapter-img-box">([\S\s]*?)</a></div>', html_content)
  100. img_list = img_list[0]
  101. urls = re.findall('<img (src="|data-src=")(.*?)"', img_list)
  102. for url in urls:
  103. page.goto(url)
  104. browser.close()
  105. def main():
  106. print(target_url)
  107. # ------------------------------ step1 ------------------------------
  108. title = fetch_page_title(target_url)
  109. comico_folder = os.path.join(download_folder, title)
  110. if not os.path.exists(comico_folder):
  111. os.mkdir(comico_folder)
  112. # 创建 chapter db, 保存 chapter 数据
  113. db_path = os.path.join(comico_folder, 'comico.db')
  114. # 获取章节的 title, url
  115. chapter_data = fetch_chapter_data()
  116. for k, v in chapter_data.items():
  117. chapter_url = v[1]
  118. write_db(title, db_path, k + '_' + v[0], chapter_url)
  119. # ------------------------------ step2 ------------------------------
  120. all_data = load_db(title, db_path)
  121. for data in all_data:
  122. chapter_folder_name = os.path.join(comico_folder, data[1])
  123. if not os.path.exists(chapter_folder_name):
  124. os.mkdir(chapter_folder_name)
  125. fetch_images(data, chapter_folder_name)
  126. time.sleep(999)
  127. if __name__ == '__main__':
  128. main()