main.py 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270
  1. import asyncio
  2. import os
  3. import random
  4. import re
  5. import sqlite3
  6. import httpx
  7. import tkinter as tk
  8. from tkinter import messagebox
  9. class GETCLDATA:
  10. def __init__(self, proxy=None): # 添加 proxy 参数
  11. self.base_url = 'https://t66y.com/'
  12. self.target_url_dict = {
  13. 'cavalry': 'thread0806.php?fid=15&search=&page={}',
  14. 'infantry': 'thread0806.php?fid=2&search=&page={}',
  15. }
  16. self.headers = {
  17. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
  18. }
  19. self.all_data = {}
  20. self.proxy = proxy # 保存代理设置
  21. async def fetch_page(self, client, url, tag, page):
  22. sleep_time = random.uniform(3, 5)
  23. await asyncio.sleep(sleep_time)
  24. try:
  25. response = await client.get(url, headers=self.headers)
  26. if response.status_code != 200:
  27. print('连接失败')
  28. return None
  29. response.encoding = 'utf-8'
  30. return response.text
  31. except Exception as e:
  32. print(e)
  33. return None
  34. def parse_html(self, html, tag):
  35. target_list = re.findall(r'<h3>(.*?)</h3>', html)
  36. if not target_list:
  37. print(f'未找到任何h3标签内容,tag: {tag}')
  38. return
  39. for i in target_list:
  40. if '隨時更新' in i or '免翻地址' in i or '发布原创' in i or '版規' in i or 'VR' in i or 'vr' in i:
  41. continue
  42. href_url_list = re.findall(r'<a href="(.*?)"', i)
  43. cl_id_list = re.findall(r'id="(.*?)">', i)
  44. title_list = re.findall(r'target="_blank" id=".*?">(.*?)</a>', i)
  45. for herf_url, cl_id, title in zip(href_url_list, cl_id_list, title_list):
  46. if not self.all_data.setdefault(tag):
  47. self.all_data[tag] = [
  48. [cl_id, self.base_url + herf_url, herf_url, title]]
  49. else:
  50. self.all_data[tag].append(
  51. [cl_id, self.base_url + herf_url, herf_url, title])
  52. async def get_data(self):
  53. if self.proxy: # 使用传入的代理
  54. async with httpx.AsyncClient(proxies=self.proxy) as client:
  55. tasks = []
  56. for tag, target_url in self.target_url_dict.items():
  57. for page in range(1, 100):
  58. url = self.base_url + target_url.format(page)
  59. task = asyncio.create_task(
  60. self.fetch_page(client, url, tag, page))
  61. tasks.append(task)
  62. htmls = await asyncio.gather(*tasks)
  63. for html, (tag, page) in zip(htmls, [(tag, page) for tag in self.target_url_dict for page in range(1, 100)]):
  64. if html:
  65. self.parse_html(html, tag)
  66. else:
  67. async with httpx.AsyncClient() as client:
  68. tasks = []
  69. for tag, target_url in self.target_url_dict.items():
  70. for page in range(1, 100):
  71. url = self.base_url + target_url.format(page)
  72. task = asyncio.create_task(
  73. self.fetch_page(client, url, tag, page))
  74. tasks.append(task)
  75. htmls = await asyncio.gather(*tasks)
  76. for html, (tag, page) in zip(htmls, [(tag, page) for tag in self.target_url_dict for page in range(1, 100)]):
  77. if html:
  78. self.parse_html(html, tag)
  79. def save_to_db(self, tag):
  80. conn = sqlite3.connect('cl.db')
  81. c = conn.cursor()
  82. c.execute('''
  83. CREATE TABLE IF NOT EXISTS 'CL' (
  84. id INTEGER PRIMARY KEY AUTOINCREMENT,
  85. cl_id TEXT NOT NULL,
  86. full_url TEXT NOT NULL,
  87. href_url TEXT NOT NULL,
  88. title TEXT NOT NULL,
  89. tag TEXT NOT NULL,
  90. UNIQUE(cl_id)
  91. )
  92. ''')
  93. conn.commit()
  94. skip_counter = 0
  95. save_line_count = 0
  96. for data in self.all_data.get(tag, []):
  97. cl_id, full_url, href_url, title = data
  98. c.execute(f'SELECT cl_id FROM "CL" WHERE cl_id=?', (cl_id,))
  99. if not c.fetchone():
  100. c.execute(f'INSERT INTO "CL" (cl_id, full_url, href_url, title, tag) VALUES (?, ?, ?, ?, ?)',
  101. (cl_id, full_url, href_url, title, tag))
  102. conn.commit()
  103. save_line_count += 1
  104. else:
  105. skip_counter += 1
  106. if skip_counter >= 10:
  107. break
  108. c.close()
  109. conn.close()
  110. return save_line_count
  111. async def main(self):
  112. await self.get_data()
  113. if not self.all_data:
  114. print('无法获取数据')
  115. return 0
  116. save_line_count = 0
  117. for tag in self.all_data:
  118. save_line_count += self.save_to_db(tag)
  119. print(f'保存成功,共保存{save_line_count}条数据')
  120. return save_line_count
  121. class LOADCLDATA:
  122. def __init__(self, db_name='cl.db'):
  123. self.db_name = db_name
  124. self.conn = None
  125. self.cursor = None
  126. def connect(self):
  127. if not os.path.exists(self.db_name):
  128. c = GETCLDATA()
  129. asyncio.run(c.main())
  130. self.conn = sqlite3.connect(self.db_name)
  131. self.cursor = self.conn.cursor()
  132. def fetch_all_data(self):
  133. self.cursor.execute("SELECT * FROM CL")
  134. rows = self.cursor.fetchall()
  135. print(f'\n\n数据库共有{len(rows)}条数据')
  136. return rows
  137. def filter_by_title(self, filter_list):
  138. if not filter_list:
  139. print("filter_list 为空,未进行匹配。")
  140. return []
  141. like_conditions = " OR ".join(["title LIKE ?"] * len(filter_list))
  142. query = f"SELECT * FROM CL WHERE {like_conditions}"
  143. params = [f'%{keyword}%' for keyword in filter_list]
  144. self.cursor.execute(query, params)
  145. matched_rows = self.cursor.fetchall()
  146. return matched_rows
  147. def close(self):
  148. if self.conn:
  149. self.conn.close()
  150. class ClApp:
  151. def __init__(self, root):
  152. self.root = root
  153. self.root.title("CL")
  154. screen_width = self.root.winfo_screenwidth()
  155. screen_height = self.root.winfo_screenheight()
  156. window_width = 800
  157. window_height = 650
  158. x = (screen_width - window_width) // 2 + 100
  159. y = (screen_height - window_height) // 2 + 50
  160. self.root.geometry(f"{window_width}x{window_height}+{x}+{y}")
  161. self.top_frame = tk.Frame(self.root)
  162. self.top_frame.pack(pady=10)
  163. self.update_button = tk.Button(
  164. self.top_frame, text="更新数据库", command=self.update_database)
  165. self.update_button.pack(side=tk.LEFT, padx=5)
  166. self.search_button = tk.Button(
  167. self.top_frame, text="搜索数据库", command=self.search_database)
  168. self.search_button.pack(side=tk.LEFT, padx=5)
  169. self.search_entry = tk.Entry(self.top_frame, width=30)
  170. self.search_entry.pack(side=tk.LEFT, padx=5)
  171. self.proxy_frame = tk.Frame(self.root)
  172. self.proxy_frame.pack(pady=5)
  173. self.proxy_var = tk.BooleanVar(value=True)
  174. self.proxy_checkbox = tk.Checkbutton(
  175. self.proxy_frame, text="是否使用代理", variable=self.proxy_var)
  176. self.proxy_checkbox.pack(side=tk.LEFT, padx=5)
  177. self.proxy_entry = tk.Entry(self.proxy_frame, width=30)
  178. self.proxy_entry.insert(0, "http://127.0.0.1:7890")
  179. self.proxy_entry.pack(side=tk.LEFT, padx=5)
  180. self.output_text = tk.Text(
  181. self.root, height=35, width=100, state="disabled")
  182. self.output_text.pack(pady=10)
  183. self.clear_button = tk.Button(
  184. self.root, text="清空输出", command=self.clear_output)
  185. self.clear_button.pack(pady=10)
  186. def update_database(self):
  187. self.output_text.config(state="normal")
  188. proxy = self.proxy_entry.get() if self.proxy_var.get() else None
  189. get_cl_data = GETCLDATA(proxy=proxy)
  190. asyncio.run(get_cl_data.main())
  191. self.output_text.config(state="disabled")
  192. messagebox.showinfo("提示", "数据库已更新完成")
  193. def search_database(self):
  194. keyword = self.search_entry.get()
  195. self.output_text.config(state="normal")
  196. self.output_text.delete(1.0, tk.END) # 清空输出框
  197. self.output_text.insert(tk.END, f"搜索关键词: {keyword}\n\n")
  198. self.output_text.config(state="disabled")
  199. # 调用 LOADCLDATA 类进行搜索
  200. load_cl_data = LOADCLDATA()
  201. load_cl_data.connect()
  202. results = load_cl_data.filter_by_title([keyword])
  203. if results:
  204. self.output_text.config(state="normal")
  205. output_result = ""
  206. for row in results:
  207. output_result += f"{row[4]}\n{row[2]}\n\n"
  208. self.output_text.insert(tk.END, output_result)
  209. self.output_text.config(state="disabled")
  210. messagebox.showinfo("搜索完成", f"共搜索到 {len(results)} 条数据")
  211. else:
  212. messagebox.showinfo("搜索完成", "没有匹配到任何结果")
  213. load_cl_data.close()
  214. def clear_output(self):
  215. self.output_text.config(state="normal")
  216. self.output_text.delete(1.0, tk.END)
  217. self.output_text.config(state="disabled")
  218. root = tk.Tk()
  219. app = ClApp(root)
  220. root.mainloop()