cl_backup.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218
  1. import asyncio
  2. import os
  3. import random
  4. import re
  5. import sqlite3
  6. import httpx
  7. # 1, 更新数据库, 2, 读取数据库, 关键词过滤输出
  8. opt = 1
  9. # 代理 ip
  10. proxy = 'http://127.0.0.1:7890'
  11. class GETCLDATA:
  12. def __init__(self):
  13. self.base_url = 'https://t66y.com/'
  14. self.target_url_dict = {
  15. 'cavalry': 'thread0806.php?fid=15&search=&page={}',
  16. 'infantry': 'thread0806.php?fid=2&search=&page={}',
  17. }
  18. self.headers = {
  19. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'
  20. }
  21. self.all_data = {}
  22. async def fetch_page(self, client, url, tag, page):
  23. # print(f'正在获取 {tag} 第 {page} 页数据')
  24. sleep_time = random.uniform(3, 5)
  25. # print(f'程序暂停{sleep_time}')
  26. await asyncio.sleep(sleep_time)
  27. try:
  28. response = await client.get(url, headers=self.headers)
  29. if response.status_code != 200:
  30. print('连接失败')
  31. return None
  32. response.encoding = 'utf-8'
  33. return response.text
  34. except Exception as e:
  35. print(e)
  36. return None
  37. def parse_html(self, html, tag):
  38. target_list = re.findall(r'<h3>(.*?)</h3>', html)
  39. if not target_list:
  40. print(f'未找到任何h3标签内容,tag: {tag}')
  41. return
  42. for i in target_list:
  43. if '隨時更新' in i or '免翻地址' in i or '发布原创' in i or '版規' in i or 'VR' in i or 'vr' in i:
  44. continue
  45. href_url_list = re.findall(r'<a href="(.*?)"', i)
  46. cl_id_list = re.findall(r'id="(.*?)">', i)
  47. title_list = re.findall(r'target="_blank" id=".*?">(.*?)</a>', i)
  48. for herf_url, cl_id, title in zip(href_url_list, cl_id_list, title_list):
  49. if not self.all_data.setdefault(tag):
  50. self.all_data[tag] = [
  51. [cl_id, self.base_url + herf_url, herf_url, title]]
  52. else:
  53. self.all_data[tag].append(
  54. [cl_id, self.base_url + herf_url, herf_url, title])
  55. async def get_data(self):
  56. if proxy:
  57. async with httpx.AsyncClient(proxy=proxy) as client:
  58. tasks = []
  59. for tag, target_url in self.target_url_dict.items():
  60. for page in range(1, 100):
  61. url = self.base_url + target_url.format(page)
  62. task = asyncio.create_task(
  63. self.fetch_page(client, url, tag, page))
  64. tasks.append(task)
  65. htmls = await asyncio.gather(*tasks)
  66. for html, (tag, page) in zip(htmls, [(tag, page) for tag in self.target_url_dict for page in range(1, 100)]):
  67. if html:
  68. self.parse_html(html, tag)
  69. else:
  70. async with httpx.AsyncClient() as client:
  71. tasks = []
  72. for tag, target_url in self.target_url_dict.items():
  73. for page in range(1, 100):
  74. url = self.base_url + target_url.format(page)
  75. task = asyncio.create_task(
  76. self.fetch_page(client, url, tag, page))
  77. tasks.append(task)
  78. htmls = await asyncio.gather(*tasks)
  79. for html, (tag, page) in zip(htmls, [(tag, page) for tag in self.target_url_dict for page in range(1, 100)]):
  80. if html:
  81. self.parse_html(html, tag)
  82. def save_to_db(self, tag):
  83. conn = sqlite3.connect('cl.db')
  84. c = conn.cursor()
  85. c.execute('''
  86. CREATE TABLE IF NOT EXISTS 'CL' (
  87. id INTEGER PRIMARY KEY AUTOINCREMENT,
  88. cl_id TEXT NOT NULL,
  89. full_url TEXT NOT NULL,
  90. href_url TEXT NOT NULL,
  91. title TEXT NOT NULL,
  92. tag TEXT NOT NULL,
  93. UNIQUE(cl_id)
  94. )
  95. ''')
  96. conn.commit()
  97. skip_counter = 0
  98. save_line_count = 0
  99. for data in self.all_data.get(tag, []):
  100. cl_id, full_url, href_url, title = data
  101. c.execute(f'SELECT cl_id FROM "CL" WHERE cl_id=?', (cl_id,))
  102. if not c.fetchone():
  103. c.execute(f'INSERT INTO "CL" (cl_id, full_url, href_url, title, tag) VALUES (?, ?, ?, ?, ?)',
  104. (cl_id, full_url, href_url, title, tag))
  105. conn.commit()
  106. save_line_count += 1
  107. else:
  108. skip_counter += 1
  109. # print(f"数据已存在,跳过。当前跳过次数:{skip_counter}")
  110. if skip_counter >= 10:
  111. break
  112. c.close()
  113. conn.close()
  114. return save_line_count
  115. async def main(self):
  116. await self.get_data()
  117. if not self.all_data:
  118. print('无法获取数据')
  119. return
  120. save_line_count = 0
  121. for tag in self.all_data:
  122. save_line_count += self.save_to_db(tag)
  123. print(f'保存成功,共保存{save_line_count}条数据')
  124. class LOADCLDATA:
  125. def __init__(self, db_name='cl.db'):
  126. self.db_name = db_name
  127. self.conn = None
  128. self.cursor = None
  129. def connect(self):
  130. """连接到 SQLite 数据库"""
  131. if not os.path.exists(self.db_name):
  132. c = GETCLDATA()
  133. asyncio.run(c.main())
  134. self.conn = sqlite3.connect(self.db_name)
  135. self.cursor = self.conn.cursor()
  136. def fetch_all_data(self):
  137. """查询表中的所有数据"""
  138. self.cursor.execute("SELECT * FROM CL")
  139. rows = self.cursor.fetchall()
  140. print(f'\n\n数据库共有{len(rows)}条数据')
  141. return rows
  142. def filter_by_title(self, filter_list):
  143. """
  144. 根据 filter_list 中的元素模糊匹配 title 字段,并打印匹配的结果
  145. :param filter_list: 包含需要匹配的关键词的列表
  146. """
  147. if not filter_list:
  148. print("filter_list 为空,未进行匹配。")
  149. return
  150. # 构建 SQL 的 WHERE 条件
  151. like_conditions = " OR ".join(["Title LIKE ?"] * len(filter_list))
  152. query = f"SELECT * FROM CL WHERE {like_conditions}"
  153. # 构建参数列表,每个关键词前后加上 % 用于模糊匹配
  154. params = [f'%{keyword}%' for keyword in filter_list]
  155. # 执行查询
  156. self.cursor.execute(query, params)
  157. matched_rows = self.cursor.fetchall()
  158. # 打印匹配的结果
  159. if matched_rows:
  160. print("\n匹配到的结果:")
  161. for row in matched_rows:
  162. print(
  163. f"ID: {row[0]}, Tag: {row[5]}, Full_URL: {row[2]}, Title: {row[4]}")
  164. print(f"匹配到的总行数: {len(matched_rows)}")
  165. else:
  166. print("没有匹配到任何结果。")
  167. def close(self):
  168. """关闭数据库连接"""
  169. if self.conn:
  170. self.conn.close()
  171. if __name__ == '__main__':
  172. if opt == 1:
  173. cl = GETCLDATA()
  174. asyncio.run(cl.main())
  175. print('done')
  176. elif opt == 2:
  177. filter_list = ['']
  178. cl = LOADCLDATA()
  179. cl.connect()
  180. cl.filter_by_title(filter_list)
  181. cl.fetch_all_data()
  182. cl.close()