ai_news.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import re
  4. import json
  5. import uuid
  6. import httpx
  7. import asyncio
  8. import datetime
  9. from bs4 import BeautifulSoup
  10. from ollama import Client as oClient
  11. from openai import OpenAI
  12. from playwright.async_api import async_playwright
  13. from matrix_client.client import MatrixClient
  14. from matrix_client.api import MatrixHttpApi
  15. key_list = ['web3']
  16. text_batch = 1
  17. class FREEAI(object):
  18. def call_ai(self, message):
  19. try:
  20. client = OpenAI(
  21. api_key="sk-rM32T5VuyyCFyZGyEe006aEdFe6e4301A7627f7a3973Df17",
  22. base_url="https://knox.chat/v1",
  23. )
  24. completion = client.chat.completions.create(
  25. model="deepseek-ai/DeepSeek-R1-Distill-Llama-70B-free",
  26. messages=[{"role": "user", "content": f"{message}"}],
  27. temperature=0.3,
  28. )
  29. result = completion.choices[0].message.content
  30. return result
  31. except Exception as e:
  32. print(e)
  33. class OllamaChat(object):
  34. def __init__(self, host='http://192.168.31.28:11434'):
  35. self.host = host
  36. def call_ollama(self, role, text, prompt_words, model='llava:13b', temperature=0.4):
  37. # 使用 ollama 里面的模型
  38. message = text + '\n\n' + prompt_words
  39. print(f'use model: {model}')
  40. try:
  41. response_iter = oClient(host=self.host).chat(model=model,
  42. messages=[
  43. {'role': 'system', 'content': role},
  44. {'role': 'user', 'content': message}
  45. ],
  46. options={"temperature": temperature},
  47. stream=False)
  48. return response_iter['message']['content']
  49. except Exception as e:
  50. print(f"\n发生错误: {e}")
  51. return None
  52. class MatrixBot:
  53. def __init__(self, user, password, key):
  54. self.base_url = "https://matrix.erhe.top"
  55. self.user = user
  56. self.password = password
  57. self.client = MatrixClient("https://matrix.erhe.top")
  58. self.token = self.login()
  59. self.to = key
  60. def login(self):
  61. self.token = self.client.login(username=self.user, password=self.password)
  62. return self.token
  63. def send_message(self, message):
  64. if self.token:
  65. try:
  66. api = MatrixHttpApi(self.base_url, token=self.token)
  67. api.send_message(self.to, message)
  68. except Exception as e:
  69. print(e)
  70. api = MatrixHttpApi(self.base_url, token=self.token)
  71. api.send_message(self.to, str(e))
  72. else:
  73. print("Bot is not logged in. Please login first.")
  74. class AINEWS:
  75. def create_config_if_not_exists(self):
  76. # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
  77. current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件的目录路径
  78. # 构建 config.json 文件的完整路径
  79. config_path = os.path.join(current_dir, 'config.json')
  80. # 检查 config.json 文件是否存在
  81. if not os.path.exists(config_path):
  82. # 如果不存在,创建并写入默认的 JSON 数据
  83. default_config = {
  84. "example": {
  85. "use_browser": 0,
  86. "ai_host": 'http://127.0.0.1:11434(不需要此功能留空)',
  87. "message_bot_key": '填入matrix的key(不需要此功能留空)',
  88. "target_url_list": ['目标网站'],
  89. "role": "AI的角色, 例如: 你是一个聊天机器人",
  90. "prompt_words": "提示词: 帮我总结, 用中文回复"
  91. }
  92. }
  93. # 写入 JSON 数据到 config.json 文件
  94. with open(config_path, 'w', encoding='utf-8') as f:
  95. json.dump(default_config, f, indent=4)
  96. print(f"Created {config_path} with default configuration.")
  97. def mkdir_save_data(self):
  98. # 获取当前脚本所在路径
  99. current_file_path = os.path.dirname(__file__)
  100. # 拼接 save_data 文件夹路径
  101. save_file_path = os.path.join(current_file_path, 'save_data')
  102. # 如果 save_data 文件夹不存在,则创建
  103. if not os.path.exists(save_file_path):
  104. os.makedirs(save_file_path)
  105. # 在 save_data 文件夹中,创建一个以当前日期时间命名的子文件夹
  106. datetime_file_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  107. datetime_file_path = os.path.join(save_file_path, datetime_file_name)
  108. if not os.path.exists(datetime_file_path):
  109. os.makedirs(datetime_file_path)
  110. return datetime_file_path
  111. def save_to_txt(self, url_to_text, datetime_file_path):
  112. # 将爬取的新闻 保存到 txt 文件中
  113. file = os.path.join(datetime_file_path, 'all_page_data.txt')
  114. with open(file, 'w', encoding='utf-8') as file:
  115. file.write(str(url_to_text))
  116. # region 读取配置文件
  117. def load_config(self, key):
  118. # 读取配置文件 config.json
  119. # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
  120. current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件的目录路径
  121. # 构建 config.json 文件的完整路径
  122. config_path = os.path.join(current_dir, 'config.json')
  123. config = {}
  124. if os.path.exists('config.json'):
  125. with open('config.json', 'r', encoding='utf-8') as f:
  126. config = json.load(f)
  127. if not config:
  128. # 检查 config.json 文件是否存在
  129. if not os.path.exists(config_path):
  130. # 如果不存在,创建并写入默认的 JSON 数据
  131. default_config = {
  132. "example": {
  133. "use_browser": 0,
  134. "ai_host": 'http://127.0.0.1:11434(不需要此功能留空)',
  135. "message_bot_key": '填入matrix的key(不需要此功能留空)',
  136. "target_url_list": ['目标网站'],
  137. "role": "AI的角色, 例如: 你是一个聊天机器人",
  138. "prompt_words": "提示词: 帮我总结, 用中文回复"
  139. }
  140. }
  141. # 写入 JSON 数据到 config.json 文件
  142. with open(config_path, 'w', encoding='utf-8') as f:
  143. json.dump(default_config, f, indent=4)
  144. print(f"Created {config_path} with default configuration.")
  145. exit(0)
  146. k = config[key]
  147. return k
  148. # return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host'], k['message_bot_key']
  149. # endregion
  150. # region 使用httpx获取网页内容
  151. async def get_htmls(self, urls):
  152. # 获取 HTML
  153. async with httpx.AsyncClient() as client:
  154. async def get_html(url):
  155. try:
  156. print(f'正在打开: {url}')
  157. # 发送 GET 请求获取页面内容
  158. response = await client.get(url)
  159. response.raise_for_status() # 确保请求成功
  160. # 使用 BeautifulSoup 解析 HTML 内容
  161. soup = BeautifulSoup(response.text, 'html.parser')
  162. # 提取纯文本内容
  163. text = soup.get_text(separator=' ', strip=True)
  164. # 去除多余的空白字符
  165. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  166. return url, cleaned_text
  167. except Exception as e:
  168. print(f"Error fetching {url}: {e}")
  169. return url, ""
  170. # 使用 asyncio.gather 同时获取所有网站的 HTML
  171. tasks = [get_html(url) for url in urls]
  172. results = await asyncio.gather(*tasks)
  173. # 将结果存储在字典中
  174. url_to_text = {url: text for url, text in results}
  175. return url_to_text
  176. # endregion
  177. # region 使用Playwright获取HTML内容
  178. async def get_htmls_with_browser(self, urls, datetime_file_path):
  179. # 使用 Playwright 获取 HTML 内容
  180. url_to_text = {}
  181. async with async_playwright() as p:
  182. # 启动浏览器
  183. browser = await p.chromium.launch(headless=True)
  184. # 创建浏览器上下文
  185. context = await browser.new_context()
  186. async def get_html(url):
  187. try:
  188. print(f'正在打开: {url}')
  189. # 在上下文中打开新页面
  190. page = await context.new_page()
  191. # 导航到指定网址
  192. await page.goto(url)
  193. # 禁止弹框
  194. await self.disable_dialogs(page)
  195. # 调用 disable_images 方法阻止图片加载并隐藏图片
  196. await self.disable_images(page)
  197. # 滚动页面以加载动态内容
  198. await self.scroll_to_percentage(page)
  199. # 顺手截图
  200. await self.screenshot(page, datetime_file_path)
  201. # 获取渲染后的 HTML
  202. html = await page.content()
  203. # 使用 BeautifulSoup 解析 HTML 内容
  204. soup = BeautifulSoup(html, 'html.parser')
  205. # 提取纯文本内容
  206. text = soup.get_text(separator=' ', strip=True)
  207. # 去除多余的空白字符
  208. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  209. # 关闭页面
  210. await page.close()
  211. return url, cleaned_text
  212. except Exception as e:
  213. print(f"Error fetching {url}: {e}")
  214. return url, ""
  215. # 使用 asyncio.gather 同时获取所有网站的 HTML
  216. tasks = [get_html(url) for url in urls]
  217. results = await asyncio.gather(*tasks)
  218. # 将结果存储在字典中
  219. url_to_text = {url: text for url, text in results}
  220. # 关闭上下文和浏览器
  221. await context.close()
  222. await browser.close()
  223. return url_to_text
  224. # endregion
  225. # region 滚动页面
  226. @staticmethod
  227. async def scroll_to_percentage(page):
  228. # 获取页面标题并打印
  229. title = await page.title()
  230. print(f'正在滚动浏览器页面: {title}')
  231. percentage_list = [i for i in range(5, 101, 2)]
  232. for percentage in percentage_list:
  233. # 计算页面的指定百分比高度
  234. height = await page.evaluate("() => document.body.scrollHeight")
  235. scroll_position = height * (percentage / 100)
  236. # 跳转到指定的百分比位置
  237. await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
  238. await asyncio.sleep(0.5) # 使用异步 sleep
  239. await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
  240. # endregion
  241. # region 网页截图
  242. @staticmethod
  243. async def screenshot(page, datetime_file_path):
  244. # 顺手截图
  245. # 获取网页的 title
  246. title = await page.title()
  247. # 替换不合法的字符
  248. cleaned_title = re.sub(r'[\\/:*?"<>|]', '', title)
  249. # 如果 title 为空,使用默认名称
  250. if not cleaned_title:
  251. cleaned_title = "untitled"
  252. # 拼接截图文件路径
  253. screenshot_path = os.path.join(datetime_file_path, f"{cleaned_title}_{uuid.uuid4().hex[:6]}.png")
  254. # 进行整页截图
  255. await page.screenshot(path=screenshot_path, full_page=True)
  256. print(f"截图已保存到: {screenshot_path}")
  257. # endregion
  258. # region 禁止网页显示图片
  259. async def disable_images(self, page):
  260. # 调用 JavaScript 函数阻止图片加载并隐藏图片
  261. await page.evaluate('''() => {
  262. function disableImages() {
  263. // 阻止所有图片加载
  264. document.querySelectorAll('img').forEach(img => {
  265. img.src = ''; // 清空 src 属性
  266. img.removeAttribute('srcset'); // 移除 srcset 属性(如果有)
  267. });
  268. // 隐藏所有图片
  269. document.querySelectorAll('img').forEach(img => {
  270. img.style.display = 'none';
  271. });
  272. }
  273. disableImages(); // 调用函数
  274. }''')
  275. # endregion
  276. # region 覆盖JavaScript的弹框方法,使其无效
  277. async def disable_dialogs(self, page):
  278. # 覆盖 JavaScript 的弹框方法,使其无效
  279. await page.evaluate('''() => {
  280. window.alert = () => {};
  281. window.confirm = () => true; // confirm 默认返回 true
  282. window.prompt = () => null; // prompt 默认返回 null
  283. }''')
  284. # endregion
  285. # region AI处理数据
  286. def process_data(self, result_text, prompt_words, role, ai_host):
  287. # 整理获取的数据, 返回准备发送的数据
  288. process_send = []
  289. O = OllamaChat(ai_host)
  290. if text_batch:
  291. for k, v in result_text.items():
  292. response_context = FREEAI().call_ai(v)
  293. # response_context = O.call_ollama(role, v, prompt_words)
  294. if response_context:
  295. message = f'{k}\n{response_context}\n'
  296. process_send.append(message)
  297. else:
  298. t = ''
  299. for k, v in result_text.items():
  300. t += f'{k}\n{v}\n'
  301. response_context = O.call_ollama(role, t, prompt_words)
  302. if response_context:
  303. process_send.append(response_context)
  304. return process_send
  305. # endregion
  306. # region 主函数
  307. def main(self, config):
  308. target_url_list = config['target_url_list']
  309. prompt_words = config['prompt_words']
  310. role = config['role']
  311. use_browser = config['use_browser']
  312. ai_host = config['ai_host']
  313. message_bot_key = config['message_bot_key']
  314. use_ai = config['use_ai']
  315. # 获取所有的网页html内容
  316. if use_browser:
  317. result_text = asyncio.run(self.get_htmls_with_browser(target_url_list, datetime_file_path))
  318. else:
  319. result_text = asyncio.run(self.get_htmls(target_url_list))
  320. # 保存文本
  321. if result_text:
  322. print(f'共获取 {len(result_text)} 个网址的数据')
  323. self.save_to_txt(result_text, datetime_file_path)
  324. else:
  325. print('无数据, 程序退出')
  326. exit(0)
  327. print(f'{role}\n{prompt_words}')
  328. if use_ai:
  329. # 处理发送 text 数据
  330. process_send = self.process_data(result_text, prompt_words, role, ai_host)
  331. # 创建消息bot实例
  332. bot = MatrixBot('message-bot', 'aaaAAA111!!!', message_bot_key)
  333. # 发送消息
  334. for process_text in process_send:
  335. bot.send_message(process_text)
  336. # endregion
  337. if __name__ == "__main__":
  338. ainews = AINEWS()
  339. ainews.create_config_if_not_exists()
  340. datetime_file_path = ainews.mkdir_save_data()
  341. for key in key_list:
  342. config = ainews.load_config(key)
  343. target_url_list = config['target_url_list']
  344. print(f'关键词 {key} 共有 {len(target_url_list)} 个网址')
  345. ainews.main(config)
  346. print('done!')