ai_news.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import re
  4. import json
  5. import uuid
  6. import httpx
  7. import asyncio
  8. import datetime
  9. from bs4 import BeautifulSoup
  10. from ollama import Client as oClient
  11. from playwright.async_api import async_playwright
  12. from matrix_client.client import MatrixClient
  13. from matrix_client.api import MatrixHttpApi
  14. key_list = ['web3']
  15. text_batch = 0
  16. class OllamaChat(object):
  17. def __init__(self, host='http://192.168.31.28:11434'):
  18. self.host = host
  19. def call_ollama(self, role, text, prompt_words, model='llava:13b', temperature=0.4):
  20. # 使用 ollama 里面的模型
  21. message = text + '\n\n' + prompt_words
  22. print(f'use model: {model}')
  23. try:
  24. response_iter = oClient(host=self.host).chat(model=model,
  25. messages=[
  26. {'role': 'system', 'content': role},
  27. {'role': 'user', 'content': message}
  28. ],
  29. options={"temperature": temperature},
  30. stream=False)
  31. return response_iter['message']['content']
  32. except Exception as e:
  33. print(f"\n发生错误: {e}")
  34. return None
  35. class MatrixBot:
  36. def __init__(self, user, password, key):
  37. self.base_url = "https://matrix.erhe.top"
  38. self.user = user
  39. self.password = password
  40. self.client = MatrixClient("https://matrix.erhe.top")
  41. self.token = self.login()
  42. self.to = key
  43. def login(self):
  44. self.token = self.client.login(username=self.user, password=self.password)
  45. return self.token
  46. def send_message(self, message):
  47. if self.token:
  48. try:
  49. api = MatrixHttpApi(self.base_url, token=self.token)
  50. api.send_message(self.to, message)
  51. except Exception as e:
  52. print(e)
  53. api = MatrixHttpApi(self.base_url, token=self.token)
  54. api.send_message(self.to, str(e))
  55. else:
  56. print("Bot is not logged in. Please login first.")
  57. class AINEWS:
  58. def create_config_if_not_exists(self):
  59. # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
  60. current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件的目录路径
  61. # 构建 config.json 文件的完整路径
  62. config_path = os.path.join(current_dir, 'config.json')
  63. # 检查 config.json 文件是否存在
  64. if not os.path.exists(config_path):
  65. # 如果不存在,创建并写入默认的 JSON 数据
  66. default_config = {
  67. "example": {
  68. "use_browser": 0,
  69. "ai_host": 'http://127.0.0.1:11434(不需要此功能留空)',
  70. "message_bot_key": '填入matrix的key(不需要此功能留空)',
  71. "target_url_list": ['目标网站'],
  72. "role": "AI的角色, 例如: 你是一个聊天机器人",
  73. "prompt_words": "提示词: 帮我总结, 用中文回复"
  74. }
  75. }
  76. # 写入 JSON 数据到 config.json 文件
  77. with open(config_path, 'w', encoding='utf-8') as f:
  78. json.dump(default_config, f, indent=4)
  79. print(f"Created {config_path} with default configuration.")
  80. def mkdir_save_data(self):
  81. # 获取当前脚本所在路径
  82. current_file_path = os.path.dirname(__file__)
  83. # 拼接 save_data 文件夹路径
  84. save_file_path = os.path.join(current_file_path, 'save_data')
  85. # 如果 save_data 文件夹不存在,则创建
  86. if not os.path.exists(save_file_path):
  87. os.makedirs(save_file_path)
  88. # 在 save_data 文件夹中,创建一个以当前日期时间命名的子文件夹
  89. datetime_file_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  90. datetime_file_path = os.path.join(save_file_path, datetime_file_name)
  91. if not os.path.exists(datetime_file_path):
  92. os.makedirs(datetime_file_path)
  93. return datetime_file_path
  94. def save_to_txt(self, url_to_text, datetime_file_path):
  95. # 将爬取的新闻 保存到 txt 文件中
  96. file = os.path.join(datetime_file_path, 'all_page_data.txt')
  97. with open(file, 'w', encoding='utf-8') as file:
  98. file.write(str(url_to_text))
  99. # region 读取配置文件
  100. def load_config(self, key):
  101. # 读取配置文件 config.json
  102. # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
  103. current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件的目录路径
  104. # 构建 config.json 文件的完整路径
  105. config_path = os.path.join(current_dir, 'config.json')
  106. config = {}
  107. if os.path.exists('config.json'):
  108. with open('config.json', 'r', encoding='utf-8') as f:
  109. config = json.load(f)
  110. if not config:
  111. # 检查 config.json 文件是否存在
  112. if not os.path.exists(config_path):
  113. # 如果不存在,创建并写入默认的 JSON 数据
  114. default_config = {
  115. "example": {
  116. "use_browser": 0,
  117. "ai_host": 'http://127.0.0.1:11434(不需要此功能留空)',
  118. "message_bot_key": '填入matrix的key(不需要此功能留空)',
  119. "target_url_list": ['目标网站'],
  120. "role": "AI的角色, 例如: 你是一个聊天机器人",
  121. "prompt_words": "提示词: 帮我总结, 用中文回复"
  122. }
  123. }
  124. # 写入 JSON 数据到 config.json 文件
  125. with open(config_path, 'w', encoding='utf-8') as f:
  126. json.dump(default_config, f, indent=4)
  127. print(f"Created {config_path} with default configuration.")
  128. exit(0)
  129. k = config[key]
  130. return k
  131. # return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host'], k['message_bot_key']
  132. # endregion
  133. # region 使用httpx获取网页内容
  134. async def get_htmls(self, urls):
  135. # 获取 HTML
  136. async with httpx.AsyncClient() as client:
  137. async def get_html(url):
  138. try:
  139. print(f'正在打开: {url}')
  140. # 发送 GET 请求获取页面内容
  141. response = await client.get(url)
  142. response.raise_for_status() # 确保请求成功
  143. # 使用 BeautifulSoup 解析 HTML 内容
  144. soup = BeautifulSoup(response.text, 'html.parser')
  145. # 提取纯文本内容
  146. text = soup.get_text(separator=' ', strip=True)
  147. # 去除多余的空白字符
  148. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  149. return url, cleaned_text
  150. except Exception as e:
  151. print(f"Error fetching {url}: {e}")
  152. return url, ""
  153. # 使用 asyncio.gather 同时获取所有网站的 HTML
  154. tasks = [get_html(url) for url in urls]
  155. results = await asyncio.gather(*tasks)
  156. # 将结果存储在字典中
  157. url_to_text = {url: text for url, text in results}
  158. return url_to_text
  159. # endregion
  160. # region 使用Playwright获取HTML内容
  161. async def get_htmls_with_browser(self, urls, datetime_file_path):
  162. # 使用 Playwright 获取 HTML 内容
  163. url_to_text = {}
  164. async with async_playwright() as p:
  165. # 启动浏览器
  166. browser = await p.chromium.launch(headless=True)
  167. # 创建浏览器上下文
  168. context = await browser.new_context()
  169. async def get_html(url):
  170. try:
  171. print(f'正在打开: {url}')
  172. # 在上下文中打开新页面
  173. page = await context.new_page()
  174. # 导航到指定网址
  175. await page.goto(url)
  176. # 禁止弹框
  177. await self.disable_dialogs(page)
  178. # 调用 disable_images 方法阻止图片加载并隐藏图片
  179. await self.disable_images(page)
  180. # 滚动页面以加载动态内容
  181. await self.scroll_to_percentage(page)
  182. # 顺手截图
  183. await self.screenshot(page, datetime_file_path)
  184. # 获取渲染后的 HTML
  185. html = await page.content()
  186. # 使用 BeautifulSoup 解析 HTML 内容
  187. soup = BeautifulSoup(html, 'html.parser')
  188. # 提取纯文本内容
  189. text = soup.get_text(separator=' ', strip=True)
  190. # 去除多余的空白字符
  191. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  192. # 关闭页面
  193. await page.close()
  194. return url, cleaned_text
  195. except Exception as e:
  196. print(f"Error fetching {url}: {e}")
  197. return url, ""
  198. # 使用 asyncio.gather 同时获取所有网站的 HTML
  199. tasks = [get_html(url) for url in urls]
  200. results = await asyncio.gather(*tasks)
  201. # 将结果存储在字典中
  202. url_to_text = {url: text for url, text in results}
  203. # 关闭上下文和浏览器
  204. await context.close()
  205. await browser.close()
  206. return url_to_text
  207. # endregion
  208. # region 滚动页面
  209. @staticmethod
  210. async def scroll_to_percentage(page):
  211. # 获取页面标题并打印
  212. title = await page.title()
  213. print(f'正在滚动浏览器页面: {title}')
  214. percentage_list = [i for i in range(5, 101, 2)]
  215. for percentage in percentage_list:
  216. # 计算页面的指定百分比高度
  217. height = await page.evaluate("() => document.body.scrollHeight")
  218. scroll_position = height * (percentage / 100)
  219. # 跳转到指定的百分比位置
  220. await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
  221. await asyncio.sleep(0.5) # 使用异步 sleep
  222. await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
  223. # endregion
  224. # region 网页截图
  225. @staticmethod
  226. async def screenshot(page, datetime_file_path):
  227. # 顺手截图
  228. # 获取网页的 title
  229. title = await page.title()
  230. # 替换不合法的字符
  231. cleaned_title = re.sub(r'[\\/:*?"<>|]', '', title)
  232. # 如果 title 为空,使用默认名称
  233. if not cleaned_title:
  234. cleaned_title = "untitled"
  235. # 拼接截图文件路径
  236. screenshot_path = os.path.join(datetime_file_path, f"{cleaned_title}_{uuid.uuid4().hex[:6]}.png")
  237. # 进行整页截图
  238. await page.screenshot(path=screenshot_path, full_page=True)
  239. print(f"截图已保存到: {screenshot_path}")
  240. # endregion
  241. # region 禁止网页显示图片
  242. async def disable_images(self, page):
  243. # 调用 JavaScript 函数阻止图片加载并隐藏图片
  244. await page.evaluate('''() => {
  245. function disableImages() {
  246. // 阻止所有图片加载
  247. document.querySelectorAll('img').forEach(img => {
  248. img.src = ''; // 清空 src 属性
  249. img.removeAttribute('srcset'); // 移除 srcset 属性(如果有)
  250. });
  251. // 隐藏所有图片
  252. document.querySelectorAll('img').forEach(img => {
  253. img.style.display = 'none';
  254. });
  255. }
  256. disableImages(); // 调用函数
  257. }''')
  258. # endregion
  259. # region 覆盖JavaScript的弹框方法,使其无效
  260. async def disable_dialogs(self, page):
  261. # 覆盖 JavaScript 的弹框方法,使其无效
  262. await page.evaluate('''() => {
  263. window.alert = () => {};
  264. window.confirm = () => true; // confirm 默认返回 true
  265. window.prompt = () => null; // prompt 默认返回 null
  266. }''')
  267. # endregion
  268. # region AI处理数据
  269. def process_data(self, result_text, prompt_words, role, ai_host):
  270. # 整理获取的数据, 返回准备发送的数据
  271. process_send = []
  272. O = OllamaChat(ai_host)
  273. if text_batch:
  274. for k, v in result_text.items():
  275. response_context = O.call_ollama(role, v, prompt_words)
  276. if response_context:
  277. message = f'{k}\n{response_context}\n'
  278. process_send.append(message)
  279. else:
  280. t = ''
  281. for k, v in result_text.items():
  282. t += f'{k}\n{v}\n'
  283. response_context = O.call_ollama(role, t, prompt_words)
  284. if response_context:
  285. process_send.append(response_context)
  286. return process_send
  287. # endregion
  288. # region 主函数
  289. def main(self, config):
  290. target_url_list = config['target_url_list']
  291. prompt_words = config['prompt_words']
  292. role = config['role']
  293. use_browser = config['use_browser']
  294. ai_host = config['ai_host']
  295. message_bot_key = config['message_bot_key']
  296. use_ai = config['use_ai']
  297. # 获取所有的网页html内容
  298. if use_browser:
  299. result_text = asyncio.run(self.get_htmls_with_browser(target_url_list, datetime_file_path))
  300. else:
  301. result_text = asyncio.run(self.get_htmls(target_url_list))
  302. # 保存文本
  303. if result_text:
  304. print(f'共获取 {len(result_text)} 个网址的数据')
  305. self.save_to_txt(result_text, datetime_file_path)
  306. else:
  307. print('无数据, 程序退出')
  308. exit(0)
  309. if use_ai:
  310. # 处理发送 text 数据
  311. process_send = self.process_data(result_text, prompt_words, role, ai_host)
  312. # 创建消息bot实例
  313. bot = MatrixBot('message-bot', 'aaaAAA111!!!', message_bot_key)
  314. # 发送消息
  315. for process_text in process_send:
  316. bot.send_message(process_text)
  317. # endregion
  318. if __name__ == "__main__":
  319. ainews = AINEWS()
  320. ainews.create_config_if_not_exists()
  321. datetime_file_path = ainews.mkdir_save_data()
  322. for key in key_list:
  323. config = ainews.load_config(key)
  324. target_url_list = config['target_url_list']
  325. print(f'关键词 {key} 共有 {len(target_url_list)} 个网址')
  326. ainews.main(config)
  327. print('done!')