ai_news.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import re
  4. import json
  5. import uuid
  6. import httpx
  7. import asyncio
  8. import datetime
  9. from bs4 import BeautifulSoup
  10. from ollama import Client as oClient
  11. from playwright.async_api import async_playwright
  12. from matrix_client.client import MatrixClient
  13. from matrix_client.api import MatrixHttpApi
  14. key_list = ['web3']
  15. text_batch = 0
  16. class OllamaChat(object):
  17. def __init__(self, host='http://192.168.31.28:11434'):
  18. self.host = host
  19. def call_ollama(self, role, text, prompt_words, model='llava:13b', temperature=0.4):
  20. # 使用 ollama 里面的模型
  21. message = text + '\n\n' + prompt_words
  22. print(f'use model: {model}')
  23. try:
  24. response_iter = oClient(host=self.host).chat(model=model,
  25. messages=[
  26. {'role': 'system', 'content': role},
  27. {'role': 'user', 'content': message}
  28. ],
  29. options={"temperature": temperature},
  30. stream=False)
  31. return response_iter['message']['content']
  32. except Exception as e:
  33. print(f"\n发生错误: {e}")
  34. return None
  35. class MatrixBot:
  36. def __init__(self, user, password, key):
  37. self.base_url = "https://matrix.erhe.top"
  38. self.user = user
  39. self.password = password
  40. self.client = MatrixClient("https://matrix.erhe.top")
  41. self.token = self.login()
  42. self.to = key
  43. def login(self):
  44. self.token = self.client.login(username=self.user, password=self.password)
  45. return self.token
  46. def send_message(self, message):
  47. if self.token:
  48. try:
  49. api = MatrixHttpApi(self.base_url, token=self.token)
  50. api.send_message(self.to, message)
  51. except Exception as e:
  52. print(e)
  53. api = MatrixHttpApi(self.base_url, token=self.token)
  54. api.send_message(self.to, str(e))
  55. else:
  56. print("Bot is not logged in. Please login first.")
  57. class AINEWS:
  58. def create_config_if_not_exists(self):
  59. # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
  60. current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件的目录路径
  61. # 构建 config.json 文件的完整路径
  62. config_path = os.path.join(current_dir, 'config.json')
  63. # 检查 config.json 文件是否存在
  64. if not os.path.exists(config_path):
  65. # 如果不存在,创建并写入默认的 JSON 数据
  66. default_config = {
  67. "example": {
  68. "use_browser": 0,
  69. "ai_host": 'http://127.0.0.1:11434(不需要此功能留空)',
  70. "message_bot_key": '填入matrix的key(不需要此功能留空)',
  71. "target_url_list": ['目标网站'],
  72. "role": "AI的角色, 例如: 你是一个聊天机器人",
  73. "prompt_words": "提示词: 帮我总结, 用中文回复"
  74. }
  75. }
  76. # 写入 JSON 数据到 config.json 文件
  77. with open(config_path, 'w', encoding='utf-8') as f:
  78. json.dump(default_config, f, indent=4)
  79. print(f"Created {config_path} with default configuration.")
  80. def mkdir_save_data(self):
  81. # 获取当前脚本所在路径
  82. current_file_path = os.path.dirname(__file__)
  83. # 拼接 save_data 文件夹路径
  84. save_file_path = os.path.join(current_file_path, 'save_data')
  85. # 如果 save_data 文件夹不存在,则创建
  86. if not os.path.exists(save_file_path):
  87. os.makedirs(save_file_path)
  88. # 在 save_data 文件夹中,创建一个以当前日期时间命名的子文件夹
  89. datetime_file_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  90. datetime_file_path = os.path.join(save_file_path, datetime_file_name)
  91. if not os.path.exists(datetime_file_path):
  92. os.makedirs(datetime_file_path)
  93. return datetime_file_path
  94. def save_to_txt(self, url_to_text, datetime_file_path):
  95. # 将爬取的新闻 保存到 txt 文件中
  96. file = os.path.join(datetime_file_path, 'all_page_data.txt')
  97. with open(file, 'w', encoding='utf-8') as file:
  98. file.write(str(url_to_text))
  99. def load_config(self, key):
  100. # 读取配置文件 config.json
  101. config = {}
  102. if os.path.exists('config.json'):
  103. with open('config.json', 'r', encoding='utf-8') as f:
  104. config = json.load(f)
  105. if not config:
  106. print('config.json is not exist!')
  107. exit(0)
  108. k = config[key]
  109. return k
  110. # return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host'], k['message_bot_key']
  111. async def get_htmls(self, urls):
  112. # 获取 HTML
  113. async with httpx.AsyncClient() as client:
  114. async def get_html(url):
  115. try:
  116. print(f'正在打开: {url}')
  117. # 发送 GET 请求获取页面内容
  118. response = await client.get(url)
  119. response.raise_for_status() # 确保请求成功
  120. # 使用 BeautifulSoup 解析 HTML 内容
  121. soup = BeautifulSoup(response.text, 'html.parser')
  122. # 提取纯文本内容
  123. text = soup.get_text(separator=' ', strip=True)
  124. # 去除多余的空白字符
  125. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  126. return url, cleaned_text
  127. except Exception as e:
  128. print(f"Error fetching {url}: {e}")
  129. return url, ""
  130. # 使用 asyncio.gather 同时获取所有网站的 HTML
  131. tasks = [get_html(url) for url in urls]
  132. results = await asyncio.gather(*tasks)
  133. # 将结果存储在字典中
  134. url_to_text = {url: text for url, text in results}
  135. return url_to_text
  136. async def get_htmls_with_browser(self, urls, datetime_file_path):
  137. # 使用 Playwright 获取 HTML 内容
  138. url_to_text = {}
  139. async with async_playwright() as p:
  140. # 启动浏览器
  141. browser = await p.chromium.launch(headless=True)
  142. # 创建浏览器上下文
  143. context = await browser.new_context()
  144. async def get_html(url):
  145. try:
  146. print(f'正在打开: {url}')
  147. # 在上下文中打开新页面
  148. page = await context.new_page()
  149. # 导航到指定网址
  150. await page.goto(url)
  151. # 禁止弹框
  152. await self.disable_dialogs(page)
  153. # 调用 disable_images 方法阻止图片加载并隐藏图片
  154. await self.disable_images(page)
  155. # 滚动页面以加载动态内容
  156. await self.scroll_to_percentage(page)
  157. # 顺手截图
  158. await self.screenshot(page, datetime_file_path)
  159. # 获取渲染后的 HTML
  160. html = await page.content()
  161. # 使用 BeautifulSoup 解析 HTML 内容
  162. soup = BeautifulSoup(html, 'html.parser')
  163. # 提取纯文本内容
  164. text = soup.get_text(separator=' ', strip=True)
  165. # 去除多余的空白字符
  166. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  167. # 关闭页面
  168. await page.close()
  169. return url, cleaned_text
  170. except Exception as e:
  171. print(f"Error fetching {url}: {e}")
  172. return url, ""
  173. # 使用 asyncio.gather 同时获取所有网站的 HTML
  174. tasks = [get_html(url) for url in urls]
  175. results = await asyncio.gather(*tasks)
  176. # 将结果存储在字典中
  177. url_to_text = {url: text for url, text in results}
  178. # 关闭上下文和浏览器
  179. await context.close()
  180. await browser.close()
  181. return url_to_text
  182. @staticmethod
  183. async def scroll_to_percentage(page):
  184. # 获取页面标题并打印
  185. title = await page.title()
  186. print(f'正在滚动浏览器页面: {title}')
  187. percentage_list = [i for i in range(5, 101, 2)]
  188. for percentage in percentage_list:
  189. # 计算页面的指定百分比高度
  190. height = await page.evaluate("() => document.body.scrollHeight")
  191. scroll_position = height * (percentage / 100)
  192. # 跳转到指定的百分比位置
  193. await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
  194. await asyncio.sleep(0.5) # 使用异步 sleep
  195. await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
  196. @staticmethod
  197. async def screenshot(page, datetime_file_path):
  198. # 顺手截图
  199. # 获取网页的 title
  200. title = await page.title()
  201. # 替换不合法的字符
  202. cleaned_title = re.sub(r'[\\/:*?"<>|]', '', title)
  203. # 如果 title 为空,使用默认名称
  204. if not cleaned_title:
  205. cleaned_title = "untitled"
  206. # 拼接截图文件路径
  207. screenshot_path = os.path.join(datetime_file_path, f"{cleaned_title}_{uuid.uuid4().hex[:6]}.png")
  208. # 进行整页截图
  209. await page.screenshot(path=screenshot_path, full_page=True)
  210. print(f"截图已保存到: {screenshot_path}")
  211. async def disable_images(self, page):
  212. # 调用 JavaScript 函数阻止图片加载并隐藏图片
  213. await page.evaluate('''() => {
  214. function disableImages() {
  215. // 阻止所有图片加载
  216. document.querySelectorAll('img').forEach(img => {
  217. img.src = ''; // 清空 src 属性
  218. img.removeAttribute('srcset'); // 移除 srcset 属性(如果有)
  219. });
  220. // 隐藏所有图片
  221. document.querySelectorAll('img').forEach(img => {
  222. img.style.display = 'none';
  223. });
  224. }
  225. disableImages(); // 调用函数
  226. }''')
  227. async def disable_dialogs(self, page):
  228. # 覆盖 JavaScript 的弹框方法,使其无效
  229. await page.evaluate('''() => {
  230. window.alert = () => {};
  231. window.confirm = () => true; // confirm 默认返回 true
  232. window.prompt = () => null; // prompt 默认返回 null
  233. }''')
  234. def process_data(self, result_text, prompt_words, role, ai_host):
  235. # 整理获取的数据, 返回准备发送的数据
  236. process_send = []
  237. O = OllamaChat(ai_host)
  238. if text_batch:
  239. for k, v in result_text.items():
  240. response_context = O.call_ollama(role, v, prompt_words)
  241. if response_context:
  242. message = f'{k}\n{response_context}\n'
  243. process_send.append(message)
  244. else:
  245. t = ''
  246. for k, v in result_text.items():
  247. t += f'{k}\n{v}\n'
  248. response_context = O.call_ollama(role, t, prompt_words)
  249. if response_context:
  250. process_send.append(response_context)
  251. return process_send
  252. def main(self, config):
  253. target_url_list = config['target_url_list']
  254. prompt_words = config['prompt_words']
  255. role = config['role']
  256. use_browser = config['use_browser']
  257. ai_host = config['ai_host']
  258. message_bot_key = config['message_bot_key']
  259. use_ai = config['use_ai']
  260. # 获取所有的网页html内容
  261. if use_browser:
  262. result_text = asyncio.run(self.get_htmls_with_browser(target_url_list, datetime_file_path))
  263. else:
  264. result_text = asyncio.run(self.get_htmls(target_url_list))
  265. # 保存文本
  266. if result_text:
  267. print(f'共获取 {len(result_text)} 个网址的数据')
  268. self.save_to_txt(result_text, datetime_file_path)
  269. else:
  270. print('无数据, 程序退出')
  271. exit(0)
  272. if use_ai:
  273. # 处理发送 text 数据
  274. process_send = self.process_data(result_text, prompt_words, role, ai_host)
  275. # 创建消息bot实例
  276. bot = MatrixBot('message-bot', 'aaaAAA111!!!', message_bot_key)
  277. # 发送消息
  278. for process_text in process_send:
  279. bot.send_message(process_text)
  280. if __name__ == "__main__":
  281. ainews = AINEWS()
  282. ainews.create_config_if_not_exists()
  283. datetime_file_path = ainews.mkdir_save_data()
  284. for key in key_list:
  285. config = ainews.load_config(key)
  286. target_url_list = config['target_url_list']
  287. print(f'关键词 {key} 共有 {len(target_url_list)} 个网址')
  288. ainews.main(config)
  289. print('done!')