ai_news.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import re
  4. import json
  5. import uuid
  6. import httpx
  7. import asyncio
  8. import datetime
  9. import time
  10. from bs4 import BeautifulSoup
  11. from ollama import Client as oClient
  12. from playwright.async_api import async_playwright
  13. from matrix_client.client import MatrixClient
  14. from matrix_client.api import MatrixHttpApi
  15. key_list = ['web3']
  16. text_batch = 0
  17. class OllamaChat(object):
  18. def __init__(self, host='http://192.168.31.28:11434'):
  19. self.host = host
  20. def call_ollama(self, role, text, prompt_words, model='llava:13b', temperature=0.4):
  21. # 使用 ollama 里面的模型
  22. message = text + '\n\n' + prompt_words
  23. print(f'use model: {model}')
  24. try:
  25. response_iter = oClient(host=self.host).chat(model=model,
  26. messages=[
  27. {'role': 'system', 'content': role},
  28. {'role': 'user', 'content': message}
  29. ],
  30. options={"temperature": temperature},
  31. stream=False)
  32. return response_iter['message']['content']
  33. except Exception as e:
  34. print(f"\n发生错误: {e}")
  35. return None
  36. class MatrixBot:
  37. def __init__(self, user, password, key):
  38. self.base_url = "https://matrix.erhe.top"
  39. self.user = user
  40. self.password = password
  41. self.client = MatrixClient("https://matrix.erhe.top")
  42. self.token = self.login()
  43. self.to = key
  44. def login(self):
  45. self.token = self.client.login(username=self.user, password=self.password)
  46. return self.token
  47. def send_message(self, message):
  48. if self.token:
  49. try:
  50. api = MatrixHttpApi(self.base_url, token=self.token)
  51. api.send_message(self.to, message)
  52. except Exception as e:
  53. print(e)
  54. api = MatrixHttpApi(self.base_url, token=self.token)
  55. api.send_message(self.to, str(e))
  56. else:
  57. print("Bot is not logged in. Please login first.")
  58. class AINEWS:
  59. def create_config_if_not_exists(self):
  60. # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
  61. current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件的目录路径
  62. # 构建 config.json 文件的完整路径
  63. config_path = os.path.join(current_dir, 'config.json')
  64. # 检查 config.json 文件是否存在
  65. if not os.path.exists(config_path):
  66. # 如果不存在,创建并写入默认的 JSON 数据
  67. default_config = {
  68. "example": {
  69. "use_browser": 0,
  70. "ai_host": 'http://127.0.0.1:11434(不需要此功能留空)',
  71. "message_bot_key": '填入matrix的key(不需要此功能留空)',
  72. "target_url_list": ['目标网站'],
  73. "role": "AI的角色, 例如: 你是一个聊天机器人",
  74. "prompt_words": "提示词: 帮我总结, 用中文回复"
  75. }
  76. }
  77. # 写入 JSON 数据到 config.json 文件
  78. with open(config_path, 'w', encoding='utf-8') as f:
  79. json.dump(default_config, f, indent=4)
  80. print(f"Created {config_path} with default configuration.")
  81. def mkdir_save_data(self):
  82. # 获取当前脚本所在路径
  83. current_file_path = os.path.dirname(__file__)
  84. # 拼接 save_data 文件夹路径
  85. save_file_path = os.path.join(current_file_path, 'save_data')
  86. # 如果 save_data 文件夹不存在,则创建
  87. if not os.path.exists(save_file_path):
  88. os.makedirs(save_file_path)
  89. # 在 save_data 文件夹中,创建一个以当前日期时间命名的子文件夹
  90. datetime_file_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
  91. datetime_file_path = os.path.join(save_file_path, datetime_file_name)
  92. if not os.path.exists(datetime_file_path):
  93. os.makedirs(datetime_file_path)
  94. return datetime_file_path
  95. def save_to_txt(self, url_to_text, datetime_file_path):
  96. # 将爬取的新闻 保存到 txt 文件中
  97. file = os.path.join(datetime_file_path, 'all_page_data.txt')
  98. with open(file, 'w', encoding='utf-8') as file:
  99. file.write(str(url_to_text))
  100. def load_config(self, key):
  101. # 读取配置文件 config.json
  102. config = {}
  103. if os.path.exists('config.json'):
  104. with open('config.json', 'r', encoding='utf-8') as f:
  105. config = json.load(f)
  106. if not config:
  107. print('config.json is not exist!')
  108. exit(0)
  109. k = config[key]
  110. return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host'], k['message_bot_key']
  111. async def get_htmls(self, urls):
  112. # 获取 HTML
  113. async with httpx.AsyncClient() as client:
  114. async def get_html(url):
  115. try:
  116. print(f'正在打开: {url}')
  117. # 发送 GET 请求获取页面内容
  118. response = await client.get(url)
  119. response.raise_for_status() # 确保请求成功
  120. # 使用 BeautifulSoup 解析 HTML 内容
  121. soup = BeautifulSoup(response.text, 'html.parser')
  122. # 提取纯文本内容
  123. text = soup.get_text(separator=' ', strip=True)
  124. # 去除多余的空白字符
  125. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  126. return url, cleaned_text
  127. except Exception as e:
  128. print(f"Error fetching {url}: {e}")
  129. return url, ""
  130. # 使用 asyncio.gather 同时获取所有网站的 HTML
  131. tasks = [get_html(url) for url in urls]
  132. results = await asyncio.gather(*tasks)
  133. # 将结果存储在字典中
  134. url_to_text = {url: text for url, text in results}
  135. return url_to_text
  136. async def get_htmls_with_browser(self, urls, datetime_file_path):
  137. # 使用 Playwright 获取 HTML 内容
  138. url_to_text = {}
  139. async with async_playwright() as p:
  140. # 启动浏览器
  141. browser = await p.chromium.launch(headless=True)
  142. # 创建浏览器上下文
  143. context = await browser.new_context()
  144. async def get_html(url):
  145. try:
  146. print(f'正在打开: {url}')
  147. # 在上下文中打开新页面
  148. page = await context.new_page()
  149. # 导航到指定网址
  150. await page.goto(url)
  151. # 滚动页面以加载动态内容
  152. await self.scroll_to_percentage(page)
  153. # 顺手截图
  154. await self.screenshot(page, datetime_file_path)
  155. # 获取渲染后的 HTML
  156. html = await page.content()
  157. # 使用 BeautifulSoup 解析 HTML 内容
  158. soup = BeautifulSoup(html, 'html.parser')
  159. # 提取纯文本内容
  160. text = soup.get_text(separator=' ', strip=True)
  161. # 去除多余的空白字符
  162. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  163. # 关闭页面
  164. await page.close()
  165. return url, cleaned_text
  166. except Exception as e:
  167. print(f"Error fetching {url}: {e}")
  168. return url, ""
  169. # 使用 asyncio.gather 同时获取所有网站的 HTML
  170. tasks = [get_html(url) for url in urls]
  171. results = await asyncio.gather(*tasks)
  172. # 将结果存储在字典中
  173. url_to_text = {url: text for url, text in results}
  174. # 关闭上下文和浏览器
  175. await context.close()
  176. await browser.close()
  177. return url_to_text
  178. @staticmethod
  179. async def scroll_to_percentage(page):
  180. # 获取页面标题并打印
  181. title = await page.title()
  182. print(f'正在滚动浏览器页面: {title}')
  183. percentage_list = [i for i in range(5, 101, 2)]
  184. for percentage in percentage_list:
  185. # 计算页面的指定百分比高度
  186. height = await page.evaluate("() => document.body.scrollHeight")
  187. scroll_position = height * (percentage / 100)
  188. # 跳转到指定的百分比位置
  189. await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
  190. await asyncio.sleep(0.5) # 使用异步 sleep
  191. await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
  192. @staticmethod
  193. async def screenshot(page, datetime_file_path):
  194. # 顺手截图
  195. # 获取网页的 title
  196. title = await page.title()
  197. # 替换不合法的字符
  198. cleaned_title = re.sub(r'[\\/:*?"<>|]', '', title)
  199. # 如果 title 为空,使用默认名称
  200. if not cleaned_title:
  201. cleaned_title = "untitled"
  202. # 拼接截图文件路径
  203. screenshot_path = os.path.join(datetime_file_path, f"{cleaned_title}_{uuid.uuid4().hex[:6]}.png")
  204. # 进行整页截图
  205. await page.screenshot(path=screenshot_path, full_page=True)
  206. print(f"截图已保存到: {screenshot_path}")
  207. def process_data(self, result_text, prompt_words, role, ai_host):
  208. # 整理获取的数据, 返回准备发送的数据
  209. process_send = []
  210. O = OllamaChat(ai_host)
  211. if text_batch:
  212. for k, v in result_text.items():
  213. response_context = O.call_ollama(role, v, prompt_words)
  214. if response_context:
  215. message = f'{k}\n{response_context}\n'
  216. process_send.append(message)
  217. else:
  218. t = ''
  219. for k, v in result_text.items():
  220. t += f'{k}\n{v}\n'
  221. response_context = O.call_ollama(role, t, prompt_words)
  222. if response_context:
  223. process_send.append(response_context)
  224. return process_send
  225. def main(self, datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key):
  226. # 获取所有的网页html内容
  227. if use_browser:
  228. result_text = asyncio.run(self.get_htmls_with_browser(target_url_list, datetime_file_path))
  229. else:
  230. result_text = asyncio.run(self.get_htmls(target_url_list))
  231. # 保存文本
  232. if result_text:
  233. print(f'共获取 {len(result_text)} 个网址的数据')
  234. self.save_to_txt(result_text, datetime_file_path)
  235. else:
  236. print('无数据, 程序退出')
  237. exit(0)
  238. # # 如果只需要保存爬取数据, 不使用 AI, 注释下面
  239. # # 处理发送 text 数据
  240. # process_send = self.process_data(result_text, prompt_words, role, ai_host)
  241. #
  242. # # 创建消息bot实例
  243. # bot = MatrixBot('message-bot', 'aaaAAA111!!!', message_bot_key)
  244. # # 发送消息
  245. # for process_text in process_send:
  246. # bot.send_message(process_text)
  247. if __name__ == "__main__":
  248. ainews = AINEWS()
  249. ainews.create_config_if_not_exists()
  250. datetime_file_path = ainews.mkdir_save_data()
  251. for key in key_list:
  252. target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key = ainews.load_config(key)
  253. print(f'关键词 {key} 共有 {len(target_url_list)} 个网址')
  254. ainews.main(datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key)
  255. print('done!')