ai_news.py 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import re
  4. import json
  5. import httpx
  6. import asyncio
  7. import time
  8. from bs4 import BeautifulSoup
  9. from ollama import Client as oClient
  10. from playwright.async_api import async_playwright
  11. from matrix_client.client import MatrixClient
  12. from matrix_client.api import MatrixHttpApi
  13. key_list = ['web3']
  14. text_batch = 0
  15. class OllamaChat(object):
  16. def __init__(self, host='http://192.168.31.28:11434'):
  17. self.host = host
  18. def call_ollama(self, role, text, prompt_words, model='llava:13b', temperature=0.4):
  19. # 使用 ollama 里面的模型
  20. message = text + '\n\n' + prompt_words
  21. print(f'use model: {model}')
  22. try:
  23. response_iter = oClient(host=self.host).chat(model=model,
  24. messages=[
  25. {'role': 'system', 'content': role},
  26. {'role': 'user', 'content': message}
  27. ],
  28. options={"temperature": temperature},
  29. stream=False)
  30. return response_iter['message']['content']
  31. except Exception as e:
  32. print(f"\n发生错误: {e}")
  33. return None
  34. class MatrixBot:
  35. def __init__(self, user, password):
  36. self.base_url = "https://matrix.erhe.top"
  37. self.user = user
  38. self.password = password
  39. self.client = MatrixClient("https://matrix.erhe.top")
  40. self.token = self.login()
  41. self.to = "!CgWvWEnLbKYvhXLvil:chat.abeginner.cn"
  42. def login(self):
  43. self.token = self.client.login(username=self.user, password=self.password)
  44. return self.token
  45. def send_message(self, message):
  46. if self.token:
  47. try:
  48. api = MatrixHttpApi(self.base_url, token=self.token)
  49. api.send_message(self.to, message)
  50. except Exception as e:
  51. print(e)
  52. api = MatrixHttpApi(self.base_url, token=self.token)
  53. api.send_message(self.to, str(e))
  54. else:
  55. print("Bot is not logged in. Please login first.")
  56. class AINEWS:
  57. def create_config_if_not_exists(self):
  58. # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
  59. current_dir = os.path.dirname(os.path.abspath(__file__)) # 获取当前文件的目录路径
  60. # 构建 config.json 文件的完整路径
  61. config_path = os.path.join(current_dir, 'config.json')
  62. # 检查 config.json 文件是否存在
  63. if not os.path.exists(config_path):
  64. # 如果不存在,创建并写入默认的 JSON 数据
  65. default_config = {
  66. "example": {
  67. "use_browser": 0,
  68. "ai_host": 'http://127.0.0.1:11434',
  69. "target_url_list": ['目标网站'],
  70. "role": "AI的角色, 例如: 你是一个聊天机器人",
  71. "prompt_words": "提示词: 帮我总结, 用中文回复"
  72. }
  73. }
  74. # 写入 JSON 数据到 config.json 文件
  75. with open(config_path, 'w', encoding='utf-8') as f:
  76. json.dump(default_config, f, indent=4)
  77. print(f"Created {config_path} with default configuration.")
  78. def save_to_txt(self, url_to_text):
  79. # 将爬取的新闻 保存到 txt 文件中
  80. current_file_path = os.path.dirname(__file__)
  81. save_file_path = os.path.join(current_file_path, 'save_txt')
  82. if not os.path.exists(save_file_path):
  83. os.makedirs(save_file_path)
  84. file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
  85. with open(file, 'w', encoding='utf-8') as file:
  86. file.write(str(url_to_text))
  87. def load_config(self, key):
  88. # 读取配置文件 config.json
  89. config = {}
  90. if os.path.exists('config.json'):
  91. with open('config.json', 'r', encoding='utf-8') as f:
  92. config = json.load(f)
  93. if not config:
  94. print('config.json is not exist!')
  95. exit(0)
  96. k = config[key]
  97. return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host']
  98. async def get_htmls(self, urls):
  99. # 获取 HTML
  100. async with httpx.AsyncClient() as client:
  101. async def get_html(url):
  102. try:
  103. print(f'正在打开: {url}')
  104. # 发送 GET 请求获取页面内容
  105. response = await client.get(url)
  106. response.raise_for_status() # 确保请求成功
  107. # 使用 BeautifulSoup 解析 HTML 内容
  108. soup = BeautifulSoup(response.text, 'html.parser')
  109. # 提取纯文本内容
  110. text = soup.get_text(separator=' ', strip=True)
  111. # 去除多余的空白字符
  112. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  113. return url, cleaned_text
  114. except Exception as e:
  115. print(f"Error fetching {url}: {e}")
  116. return url, ""
  117. # 使用 asyncio.gather 同时获取所有网站的 HTML
  118. tasks = [get_html(url) for url in urls]
  119. results = await asyncio.gather(*tasks)
  120. # 将结果存储在字典中
  121. url_to_text = {url: text for url, text in results}
  122. return url_to_text
  123. async def get_htmls_with_browser(self, urls):
  124. # 使用 Playwright 获取 HTML 内容
  125. url_to_text = {}
  126. async with async_playwright() as p:
  127. # 启动浏览器
  128. browser = await p.chromium.launch(headless=True)
  129. # 创建浏览器上下文
  130. context = await browser.new_context()
  131. async def get_html(url):
  132. try:
  133. print(f'正在打开: {url}')
  134. # 在上下文中打开新页面
  135. page = await context.new_page()
  136. # 导航到指定网址
  137. await page.goto(url)
  138. # 滚动页面以加载动态内容
  139. await self.scroll_to_percentage(page)
  140. # 获取渲染后的 HTML
  141. html = await page.content()
  142. # 使用 BeautifulSoup 解析 HTML 内容
  143. soup = BeautifulSoup(html, 'html.parser')
  144. # 提取纯文本内容
  145. text = soup.get_text(separator=' ', strip=True)
  146. # 去除多余的空白字符
  147. cleaned_text = re.sub(r'\s+', ' ', text).strip()
  148. # 关闭页面
  149. await page.close()
  150. return url, cleaned_text
  151. except Exception as e:
  152. print(f"Error fetching {url}: {e}")
  153. return url, ""
  154. # 使用 asyncio.gather 同时获取所有网站的 HTML
  155. tasks = [get_html(url) for url in urls]
  156. results = await asyncio.gather(*tasks)
  157. # 将结果存储在字典中
  158. url_to_text = {url: text for url, text in results}
  159. # 关闭上下文和浏览器
  160. await context.close()
  161. await browser.close()
  162. return url_to_text
  163. @staticmethod
  164. async def scroll_to_percentage(page):
  165. # 获取页面标题并打印
  166. title = await page.title()
  167. print(f'正在滚动浏览器页面: {title}')
  168. percentage_list = [i for i in range(5, 101, 2)]
  169. for percentage in percentage_list:
  170. # 计算页面的指定百分比高度
  171. height = await page.evaluate("() => document.body.scrollHeight")
  172. scroll_position = height * (percentage / 100)
  173. # 跳转到指定的百分比位置
  174. await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
  175. await asyncio.sleep(0.5) # 使用异步 sleep
  176. await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
  177. def process_data(self, result_text, prompt_words, role):
  178. # 整理获取的数据, 返回准备发送的数据
  179. process_send = []
  180. O = OllamaChat(ai_host)
  181. if text_batch:
  182. for k, v in result_text.items():
  183. response_context = O.call_ollama(role, v, prompt_words)
  184. if response_context:
  185. message = f'{k}\n{response_context}\n'
  186. process_send.append(message)
  187. else:
  188. t = ''
  189. for k, v in result_text.items():
  190. t += f'{k}\n{v}\n'
  191. response_context = O.call_ollama(role, t, prompt_words)
  192. if response_context:
  193. process_send.append(response_context)
  194. return process_send
  195. def main(self, target_url_list, prompt_words, role, use_browser, ai_host):
  196. # 获取所有的网页html内容
  197. if use_browser:
  198. result_text = asyncio.run(self.get_htmls_with_browser(target_url_list))
  199. else:
  200. result_text = asyncio.run(self.get_htmls(target_url_list))
  201. # 保存文本
  202. self.save_to_txt(result_text)
  203. # # 如果只需要保存爬取数据, 不使用 AI, 注释下面
  204. # # 创建消息bot实例
  205. # bot = MatrixBot('message-bot', 'aaaAAA111!!!')
  206. #
  207. # # 处理发送 text 数据
  208. # process_send = self.process_data(result_text, prompt_words, role, ai_host)
  209. #
  210. # # 发送消息
  211. # for process_text in process_send:
  212. # bot.send_message(process_text)
  213. if __name__ == "__main__":
  214. ainews = AINEWS()
  215. ainews.create_config_if_not_exists()
  216. for key in key_list:
  217. target_url_list, prompt_words, role, use_browser, ai_host = ainews.load_config(key)
  218. ainews.main(target_url_list, prompt_words, role, use_browser, ai_host)
  219. print('done!')