|
|
@@ -2,15 +2,15 @@
|
|
|
import os
|
|
|
import re
|
|
|
import json
|
|
|
-from playwright.async_api import async_playwright
|
|
|
+import httpx
|
|
|
import asyncio
|
|
|
+import time
|
|
|
from bs4 import BeautifulSoup
|
|
|
-from api_ollama import *
|
|
|
-from api_kimi import *
|
|
|
-from api_deepseek import *
|
|
|
-from send_matrix import *
|
|
|
+from ollama import Client as oClient
|
|
|
+from send_matrix import MatrixBot
|
|
|
|
|
|
key = 'web3'
|
|
|
+text_batch = 0
|
|
|
|
|
|
|
|
|
class AINEWS:
|
|
|
@@ -27,8 +27,8 @@ class AINEWS:
|
|
|
default_config = {
|
|
|
"example": {
|
|
|
"target_url_list": [],
|
|
|
- "prompt_words": "",
|
|
|
- "role": ""
|
|
|
+ "role": "",
|
|
|
+ "prompt_words": ""
|
|
|
}
|
|
|
}
|
|
|
|
|
|
@@ -61,47 +61,23 @@ class AINEWS:
|
|
|
k = config[key]
|
|
|
return k['target_url_list'], k['prompt_words'], k['role']
|
|
|
|
|
|
- @staticmethod
|
|
|
- async def scroll_to_percentage(page):
|
|
|
- percentage_list = [i for i in range(5, 101, 2)]
|
|
|
- for percentage in percentage_list:
|
|
|
- # 计算页面的指定百分比高度
|
|
|
- height = await page.evaluate("() => document.body.scrollHeight")
|
|
|
- scroll_position = height * (percentage / 100)
|
|
|
- # 跳转到指定的百分比位置
|
|
|
- await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
|
|
|
- await asyncio.sleep(0.5) # 使用异步 sleep
|
|
|
- await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
|
|
|
-
|
|
|
async def get_htmls(self, urls):
|
|
|
- async with async_playwright() as p:
|
|
|
- # 启动浏览器
|
|
|
- browser = await p.chromium.launch(headless=True)
|
|
|
- # 创建浏览器上下文
|
|
|
- context = await browser.new_context()
|
|
|
-
|
|
|
+ async with httpx.AsyncClient() as client:
|
|
|
async def get_html(url):
|
|
|
try:
|
|
|
print(f'正在打开: {url}')
|
|
|
- # 在上下文中打开新页面
|
|
|
- page = await context.new_page()
|
|
|
+ # 发送 GET 请求获取页面内容
|
|
|
+ response = await client.get(url)
|
|
|
+ response.raise_for_status() # 确保请求成功
|
|
|
|
|
|
- # 导航到指定网址
|
|
|
- await page.goto(url, wait_until='networkidle') # 等待网络空闲
|
|
|
+ # 使用 BeautifulSoup 解析 HTML 内容
|
|
|
+ soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
- # 滚动页面, 获取更多信息
|
|
|
- await self.scroll_to_percentage(page)
|
|
|
+ # 提取纯文本内容
|
|
|
+ text = soup.get_text(separator=' ', strip=True)
|
|
|
|
|
|
- # 获取渲染后的 HTML
|
|
|
- html = await page.content()
|
|
|
- # 关闭页面
|
|
|
- await page.close()
|
|
|
-
|
|
|
- # 使用 BeautifulSoup 格式化 HTML 内容
|
|
|
- soup = BeautifulSoup(html, 'html.parser')
|
|
|
- formatted_html = soup.get_text()
|
|
|
- cleaned_text = re.sub(r'[\n\t\r]+', ' ', formatted_html)
|
|
|
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
|
|
+ # 去除多余的空白字符
|
|
|
+ cleaned_text = re.sub(r'\s+', ' ', text).strip()
|
|
|
|
|
|
return url, cleaned_text
|
|
|
except Exception as e:
|
|
|
@@ -115,38 +91,54 @@ class AINEWS:
|
|
|
# 将结果存储在字典中
|
|
|
url_to_text = {url: text for url, text in results}
|
|
|
|
|
|
- # 关闭上下文和浏览器
|
|
|
- await context.close()
|
|
|
- await browser.close()
|
|
|
-
|
|
|
return url_to_text
|
|
|
|
|
|
+ def call_ollama(self, host, role, text, prompt_words, model='llava:13b', temperature=0.4):
|
|
|
+ message = text + '\n\n' + prompt_words
|
|
|
+ print(f'use model: {model}')
|
|
|
+ try:
|
|
|
+ response_iter = oClient(host=host).chat(model=model,
|
|
|
+ messages=[
|
|
|
+ {'role': 'system', 'content': role},
|
|
|
+ {'role': 'user', 'content': message}
|
|
|
+ ],
|
|
|
+ options={"temperature": temperature},
|
|
|
+ stream=False)
|
|
|
+ return response_iter['message']['content']
|
|
|
+ except Exception as e:
|
|
|
+ print(f"\n发生错误: {e}")
|
|
|
+ return None
|
|
|
+
|
|
|
+ def process_data(self, result_text, prompt_words, role):
|
|
|
+ process_send = []
|
|
|
+ if text_batch:
|
|
|
+ for k, v in result_text.items():
|
|
|
+ response_context = self.call_ollama('http://192.168.31.28:11434', role, v, prompt_words)
|
|
|
+ if response_context:
|
|
|
+ message = f'{k}\n{response_context}\n'
|
|
|
+ process_send.append(message)
|
|
|
+ else:
|
|
|
+ t = ''
|
|
|
+ for k, v in result_text.items():
|
|
|
+ t += f'{k}\n{v}\n'
|
|
|
+ response_context = self.call_ollama('http://192.168.31.28:11434', role, t, prompt_words)
|
|
|
+ if response_context:
|
|
|
+ process_send.append(response_context)
|
|
|
+ return process_send
|
|
|
+
|
|
|
def main(self, target_url_list, prompt_words, role):
|
|
|
- url_to_text = asyncio.run(self.get_htmls(target_url_list))
|
|
|
+ result_text = asyncio.run(self.get_htmls(target_url_list))
|
|
|
+ self.save_to_txt(result_text)
|
|
|
|
|
|
# 创建消息bot实例
|
|
|
bot = MatrixBot('message-bot', 'aaaAAA111!!!')
|
|
|
|
|
|
- self.save_to_txt(url_to_text)
|
|
|
-
|
|
|
- O = OllamaChat()
|
|
|
- for k, v in url_to_text.items():
|
|
|
- response_context = O.call_ollama('http://127.0.0.1:11434', role, v, prompt_words)
|
|
|
- message = f'{k}\n{response_context}\n'
|
|
|
- # 发送消息
|
|
|
- bot.send_message(message)
|
|
|
-
|
|
|
- # K = KIMI()
|
|
|
- # response_context = K.call_kimi(prompt_words)
|
|
|
- # print(response_context)
|
|
|
-
|
|
|
- # D = DeepSeek()
|
|
|
- # for k, v in url_to_text.items():
|
|
|
- # response_context = D.call_deepseek(v, prompt_words)
|
|
|
+ # 准备发送 text
|
|
|
+ process_send = self.process_data(result_text, prompt_words, role)
|
|
|
|
|
|
- # # 保存每一个字符串准备发送信息
|
|
|
- # message = f'{k}\n{response_context}\n'
|
|
|
- # print(message)
|
|
|
+ # 发送消息
|
|
|
+ for process_text in process_send:
|
|
|
+ bot.send_message(process_text)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|