jack 11 місяців тому
батько
коміт
2f65213de0

+ 1 - 0
ai_news/Readme.md

@@ -0,0 +1 @@
+pip install httpx beautifulsoup ollama matrix_client

+ 265 - 0
ai_news/ai_news.py

@@ -0,0 +1,265 @@
+# -*- coding: utf-8 -*-
+import os
+import re
+import json
+import httpx
+import asyncio
+import time
+from bs4 import BeautifulSoup
+from ollama import Client as oClient
+from playwright.async_api import async_playwright
+from matrix_client.client import MatrixClient
+from matrix_client.api import MatrixHttpApi
+
+key_list = ['web3']
+text_batch = 0
+
+
+class OllamaChat(object):
+    def __init__(self, host='http://192.168.31.28:11434'):
+        self.host = host
+
+    def call_ollama(self, role, text, prompt_words, model='llava:13b', temperature=0.4):
+        # 使用 ollama 里面的模型
+        message = text + '\n\n' + prompt_words
+        print(f'use model: {model}')
+        try:
+            response_iter = oClient(host=self.host).chat(model=model,
+                                                         messages=[
+                                                             {'role': 'system', 'content': role},
+                                                             {'role': 'user', 'content': message}
+                                                         ],
+                                                         options={"temperature": temperature},
+                                                         stream=False)
+            return response_iter['message']['content']
+        except Exception as e:
+            print(f"\n发生错误: {e}")
+            return None
+
+
+class MatrixBot:
+    def __init__(self, user, password):
+        self.base_url = "https://matrix.erhe.top"
+        self.user = user
+        self.password = password
+        self.client = MatrixClient("https://matrix.erhe.top")
+        self.token = self.login()
+        self.to = "!CgWvWEnLbKYvhXLvil:chat.abeginner.cn"
+
+    def login(self):
+        self.token = self.client.login(username=self.user, password=self.password)
+        return self.token
+
+    def send_message(self, message):
+        if self.token:
+            try:
+                api = MatrixHttpApi(self.base_url, token=self.token)
+                api.send_message(self.to, message)
+            except Exception as e:
+                print(e)
+                api = MatrixHttpApi(self.base_url, token=self.token)
+                api.send_message(self.to, str(e))
+
+        else:
+            print("Bot is not logged in. Please login first.")
+
+
+class AINEWS:
+    def create_config_if_not_exists(self):
+        # 如果当前路径无 config 则新建 config.json, 并写入一个配置例子
+        current_dir = os.path.dirname(os.path.abspath(__file__))  # 获取当前文件的目录路径
+
+        # 构建 config.json 文件的完整路径
+        config_path = os.path.join(current_dir, 'config.json')
+
+        # 检查 config.json 文件是否存在
+        if not os.path.exists(config_path):
+            # 如果不存在,创建并写入默认的 JSON 数据
+            default_config = {
+                "example": {
+                    "use_browser": 0,
+                    "ai_host": 'http://127.0.0.1:11434',
+                    "target_url_list": ['目标网站'],
+                    "role": "AI的角色, 例如: 你是一个聊天机器人",
+                    "prompt_words": "提示词: 帮我总结, 用中文回复"
+                }
+            }
+
+            # 写入 JSON 数据到 config.json 文件
+            with open(config_path, 'w', encoding='utf-8') as f:
+                json.dump(default_config, f, indent=4)
+
+            print(f"Created {config_path} with default configuration.")
+
+    def save_to_txt(self, url_to_text):
+        # 将爬取的新闻 保存到 txt 文件中
+        current_file_path = os.path.dirname(__file__)
+        save_file_path = os.path.join(current_file_path, 'save_txt')
+        if not os.path.exists(save_file_path):
+            os.makedirs(save_file_path)
+        file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
+        with open(file, 'w', encoding='utf-8') as file:
+            file.write(str(url_to_text))
+
+
+    def load_config(self, key):
+        # 读取配置文件 config.json
+        config = {}
+        if os.path.exists('config.json'):
+            with open('config.json', 'r', encoding='utf-8') as f:
+                config = json.load(f)
+
+        if not config:
+            print('config.json is not exist!')
+            exit(0)
+
+        k = config[key]
+        return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host']
+
+    async def get_htmls(self, urls):
+        # 获取 HTML
+        async with httpx.AsyncClient() as client:
+            async def get_html(url):
+                try:
+                    print(f'正在打开: {url}')
+                    # 发送 GET 请求获取页面内容
+                    response = await client.get(url)
+                    response.raise_for_status()  # 确保请求成功
+
+                    # 使用 BeautifulSoup 解析 HTML 内容
+                    soup = BeautifulSoup(response.text, 'html.parser')
+
+                    # 提取纯文本内容
+                    text = soup.get_text(separator=' ', strip=True)
+
+                    # 去除多余的空白字符
+                    cleaned_text = re.sub(r'\s+', ' ', text).strip()
+
+                    return url, cleaned_text
+                except Exception as e:
+                    print(f"Error fetching {url}: {e}")
+                    return url, ""
+
+            # 使用 asyncio.gather 同时获取所有网站的 HTML
+            tasks = [get_html(url) for url in urls]
+            results = await asyncio.gather(*tasks)
+
+            # 将结果存储在字典中
+            url_to_text = {url: text for url, text in results}
+
+            return url_to_text
+
+    async def get_htmls_with_browser(self, urls):
+        # 使用 Playwright 获取 HTML 内容
+        url_to_text = {}
+
+        async with async_playwright() as p:
+            # 启动浏览器
+            browser = await p.chromium.launch(headless=True)
+            # 创建浏览器上下文
+            context = await browser.new_context()
+
+            async def get_html(url):
+                try:
+                    print(f'正在打开: {url}')
+                    # 在上下文中打开新页面
+                    page = await context.new_page()
+                    # 导航到指定网址
+                    await page.goto(url)
+
+                    # 滚动页面以加载动态内容
+                    await self.scroll_to_percentage(page)
+
+                    # 获取渲染后的 HTML
+                    html = await page.content()
+                    # 使用 BeautifulSoup 解析 HTML 内容
+                    soup = BeautifulSoup(html, 'html.parser')
+                    # 提取纯文本内容
+                    text = soup.get_text(separator=' ', strip=True)
+                    # 去除多余的空白字符
+                    cleaned_text = re.sub(r'\s+', ' ', text).strip()
+                    # 关闭页面
+                    await page.close()
+                    return url, cleaned_text
+                except Exception as e:
+                    print(f"Error fetching {url}: {e}")
+                    return url, ""
+
+            # 使用 asyncio.gather 同时获取所有网站的 HTML
+            tasks = [get_html(url) for url in urls]
+            results = await asyncio.gather(*tasks)
+
+            # 将结果存储在字典中
+            url_to_text = {url: text for url, text in results}
+
+            # 关闭上下文和浏览器
+            await context.close()
+            await browser.close()
+
+        return url_to_text
+
+    @staticmethod
+    async def scroll_to_percentage(page):
+        # 获取页面标题并打印
+        title = await page.title()
+        print(f'正在滚动浏览器页面: {title}')
+
+        percentage_list = [i for i in range(5, 101, 2)]
+        for percentage in percentage_list:
+            # 计算页面的指定百分比高度
+            height = await page.evaluate("() => document.body.scrollHeight")
+            scroll_position = height * (percentage / 100)
+            # 跳转到指定的百分比位置
+            await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
+            await asyncio.sleep(0.5)  # 使用异步 sleep
+        await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
+
+    def process_data(self, result_text, prompt_words, role):
+        # 整理获取的数据, 返回准备发送的数据
+        process_send = []
+        O = OllamaChat(ai_host)
+        if text_batch:
+            for k, v in result_text.items():
+                response_context = O.call_ollama(role, v, prompt_words)
+                if response_context:
+                    message = f'{k}\n{response_context}\n'
+                    process_send.append(message)
+        else:
+            t = ''
+            for k, v in result_text.items():
+                t += f'{k}\n{v}\n'
+            response_context = O.call_ollama(role, t, prompt_words)
+            if response_context:
+                process_send.append(response_context)
+        return process_send
+
+    def main(self, target_url_list, prompt_words, role, use_browser, ai_host):
+        # 获取所有的网页html内容
+        if use_browser:
+            result_text = asyncio.run(self.get_htmls_with_browser(target_url_list))
+        else:
+            result_text = asyncio.run(self.get_htmls(target_url_list))
+
+        # 保存文本
+        self.save_to_txt(result_text)
+
+        # # 如果只需要保存爬取数据, 不使用 AI, 注释下面
+        # # 创建消息bot实例
+        # bot = MatrixBot('message-bot', 'aaaAAA111!!!')
+        #
+        # # 处理发送 text 数据
+        # process_send = self.process_data(result_text, prompt_words, role, ai_host)
+        #
+        # # 发送消息
+        # for process_text in process_send:
+        #     bot.send_message(process_text)
+
+
+if __name__ == "__main__":
+    ainews = AINEWS()
+    ainews.create_config_if_not_exists()
+
+    for key in key_list:
+        target_url_list, prompt_words, role, use_browser, ai_host = ainews.load_config(key)
+        ainews.main(target_url_list, prompt_words, role, use_browser, ai_host)
+    print('done!')

+ 49 - 0
ai_news/config.json

@@ -0,0 +1,49 @@
+{
+  "example": {
+    "use_browser": 0,
+    "ai_host": "http://127.0.0.1:11434",
+    "target_url_list": [],
+    "prompt_words": "",
+    "role": ""
+  },
+  "web3": {
+    "use_browser": 0,
+    "ai_host": "http://home.erhe.link:36001",
+    "target_url_list": [
+      "https://wublock123.com",
+      "https://m.odaily.news/newsflash",
+      "https://www.chaincatcher.com/news",
+      "https://www.panewslab.com/",
+      "https://www.theblockbeats.info/newsflash",
+      "https://www.jinse.cn/lives",
+      "https://www.techflowpost.com/article/index.html",
+      "https://foresightnews.pro/news",
+      "https://www.web3sj.com/news/"
+    ],
+    "role": "你是一个新闻报播员, 负责理解和报播新闻, 请用中文回答",
+    "prompt_words": "这是最近的数字币相关的新闻, 你现在是一个新闻报播员, 帮我整理并总结以上新闻, 分析一下数字币接下来大概得走势, 再在此基础上, 给每一条新闻做一个情感分析, 最后输出一下总得分"
+  },
+  "A": {
+    "use_browser": 1,
+    "ai_host": "http://home.erhe.link:36001",
+    "target_url_list": [
+      "https://www.eastmoney.com/",
+      "https://www.jrj.com.cn/",
+      "https://www.10jqka.com.cn/",
+      "https://www.stcn.com/",
+      "https://www.cs.com.cn/",
+      "https://www.cnstock.com/",
+      "https://www.cls.cn/",
+      "https://www.yicai.com/",
+      "https://www.hexun.com/",
+      "https://www.wind.com.cn/",
+      "https://www.sse.com.cn/",
+      "https://www.szse.cn/",
+      "https://www.xueqiu.com/",
+      "https://www.jiemian.com/",
+      "https://www.caixin.com/"
+    ],
+    "prompt_words": "给你一个或多个网页的源代码, 里面是未清洗的网页源代码,你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了,帮我总结一下这些网站的内容, 请用中文回答",
+    "role": ""
+  }
+}

+ 27 - 0
ai_news/requirements.txt

@@ -0,0 +1,27 @@
+annotated-types==0.7.0
+anyio==4.8.0
+beautifulsoup4==4.12.3
+certifi==2024.12.14
+charset-normalizer==3.4.1
+distro==1.9.0
+exceptiongroup==1.2.2
+gotify==0.6.0
+greenlet==3.1.1
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.27.2
+idna==3.10
+jiter==0.8.2
+matrix-client==0.4.0
+ollama==0.4.5
+openai==1.59.6
+playwright==1.49.1
+pydantic==2.10.5
+pydantic_core==2.27.2
+pyee==12.0.0
+requests==2.32.3
+sniffio==1.3.1
+soupsieve==2.6
+tqdm==4.67.1
+typing_extensions==4.12.2
+urllib3==1.26.20

+ 7 - 7
message/message_coin_detail.py

@@ -64,7 +64,7 @@ def fetch_coin_data(target):
             # text += f'Diluted Market Value: {dilute}\n'
             # text += f'Logo: {logoUrl}\n'
 
-            return text + '\n'
+            return text
 
 
 def fetch_vix_data():
@@ -159,7 +159,7 @@ def main():
         for retry in range(1, retry_count + 1):
             result = fetch_coin_data(target)
             if result:
-                text += result + '\n\n'
+                text += result + '\n'
                 break
             else:
                 print(f"{target} Failed to fetch data. retry: {retry}")
@@ -181,17 +181,17 @@ def main():
     for retry in range(1, retry_count + 1):
         result = fetch_gas_data()
         if result:
-            text += result + '\n\n'
+            text += '\n' + result + '\n\n'
             break
         else:
             # print(f"Failed to fetch Gas data. retry: {retry}")
             if retry == retry_count:
                 text += f"Failed to fetch Gas data. retry count: {retry}"
 
-    # if text:
-    #     GotifyNotifier('Real-time coin price\n', text, 'AgfOJESqDKftBTQ').send_message()
-    # else:
-    #     print('No Data')
+    if text:
+        GotifyNotifier('Real-time coin price\n', text, 'AgfOJESqDKftBTQ').send_message()
+    else:
+        print('No Data')
 
 
 if __name__ == "__main__":

+ 38 - 0
utils/utils_call_ollama.py

@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+import time
+from ollama import Client as oClient
+
+
+class OllamaChat(object):
+    def call_ollama(self, host, role, text, prompt_words, model='llava:13b', temperature=0.4):
+        message = text + '\n\n' + prompt_words
+        print(f'use model: {model}')
+        try:
+            response_iter = oClient(host=host).chat(model=model,
+                                                    messages=[
+                                                        {'role': 'system', 'content': role},
+                                                        {'role': 'user', 'content': message}
+                                                    ],
+                                                    options={"temperature": temperature},
+                                                    stream=False)
+            return response_iter['message']['content']
+        except Exception as e:
+            print(f"\n发生错误: {e}")
+
+
+# if __name__ == "__main__":
+#     C = OllamaChat()
+#     start_time = time.time()
+#
+#     role = '你是一个聊天机器人'
+#
+#     text = 'hello'
+#
+#     prompt_words = '你好呀'
+#
+#     response_context = C.call_ollama('http://192.168.31.28:11434', role, text, prompt_words, model='llava:13b')
+#     print(response_context)
+#
+#     end_time = time.time()
+#     run_time = end_time - start_time
+#     print(f"程序运行时间:{run_time} 秒\n")

+ 10 - 10
utils/utils_send_matrix.py

@@ -11,7 +11,7 @@ class MatrixBot:
         self.password = password
         self.client = MatrixClient("https://matrix.erhe.top")
         self.token = self.login()
-        self.to = "!ddrrTpQmepfgivMxeW:chat.abeginner.cn"
+        self.to = "!CgWvWEnLbKYvhXLvil:chat.abeginner.cn"
 
     def login(self):
         self.token = self.client.login(username=self.user, password=self.password)
@@ -25,16 +25,16 @@ class MatrixBot:
             except Exception as e:
                 print(e)
                 api = MatrixHttpApi(self.base_url, token=self.token)
-                api.send_message(self.to, e)
+                api.send_message(self.to, str(e))
 
         else:
             print("Bot is not logged in. Please login first.")
 
-# if __name__ == '__main__':
-# 测试调用
-# user = "bot1"
-# pw = "aaaAAA111!!!"
-# message = "123987456"
-#
-# bot = MatrixBot(user, pw)
-# bot.send_message(message)
+if __name__ == '__main__':
+    # 测试调用
+    user = "message-bot"
+    pw = "aaaAAA111!!!"
+    message = "123987456"
+
+    bot = MatrixBot(user, pw)
+    bot.send_message(message)