jack 11 月之前
父節點
當前提交
1e9cd82476
共有 2 個文件被更改,包括 63 次插入16 次删除
  1. 56 14
      ai_news/ai_news.py
  2. 7 2
      ai_news/config.json

+ 56 - 14
ai_news/ai_news.py

@@ -6,7 +6,6 @@ import uuid
 import httpx
 import asyncio
 import datetime
-import time
 from bs4 import BeautifulSoup
 from ollama import Client as oClient
 from playwright.async_api import async_playwright
@@ -129,7 +128,8 @@ class AINEWS:
             exit(0)
 
         k = config[key]
-        return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host'], k['message_bot_key']
+        return k
+        # return k['target_url_list'], k['prompt_words'], k['role'], k['use_browser'], k['ai_host'], k['message_bot_key']
 
     async def get_htmls(self, urls):
         # 获取 HTML
@@ -182,6 +182,12 @@ class AINEWS:
                     # 导航到指定网址
                     await page.goto(url)
 
+                    # 禁止弹框
+                    await self.disable_dialogs(page)
+
+                    # 调用 disable_images 方法阻止图片加载并隐藏图片
+                    await self.disable_images(page)
+
                     # 滚动页面以加载动态内容
                     await self.scroll_to_percentage(page)
 
@@ -249,6 +255,32 @@ class AINEWS:
         await page.screenshot(path=screenshot_path, full_page=True)
         print(f"截图已保存到: {screenshot_path}")
 
+    async def disable_images(self, page):
+        # 调用 JavaScript 函数阻止图片加载并隐藏图片
+        await page.evaluate('''() => {
+            function disableImages() {
+                // 阻止所有图片加载
+                document.querySelectorAll('img').forEach(img => {
+                    img.src = ''; // 清空 src 属性
+                    img.removeAttribute('srcset'); // 移除 srcset 属性(如果有)
+                });
+
+                // 隐藏所有图片
+                document.querySelectorAll('img').forEach(img => {
+                    img.style.display = 'none';
+                });
+            }
+            disableImages(); // 调用函数
+        }''')
+
+    async def disable_dialogs(self, page):
+        # 覆盖 JavaScript 的弹框方法,使其无效
+        await page.evaluate('''() => {
+            window.alert = () => {};
+            window.confirm = () => true; // confirm 默认返回 true
+            window.prompt = () => null;  // prompt 默认返回 null
+        }''')
+
     def process_data(self, result_text, prompt_words, role, ai_host):
         # 整理获取的数据, 返回准备发送的数据
         process_send = []
@@ -268,7 +300,15 @@ class AINEWS:
                 process_send.append(response_context)
         return process_send
 
-    def main(self, datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key):
+    def main(self, config):
+        target_url_list = config['target_url_list']
+        prompt_words = config['prompt_words']
+        role = config['role']
+        use_browser = config['use_browser']
+        ai_host = config['ai_host']
+        message_bot_key = config['message_bot_key']
+        use_ai = config['use_ai']
+
         # 获取所有的网页html内容
         if use_browser:
             result_text = asyncio.run(self.get_htmls_with_browser(target_url_list, datetime_file_path))
@@ -283,15 +323,15 @@ class AINEWS:
             print('无数据, 程序退出')
             exit(0)
 
-        # # 如果只需要保存爬取数据, 不使用 AI, 注释下面
-        # # 处理发送 text 数据
-        # process_send = self.process_data(result_text, prompt_words, role, ai_host)
-        #
-        # # 创建消息bot实例
-        # bot = MatrixBot('message-bot', 'aaaAAA111!!!', message_bot_key)
-        # # 发送消息
-        # for process_text in process_send:
-        #     bot.send_message(process_text)
+        if use_ai:
+            # 处理发送 text 数据
+            process_send = self.process_data(result_text, prompt_words, role, ai_host)
+
+            # 创建消息bot实例
+            bot = MatrixBot('message-bot', 'aaaAAA111!!!', message_bot_key)
+            # 发送消息
+            for process_text in process_send:
+                bot.send_message(process_text)
 
 
 if __name__ == "__main__":
@@ -300,7 +340,9 @@ if __name__ == "__main__":
     datetime_file_path = ainews.mkdir_save_data()
 
     for key in key_list:
-        target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key = ainews.load_config(key)
+        config = ainews.load_config(key)
+        target_url_list = config['target_url_list']
+
         print(f'关键词 {key} 共有 {len(target_url_list)} 个网址')
-        ainews.main(datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key)
+        ainews.main(config)
     print('done!')

+ 7 - 2
ai_news/config.json

@@ -3,6 +3,7 @@
     "use_browser": 0,
     "ai_host": "http://127.0.0.1:11434",
     "message_bot_key": "",
+    "use_ai": 1,
     "target_url_list": [],
     "prompt_words": "",
     "role": ""
@@ -11,6 +12,7 @@
     "use_browser": 1,
     "ai_host": "http://home.erhe.link:36001",
     "message_bot_key": "!CgWvWEnLbKYvhXLvil:chat.abeginner.cn",
+    "use_ai": 1,
     "target_url_list": [
       "https://wublock123.com",
       "https://m.odaily.news/newsflash",
@@ -20,7 +22,9 @@
       "https://www.jinse.cn/lives",
       "https://www.techflowpost.com/article/index.html",
       "https://foresightnews.pro/news",
-      "https://www.web3sj.com/news/"
+      "https://www.web3sj.com/news/",
+      "https://www.techflowpost.com/newsletter/index.html",
+      "https://www.theblockbeats.info/newsflash?type=2"
     ],
     "role": "你是一个新闻报播员, 负责理解和报播新闻, 请用中文回答",
     "prompt_words": "这是最近的数字币相关的新闻, 你现在是一个新闻报播员, 帮我整理并总结以上新闻, 分析一下数字币接下来大概得走势, 再在此基础上, 给每一条新闻做一个情感分析, 最后输出一下总得分"
@@ -28,7 +32,8 @@
   "A": {
     "use_browser": 1,
     "ai_host": "http://home.erhe.link:36001",
-    "message_bot_key": "",
+    "message_bot_key": "!CgWvWEnLbKYvhXLvil:chat.abeginner.cn",
+    "use_ai": 1,
     "target_url_list": [
       "https://www.eastmoney.com/",
       "https://www.jrj.com.cn/",