Преглед на файлове

ai_news 加入截图功能

jack преди 11 месеца
родител
ревизия
461dad81f1
променени са 2 файла, в които са добавени 47 реда и са изтрити 12 реда
  1. 1 1
      .gitignore
  2. 46 11
      ai_news/ai_news.py

+ 1 - 1
.gitignore

@@ -61,4 +61,4 @@ docs/_build/
 target/
 
 other/split_clash_config/split_config
-ai_news/save_txt
+ai_news/save_data

+ 46 - 11
ai_news/ai_news.py

@@ -2,8 +2,10 @@
 import os
 import re
 import json
+import uuid
 import httpx
 import asyncio
+import datetime
 import time
 from bs4 import BeautifulSoup
 from ollama import Client as oClient
@@ -44,7 +46,7 @@ class MatrixBot:
         self.password = password
         self.client = MatrixClient("https://matrix.erhe.top")
         self.token = self.login()
-        self.to = "!CgWvWEnLbKYvhXLvil:chat.abeginner.cn"
+        self.to = key
 
     def login(self):
         self.token = self.client.login(username=self.user, password=self.password)
@@ -92,17 +94,29 @@ class AINEWS:
 
             print(f"Created {config_path} with default configuration.")
 
-    def save_to_txt(self, url_to_text):
-        # 将爬取的新闻 保存到 txt 文件中
+    def mkdir_save_data(self):
+        # 获取当前脚本所在路径
         current_file_path = os.path.dirname(__file__)
-        save_file_path = os.path.join(current_file_path, 'save_txt')
+        # 拼接 save_data 文件夹路径
+        save_file_path = os.path.join(current_file_path, 'save_data')
+        # 如果 save_data 文件夹不存在,则创建
         if not os.path.exists(save_file_path):
             os.makedirs(save_file_path)
-        file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
+
+        # 在 save_data 文件夹中,创建一个以当前日期时间命名的子文件夹
+        datetime_file_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
+        datetime_file_path = os.path.join(save_file_path, datetime_file_name)
+        if not os.path.exists(datetime_file_path):
+            os.makedirs(datetime_file_path)
+
+        return datetime_file_path
+
+    def save_to_txt(self, url_to_text, datetime_file_path):
+        # 将爬取的新闻 保存到 txt 文件中
+        file = os.path.join(datetime_file_path, 'all_page_data.txt')
         with open(file, 'w', encoding='utf-8') as file:
             file.write(str(url_to_text))
 
-
     def load_config(self, key):
         # 读取配置文件 config.json
         config = {}
@@ -150,7 +164,7 @@ class AINEWS:
 
             return url_to_text
 
-    async def get_htmls_with_browser(self, urls):
+    async def get_htmls_with_browser(self, urls, datetime_file_path):
         # 使用 Playwright 获取 HTML 内容
         url_to_text = {}
 
@@ -171,6 +185,9 @@ class AINEWS:
                     # 滚动页面以加载动态内容
                     await self.scroll_to_percentage(page)
 
+                    # 顺手截图
+                    await self.screenshot(page, datetime_file_path)
+
                     # 获取渲染后的 HTML
                     html = await page.content()
                     # 使用 BeautifulSoup 解析 HTML 内容
@@ -215,6 +232,23 @@ class AINEWS:
             await asyncio.sleep(0.5)  # 使用异步 sleep
         await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
 
+    @staticmethod
+    async def screenshot(page, datetime_file_path):
+        # 顺手截图
+        # 获取网页的 title
+        title = await page.title()
+        # 替换不合法的字符
+        cleaned_title = re.sub(r'[\\/:*?"<>|]', '', title)
+        # 如果 title 为空,使用默认名称
+        if not cleaned_title:
+            cleaned_title = "untitled"
+
+        # 拼接截图文件路径
+        screenshot_path = os.path.join(datetime_file_path, f"{cleaned_title}_{uuid.uuid4().hex[:6]}.png")
+        # 进行整页截图
+        await page.screenshot(path=screenshot_path, full_page=True)
+        print(f"截图已保存到: {screenshot_path}")
+
     def process_data(self, result_text, prompt_words, role, ai_host):
         # 整理获取的数据, 返回准备发送的数据
         process_send = []
@@ -234,17 +268,17 @@ class AINEWS:
                 process_send.append(response_context)
         return process_send
 
-    def main(self, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key):
+    def main(self, datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key):
         # 获取所有的网页html内容
         if use_browser:
-            result_text = asyncio.run(self.get_htmls_with_browser(target_url_list))
+            result_text = asyncio.run(self.get_htmls_with_browser(target_url_list, datetime_file_path))
         else:
             result_text = asyncio.run(self.get_htmls(target_url_list))
 
         # 保存文本
         if result_text:
             print(f'共获取 {len(result_text)} 个网址的数据')
-            self.save_to_txt(result_text)
+            self.save_to_txt(result_text, datetime_file_path)
         else:
             print('无数据, 程序退出')
             exit(0)
@@ -263,9 +297,10 @@ class AINEWS:
 if __name__ == "__main__":
     ainews = AINEWS()
     ainews.create_config_if_not_exists()
+    datetime_file_path = ainews.mkdir_save_data()
 
     for key in key_list:
         target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key = ainews.load_config(key)
         print(f'关键词 {key} 共有 {len(target_url_list)} 个网址')
-        ainews.main(target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key)
+        ainews.main(datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key)
     print('done!')