|
|
@@ -2,8 +2,10 @@
|
|
|
import os
|
|
|
import re
|
|
|
import json
|
|
|
+import uuid
|
|
|
import httpx
|
|
|
import asyncio
|
|
|
+import datetime
|
|
|
import time
|
|
|
from bs4 import BeautifulSoup
|
|
|
from ollama import Client as oClient
|
|
|
@@ -44,7 +46,7 @@ class MatrixBot:
|
|
|
self.password = password
|
|
|
self.client = MatrixClient("https://matrix.erhe.top")
|
|
|
self.token = self.login()
|
|
|
- self.to = "!CgWvWEnLbKYvhXLvil:chat.abeginner.cn"
|
|
|
+ self.to = key
|
|
|
|
|
|
def login(self):
|
|
|
self.token = self.client.login(username=self.user, password=self.password)
|
|
|
@@ -92,17 +94,29 @@ class AINEWS:
|
|
|
|
|
|
print(f"Created {config_path} with default configuration.")
|
|
|
|
|
|
- def save_to_txt(self, url_to_text):
|
|
|
- # 将爬取的新闻 保存到 txt 文件中
|
|
|
+ def mkdir_save_data(self):
|
|
|
+ # 获取当前脚本所在路径
|
|
|
current_file_path = os.path.dirname(__file__)
|
|
|
- save_file_path = os.path.join(current_file_path, 'save_txt')
|
|
|
+ # 拼接 save_data 文件夹路径
|
|
|
+ save_file_path = os.path.join(current_file_path, 'save_data')
|
|
|
+ # 如果 save_data 文件夹不存在,则创建
|
|
|
if not os.path.exists(save_file_path):
|
|
|
os.makedirs(save_file_path)
|
|
|
- file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
|
|
|
+
|
|
|
+ # 在 save_data 文件夹中,创建一个以当前日期时间命名的子文件夹
|
|
|
+ datetime_file_name = datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
|
|
|
+ datetime_file_path = os.path.join(save_file_path, datetime_file_name)
|
|
|
+ if not os.path.exists(datetime_file_path):
|
|
|
+ os.makedirs(datetime_file_path)
|
|
|
+
|
|
|
+ return datetime_file_path
|
|
|
+
|
|
|
+ def save_to_txt(self, url_to_text, datetime_file_path):
|
|
|
+ # 将爬取的新闻 保存到 txt 文件中
|
|
|
+ file = os.path.join(datetime_file_path, 'all_page_data.txt')
|
|
|
with open(file, 'w', encoding='utf-8') as file:
|
|
|
file.write(str(url_to_text))
|
|
|
|
|
|
-
|
|
|
def load_config(self, key):
|
|
|
# 读取配置文件 config.json
|
|
|
config = {}
|
|
|
@@ -150,7 +164,7 @@ class AINEWS:
|
|
|
|
|
|
return url_to_text
|
|
|
|
|
|
- async def get_htmls_with_browser(self, urls):
|
|
|
+ async def get_htmls_with_browser(self, urls, datetime_file_path):
|
|
|
# 使用 Playwright 获取 HTML 内容
|
|
|
url_to_text = {}
|
|
|
|
|
|
@@ -171,6 +185,9 @@ class AINEWS:
|
|
|
# 滚动页面以加载动态内容
|
|
|
await self.scroll_to_percentage(page)
|
|
|
|
|
|
+ # 顺手截图
|
|
|
+ await self.screenshot(page, datetime_file_path)
|
|
|
+
|
|
|
# 获取渲染后的 HTML
|
|
|
html = await page.content()
|
|
|
# 使用 BeautifulSoup 解析 HTML 内容
|
|
|
@@ -215,6 +232,23 @@ class AINEWS:
|
|
|
await asyncio.sleep(0.5) # 使用异步 sleep
|
|
|
await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
|
|
|
|
|
|
+ @staticmethod
|
|
|
+ async def screenshot(page, datetime_file_path):
|
|
|
+ # 顺手截图
|
|
|
+ # 获取网页的 title
|
|
|
+ title = await page.title()
|
|
|
+ # 替换不合法的字符
|
|
|
+ cleaned_title = re.sub(r'[\\/:*?"<>|]', '', title)
|
|
|
+ # 如果 title 为空,使用默认名称
|
|
|
+ if not cleaned_title:
|
|
|
+ cleaned_title = "untitled"
|
|
|
+
|
|
|
+ # 拼接截图文件路径
|
|
|
+ screenshot_path = os.path.join(datetime_file_path, f"{cleaned_title}_{uuid.uuid4().hex[:6]}.png")
|
|
|
+ # 进行整页截图
|
|
|
+ await page.screenshot(path=screenshot_path, full_page=True)
|
|
|
+ print(f"截图已保存到: {screenshot_path}")
|
|
|
+
|
|
|
def process_data(self, result_text, prompt_words, role, ai_host):
|
|
|
# 整理获取的数据, 返回准备发送的数据
|
|
|
process_send = []
|
|
|
@@ -234,17 +268,17 @@ class AINEWS:
|
|
|
process_send.append(response_context)
|
|
|
return process_send
|
|
|
|
|
|
- def main(self, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key):
|
|
|
+ def main(self, datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key):
|
|
|
# 获取所有的网页html内容
|
|
|
if use_browser:
|
|
|
- result_text = asyncio.run(self.get_htmls_with_browser(target_url_list))
|
|
|
+ result_text = asyncio.run(self.get_htmls_with_browser(target_url_list, datetime_file_path))
|
|
|
else:
|
|
|
result_text = asyncio.run(self.get_htmls(target_url_list))
|
|
|
|
|
|
# 保存文本
|
|
|
if result_text:
|
|
|
print(f'共获取 {len(result_text)} 个网址的数据')
|
|
|
- self.save_to_txt(result_text)
|
|
|
+ self.save_to_txt(result_text, datetime_file_path)
|
|
|
else:
|
|
|
print('无数据, 程序退出')
|
|
|
exit(0)
|
|
|
@@ -263,9 +297,10 @@ class AINEWS:
|
|
|
if __name__ == "__main__":
|
|
|
ainews = AINEWS()
|
|
|
ainews.create_config_if_not_exists()
|
|
|
+ datetime_file_path = ainews.mkdir_save_data()
|
|
|
|
|
|
for key in key_list:
|
|
|
target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key = ainews.load_config(key)
|
|
|
print(f'关键词 {key} 共有 {len(target_url_list)} 个网址')
|
|
|
- ainews.main(target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key)
|
|
|
+ ainews.main(datetime_file_path, target_url_list, prompt_words, role, use_browser, ai_host, message_bot_key)
|
|
|
print('done!')
|