|
|
@@ -1,17 +1,53 @@
|
|
|
# -*- coding: utf-8 -*-
|
|
|
+import os
|
|
|
import re
|
|
|
+import json
|
|
|
from playwright.async_api import async_playwright
|
|
|
import asyncio
|
|
|
from bs4 import BeautifulSoup
|
|
|
-
|
|
|
from api_ollama import *
|
|
|
from api_kimi import *
|
|
|
from api_deepseek import *
|
|
|
-
|
|
|
from send_to_email import *
|
|
|
|
|
|
+key = 'web3'
|
|
|
+
|
|
|
|
|
|
class AINEWS:
|
|
|
+ def save_to_txt(self, text):
|
|
|
+ current_file_path = os.path.dirname(__file__)
|
|
|
+ save_file_path = os.path.join(current_file_path, 'save_txt')
|
|
|
+ if not os.path.exists(save_file_path):
|
|
|
+ os.makedirs(save_file_path)
|
|
|
+ file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
|
|
|
+ with open(file, 'w', encoding='utf-8') as file:
|
|
|
+ file.write(text)
|
|
|
+
|
|
|
+ def load_config(self, key):
|
|
|
+ config = {}
|
|
|
+ if os.path.exists('config.json'):
|
|
|
+ with open('config.json', 'r', encoding='utf-8') as f:
|
|
|
+ config = json.load(f)
|
|
|
+
|
|
|
+ if not config:
|
|
|
+ print('config.json is not exist!')
|
|
|
+ exit()
|
|
|
+
|
|
|
+ k = config[key]
|
|
|
+ return k['target_url_list'], k['prompt_words']
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ async def scroll_to_percentage(page):
|
|
|
+ percentage_list = [i for i in range(5, 101, 2)]
|
|
|
+ for percentage in percentage_list:
|
|
|
+ # 计算页面的指定百分比高度
|
|
|
+ height = await page.evaluate("() => document.body.scrollHeight")
|
|
|
+ scroll_position = height * (percentage / 100)
|
|
|
+ # 跳转到指定的百分比位置
|
|
|
+ await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
|
|
|
+ await asyncio.sleep(0.5) # 使用异步 sleep
|
|
|
+ await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
|
|
|
+
|
|
|
async def get_htmls(self, urls):
|
|
|
htmls = []
|
|
|
async with async_playwright() as p:
|
|
|
@@ -27,6 +63,10 @@ class AINEWS:
|
|
|
page = await context.new_page()
|
|
|
# 导航到指定网址
|
|
|
await page.goto(url)
|
|
|
+
|
|
|
+ # 滚动页面, 获取更多信息
|
|
|
+ await self.scroll_to_percentage(page)
|
|
|
+
|
|
|
# 获取渲染后的 HTML
|
|
|
html = await page.content()
|
|
|
# 关闭页面
|
|
|
@@ -58,17 +98,11 @@ class AINEWS:
|
|
|
|
|
|
return text
|
|
|
|
|
|
- def main(self):
|
|
|
- urls = ["https://www.smzdm.com/jingxuan/", "https://faxian.smzdm.com/"]
|
|
|
- text = asyncio.run(self.get_htmls(urls))
|
|
|
+ def main(self, target_url_list, prompt_words):
|
|
|
+ text = asyncio.run(self.get_htmls(target_url_list))
|
|
|
|
|
|
- # print(text)
|
|
|
+ self.save_to_txt(text)
|
|
|
|
|
|
- prompt_words = '''
|
|
|
- 给你几个个网页的源代码, 里面是未清洗的网页源代码
|
|
|
- 你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了
|
|
|
- 帮我总结一下内容, 请用中文回答
|
|
|
- '''
|
|
|
prompt_words += text
|
|
|
|
|
|
# C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b')
|
|
|
@@ -79,11 +113,13 @@ class AINEWS:
|
|
|
# response_context = K.call_kimi(prompt_words)
|
|
|
# print(response_context)
|
|
|
|
|
|
- D = DeepSeek()
|
|
|
- response_context = D.call_deepseek(prompt_words)
|
|
|
- print(response_context)
|
|
|
+ # D = DeepSeek()
|
|
|
+ # response_context = D.call_deepseek(prompt_words)
|
|
|
+ # print(response_context)
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
ainews = AINEWS()
|
|
|
- ainews.main()
|
|
|
+ target_url_list, prompt_words = ainews.load_config(key)
|
|
|
+ ainews.main(target_url_list, prompt_words)
|
|
|
+ print('done!')
|