|
|
@@ -0,0 +1,89 @@
|
|
|
+# -*- coding: utf-8 -*-
|
|
|
+import re
|
|
|
+from playwright.async_api import async_playwright
|
|
|
+import asyncio
|
|
|
+from bs4 import BeautifulSoup
|
|
|
+
|
|
|
+from api_ollama import *
|
|
|
+from api_kimi import *
|
|
|
+from api_deepseek import *
|
|
|
+
|
|
|
+from send_to_email import *
|
|
|
+
|
|
|
+
|
|
|
+class AINEWS:
|
|
|
+ async def get_htmls(self, urls):
|
|
|
+ htmls = []
|
|
|
+ async with async_playwright() as p:
|
|
|
+ # 启动浏览器
|
|
|
+ browser = await p.chromium.launch(headless=True)
|
|
|
+ # 创建浏览器上下文
|
|
|
+ context = await browser.new_context()
|
|
|
+
|
|
|
+ async def get_html(url):
|
|
|
+ try:
|
|
|
+ print(f'正在打开: {url}')
|
|
|
+ # 在上下文中打开新页面
|
|
|
+ page = await context.new_page()
|
|
|
+ # 导航到指定网址
|
|
|
+ await page.goto(url)
|
|
|
+ # 获取渲染后的 HTML
|
|
|
+ html = await page.content()
|
|
|
+ # 关闭页面
|
|
|
+ await page.close()
|
|
|
+ return html
|
|
|
+ except Exception as e:
|
|
|
+ print(f"Error fetching {url}: {e}")
|
|
|
+ return ""
|
|
|
+
|
|
|
+ # 使用 asyncio.gather 同时获取所有网站的 HTML
|
|
|
+ tasks = [get_html(url) for url in urls]
|
|
|
+ htmls_list = await asyncio.gather(*tasks)
|
|
|
+
|
|
|
+ # 使用 BeautifulSoup 格式化每个 HTML 内容
|
|
|
+ formatted_htmls = []
|
|
|
+ for html in htmls_list:
|
|
|
+ soup = BeautifulSoup(html, 'html.parser')
|
|
|
+ formatted_html = soup.get_text()
|
|
|
+ cleaned_text = re.sub(r'[\n\t\r]+', ' ', formatted_html)
|
|
|
+ cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
|
|
|
+ formatted_htmls.append(cleaned_text)
|
|
|
+
|
|
|
+ # 将所有格式化后的 HTML 内容合并到一个字符串中
|
|
|
+ text = "\n".join(formatted_htmls)
|
|
|
+
|
|
|
+ # 关闭上下文和浏览器
|
|
|
+ await context.close()
|
|
|
+ await browser.close()
|
|
|
+
|
|
|
+ return text
|
|
|
+
|
|
|
+ def main(self):
|
|
|
+ urls = ["https://www.smzdm.com/jingxuan/", "https://faxian.smzdm.com/"]
|
|
|
+ text = asyncio.run(self.get_htmls(urls))
|
|
|
+
|
|
|
+ # print(text)
|
|
|
+
|
|
|
+ prompt_words = '''
|
|
|
+ 给你几个个网页的源代码, 里面是未清洗的网页源代码
|
|
|
+ 你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了
|
|
|
+ 帮我总结一下内容, 请用中文回答
|
|
|
+ '''
|
|
|
+ prompt_words += text
|
|
|
+
|
|
|
+ # C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b')
|
|
|
+ # response_context = C.start_chat()
|
|
|
+ # print(response_context)
|
|
|
+
|
|
|
+ # K = KIMI()
|
|
|
+ # response_context = K.call_kimi(prompt_words)
|
|
|
+ # print(response_context)
|
|
|
+
|
|
|
+ D = DeepSeek()
|
|
|
+ response_context = D.call_deepseek(prompt_words)
|
|
|
+ print(response_context)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ ainews = AINEWS()
|
|
|
+ ainews.main()
|