| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889 |
- # -*- coding: utf-8 -*-
- import re
- from playwright.async_api import async_playwright
- import asyncio
- from bs4 import BeautifulSoup
- from api_ollama import *
- from api_kimi import *
- from api_deepseek import *
- from send_to_email import *
- class AINEWS:
- async def get_htmls(self, urls):
- htmls = []
- async with async_playwright() as p:
- # 启动浏览器
- browser = await p.chromium.launch(headless=True)
- # 创建浏览器上下文
- context = await browser.new_context()
- async def get_html(url):
- try:
- print(f'正在打开: {url}')
- # 在上下文中打开新页面
- page = await context.new_page()
- # 导航到指定网址
- await page.goto(url)
- # 获取渲染后的 HTML
- html = await page.content()
- # 关闭页面
- await page.close()
- return html
- except Exception as e:
- print(f"Error fetching {url}: {e}")
- return ""
- # 使用 asyncio.gather 同时获取所有网站的 HTML
- tasks = [get_html(url) for url in urls]
- htmls_list = await asyncio.gather(*tasks)
- # 使用 BeautifulSoup 格式化每个 HTML 内容
- formatted_htmls = []
- for html in htmls_list:
- soup = BeautifulSoup(html, 'html.parser')
- formatted_html = soup.get_text()
- cleaned_text = re.sub(r'[\n\t\r]+', ' ', formatted_html)
- cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
- formatted_htmls.append(cleaned_text)
- # 将所有格式化后的 HTML 内容合并到一个字符串中
- text = "\n".join(formatted_htmls)
- # 关闭上下文和浏览器
- await context.close()
- await browser.close()
- return text
- def main(self):
- urls = ["https://www.smzdm.com/jingxuan/", "https://faxian.smzdm.com/"]
- text = asyncio.run(self.get_htmls(urls))
- # print(text)
- prompt_words = '''
- 给你几个个网页的源代码, 里面是未清洗的网页源代码
- 你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了
- 帮我总结一下内容, 请用中文回答
- '''
- prompt_words += text
- # C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b')
- # response_context = C.start_chat()
- # print(response_context)
- # K = KIMI()
- # response_context = K.call_kimi(prompt_words)
- # print(response_context)
- D = DeepSeek()
- response_context = D.call_deepseek(prompt_words)
- print(response_context)
- if __name__ == "__main__":
- ainews = AINEWS()
- ainews.main()
|