# -*- coding: utf-8 -*- import re from playwright.async_api import async_playwright import asyncio from bs4 import BeautifulSoup from api_ollama import * from api_kimi import * from api_deepseek import * from send_to_email import * class AINEWS: async def get_htmls(self, urls): htmls = [] async with async_playwright() as p: # 启动浏览器 browser = await p.chromium.launch(headless=True) # 创建浏览器上下文 context = await browser.new_context() async def get_html(url): try: print(f'正在打开: {url}') # 在上下文中打开新页面 page = await context.new_page() # 导航到指定网址 await page.goto(url) # 获取渲染后的 HTML html = await page.content() # 关闭页面 await page.close() return html except Exception as e: print(f"Error fetching {url}: {e}") return "" # 使用 asyncio.gather 同时获取所有网站的 HTML tasks = [get_html(url) for url in urls] htmls_list = await asyncio.gather(*tasks) # 使用 BeautifulSoup 格式化每个 HTML 内容 formatted_htmls = [] for html in htmls_list: soup = BeautifulSoup(html, 'html.parser') formatted_html = soup.get_text() cleaned_text = re.sub(r'[\n\t\r]+', ' ', formatted_html) cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip() formatted_htmls.append(cleaned_text) # 将所有格式化后的 HTML 内容合并到一个字符串中 text = "\n".join(formatted_htmls) # 关闭上下文和浏览器 await context.close() await browser.close() return text def main(self): urls = ["https://www.smzdm.com/jingxuan/", "https://faxian.smzdm.com/"] text = asyncio.run(self.get_htmls(urls)) # print(text) prompt_words = ''' 给你几个个网页的源代码, 里面是未清洗的网页源代码 你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了 帮我总结一下内容, 请用中文回答 ''' prompt_words += text # C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b') # response_context = C.start_chat() # print(response_context) # K = KIMI() # response_context = K.call_kimi(prompt_words) # print(response_context) D = DeepSeek() response_context = D.call_deepseek(prompt_words) print(response_context) if __name__ == "__main__": ainews = AINEWS() ainews.main()