toor
/
AI_NEWS


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889
							# -*- coding: utf-8 -*-
import re
from playwright.async_api import async_playwright
import asyncio
from bs4 import BeautifulSoup

from api_ollama import *
from api_kimi import *
from api_deepseek import *

from send_to_email import *


class AINEWS:
    async def get_htmls(self, urls):
        htmls = []
        async with async_playwright() as p:
            # 启动浏览器
            browser = await p.chromium.launch(headless=True)
            # 创建浏览器上下文
            context = await browser.new_context()

            async def get_html(url):
                try:
                    print(f'正在打开: {url}')
                    # 在上下文中打开新页面
                    page = await context.new_page()
                    # 导航到指定网址
                    await page.goto(url)
                    # 获取渲染后的 HTML
                    html = await page.content()
                    # 关闭页面
                    await page.close()
                    return html
                except Exception as e:
                    print(f"Error fetching {url}: {e}")
                    return ""

            # 使用 asyncio.gather 同时获取所有网站的 HTML
            tasks = [get_html(url) for url in urls]
            htmls_list = await asyncio.gather(*tasks)

            # 使用 BeautifulSoup 格式化每个 HTML 内容
            formatted_htmls = []
            for html in htmls_list:
                soup = BeautifulSoup(html, 'html.parser')
                formatted_html = soup.get_text()
                cleaned_text = re.sub(r'[\n\t\r]+', ' ', formatted_html)
                cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
                formatted_htmls.append(cleaned_text)

            # 将所有格式化后的 HTML 内容合并到一个字符串中
            text = "\n".join(formatted_htmls)

            # 关闭上下文和浏览器
            await context.close()
            await browser.close()

            return text

    def main(self):
        urls = ["https://www.smzdm.com/jingxuan/", "https://faxian.smzdm.com/"]
        text = asyncio.run(self.get_htmls(urls))

        # print(text)

        prompt_words = '''
            给你几个个网页的源代码, 里面是未清洗的网页源代码
            你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了
            帮我总结一下内容, 请用中文回答
        '''
        prompt_words += text

        # C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b')
        # response_context = C.start_chat()
        # print(response_context)

        # K = KIMI()
        # response_context = K.call_kimi(prompt_words)
        # print(response_context)

        D = DeepSeek()
        response_context = D.call_deepseek(prompt_words)
        print(response_context)


if __name__ == "__main__":
    ainews = AINEWS()
    ainews.main()