11 месяцев назад · 04eb52e02c
--- a/.gitignore
+++ b/.gitignore
@@ -51,3 +51,4 @@ coverage.xml
 
				 *.pot
			
 
				 
			
 
				 *.log
			
 
				+save_txt/*
			
--- a/config.json
+++ b/config.json
@@ -0,0 +1,11 @@
 
				+{
			
 
				+  "web3": {
			
 
				+    "target_url_list": [
			
 
				+      "https://wublock123.com",
			
 
				+      "https://m.odaily.news/newsflash",
			
 
				+      "https://www.chaincatcher.com/news",
			
 
				+      "https://m.marsbit.co/flash/"
			
 
				+    ],
			
 
				+    "prompt_words": "给你一个或多个网页的源代码, 里面是未清洗的网页源代码,你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了,帮我总结一下这些网站的内容, 请用中文回答"
			
 
				+  }
			
 
				+}
			
--- a/main.py
+++ b/main.py
@@ -1,17 +1,53 @@
 
				 # -*- coding: utf-8 -*-
			
 
				+import os
			
 
				 import re
			
 
				+import json
			
 
				 from playwright.async_api import async_playwright
			
 
				 import asyncio
			
 
				 from bs4 import BeautifulSoup
			
 
				-
			
 
				 from api_ollama import *
			
 
				 from api_kimi import *
			
 
				 from api_deepseek import *
			
 
				-
			
 
				 from send_to_email import *
			
 
				 
			
 
				+key = 'web3'
			
 
				+
			
 
				 
			
 
				 class AINEWS:
			
 
				+    def save_to_txt(self, text):
			
 
				+        current_file_path = os.path.dirname(__file__)
			
 
				+        save_file_path = os.path.join(current_file_path, 'save_txt')
			
 
				+        if not os.path.exists(save_file_path):
			
 
				+            os.makedirs(save_file_path)
			
 
				+        file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
			
 
				+        with open(file, 'w', encoding='utf-8') as file:
			
 
				+            file.write(text)
			
 
				+
			
 
				+    def load_config(self, key):
			
 
				+        config = {}
			
 
				+        if os.path.exists('config.json'):
			
 
				+            with open('config.json', 'r', encoding='utf-8') as f:
			
 
				+                config = json.load(f)
			
 
				+
			
 
				+        if not config:
			
 
				+            print('config.json is not exist!')
			
 
				+            exit()
			
 
				+
			
 
				+        k = config[key]
			
 
				+        return k['target_url_list'], k['prompt_words']
			
 
				+
			
 
				+    @staticmethod
			
 
				+    async def scroll_to_percentage(page):
			
 
				+        percentage_list = [i for i in range(5, 101, 2)]
			
 
				+        for percentage in percentage_list:
			
 
				+            # 计算页面的指定百分比高度
			
 
				+            height = await page.evaluate("() => document.body.scrollHeight")
			
 
				+            scroll_position = height * (percentage / 100)
			
 
				+            # 跳转到指定的百分比位置
			
 
				+            await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
			
 
				+            await asyncio.sleep(0.5)  # 使用异步 sleep
			
 
				+        await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
			
 
				+
			
 
				     async def get_htmls(self, urls):
			
 
				         htmls = []
			
 
				         async with async_playwright() as p:
			
@@ -27,6 +63,10 @@ class AINEWS:
 
				                     page = await context.new_page()
			
 
				                     # 导航到指定网址
			
 
				                     await page.goto(url)
			
 
				+
			
 
				+                    # 滚动页面, 获取更多信息
			
 
				+                    await self.scroll_to_percentage(page)
			
 
				+
			
 
				                     # 获取渲染后的 HTML
			
 
				                     html = await page.content()
			
 
				                     # 关闭页面
			
@@ -58,17 +98,11 @@ class AINEWS:
 
				 
			
 
				             return text
			
 
				 
			
 
				-    def main(self):
			
 
				-        urls = ["https://www.smzdm.com/jingxuan/", "https://faxian.smzdm.com/"]
			
 
				-        text = asyncio.run(self.get_htmls(urls))
			
 
				+    def main(self, target_url_list, prompt_words):
			
 
				+        text = asyncio.run(self.get_htmls(target_url_list))
			
 
				 
			
 
				-        # print(text)
			
 
				+        self.save_to_txt(text)
			
 
				 
			
 
				-        prompt_words = '''
			
 
				-            给你几个个网页的源代码, 里面是未清洗的网页源代码
			
 
				-            你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了
			
 
				-            帮我总结一下内容, 请用中文回答
			
 
				-        '''
			
 
				         prompt_words += text
			
 
				 
			
 
				         # C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b')
			
@@ -79,11 +113,13 @@ class AINEWS:
 
				         # response_context = K.call_kimi(prompt_words)
			
 
				         # print(response_context)
			
 
				 
			
 
				-        D = DeepSeek()
			
 
				-        response_context = D.call_deepseek(prompt_words)
			
 
				-        print(response_context)
			
 
				+        # D = DeepSeek()
			
 
				+        # response_context = D.call_deepseek(prompt_words)
			
 
				+        # print(response_context)
			
 
				 
			
 
				 
			
 
				 if __name__ == "__main__":
			
 
				     ainews = AINEWS()
			
 
				-    ainews.main()
			
 
				+    target_url_list, prompt_words = ainews.load_config(key)
			
 
				+    ainews.main(target_url_list, prompt_words)
			
 
				+    print('done!')