jack 11 місяців тому
батько
коміт
04eb52e02c
3 змінених файлів з 63 додано та 15 видалено
  1. 1 0
      .gitignore
  2. 11 0
      config.json
  3. 51 15
      main.py

+ 1 - 0
.gitignore

@@ -51,3 +51,4 @@ coverage.xml
 *.pot
 
 *.log
+save_txt/*

+ 11 - 0
config.json

@@ -0,0 +1,11 @@
+{
+  "web3": {
+    "target_url_list": [
+      "https://wublock123.com",
+      "https://m.odaily.news/newsflash",
+      "https://www.chaincatcher.com/news",
+      "https://m.marsbit.co/flash/"
+    ],
+    "prompt_words": "给你一个或多个网页的源代码, 里面是未清洗的网页源代码,你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了,帮我总结一下这些网站的内容, 请用中文回答"
+  }
+}

+ 51 - 15
main.py

@@ -1,17 +1,53 @@
 # -*- coding: utf-8 -*-
+import os
 import re
+import json
 from playwright.async_api import async_playwright
 import asyncio
 from bs4 import BeautifulSoup
-
 from api_ollama import *
 from api_kimi import *
 from api_deepseek import *
-
 from send_to_email import *
 
+key = 'web3'
+
 
 class AINEWS:
+    def save_to_txt(self, text):
+        current_file_path = os.path.dirname(__file__)
+        save_file_path = os.path.join(current_file_path, 'save_txt')
+        if not os.path.exists(save_file_path):
+            os.makedirs(save_file_path)
+        file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
+        with open(file, 'w', encoding='utf-8') as file:
+            file.write(text)
+
+    def load_config(self, key):
+        config = {}
+        if os.path.exists('config.json'):
+            with open('config.json', 'r', encoding='utf-8') as f:
+                config = json.load(f)
+
+        if not config:
+            print('config.json is not exist!')
+            exit()
+
+        k = config[key]
+        return k['target_url_list'], k['prompt_words']
+
+    @staticmethod
+    async def scroll_to_percentage(page):
+        percentage_list = [i for i in range(5, 101, 2)]
+        for percentage in percentage_list:
+            # 计算页面的指定百分比高度
+            height = await page.evaluate("() => document.body.scrollHeight")
+            scroll_position = height * (percentage / 100)
+            # 跳转到指定的百分比位置
+            await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
+            await asyncio.sleep(0.5)  # 使用异步 sleep
+        await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
+
     async def get_htmls(self, urls):
         htmls = []
         async with async_playwright() as p:
@@ -27,6 +63,10 @@ class AINEWS:
                     page = await context.new_page()
                     # 导航到指定网址
                     await page.goto(url)
+
+                    # 滚动页面, 获取更多信息
+                    await self.scroll_to_percentage(page)
+
                     # 获取渲染后的 HTML
                     html = await page.content()
                     # 关闭页面
@@ -58,17 +98,11 @@ class AINEWS:
 
             return text
 
-    def main(self):
-        urls = ["https://www.smzdm.com/jingxuan/", "https://faxian.smzdm.com/"]
-        text = asyncio.run(self.get_htmls(urls))
+    def main(self, target_url_list, prompt_words):
+        text = asyncio.run(self.get_htmls(target_url_list))
 
-        # print(text)
+        self.save_to_txt(text)
 
-        prompt_words = '''
-            给你几个个网页的源代码, 里面是未清洗的网页源代码
-            你可以无视网页源代码的部分,关注内容就行,重复的话就不用说了
-            帮我总结一下内容, 请用中文回答
-        '''
         prompt_words += text
 
         # C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b')
@@ -79,11 +113,13 @@ class AINEWS:
         # response_context = K.call_kimi(prompt_words)
         # print(response_context)
 
-        D = DeepSeek()
-        response_context = D.call_deepseek(prompt_words)
-        print(response_context)
+        # D = DeepSeek()
+        # response_context = D.call_deepseek(prompt_words)
+        # print(response_context)
 
 
 if __name__ == "__main__":
     ainews = AINEWS()
-    ainews.main()
+    target_url_list, prompt_words = ainews.load_config(key)
+    ainews.main(target_url_list, prompt_words)
+    print('done!')