main.py 4.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125
  1. # -*- coding: utf-8 -*-
  2. import os
  3. import re
  4. import json
  5. from playwright.async_api import async_playwright
  6. import asyncio
  7. from bs4 import BeautifulSoup
  8. from api_ollama import *
  9. from api_kimi import *
  10. from api_deepseek import *
  11. from send_to_email import *
  12. key = 'web3'
  13. class AINEWS:
  14. def save_to_txt(self, text):
  15. current_file_path = os.path.dirname(__file__)
  16. save_file_path = os.path.join(current_file_path, 'save_txt')
  17. if not os.path.exists(save_file_path):
  18. os.makedirs(save_file_path)
  19. file = os.path.join(save_file_path, str(int(time.time())) + '.txt')
  20. with open(file, 'w', encoding='utf-8') as file:
  21. file.write(text)
  22. def load_config(self, key):
  23. config = {}
  24. if os.path.exists('config.json'):
  25. with open('config.json', 'r', encoding='utf-8') as f:
  26. config = json.load(f)
  27. if not config:
  28. print('config.json is not exist!')
  29. exit()
  30. k = config[key]
  31. return k['target_url_list'], k['prompt_words']
  32. @staticmethod
  33. async def scroll_to_percentage(page):
  34. percentage_list = [i for i in range(5, 101, 2)]
  35. for percentage in percentage_list:
  36. # 计算页面的指定百分比高度
  37. height = await page.evaluate("() => document.body.scrollHeight")
  38. scroll_position = height * (percentage / 100)
  39. # 跳转到指定的百分比位置
  40. await page.evaluate(f"window.scrollTo({{top: {scroll_position}, behavior: 'smooth'}})")
  41. await asyncio.sleep(0.5) # 使用异步 sleep
  42. await page.evaluate("window.scrollTo({top: 0, behavior: 'smooth'})")
  43. async def get_htmls(self, urls):
  44. htmls = []
  45. async with async_playwright() as p:
  46. # 启动浏览器
  47. browser = await p.chromium.launch(headless=True)
  48. # 创建浏览器上下文
  49. context = await browser.new_context()
  50. async def get_html(url):
  51. try:
  52. print(f'正在打开: {url}')
  53. # 在上下文中打开新页面
  54. page = await context.new_page()
  55. # 导航到指定网址
  56. await page.goto(url)
  57. # 滚动页面, 获取更多信息
  58. await self.scroll_to_percentage(page)
  59. # 获取渲染后的 HTML
  60. html = await page.content()
  61. # 关闭页面
  62. await page.close()
  63. return html
  64. except Exception as e:
  65. print(f"Error fetching {url}: {e}")
  66. return ""
  67. # 使用 asyncio.gather 同时获取所有网站的 HTML
  68. tasks = [get_html(url) for url in urls]
  69. htmls_list = await asyncio.gather(*tasks)
  70. # 使用 BeautifulSoup 格式化每个 HTML 内容
  71. formatted_htmls = []
  72. for html in htmls_list:
  73. soup = BeautifulSoup(html, 'html.parser')
  74. formatted_html = soup.get_text()
  75. cleaned_text = re.sub(r'[\n\t\r]+', ' ', formatted_html)
  76. cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
  77. formatted_htmls.append(cleaned_text)
  78. # 将所有格式化后的 HTML 内容合并到一个字符串中
  79. text = "\n".join(formatted_htmls)
  80. # 关闭上下文和浏览器
  81. await context.close()
  82. await browser.close()
  83. return text
  84. def main(self, target_url_list, prompt_words):
  85. text = asyncio.run(self.get_htmls(target_url_list))
  86. self.save_to_txt(text)
  87. prompt_words += text
  88. # C = ChatBot('http://erhe.top:27381', prompt_words, 'qwen2.5:3b')
  89. # response_context = C.start_chat()
  90. # print(response_context)
  91. # K = KIMI()
  92. # response_context = K.call_kimi(prompt_words)
  93. # print(response_context)
  94. # D = DeepSeek()
  95. # response_context = D.call_deepseek(prompt_words)
  96. # print(response_context)
  97. if __name__ == "__main__":
  98. ainews = AINEWS()
  99. target_url_list, prompt_words = ainews.load_config(key)
  100. ainews.main(target_url_list, prompt_words)
  101. print('done!')