newsnow.py 7.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
  1. import httpx
  2. import json
  3. import pandas as pd
  4. from typing import List, Dict, Any
  5. import time
  6. class NewsNowSpider:
  7. def __init__(self):
  8. self.base_url = "https://newsnow.busiyi.world/api/s/entire"
  9. self.headers = {
  10. 'authority': 'newsnow.busiyi.world',
  11. 'accept': 'application/json',
  12. 'accept-encoding': 'gzip, deflate, br, zstd',
  13. 'accept-language': 'zh-CN,zh;q=0.7',
  14. 'content-type': 'application/json',
  15. 'origin': 'https://newsnow.busiyi.world',
  16. 'priority': 'u=1, i',
  17. 'referer': 'https://newsnow.busiyi.world/c/focus',
  18. 'sec-ch-ua': '"Chromium";v="142", "Brave";v="142", "Not_A Brand";v="99"',
  19. 'sec-ch-ua-mobile': '?0',
  20. 'sec-ch-ua-platform': '"macOS"',
  21. 'sec-fetch-dest': 'empty',
  22. 'sec-fetch-mode': 'cors',
  23. 'sec-fetch-site': 'same-origin',
  24. 'sec-gpc': '1',
  25. 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36'
  26. }
  27. self.payload = {
  28. "sources": [
  29. "cls-depth", "cls-hot", "cls-telegraph",
  30. "fastbull-express", "fastbull-news", "gelonghui",
  31. "jin10", "mktnews-flash", "wallstreetcn-hot",
  32. "wallstreetcn-news", "wallstreetcn-quick", "xueqiu-hotstock"
  33. ]
  34. }
  35. def fetch_news(self, timeout: float = 30.0) -> Dict[str, Any]:
  36. """
  37. 获取新闻数据
  38. Args:
  39. timeout: 请求超时时间
  40. Returns:
  41. 返回API的JSON响应数据
  42. """
  43. try:
  44. with httpx.Client(headers=self.headers, timeout=timeout) as client:
  45. response = client.post(
  46. self.base_url,
  47. json=self.payload,
  48. headers=self.headers
  49. )
  50. response.raise_for_status() # 如果状态码不是200,抛出异常
  51. data = response.json()
  52. print(f"成功获取数据,共 {len(data)} 条新闻")
  53. return data
  54. except httpx.HTTPStatusError as e:
  55. print(f"HTTP错误: {e.response.status_code} - {e.response.text}")
  56. return {}
  57. except httpx.RequestError as e:
  58. print(f"请求错误: {e}")
  59. return {}
  60. except json.JSONDecodeError as e:
  61. print(f"JSON解析错误: {e}")
  62. return {}
  63. except Exception as e:
  64. print(f"未知错误: {e}")
  65. return {}
  66. def parse_news_data(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
  67. """
  68. 解析新闻数据
  69. Args:
  70. data: API返回的原始数据
  71. Returns:
  72. 解析后的新闻列表
  73. """
  74. parsed_news = []
  75. if not data:
  76. return parsed_news
  77. for item in data:
  78. try:
  79. news_item = {
  80. 'id': item.get('id', ''),
  81. 'title': item.get('title', ''),
  82. 'content': item.get('content', ''),
  83. 'source': item.get('source', ''),
  84. 'publish_time': item.get('publishTime', ''),
  85. 'create_time': item.get('createTime', ''),
  86. 'url': item.get('url', ''),
  87. 'image_url': item.get('imageUrl', ''),
  88. 'importance': item.get('importance', 0),
  89. 'sentiment': item.get('sentiment', 0),
  90. 'tags': item.get('tags', []),
  91. 'related_stocks': item.get('relatedStocks', [])
  92. }
  93. parsed_news.append(news_item)
  94. except Exception as e:
  95. print(f"解析新闻项时出错: {e}")
  96. continue
  97. return parsed_news
  98. def save_to_json(self, data: List[Dict[str, Any]], filename: str = None):
  99. """
  100. 保存数据到JSON文件
  101. Args:
  102. data: 要保存的数据
  103. filename: 文件名,如果为None则使用时间戳
  104. """
  105. if not filename:
  106. timestamp = time.strftime("%Y%m%d_%H%M%S")
  107. filename = f"news_data_{timestamp}.json"
  108. try:
  109. with open(filename, 'w', encoding='utf-8') as f:
  110. json.dump(data, f, ensure_ascii=False, indent=2)
  111. print(f"数据已保存到: {filename}")
  112. except Exception as e:
  113. print(f"保存文件时出错: {e}")
  114. def save_to_excel(self, data: List[Dict[str, Any]], filename: str = None):
  115. """
  116. 保存数据到Excel文件
  117. Args:
  118. data: 要保存的数据
  119. filename: 文件名,如果为None则使用时间戳
  120. """
  121. if not data:
  122. print("没有数据可保存")
  123. return
  124. if not filename:
  125. timestamp = time.strftime("%Y%m%d_%H%M%S")
  126. filename = f"news_data_{timestamp}.xlsx"
  127. try:
  128. df = pd.DataFrame(data)
  129. df.to_excel(filename, index=False, engine='openpyxl')
  130. print(f"数据已保存到: {filename}")
  131. except Exception as e:
  132. print(f"保存Excel文件时出错: {e}")
  133. def run(self, save_json: bool = True, save_excel: bool = True):
  134. """
  135. 运行爬虫
  136. Args:
  137. save_json: 是否保存为JSON文件
  138. save_excel: 是否保存为Excel文件
  139. """
  140. print("开始获取新闻数据...")
  141. # 获取数据
  142. raw_data = self.fetch_news()
  143. if not raw_data:
  144. print("未能获取到数据")
  145. return
  146. # 解析数据
  147. parsed_data = self.parse_news_data(raw_data)
  148. if not parsed_data:
  149. print("没有解析到有效数据")
  150. return
  151. print(f"成功解析 {len(parsed_data)} 条新闻")
  152. # 保存数据
  153. if save_json:
  154. self.save_to_json(parsed_data)
  155. if save_excel:
  156. self.save_to_excel(parsed_data)
  157. return parsed_data
  158. def get_news_by_source(self, source: str) -> List[Dict[str, Any]]:
  159. """
  160. 根据来源筛选新闻
  161. Args:
  162. source: 新闻来源
  163. Returns:
  164. 筛选后的新闻列表
  165. """
  166. raw_data = self.fetch_news()
  167. if not raw_data:
  168. return []
  169. all_news = self.parse_news_data(raw_data)
  170. filtered_news = [news for news in all_news if news.get('source') == source]
  171. print(f"来源 '{source}' 的新闻数量: {len(filtered_news)}")
  172. return filtered_news
  173. # 使用示例
  174. if __name__ == "__main__":
  175. # 创建爬虫实例
  176. spider = NewsNowSpider()
  177. # 方法1: 运行完整爬虫(获取所有数据并保存)
  178. news_data = spider.run(save_json=True, save_excel=True)
  179. # 方法2: 只获取特定来源的新闻
  180. # wallstreet_news = spider.get_news_by_source("wallstreetcn")
  181. # 方法3: 只获取数据不保存
  182. # raw_data = spider.fetch_news()
  183. # parsed_data = spider.parse_news_data(raw_data)
  184. # 打印前几条新闻
  185. if news_data:
  186. print("\n前3条新闻标题:")
  187. for i, news in enumerate(news_data[:3]):
  188. print(f"{i + 1}. {news['title']} (来源: {news['source']})")