| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230 |
- import httpx
- import json
- import pandas as pd
- from typing import List, Dict, Any
- import time
- class NewsNowSpider:
- def __init__(self):
- self.base_url = "https://newsnow.busiyi.world/api/s/entire"
- self.headers = {
- 'authority': 'newsnow.busiyi.world',
- 'accept': 'application/json',
- 'accept-encoding': 'gzip, deflate, br, zstd',
- 'accept-language': 'zh-CN,zh;q=0.7',
- 'content-type': 'application/json',
- 'origin': 'https://newsnow.busiyi.world',
- 'priority': 'u=1, i',
- 'referer': 'https://newsnow.busiyi.world/c/focus',
- 'sec-ch-ua': '"Chromium";v="142", "Brave";v="142", "Not_A Brand";v="99"',
- 'sec-ch-ua-mobile': '?0',
- 'sec-ch-ua-platform': '"macOS"',
- 'sec-fetch-dest': 'empty',
- 'sec-fetch-mode': 'cors',
- 'sec-fetch-site': 'same-origin',
- 'sec-gpc': '1',
- 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36'
- }
- self.payload = {
- "sources": [
- "cls-depth", "cls-hot", "cls-telegraph",
- "fastbull-express", "fastbull-news", "gelonghui",
- "jin10", "mktnews-flash", "wallstreetcn-hot",
- "wallstreetcn-news", "wallstreetcn-quick", "xueqiu-hotstock"
- ]
- }
- def fetch_news(self, timeout: float = 30.0) -> Dict[str, Any]:
- """
- 获取新闻数据
- Args:
- timeout: 请求超时时间
- Returns:
- 返回API的JSON响应数据
- """
- try:
- with httpx.Client(headers=self.headers, timeout=timeout) as client:
- response = client.post(
- self.base_url,
- json=self.payload,
- headers=self.headers
- )
- response.raise_for_status() # 如果状态码不是200,抛出异常
- data = response.json()
- print(f"成功获取数据,共 {len(data)} 条新闻")
- return data
- except httpx.HTTPStatusError as e:
- print(f"HTTP错误: {e.response.status_code} - {e.response.text}")
- return {}
- except httpx.RequestError as e:
- print(f"请求错误: {e}")
- return {}
- except json.JSONDecodeError as e:
- print(f"JSON解析错误: {e}")
- return {}
- except Exception as e:
- print(f"未知错误: {e}")
- return {}
- def parse_news_data(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
- """
- 解析新闻数据
- Args:
- data: API返回的原始数据
- Returns:
- 解析后的新闻列表
- """
- parsed_news = []
- if not data:
- return parsed_news
- for item in data:
- try:
- news_item = {
- 'id': item.get('id', ''),
- 'title': item.get('title', ''),
- 'content': item.get('content', ''),
- 'source': item.get('source', ''),
- 'publish_time': item.get('publishTime', ''),
- 'create_time': item.get('createTime', ''),
- 'url': item.get('url', ''),
- 'image_url': item.get('imageUrl', ''),
- 'importance': item.get('importance', 0),
- 'sentiment': item.get('sentiment', 0),
- 'tags': item.get('tags', []),
- 'related_stocks': item.get('relatedStocks', [])
- }
- parsed_news.append(news_item)
- except Exception as e:
- print(f"解析新闻项时出错: {e}")
- continue
- return parsed_news
- def save_to_json(self, data: List[Dict[str, Any]], filename: str = None):
- """
- 保存数据到JSON文件
- Args:
- data: 要保存的数据
- filename: 文件名,如果为None则使用时间戳
- """
- if not filename:
- timestamp = time.strftime("%Y%m%d_%H%M%S")
- filename = f"news_data_{timestamp}.json"
- try:
- with open(filename, 'w', encoding='utf-8') as f:
- json.dump(data, f, ensure_ascii=False, indent=2)
- print(f"数据已保存到: {filename}")
- except Exception as e:
- print(f"保存文件时出错: {e}")
- def save_to_excel(self, data: List[Dict[str, Any]], filename: str = None):
- """
- 保存数据到Excel文件
- Args:
- data: 要保存的数据
- filename: 文件名,如果为None则使用时间戳
- """
- if not data:
- print("没有数据可保存")
- return
- if not filename:
- timestamp = time.strftime("%Y%m%d_%H%M%S")
- filename = f"news_data_{timestamp}.xlsx"
- try:
- df = pd.DataFrame(data)
- df.to_excel(filename, index=False, engine='openpyxl')
- print(f"数据已保存到: {filename}")
- except Exception as e:
- print(f"保存Excel文件时出错: {e}")
- def run(self, save_json: bool = True, save_excel: bool = True):
- """
- 运行爬虫
- Args:
- save_json: 是否保存为JSON文件
- save_excel: 是否保存为Excel文件
- """
- print("开始获取新闻数据...")
- # 获取数据
- raw_data = self.fetch_news()
- if not raw_data:
- print("未能获取到数据")
- return
- # 解析数据
- parsed_data = self.parse_news_data(raw_data)
- if not parsed_data:
- print("没有解析到有效数据")
- return
- print(f"成功解析 {len(parsed_data)} 条新闻")
- # 保存数据
- if save_json:
- self.save_to_json(parsed_data)
- if save_excel:
- self.save_to_excel(parsed_data)
- return parsed_data
- def get_news_by_source(self, source: str) -> List[Dict[str, Any]]:
- """
- 根据来源筛选新闻
- Args:
- source: 新闻来源
- Returns:
- 筛选后的新闻列表
- """
- raw_data = self.fetch_news()
- if not raw_data:
- return []
- all_news = self.parse_news_data(raw_data)
- filtered_news = [news for news in all_news if news.get('source') == source]
- print(f"来源 '{source}' 的新闻数量: {len(filtered_news)}")
- return filtered_news
- # 使用示例
- if __name__ == "__main__":
- # 创建爬虫实例
- spider = NewsNowSpider()
- # 方法1: 运行完整爬虫(获取所有数据并保存)
- news_data = spider.run(save_json=True, save_excel=True)
- # 方法2: 只获取特定来源的新闻
- # wallstreet_news = spider.get_news_by_source("wallstreetcn")
- # 方法3: 只获取数据不保存
- # raw_data = spider.fetch_news()
- # parsed_data = spider.parse_news_data(raw_data)
- # 打印前几条新闻
- if news_data:
- print("\n前3条新闻标题:")
- for i, news in enumerate(news_data[:3]):
- print(f"{i + 1}. {news['title']} (来源: {news['source']})")
|