toor
/
AutoInfo


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230
							import httpx
import json
import pandas as pd
from typing import List, Dict, Any
import time


class NewsNowSpider:
    def __init__(self):
        self.base_url = "https://newsnow.busiyi.world/api/s/entire"
        self.headers = {
            'authority': 'newsnow.busiyi.world',
            'accept': 'application/json',
            'accept-encoding': 'gzip, deflate, br, zstd',
            'accept-language': 'zh-CN,zh;q=0.7',
            'content-type': 'application/json',
            'origin': 'https://newsnow.busiyi.world',
            'priority': 'u=1, i',
            'referer': 'https://newsnow.busiyi.world/c/focus',
            'sec-ch-ua': '"Chromium";v="142", "Brave";v="142", "Not_A Brand";v="99"',
            'sec-ch-ua-mobile': '?0',
            'sec-ch-ua-platform': '"macOS"',
            'sec-fetch-dest': 'empty',
            'sec-fetch-mode': 'cors',
            'sec-fetch-site': 'same-origin',
            'sec-gpc': '1',
            'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36'
        }
        self.payload = {
            "sources": [
                "cls-depth", "cls-hot", "cls-telegraph",
                "fastbull-express", "fastbull-news", "gelonghui",
                "jin10", "mktnews-flash", "wallstreetcn-hot",
                "wallstreetcn-news", "wallstreetcn-quick", "xueqiu-hotstock"
            ]
        }

    def fetch_news(self, timeout: float = 30.0) -> Dict[str, Any]:
        """
        获取新闻数据

        Args:
            timeout: 请求超时时间

        Returns:
            返回API的JSON响应数据
        """
        try:
            with httpx.Client(headers=self.headers, timeout=timeout) as client:
                response = client.post(
                    self.base_url,
                    json=self.payload,
                    headers=self.headers
                )

                response.raise_for_status()  # 如果状态码不是200，抛出异常

                data = response.json()
                print(f"成功获取数据，共 {len(data)} 条新闻")
                return data

        except httpx.HTTPStatusError as e:
            print(f"HTTP错误: {e.response.status_code} - {e.response.text}")
            return {}
        except httpx.RequestError as e:
            print(f"请求错误: {e}")
            return {}
        except json.JSONDecodeError as e:
            print(f"JSON解析错误: {e}")
            return {}
        except Exception as e:
            print(f"未知错误: {e}")
            return {}

    def parse_news_data(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
        """
        解析新闻数据

        Args:
            data: API返回的原始数据

        Returns:
            解析后的新闻列表
        """
        parsed_news = []

        if not data:
            return parsed_news

        for item in data:
            try:
                news_item = {
                    'id': item.get('id', ''),
                    'title': item.get('title', ''),
                    'content': item.get('content', ''),
                    'source': item.get('source', ''),
                    'publish_time': item.get('publishTime', ''),
                    'create_time': item.get('createTime', ''),
                    'url': item.get('url', ''),
                    'image_url': item.get('imageUrl', ''),
                    'importance': item.get('importance', 0),
                    'sentiment': item.get('sentiment', 0),
                    'tags': item.get('tags', []),
                    'related_stocks': item.get('relatedStocks', [])
                }
                parsed_news.append(news_item)
            except Exception as e:
                print(f"解析新闻项时出错: {e}")
                continue

        return parsed_news

    def save_to_json(self, data: List[Dict[str, Any]], filename: str = None):
        """
        保存数据到JSON文件

        Args:
            data: 要保存的数据
            filename: 文件名，如果为None则使用时间戳
        """
        if not filename:
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            filename = f"news_data_{timestamp}.json"

        try:
            with open(filename, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)
            print(f"数据已保存到: {filename}")
        except Exception as e:
            print(f"保存文件时出错: {e}")

    def save_to_excel(self, data: List[Dict[str, Any]], filename: str = None):
        """
        保存数据到Excel文件

        Args:
            data: 要保存的数据
            filename: 文件名，如果为None则使用时间戳
        """
        if not data:
            print("没有数据可保存")
            return

        if not filename:
            timestamp = time.strftime("%Y%m%d_%H%M%S")
            filename = f"news_data_{timestamp}.xlsx"

        try:
            df = pd.DataFrame(data)
            df.to_excel(filename, index=False, engine='openpyxl')
            print(f"数据已保存到: {filename}")
        except Exception as e:
            print(f"保存Excel文件时出错: {e}")

    def run(self, save_json: bool = True, save_excel: bool = True):
        """
        运行爬虫

        Args:
            save_json: 是否保存为JSON文件
            save_excel: 是否保存为Excel文件
        """
        print("开始获取新闻数据...")

        # 获取数据
        raw_data = self.fetch_news()

        if not raw_data:
            print("未能获取到数据")
            return

        # 解析数据
        parsed_data = self.parse_news_data(raw_data)

        if not parsed_data:
            print("没有解析到有效数据")
            return

        print(f"成功解析 {len(parsed_data)} 条新闻")

        # 保存数据
        if save_json:
            self.save_to_json(parsed_data)

        if save_excel:
            self.save_to_excel(parsed_data)

        return parsed_data

    def get_news_by_source(self, source: str) -> List[Dict[str, Any]]:
        """
        根据来源筛选新闻

        Args:
            source: 新闻来源

        Returns:
            筛选后的新闻列表
        """
        raw_data = self.fetch_news()
        if not raw_data:
            return []

        all_news = self.parse_news_data(raw_data)
        filtered_news = [news for news in all_news if news.get('source') == source]

        print(f"来源 '{source}' 的新闻数量: {len(filtered_news)}")
        return filtered_news


# 使用示例
if __name__ == "__main__":
    # 创建爬虫实例
    spider = NewsNowSpider()

    # 方法1: 运行完整爬虫（获取所有数据并保存）
    news_data = spider.run(save_json=True, save_excel=True)

    # 方法2: 只获取特定来源的新闻
    # wallstreet_news = spider.get_news_by_source("wallstreetcn")

    # 方法3: 只获取数据不保存
    # raw_data = spider.fetch_news()
    # parsed_data = spider.parse_news_data(raw_data)

    # 打印前几条新闻
    if news_data:
        print("\n前3条新闻标题:")
        for i, news in enumerate(news_data[:3]):
            print(f"{i + 1}. {news['title']} (来源: {news['source']})")