| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133 |
- # -*- coding: utf-8 -*-
- '''
- 币世界 文章板块
- '''
- import httpx
- import os
- import sys
- from httpx import HTTPStatusError
- import re
- import time
- from datetime import datetime
- sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
- from utils.utils_mongo_handle import MongoHandle
- from utils.utils_logs_handle import LogsHandle
- from base.base_load_config import load_config
- config_json = load_config()
- DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
- class BiShiJie(object):
- def __init__(self):
- self.base_url = 'https://www.528btc.com'
- self.url = self.base_url + "/e/extend/api/v2/AjaxPageList/"
- self.send_email_datas = []
- self.send_email_now = 0
- self.logs_handle = LogsHandle()
- self.now_day = time.strftime('%Y-%m-%d', time.localtime())
- self.headers = {
- "Accept": "text/html, */*; q=0.01",
- "Accept-Encoding": "gzip, deflate, br, zstd",
- "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
- "Origin": "https://www.528btc.com",
- "Referer": "https://www.528btc.com/kx/",
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0",
- "X-Requested-With": "XMLHttpRequest",
- }
- db = 'NEWS'
- collection = '币世界-文章'
- self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
- def req(self):
- max_page_num = 1 + 5
- all_data = []
- for page in range(1, max_page_num):
- form_data = {
- "pageIndex": f"{page}",
- "module": "newslist-v2",
- "classid": "114",
- "limitpage": "15"
- }
- try:
- response = httpx.post(self.url, headers=self.headers, data=form_data)
- # 检查响应状态码
- response.raise_for_status()
- html = response.text
- div_list = re.findall('<div class="slices_item_content">([\S\s]*?)</div>\n.*?</div>\n.*?</div>', html)
- for div in div_list:
- title_list = re.findall('<div class="title overflow">(.*?)</div>', div)
- title = title_list[0] if len(title_list) > 0 else ''
- context_list = re.findall('<div class="introduce overflow">(.*?)</div>', div)
- context = context_list[0] if len(context_list) > 0 else ''
- source_url_list = re.findall('<a target="_blank" href="(.*?)">', div)
- source_url = source_url_list[0] if len(source_url_list) > 0 else ''
- article_type_list = re.findall('<span class="span">(.*?)</span>', div)
- article_type = article_type_list[0] if len(article_type_list) > 0 else ''
- posted_date_list = re.findall('<span class="time">(.*?)</span>', div)
- posted_date = posted_date_list[0] if len(posted_date_list) > 0 else ''
- all_data.append({
- "title": title,
- "context": context,
- "source_url": '',
- 'link': self.base_url + source_url,
- "article_type": article_type,
- "article_source": '',
- "img_url": '',
- 'keyword': article_type,
- "posted_date": posted_date,
- "create_time": int(time.time()),
- "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
- "repush_times": DEFAULT_RE_PUSH_TIMES
- })
- except HTTPStatusError as http_err:
- print(f"HTTP error occurred: {http_err}")
- except Exception as err:
- print(f"An error occurred: {err}")
- return all_data
- def save_to_mongo(self, data):
- print('开始储存 币世界文章 数据')
- for data_to_insert in data:
- try:
- # 检查数据库中是否存在匹配的文档
- filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
- count = self.mongo.collection.count_documents(filter_criteria)
- if count == 0:
- # 如果没有找到匹配的文档,插入新文档
- result = self.mongo.collection.insert_one(data_to_insert)
- self.send_email_datas.append(data_to_insert)
- except TypeError as te:
- print('\n%s' % te)
- self.logs_handle.logs_write('币世界-文章', '写入数据库报错: %s' % te, 'error', False)
- return 0
- print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
- def main(self):
- all_data = self.req()
- if not all_data:
- print('数据为空')
- exit(0)
- self.save_to_mongo(all_data)
- if __name__ == '__main__':
- BiShiJie().main()
|