coin_world.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 币世界 文章板块
  4. '''
  5. import httpx
  6. import os
  7. import sys
  8. from httpx import HTTPStatusError
  9. import re
  10. import time
  11. from datetime import datetime
  12. sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
  13. from utils.utils_mongo_handle import MongoHandle
  14. from utils.utils_logs_handle import LogsHandle
  15. from base.base_load_config import load_config
  16. config_json = load_config()
  17. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  18. class BiShiJie(object):
  19. def __init__(self):
  20. self.base_url = 'https://www.528btc.com'
  21. self.url = self.base_url + "/e/extend/api/v2/AjaxPageList/"
  22. self.send_email_datas = []
  23. self.send_email_now = 0
  24. self.logs_handle = LogsHandle()
  25. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  26. self.headers = {
  27. "Accept": "text/html, */*; q=0.01",
  28. "Accept-Encoding": "gzip, deflate, br, zstd",
  29. "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
  30. "Origin": "https://www.528btc.com",
  31. "Referer": "https://www.528btc.com/kx/",
  32. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0",
  33. "X-Requested-With": "XMLHttpRequest",
  34. }
  35. db = 'NEWS'
  36. collection = '币世界-文章'
  37. self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
  38. def req(self):
  39. max_page_num = 1 + 5
  40. all_data = []
  41. for page in range(1, max_page_num):
  42. form_data = {
  43. "pageIndex": f"{page}",
  44. "module": "newslist-v2",
  45. "classid": "114",
  46. "limitpage": "15"
  47. }
  48. try:
  49. response = httpx.post(self.url, headers=self.headers, data=form_data)
  50. # 检查响应状态码
  51. response.raise_for_status()
  52. html = response.text
  53. div_list = re.findall('<div class="slices_item_content">([\S\s]*?)</div>\n.*?</div>\n.*?</div>', html)
  54. for div in div_list:
  55. title_list = re.findall('<div class="title overflow">(.*?)</div>', div)
  56. title = title_list[0] if len(title_list) > 0 else ''
  57. context_list = re.findall('<div class="introduce overflow">(.*?)</div>', div)
  58. context = context_list[0] if len(context_list) > 0 else ''
  59. source_url_list = re.findall('<a target="_blank" href="(.*?)">', div)
  60. source_url = source_url_list[0] if len(source_url_list) > 0 else ''
  61. article_type_list = re.findall('<span class="span">(.*?)</span>', div)
  62. article_type = article_type_list[0] if len(article_type_list) > 0 else ''
  63. posted_date_list = re.findall('<span class="time">(.*?)</span>', div)
  64. posted_date = posted_date_list[0] if len(posted_date_list) > 0 else ''
  65. all_data.append({
  66. "title": title,
  67. "context": context,
  68. "source_url": '',
  69. 'link': self.base_url + source_url,
  70. "article_type": article_type,
  71. "article_source": '',
  72. "img_url": '',
  73. 'keyword': article_type,
  74. "posted_date": posted_date,
  75. "create_time": int(time.time()),
  76. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  77. "repush_times": DEFAULT_RE_PUSH_TIMES
  78. })
  79. except HTTPStatusError as http_err:
  80. print(f"HTTP error occurred: {http_err}")
  81. except Exception as err:
  82. print(f"An error occurred: {err}")
  83. return all_data
  84. def save_to_mongo(self, data):
  85. print('开始储存 币世界文章 数据')
  86. for data_to_insert in data:
  87. try:
  88. # 检查数据库中是否存在匹配的文档
  89. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  90. count = self.mongo.collection.count_documents(filter_criteria)
  91. if count == 0:
  92. # 如果没有找到匹配的文档,插入新文档
  93. result = self.mongo.collection.insert_one(data_to_insert)
  94. self.send_email_datas.append(data_to_insert)
  95. except TypeError as te:
  96. print('\n%s' % te)
  97. self.logs_handle.logs_write('币世界-文章', '写入数据库报错: %s' % te, 'error', False)
  98. return 0
  99. print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
  100. def main(self):
  101. all_data = self.req()
  102. if not all_data:
  103. print('数据为空')
  104. exit(0)
  105. self.save_to_mongo(all_data)
  106. if __name__ == '__main__':
  107. BiShiJie().main()