coin_world.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 币世界 文章板块
  4. '''
  5. import os
  6. import sys
  7. from httpx import HTTPStatusError
  8. import re
  9. sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
  10. from utils.utils import *
  11. config_json = LoadConfig().load_config()
  12. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  13. class BiShiJie(object):
  14. def __init__(self):
  15. self.base_url = 'https://www.528btc.com'
  16. self.url = self.base_url + "/e/extend/api/v2/AjaxPageList/"
  17. self.send_email_datas = []
  18. self.send_email_now = 0
  19. self.logs_handle = LogsHandle()
  20. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  21. self.headers = {
  22. "Accept": "text/html, */*; q=0.01",
  23. "Accept-Encoding": "gzip, deflate, br, zstd",
  24. "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
  25. "Origin": "https://www.528btc.com",
  26. "Referer": "https://www.528btc.com/kx/",
  27. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:129.0) Gecko/20100101 Firefox/129.0",
  28. "X-Requested-With": "XMLHttpRequest",
  29. }
  30. db = 'NEWS'
  31. collection = '币世界-文章'
  32. self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
  33. def req(self):
  34. max_page_num = 1 + 5
  35. all_data = []
  36. for page in range(1, max_page_num):
  37. form_data = {
  38. "pageIndex": f"{page}",
  39. "module": "newslist-v2",
  40. "classid": "114",
  41. "limitpage": "15"
  42. }
  43. try:
  44. response = httpx.post(self.url, headers=self.headers, data=form_data)
  45. # 检查响应状态码
  46. response.raise_for_status()
  47. html = response.text
  48. div_list = re.findall('<div class="slices_item_content">([\S\s]*?)</div>\n.*?</div>\n.*?</div>', html)
  49. for div in div_list:
  50. title_list = re.findall('<div class="title overflow">(.*?)</div>', div)
  51. title = title_list[0] if len(title_list) > 0 else ''
  52. context_list = re.findall('<div class="introduce overflow">(.*?)</div>', div)
  53. context = context_list[0] if len(context_list) > 0 else ''
  54. source_url_list = re.findall('<a target="_blank" href="(.*?)">', div)
  55. source_url = source_url_list[0] if len(source_url_list) > 0 else ''
  56. article_type_list = re.findall('<span class="span">(.*?)</span>', div)
  57. article_type = article_type_list[0] if len(article_type_list) > 0 else ''
  58. posted_date_list = re.findall('<span class="time">(.*?)</span>', div)
  59. posted_date = posted_date_list[0] if len(posted_date_list) > 0 else ''
  60. all_data.append({
  61. "title": title,
  62. "context": context,
  63. "source_url": '',
  64. 'link': self.base_url + source_url,
  65. "article_type": article_type,
  66. "article_source": '',
  67. "img_url": '',
  68. 'keyword': article_type,
  69. "posted_date": posted_date,
  70. "create_time": int(time.time()),
  71. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  72. "repush_times": DEFAULT_RE_PUSH_TIMES
  73. })
  74. except HTTPStatusError as http_err:
  75. print(f"HTTP error occurred: {http_err}")
  76. except Exception as err:
  77. print(f"An error occurred: {err}")
  78. return all_data
  79. def save_to_mongo(self, data):
  80. print('开始储存 币世界文章 数据')
  81. for data_to_insert in data:
  82. try:
  83. # 检查数据库中是否存在匹配的文档
  84. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  85. count = self.mongo.collection.count_documents(filter_criteria)
  86. if count == 0:
  87. # 如果没有找到匹配的文档,插入新文档
  88. result = self.mongo.collection.insert_one(data_to_insert)
  89. self.send_email_datas.append(data_to_insert)
  90. except TypeError as te:
  91. print('\n%s' % te)
  92. self.logs_handle.logs_write('币世界-文章', '写入数据库报错: %s' % te, 'error', False)
  93. return 0
  94. print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
  95. def main(self):
  96. all_data = self.req()
  97. if not all_data:
  98. print('数据为空')
  99. exit(0)
  100. self.save_to_mongo(all_data)
  101. if __name__ == '__main__':
  102. BiShiJie().main()