news_get_news.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159
  1. # -*- coding: utf-8 -*-
  2. import time
  3. import httpx
  4. from datetime import datetime
  5. import os
  6. import sys
  7. sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
  8. from utils.utils_mongo_handle import MongoHandle
  9. from utils.utils_logs_handle import LogsHandle
  10. from utils.utils_send_email import SendEmail
  11. from utils.utils_load_config import load_config
  12. config_json = load_config()
  13. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  14. class HotNews():
  15. def __init__(self):
  16. self.base_url = 'https://www.anyknew.com/go/'
  17. self.email_subject = '聚合新闻'
  18. self.email_title = 'Anyknew'
  19. self.email_text = '获取数据时间:\n{0}\n{1}\n\n\n\n'.format(datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  20. ('-' * 90))
  21. self.logs_handle = LogsHandle()
  22. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  23. self.db = 'NEWS'
  24. self.collection = 'Anyknew_info'
  25. self.targets = {
  26. 'universal': 'https://www.anyknew.com/api/v1/cats/universal',
  27. 'finance': 'https://www.anyknew.com/api/v1/cats/aam',
  28. 'science': 'https://www.anyknew.com/api/v1/cats/st',
  29. 'life': 'https://www.anyknew.com/api/v1/cats/life',
  30. 'binary': 'https://www.anyknew.com/api/v1/cats/binary'
  31. }
  32. self.send_email_datas = []
  33. self.send_email_now = 0
  34. def main(self):
  35. self.logs_handle.logs_write('聚合新闻', '任务开始', 'start', False)
  36. resp_data = self.req()
  37. if resp_data:
  38. self.save_to_mongo(resp_data)
  39. if self.send_email_now:
  40. if self.send_email_datas:
  41. print('准备发送邮件')
  42. self.send_to_email()
  43. else:
  44. print('无新数据')
  45. else:
  46. self.logs_handle.logs_write('聚合新闻', '获取数据为空', 'error', False)
  47. return False
  48. self.logs_handle.logs_write('聚合新闻', '任务完成', 'done', False)
  49. def req(self):
  50. print('开始请求数据')
  51. result_data = []
  52. for target in self.targets:
  53. url = self.targets[target]
  54. try:
  55. resp = httpx.get(url=url)
  56. except Exception as e:
  57. print("请求出错{}, \nurl: {}".format(e, url))
  58. time.sleep(20)
  59. continue
  60. resp_json = resp.json()
  61. data = resp_json.setdefault('data')
  62. cat = data.setdefault('cat')
  63. sites = cat.setdefault('sites')
  64. for site in sites:
  65. site_name = site.setdefault('site')
  66. subs = site.setdefault('subs')
  67. target_and_site = '{}-{}'.format(target, site_name)
  68. for items in subs:
  69. for item in items:
  70. if item == 'items':
  71. detail = items['items']
  72. for d in detail:
  73. if target == 'universal':
  74. tag = 'Anyknew - 综合'
  75. elif target == 'finance':
  76. tag = 'Anyknew - 金融'
  77. elif target == 'science':
  78. tag = 'Anyknew - 科学'
  79. elif target == 'life':
  80. tag = 'Anyknew - 生活'
  81. elif target == 'binary':
  82. tag = 'Anyknew - 二进制'
  83. else:
  84. tag = 'Anyknew'
  85. result_data.append({
  86. "title": d.get('title') or '',
  87. "context": d.get('more') or '',
  88. "source_url": url,
  89. 'link': self.base_url + (str(d.get('iid')) or ''),
  90. "article_type": target_and_site,
  91. "article_source": tag,
  92. "img_url": '',
  93. 'keyword': '',
  94. "posted_date": d.get('add_date') or '',
  95. "create_time": int(time.time()),
  96. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  97. "repush_times": DEFAULT_RE_PUSH_TIMES
  98. })
  99. print('已获取数据')
  100. return result_data
  101. def save_to_mongo(self, source_data):
  102. print(f'开始处理Anyknew数据')
  103. mongo = MongoHandle(db=self.db, collection=self.collection, del_db=False, del_collection=False, auto_remove=0)
  104. for data_to_insert in source_data:
  105. try:
  106. # 检查数据库中是否存在匹配的文档
  107. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  108. count = mongo.collection.count_documents(filter_criteria)
  109. if count == 0:
  110. # 如果没有找到匹配的文档,插入新文档
  111. result = mongo.collection.insert_one(data_to_insert)
  112. self.send_email_datas.append(data_to_insert)
  113. except TypeError as te:
  114. print('\n%s' % te)
  115. self.logs_handle.logs_write('聚合新闻', '写入数据库报错: %s' % te, 'error', False)
  116. return 0
  117. print(f'Anyknew数据处理')
  118. def send_to_email(self):
  119. text = '********************************************************\n'
  120. for data in self.send_email_datas:
  121. text += '标题: {}\n'.format(data['title'])
  122. text += '正文: {}\n'.format(data['context'])
  123. text += '文章地址: {}\n'.format(data['link'])
  124. text += '类型: {}\n'.format(data['article_type'])
  125. text += '板块: {}\n'.format(data['article_source'])
  126. text += '文章时间: {}\n'.format(data['posted_date'])
  127. text += '获取时间: {}\n'.format(data['create_datetime'])
  128. text += '********************************************************\n\n'
  129. send_email = SendEmail(subject='Anyknew', title='Anyknew_info', text=text)
  130. send_email.send()
  131. print('邮件已发送')
  132. if __name__ == '__main__':
  133. HotNews().main()