news_get_36kr_info.py 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 获取36kr讯息数据, 通过rsshub获取数据, 可能需要使用代理
  4. https://www.36kr.com/
  5. '''
  6. import datetime
  7. import json
  8. import random
  9. import re
  10. import xmltodict
  11. import time
  12. import httpx
  13. from tools_mongo_handle import MongoHandle
  14. from tools_logs_handle import LogsHandle
  15. from tools_send_email import SendEmail
  16. class Get36krInfo:
  17. def __init__(self):
  18. self.base_url = 'https://rsshub.app/36kr/'
  19. self.local_key = [
  20. 'news',
  21. 'newsflashes',
  22. 'recommend',
  23. 'life',
  24. 'estate',
  25. 'workplace'
  26. ]
  27. self.logs_handle = LogsHandle()
  28. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  29. self.db = 'NEWS'
  30. self.collection = '36kr_info'
  31. self.send_email_datas = {}
  32. self.send_email_now = 0
  33. def req(self):
  34. result_data = []
  35. local_key = self.local_key[:]
  36. random.shuffle(local_key)
  37. for key in local_key:
  38. url = self.base_url + key
  39. try:
  40. response = httpx.get(url)
  41. except TimeoutError as timeout_error:
  42. print(timeout_error)
  43. continue
  44. if response.status_code != 200:
  45. self.logs_handle.logs_write('36kr_info', '请求失败, 状态码: %s' % response.status_code, 'error', False)
  46. time.sleep(20)
  47. continue
  48. response.encoding = 'utf-8'
  49. html = response.text
  50. xml_dict = xmltodict.parse(html)
  51. source = ''
  52. items = []
  53. try:
  54. source = xml_dict['rss']['channel']['title']
  55. except Exception as e:
  56. print('获取 source 失败')
  57. try:
  58. items = xml_dict['rss']['channel']['item']
  59. except Exception as e:
  60. print('获取 items 失败')
  61. for item in items:
  62. # 清洗
  63. if item.get('description'):
  64. item['description'] = re.sub(r'<[^>]+>', '', item.get('description'))
  65. result_data.append({
  66. "title": item.get('title') or '',
  67. "context": item.get('description') or '',
  68. "source_url": url,
  69. 'link': item.get('link') or '',
  70. "article_type": source,
  71. "article_source": key,
  72. "img_url": '',
  73. 'keyword': '',
  74. "posted_date": item.get('pubDate') or '',
  75. "create_time": int(time.time()),
  76. "create_datetime": datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  77. })
  78. if len(local_key) != 1:
  79. sleep_time = random.uniform(10, 15)
  80. time.sleep(sleep_time)
  81. return result_data
  82. def save_to_mongo(self, result_data):
  83. new_datas = []
  84. print(f'正在处理 {self.collection}数据')
  85. mongo = MongoHandle(db=self.db, collection=self.collection, del_db=False, del_collection=False, auto_remove=0)
  86. for data_to_insert in result_data:
  87. try:
  88. # 检查数据库中是否存在匹配的文档
  89. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  90. count = mongo.collection.count_documents(filter_criteria)
  91. if count == 0:
  92. # 如果没有找到匹配的文档,插入新文档
  93. result = mongo.collection.insert_one(data_to_insert)
  94. # 准备发送邮件的数据
  95. new_datas.append(data_to_insert)
  96. except TypeError as te:
  97. print('\n%s' % te)
  98. self.logs_handle.logs_write(f'{self.collection}', '写入数据库报错: %s' % te, 'error', False)
  99. return 0
  100. print(f'处理 {self.collection} 数据完成')
  101. return new_datas
  102. def send_to_email(self, new_datas):
  103. title = self.collection
  104. subject = self.collection
  105. text = '********************************************************\n'
  106. for data in new_datas:
  107. text += '标题: {}\n'.format(data['title'])
  108. text += '正文: {}\n'.format(data['context'])
  109. text += '文章地址: {}\n'.format(data['link'])
  110. text += '文章时间: {}\n'.format(data['posted_date'])
  111. text += '获取时间: {}\n'.format(data['create_datetime'])
  112. text += '********************************************************\n\n'
  113. send_email = SendEmail(subject=subject, title=title, text=text)
  114. send_email.send()
  115. self.logs_handle.logs_write(f'self.collection', f'{title}-发送邮件完成', 'done', False)
  116. def main(self):
  117. self.logs_handle.logs_write('36kr - info', '任务开始', 'start', False)
  118. result_data = self.req()
  119. if result_data:
  120. new_datas = self.save_to_mongo(result_data)
  121. if self.send_email_now:
  122. if new_datas:
  123. self.send_to_email(new_datas)
  124. else:
  125. print('无新数据')
  126. self.logs_handle.logs_write('36kr - info', '36kr - info 数据获取完成', 'done', False)
  127. print('done')
  128. else:
  129. self.logs_handle.logs_write('36kr - info', '无法获取 36kr - info 数据', 'error', False)
  130. # if __name__ == '__main__':
  131. # g = Get36krInfo()
  132. # g.main()