utils_news_data_collation.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157
  1. from pymongo import MongoClient
  2. from datetime import datetime, timedelta
  3. import time
  4. import re
  5. import smtplib
  6. from email.mime.text import MIMEText
  7. from email.header import Header
  8. # 假设 tools_load_config 模块和相关函数已经正确实现
  9. import tools_load_config
  10. config_json = tools_load_config.load_config()
  11. PROJECT_NAME = config_json.get('PROJECT_NAME')
  12. DB_USER = config_json.get('DB_USER')
  13. DB_PASSWORD = config_json.get('DB_PASSWORD')
  14. DB_IP = config_json.get('DB_IP')
  15. DB_PORT = config_json.get('DB_PORT')
  16. MAIL_HOST = config_json.get('MAIL_HOST')
  17. MAIL_USER = config_json.get('MAIL_USER')
  18. MAIL_PASS = config_json.get('MAIL_PASS')
  19. MAIL_SENDER = config_json.get('MAIL_SENDER')
  20. MAIL_RECEIVERS = config_json.get('MAIL_RECEIVERS')
  21. DB_NAME = config_json.get('DB_NAME') # 确保配置文件中有这个键
  22. MONGO_LINK = f'mongodb://{DB_USER}:{DB_PASSWORD}@{DB_IP}:{DB_PORT}/'.format(**config_json)
  23. now_day = datetime.now().strftime('%Y-%m-%d') # 获取今天的日期
  24. class NewsDataCollation(object):
  25. def __init__(self):
  26. # 第三方 SMTP 服务
  27. self.mail_host = MAIL_HOST # 设置服务器
  28. self.mail_user = MAIL_USER # 用户名
  29. self.mail_pass = MAIL_PASS # 口令
  30. self.sender = MAIL_SENDER
  31. self.receivers = [MAIL_RECEIVERS]
  32. self.processed_data = []
  33. self.filter = 1
  34. self.filter_key = []
  35. self.days = 1
  36. def load_data(self):
  37. client = MongoClient(MONGO_LINK)
  38. db = client['NEWS']
  39. # 根据 self.days 获取日期范围
  40. start_date = (datetime.now() - timedelta(days=self.days - 1)).strftime('%Y-%m-%d')
  41. end_date = datetime.now().strftime('%Y-%m-%d')
  42. # 构造查询条件,匹配日期范围内的日期
  43. query = {
  44. "create_datetime": {
  45. "$regex": f"^{start_date}|{end_date}",
  46. "$options": "i" # 使用不区分大小写的匹配
  47. }
  48. }
  49. # 遍历数据库中的所有集合
  50. for collection_name in db.list_collection_names():
  51. collection = db[collection_name]
  52. cursor = collection.find(query)
  53. for document in cursor:
  54. if not document.get('title'):
  55. continue
  56. data = self.process_data(document)
  57. if data:
  58. self.processed_data.append(data)
  59. # 关闭MongoDB连接
  60. client.close()
  61. def process_data(self, document):
  62. data = {
  63. "title": document.get('title') or '',
  64. "context": document.get('context') or '',
  65. "source_url": document.get('source_url') or '',
  66. 'link': document.get('link') or '',
  67. "article_type": document.get('article_type') or '',
  68. "article_source": document.get('article_source') or '',
  69. "img_url": document.get('img_url') or '',
  70. 'keyword': document.get('keyword') or '',
  71. "posted_date": document.get('posted_date') or '',
  72. "create_time": document.get('create_time') or '',
  73. "create_datetime": document.get('create_datetime') or ''
  74. }
  75. # 过滤打开, 先过滤, 然后清理字符串
  76. if self.filter and self.filter_key:
  77. for key in self.filter_key:
  78. if key in data['title'] or key in data['context']:
  79. data['title'] = self.clean_string(data['title'])
  80. data['context'] = self.clean_string(data['context'])
  81. else:
  82. return None
  83. else:
  84. # 过滤关闭, 直接清理字符串
  85. data['title'] = self.clean_string(data['title'])
  86. data['context'] = self.clean_string(data['context'])
  87. return data
  88. def clean_string(self, input_string):
  89. if not isinstance(input_string, str):
  90. return ''
  91. # 清除换行符\n
  92. cleaned_string = re.sub(r'\n', '', input_string)
  93. # 清除制表符\t
  94. cleaned_string = re.sub(r'\t', '', cleaned_string)
  95. # 清除所有空白字符(包括空格、制表符、换行符等)
  96. cleaned_string = re.sub(r'\s+', '', cleaned_string)
  97. return cleaned_string
  98. def send_email(self):
  99. subject = '新闻汇总sub'
  100. title = '新闻汇总title'
  101. text = '********************************************************\n'
  102. for data in self.processed_data:
  103. text += '标题: {}\n'.format(data['title'])
  104. text += '正文: {}\n'.format(data['context'])
  105. text += '文章地址: {}\n'.format(data['link'])
  106. text += '类型: {}\n'.format(data['article_type'])
  107. text += '板块: {}\n'.format(data['article_source'])
  108. text += '文章时间: {}\n'.format(data['posted_date'])
  109. text += '获取时间: {}\n'.format(data['create_datetime'])
  110. text += '********************************************************\n\n'
  111. message = MIMEText(text, 'plain', 'utf-8')
  112. message['From'] = Header(title, 'utf-8')
  113. message['To'] = Header("auto collection", 'utf-8')
  114. subject = subject
  115. message['Subject'] = Header(subject, 'utf-8')
  116. try:
  117. smtp_obj = smtplib.SMTP()
  118. smtp_obj.connect(self.mail_host, 25)
  119. smtp_obj.login(self.mail_user, self.mail_pass)
  120. smtp_obj.sendmail(self.sender, self.receivers, message.as_string())
  121. print("邮件发送成功")
  122. except smtplib.SMTPException:
  123. print("Error: 无法发送邮件")
  124. def main(self):
  125. # 加载数据
  126. self.load_data()
  127. if not self.processed_data:
  128. print("没有找到任何数据")
  129. return
  130. self.send_email()
  131. if __name__ == '__main__':
  132. NewsDataCollation().main()