base_news_data_collation.py 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. '''
  2. 每日从 mongo 数据库, 做新闻汇总,发送到邮箱
  3. '''
  4. import os
  5. import sys
  6. from utils.utils import LoadConfig
  7. sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
  8. from pymongo import MongoClient
  9. from datetime import datetime, timedelta
  10. import re
  11. import smtplib
  12. from email.mime.text import MIMEText
  13. from email.header import Header
  14. from utils.utils import *
  15. config_json = LoadConfig().load_config()
  16. base_project = LoadConfig().get_base_path()
  17. PROJECT_NAME = config_json.get('PROJECT_NAME')
  18. DB_USER = config_json.get('DB_USER')
  19. DB_PASSWORD = config_json.get('DB_PASSWORD')
  20. DB_IP = config_json.get('DB_IP')
  21. DB_PORT = config_json.get('DB_PORT')
  22. MAIL_HOST = config_json.get('MAIL_HOST')
  23. MAIL_USER = config_json.get('MAIL_USER')
  24. MAIL_PASS = config_json.get('MAIL_PASS')
  25. MAIL_SENDER = config_json.get('MAIL_SENDER')
  26. MAIL_RECEIVERS = config_json.get('MAIL_RECEIVERS')
  27. DB_NAME = config_json.get('DB_NAME') # 确保配置文件中有这个键
  28. MONGO_LINK = f'mongodb://{DB_USER}:{DB_PASSWORD}@{DB_IP}:{DB_PORT}/'.format(**config_json)
  29. now_day = datetime.now().strftime('%Y-%m-%d') # 获取今天的日期
  30. filter_days = config_json.get('FILTER_DAYS')
  31. filter_keys = config_json.get('FILTER_KEYS')
  32. filter_switch = True
  33. class NewsDataCollation(object):
  34. def __init__(self):
  35. # 第三方 SMTP 服务
  36. self.mail_host = MAIL_HOST # 设置服务器
  37. self.mail_user = MAIL_USER # 用户名
  38. self.mail_pass = MAIL_PASS # 口令
  39. self.sender = MAIL_SENDER
  40. self.receivers = [MAIL_RECEIVERS]
  41. self.processed_data = []
  42. def load_data(self):
  43. processed_data = []
  44. # 读取数据
  45. print('程序正在读取数据')
  46. client = MongoClient(MONGO_LINK)
  47. db = client['NEWS']
  48. # 根据 self.days 获取日期范围
  49. start_date = (datetime.now() - timedelta(days=filter_days - 1)).strftime('%Y-%m-%d')
  50. end_date = datetime.now().strftime('%Y-%m-%d')
  51. # 构造查询条件,匹配日期范围内的日期
  52. query = {
  53. "create_datetime": {
  54. "$regex": f"^{start_date}|{end_date}",
  55. "$options": "i" # 使用不区分大小写的匹配
  56. }
  57. }
  58. # 遍历数据库中的所有集合
  59. for collection_name in db.list_collection_names():
  60. print(collection_name)
  61. collection = db[collection_name]
  62. cursor = collection.find(query)
  63. for document in cursor:
  64. if not document.get('title'):
  65. continue
  66. # 检查 'repush_times' 字段是否存在,如果不存在则默认为 5
  67. repush_times = document.get('repush_times', 5)
  68. # 减少 repush_times 的值
  69. new_repush_times = repush_times - 1
  70. # 更新数据库中的 repush_times 字段
  71. collection.update_one(
  72. {"_id": document['_id']}, # 假设文档中有 _id 字段作为唯一标识
  73. {"$set": {"repush_times": new_repush_times}}
  74. )
  75. data = self.process_data(document)
  76. if data:
  77. processed_data.append(data)
  78. # 关闭MongoDB连接
  79. client.close()
  80. return processed_data
  81. def process_data(self, document):
  82. # 处理数据
  83. data = {
  84. "title": document.get('title') or '',
  85. "context": document.get('context') or '',
  86. "source_url": document.get('source_url') or '',
  87. 'link': document.get('link') or '',
  88. "article_type": document.get('article_type') or '',
  89. "article_source": document.get('article_source') or '',
  90. "img_url": document.get('img_url') or '',
  91. 'keyword': document.get('keyword') or '',
  92. "posted_date": document.get('posted_date') or '',
  93. "create_time": document.get('create_time') or '',
  94. "create_datetime": document.get('create_datetime') or '',
  95. "repush_times": document.get('repush_times', 5) - 1
  96. }
  97. data['title'] = self.clean_string(data['title'], 'title')
  98. data['context'] = self.clean_string(data['context'], 'context')
  99. return data
  100. def clean_string(self, input_string, text_type):
  101. # 清除 title 和 context 中的换行符和制表符
  102. if not isinstance(input_string, str):
  103. return ''
  104. # 清除所有空白字符(包括空格、制表符、换行符等)
  105. cleaned_string = re.sub(r'\s+', '', input_string)
  106. if len(cleaned_string) > 100:
  107. cleaned_string = cleaned_string[:100] + '...'
  108. if text_type == 'context':
  109. pass
  110. return cleaned_string
  111. def send_email(self, processed_data):
  112. # 发送邮件
  113. print('准备发送邮件')
  114. subject = '新闻汇总sub'
  115. title = '新闻汇总title'
  116. text = '********************************************************\n'
  117. for data in processed_data:
  118. text += '标题: {}\n'.format(data['title'])
  119. text += '正文: {}\n'.format(data['context'])
  120. text += '文章地址: {}\n'.format(data['link'])
  121. text += '类型: {}\n'.format(data['article_type'])
  122. text += '板块: {}\n'.format(data['article_source'])
  123. text += '文章时间: {}\n'.format(data['posted_date'])
  124. text += '获取时间: {}\n'.format(data['create_datetime'])
  125. text += '********************************************************\n\n'
  126. message = MIMEText(text, 'plain', 'utf-8')
  127. message['From'] = Header(title, 'utf-8')
  128. message['To'] = Header("auto", 'utf-8')
  129. message['Subject'] = Header(subject, 'utf-8')
  130. try:
  131. smtpObj = smtplib.SMTP_SSL(self.mail_host)
  132. smtpObj.login(self.mail_user, self.mail_pass)
  133. smtpObj.sendmail(self.sender, self.receivers, message.as_string())
  134. print("邮件发送成功")
  135. except smtplib.SMTPException as e:
  136. print("Error: 无法发送邮件", e)
  137. def send_email_with_keyword(self, series, keys, processed_data):
  138. process_send_data = {}
  139. keys = keys.split('|')
  140. have_data_keys = []
  141. for key in keys:
  142. # print(f'通过关键字: {key} 过滤') # 用来调试 key 是否正确
  143. for data in processed_data:
  144. if key in data['title'] or key in data['context']:
  145. # 如果数据里面无 keyword, 用当前 key 替换一下
  146. if not data.get('keyword'):
  147. data['keyword'] = key
  148. if series not in process_send_data:
  149. process_send_data[series] = [data]
  150. else:
  151. process_send_data[series].append(data)
  152. # 储存一下有数据的 key, 输出用
  153. have_data_keys.append(key)
  154. if process_send_data:
  155. print('{}系列, 以下关键字有数据\n{}'.format(series, list(set(have_data_keys))))
  156. # 发送邮件
  157. print('程序正在准备发送邮件的数据')
  158. for key in process_send_data:
  159. subject = '新闻汇总sub - {}'.format(series)
  160. title = '新闻汇总title - {}'.format(series)
  161. text = '********************************************************\n'
  162. for data in process_send_data[key]:
  163. text += '标题: {}\n'.format(data['title'])
  164. text += '正文: {}\n'.format(data['context'])
  165. text += '文章地址: {}\n'.format(data['link'])
  166. text += '类型: {}\n'.format(data['article_type'])
  167. text += '板块: {}\n'.format(data['article_source'])
  168. text += '关键词: {}\n'.format(key)
  169. text += '文章时间: {}\n'.format(data['posted_date'])
  170. text += '获取时间: {}\n'.format(data['create_datetime'])
  171. text += '********************************************************\n\n'
  172. message = MIMEText(text, 'plain', 'utf-8')
  173. message['From'] = Header(title, 'utf-8')
  174. message['To'] = Header("auto", 'utf-8')
  175. message['Subject'] = Header(subject, 'utf-8')
  176. try:
  177. smtpObj = smtplib.SMTP_SSL(self.mail_host)
  178. smtpObj.login(self.mail_user, self.mail_pass)
  179. smtpObj.sendmail(self.sender, self.receivers, message.as_string())
  180. print("关键字: {} 的邮件发送成功".format(series))
  181. except smtplib.SMTPException as e:
  182. print("Error: 无法发送邮件", e)
  183. def main(self):
  184. # 加载指定天数的所有数据
  185. processed_data = self.load_data()
  186. # 如果无数据, 则退出
  187. if not processed_data:
  188. print("没有找到任何数据")
  189. exit(0)
  190. # 发送一次所有数据的邮件
  191. # self.send_email(processed_data)
  192. # # 这里是通过关键词过滤然后再发送邮件
  193. if filter_switch and filter_keys:
  194. for series, keys in filter_keys.items():
  195. self.send_email_with_keyword(series, keys, processed_data)
  196. if __name__ == '__main__':
  197. NewsDataCollation().main()