news_get_apprcn.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 反斗限免
  4. 1, 获取反斗限免数据
  5. 2, 储存到mongodb
  6. 3, 发送到指定邮件
  7. '''
  8. import re
  9. import time
  10. from datetime import datetime
  11. import httpx
  12. import sys
  13. import os
  14. sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
  15. from utils.utils_mongo_handle import MongoHandle
  16. from utils.utils_logs_handle import LogsHandle
  17. from utils.utils_send_email import SendEmail
  18. from base.base_load_config import load_config
  19. config_json = load_config()
  20. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  21. class APPRCN(object):
  22. def __init__(self):
  23. self.logs_handle = LogsHandle()
  24. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  25. self.base_url = 'https://free.apprcn.com/page/{}/'
  26. self.headers = {
  27. 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8'
  28. }
  29. db = 'NEWS'
  30. collection = 'apprcn_info'
  31. self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
  32. self.send_email_datas = []
  33. self.send_email_now = 0
  34. def main(self):
  35. self.logs_handle.logs_write('apprcn', '开始获取反斗限免数据', 'start', False)
  36. response_data = self.req()
  37. if response_data:
  38. self.save_to_mongo(response_data)
  39. if self.send_email_now:
  40. self.send_to_email()
  41. self.logs_handle.logs_write('apprcn', '反斗限免数据获取完成', 'done', False)
  42. print('done')
  43. else:
  44. self.logs_handle.logs_write('apprcn', '无法获取apprcn数据', 'error', False)
  45. def req(self):
  46. urls = ['https://free.apprcn.com/']
  47. for i in range(2, 10):
  48. urls.append(self.base_url.format(i))
  49. response_data = []
  50. for i in urls:
  51. response = httpx.get(url=i, headers=self.headers)
  52. if response.status_code != 200:
  53. self.logs_handle.logs_write('apprcn', '请求失败, 状态码: %s' % response.status_code, 'error', False)
  54. exit(0)
  55. response.encoding = 'utf-8'
  56. content_list = re.findall('<div class="content">([\S\s]*?)<div class="sidebar">', response.text)
  57. # 清理content数据
  58. content = ''
  59. if content_list:
  60. for i in ['\t', '\n']:
  61. content = content_list[0].replace(i, '')
  62. context_list = re.findall('<p class="note">(.*?)</p>', content)
  63. title_list = re.findall('title="(.*?)"', content)
  64. post_date_list = re.findall('<time>(.*?)</time>', content)
  65. source_data_list = re.findall('<a class="cat" href="(.*?)"', content)
  66. for title, context, post_date, source_data in zip(title_list, context_list, post_date_list,
  67. source_data_list):
  68. response_data.append({
  69. "title": title,
  70. "context": context,
  71. "source_url": source_data,
  72. 'link': '',
  73. "article_type": '',
  74. "article_source": '',
  75. "img_url": '',
  76. 'keyword': '',
  77. "posted_date": post_date,
  78. "create_time": int(time.time()),
  79. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  80. "repush_times": DEFAULT_RE_PUSH_TIMES
  81. })
  82. if response_data:
  83. return response_data
  84. else:
  85. self.logs_handle.logs_write('apprcn', '获取数据失败', 'error', False)
  86. def save_to_mongo(self, data):
  87. print('开始储存 反斗限免 数据')
  88. for data_to_insert in data:
  89. try:
  90. # 检查数据库中是否存在匹配的文档
  91. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  92. count = self.mongo.collection.count_documents(filter_criteria)
  93. if count == 0:
  94. # 如果没有找到匹配的文档,插入新文档
  95. result = self.mongo.collection.insert_one(data_to_insert)
  96. self.send_email_datas.append(data_to_insert)
  97. except TypeError as te:
  98. print('\n%s' % te)
  99. self.logs_handle.logs_write('反斗限免', '写入数据库报错: %s' % te, 'error', False)
  100. return 0
  101. print('储存数据完成', datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
  102. def send_to_email(self):
  103. if self.send_email_datas:
  104. text = ''
  105. for data in self.send_email_datas:
  106. text += '标题: %s\n内容: %s\n时间: %s\n链接: %s\n\n' % (
  107. data['title'], data['context'], data['posted_date'], data['source_url'])
  108. send_email = SendEmail(subject='反斗限免', title='反斗限免', text=text)
  109. send_email.send()
  110. self.logs_handle.logs_write('apprcn', '发送邮件完成', 'done', False)
  111. else:
  112. self.logs_handle.logs_write('apprcn', '没有新数据, 不发送邮件', 'done', False)
  113. if __name__ == "__main__":
  114. APPRCN().main()