news_get_apprcn.py 4.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 反斗限免
  4. 1, 获取反斗限免数据
  5. 2, 储存到mongodb
  6. 3, 发送到指定邮件
  7. '''
  8. import re
  9. import time
  10. from datetime import datetime
  11. import httpx
  12. from tools_mongo_handle import MongoHandle
  13. from tools_logs_handle import LogsHandle
  14. from tools_send_email import SendEmail
  15. class APPRCN(object):
  16. def __init__(self):
  17. self.logs_handle = LogsHandle()
  18. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  19. self.base_url = 'https://free.apprcn.com/page/{}/'
  20. self.headers = {
  21. 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8'
  22. }
  23. db = 'NEWS'
  24. collection = 'apprcn_info'
  25. self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
  26. self.send_email_datas = []
  27. self.send_email_now = 0
  28. def main(self):
  29. self.logs_handle.logs_write('apprcn', '开始获取反斗限免数据', 'start', False)
  30. response_data = self.req()
  31. if response_data:
  32. self.save_to_mongo(response_data)
  33. if self.send_email_now:
  34. self.send_to_email()
  35. self.logs_handle.logs_write('apprcn', '反斗限免数据获取完成', 'done', False)
  36. print('done')
  37. else:
  38. self.logs_handle.logs_write('apprcn', '无法获取apprcn数据', 'error', False)
  39. def req(self):
  40. urls = ['https://free.apprcn.com/']
  41. for i in range(2, 10):
  42. urls.append(self.base_url.format(i))
  43. response_data = []
  44. for i in urls:
  45. response = httpx.get(url=i, headers=self.headers)
  46. if response.status_code != 200:
  47. self.logs_handle.logs_write('apprcn', '请求失败, 状态码: %s' % response.status_code, 'error', False)
  48. exit(0)
  49. response.encoding = 'utf-8'
  50. content_list = re.findall('<div class="content">([\S\s]*?)<div class="sidebar">', response.text)
  51. # 清理content数据
  52. content = ''
  53. if content_list:
  54. for i in ['\t', '\n']:
  55. content = content_list[0].replace(i, '')
  56. context_list = re.findall('<p class="note">(.*?)</p>', content)
  57. title_list = re.findall('title="(.*?)"', content)
  58. post_date_list = re.findall('<time>(.*?)</time>', content)
  59. source_data_list = re.findall('<a class="cat" href="(.*?)"', content)
  60. for title, context, post_date, source_data in zip(title_list, context_list, post_date_list,
  61. source_data_list):
  62. response_data.append({
  63. "title": title,
  64. "context": context,
  65. "source_url": source_data,
  66. 'link': '',
  67. "article_type": '',
  68. "article_source": '',
  69. "img_url": '',
  70. 'keyword': '',
  71. "posted_date": post_date,
  72. "create_time": int(time.time()),
  73. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  74. })
  75. if response_data:
  76. return response_data
  77. else:
  78. self.logs_handle.logs_write('apprcn', '获取数据失败', 'error', False)
  79. def save_to_mongo(self, data):
  80. print('开始储存 反斗限免 数据')
  81. for data_to_insert in data:
  82. try:
  83. # 检查数据库中是否存在匹配的文档
  84. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  85. count = self.mongo.collection.count_documents(filter_criteria)
  86. if count == 0:
  87. # 如果没有找到匹配的文档,插入新文档
  88. result = self.mongo.collection.insert_one(data_to_insert)
  89. self.send_email_datas.append(data_to_insert)
  90. except TypeError as te:
  91. print('\n%s' % te)
  92. self.logs_handle.logs_write('反斗限免', '写入数据库报错: %s' % te, 'error', False)
  93. return 0
  94. print('储存数据完成')
  95. def send_to_email(self):
  96. if self.send_email_datas:
  97. text = ''
  98. for data in self.send_email_datas:
  99. text += '标题: %s\n内容: %s\n时间: %s\n链接: %s\n\n' % (
  100. data['title'], data['context'], data['posted_date'], data['source_url'])
  101. send_email = SendEmail(subject='反斗限免', title='反斗限免', text=text)
  102. send_email.send()
  103. self.logs_handle.logs_write('apprcn', '发送邮件完成', 'done', False)
  104. else:
  105. self.logs_handle.logs_write('apprcn', '没有新数据, 不发送邮件', 'done', False)
  106. # if __name__ == "__main__":
  107. # A = APPRCN()
  108. # A.main()