news_get_rsshub.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. # -*- coding: UTF-8 -*-
  2. '''
  3. 关键词搜索rss消息
  4. '''
  5. import random
  6. import re
  7. import time
  8. import httpx
  9. from datetime import datetime
  10. from tools_mongo_handle import MongoHandle
  11. from tools_send_email import SendEmail
  12. import tools_load_config
  13. config_json = tools_load_config.load_config()
  14. base_project = tools_load_config.get_base_path()
  15. PROJECT_NAME = config_json.get('PROJECT_NAME')
  16. class KeySearch(object):
  17. def __init__(self):
  18. db = 'KeyWordSearch'
  19. collection = 'KeyWordSearch'
  20. self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
  21. def get_data(self, source, key):
  22. if not key:
  23. return None
  24. key_url = {
  25. '什么值得买': f'https://rsshub.app/smzdm/keyword/{key}',
  26. '新浪微博': f'https://rsshub.app/weibo/keyword/{key}',
  27. '36kr': f'https://rsshub.app/36kr/search/articles/{key}',
  28. '虎嗅网': f'https://rsshub.app/huxiu/search/{key}',
  29. }
  30. result_data = {key: []}
  31. url = key_url.get(source)
  32. try:
  33. resp = httpx.get(url)
  34. except Exception as e:
  35. print(f'请求失败: {e}\n目标地址: {url}')
  36. return None
  37. if resp.status_code != 200:
  38. # 发邮件通知
  39. print(f'请求失败, 状态码: {resp.status_code}, 源: {source}, 关键词: {key}')
  40. # LogsHandle().logs_write(title_source=PROJECT_NAME, content=f'请求失败, 状态码: {resp.status_code}', state='error', send_now=True)
  41. time.sleep(random.uniform(3, 5))
  42. return None
  43. resp.encoding = 'utf-8'
  44. # 解析数据
  45. pattern = r"<title><!\[CDATA\[(.*?)\]\]></title>\s*<description><!\[CDATA\[.*?\]\]></description>\s*<pubDate>(.*?)</pubDate>\s*<guid.*?</guid>\s*<link>(.*?)</link>"
  46. re_result = re.findall(pattern, resp.text)
  47. for result in re_result:
  48. if not result[0] or not result[1] or not result[2]:
  49. continue
  50. result_data[key].append([result[0].replace(' ', ""),
  51. datetime.strptime(result[1], '%a, %d %b %Y %H:%M:%S GMT').strftime('%Y-%m-%d %H:%M:%S'),
  52. result[2]])
  53. return result_data
  54. def save_to_mongo(self, result_data):
  55. new_data_to_email = {}
  56. for source, value in result_data.items():
  57. for key, datas in value.items():
  58. for data in datas:
  59. document = self.mongo.collection.find_one({'title': data[0], 'postdate': data[1], 'link': data[2], })
  60. if document is None:
  61. data_to_insert = {
  62. 'source': source,
  63. 'keyword': key,
  64. 'title': data[0],
  65. 'postdate': data[1],
  66. 'link': data[2],
  67. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
  68. }
  69. # 如果不存在,添加到列表并插入新文档
  70. if data_to_insert['source'] not in new_data_to_email:
  71. new_data_to_email[data_to_insert['source']] = [data_to_insert]
  72. else:
  73. new_data_to_email[data_to_insert['source']].append(data_to_insert)
  74. # 插入新文档到MongoDB集合中
  75. self.mongo.collection.insert_one(data_to_insert)
  76. else:
  77. # 如果存在,跳过
  78. continue
  79. return new_data_to_email
  80. def main(self):
  81. # 输入的关键字
  82. # input_keys = {
  83. # '什么值得买': [''],
  84. # '新浪微博': [''],
  85. # '36kr': [''],
  86. # '虎嗅网': [''],
  87. # }
  88. input_keys = {
  89. '什么值得买': ['京东', '券', '鼠标', '键盘', '硬盘', '咖啡', '显示器'],
  90. '新浪微博': ['测试网', '比特币', 'web3', 'CoinToEarn', 'YourAirdropETH', 'VIP8888883', 'duola_eth', 'sanyi_eth', 'kuangshenbtc', 'jianshubiji'],
  91. '36kr': ['测试网', '比特币', 'web3', 'CoinToEarn', 'YourAirdropETH', 'VIP8888883', 'duola_eth', 'sanyi_eth', 'kuangshenbtc', 'jianshubiji'],
  92. '虎嗅网': ['测试网', '比特币', 'web3', 'CoinToEarn', 'YourAirdropETH', 'VIP8888883', 'duola_eth', 'sanyi_eth', 'kuangshenbtc', 'jianshubiji'],
  93. }
  94. result_data = {}
  95. for key, value in input_keys.items():
  96. for k in value:
  97. if not k:
  98. continue
  99. print(f'正在获取 {key} - {k} 数据')
  100. datas = self.get_data(key, k)
  101. time.sleep(random.uniform(4, 6))
  102. if not datas:
  103. print(f'{k}: nodata')
  104. continue
  105. if key in result_data:
  106. result_data[key].update(datas)
  107. else:
  108. result_data.update({key: datas})
  109. new_data_to_email = self.save_to_mongo(result_data)
  110. # 如果有新消息, 即时发送邮件
  111. if new_data_to_email:
  112. for source, datas in new_data_to_email.items():
  113. content = f'KeyWord Search Message\n\nSource site: {source}\n\n{"*" * 50}\n\nposted at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n{"*" * 50}\n\n'
  114. for data in datas:
  115. content += f'source: {data["source"]}, keyword: {data["keyword"]}\n'
  116. content += f'title: {data["title"]}, postdate: {data["postdate"]}\n'
  117. content += f'link: {data["link"]}\n'
  118. content += f'\n{"*" * 50}\n'
  119. SendEmail(
  120. subject=f"{source} - KeyWord Search Message",
  121. title=f'New Message ({datetime.now().strftime("%Y-%m-%d %H:%M:%S")})',
  122. text=content
  123. ).send()
  124. if __name__ == '__main__':
  125. print('keyword reminder start')
  126. search = KeySearch()
  127. search.main()
  128. print('keyword reminder done')