| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148 |
- # -*- coding: UTF-8 -*-
- '''
- 关键词搜索rss消息
- '''
- import random
- import re
- import time
- import httpx
- from datetime import datetime
- from tools_mongo_handle import MongoHandle
- from tools_send_email import SendEmail
- import tools_load_config
- config_json = tools_load_config.load_config()
- base_project = tools_load_config.get_base_path()
- PROJECT_NAME = config_json.get('PROJECT_NAME')
- class KeySearch(object):
- def __init__(self):
- db = 'KeyWordSearch'
- collection = 'KeyWordSearch'
- self.mongo = MongoHandle(db=db, collection=collection, del_db=False, del_collection=False, auto_remove=0)
- def get_data(self, source, key):
- if not key:
- return None
- key_url = {
- '什么值得买': f'https://rsshub.app/smzdm/keyword/{key}',
- '新浪微博': f'https://rsshub.app/weibo/keyword/{key}',
- '36kr': f'https://rsshub.app/36kr/search/articles/{key}',
- '虎嗅网': f'https://rsshub.app/huxiu/search/{key}',
- }
- result_data = {key: []}
- url = key_url.get(source)
- try:
- resp = httpx.get(url)
- except Exception as e:
- print(f'请求失败: {e}\n目标地址: {url}')
- return None
- if resp.status_code != 200:
- # 发邮件通知
- print(f'请求失败, 状态码: {resp.status_code}, 源: {source}, 关键词: {key}')
- # LogsHandle().logs_write(title_source=PROJECT_NAME, content=f'请求失败, 状态码: {resp.status_code}', state='error', send_now=True)
- time.sleep(random.uniform(3, 5))
- return None
- resp.encoding = 'utf-8'
- # 解析数据
- pattern = r"<title><!\[CDATA\[(.*?)\]\]></title>\s*<description><!\[CDATA\[.*?\]\]></description>\s*<pubDate>(.*?)</pubDate>\s*<guid.*?</guid>\s*<link>(.*?)</link>"
- re_result = re.findall(pattern, resp.text)
- for result in re_result:
- if not result[0] or not result[1] or not result[2]:
- continue
- result_data[key].append([result[0].replace(' ', ""),
- datetime.strptime(result[1], '%a, %d %b %Y %H:%M:%S GMT').strftime('%Y-%m-%d %H:%M:%S'),
- result[2]])
- return result_data
- def save_to_mongo(self, result_data):
- new_data_to_email = {}
- for source, value in result_data.items():
- for key, datas in value.items():
- for data in datas:
- document = self.mongo.collection.find_one({'title': data[0], 'postdate': data[1], 'link': data[2], })
- if document is None:
- data_to_insert = {
- 'source': source,
- 'keyword': key,
- 'title': data[0],
- 'postdate': data[1],
- 'link': data[2],
- 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S")
- }
- # 如果不存在,添加到列表并插入新文档
- if data_to_insert['source'] not in new_data_to_email:
- new_data_to_email[data_to_insert['source']] = [data_to_insert]
- else:
- new_data_to_email[data_to_insert['source']].append(data_to_insert)
- # 插入新文档到MongoDB集合中
- self.mongo.collection.insert_one(data_to_insert)
- else:
- # 如果存在,跳过
- continue
- return new_data_to_email
- def main(self):
- # 输入的关键字
- # input_keys = {
- # '什么值得买': [''],
- # '新浪微博': [''],
- # '36kr': [''],
- # '虎嗅网': [''],
- # }
- input_keys = {
- '什么值得买': ['京东', '券', '鼠标', '键盘', '硬盘', '咖啡', '显示器'],
- '新浪微博': ['测试网', '比特币', 'web3', 'CoinToEarn', 'YourAirdropETH', 'VIP8888883', 'duola_eth', 'sanyi_eth', 'kuangshenbtc', 'jianshubiji'],
- '36kr': ['测试网', '比特币', 'web3', 'CoinToEarn', 'YourAirdropETH', 'VIP8888883', 'duola_eth', 'sanyi_eth', 'kuangshenbtc', 'jianshubiji'],
- '虎嗅网': ['测试网', '比特币', 'web3', 'CoinToEarn', 'YourAirdropETH', 'VIP8888883', 'duola_eth', 'sanyi_eth', 'kuangshenbtc', 'jianshubiji'],
- }
- result_data = {}
- for key, value in input_keys.items():
- for k in value:
- if not k:
- continue
- print(f'正在获取 {key} - {k} 数据')
- datas = self.get_data(key, k)
- time.sleep(random.uniform(4, 6))
- if not datas:
- print(f'{k}: nodata')
- continue
- if key in result_data:
- result_data[key].update(datas)
- else:
- result_data.update({key: datas})
- new_data_to_email = self.save_to_mongo(result_data)
- # 如果有新消息, 即时发送邮件
- if new_data_to_email:
- for source, datas in new_data_to_email.items():
- content = f'KeyWord Search Message\n\nSource site: {source}\n\n{"*" * 50}\n\nposted at {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}\n\n{"*" * 50}\n\n'
- for data in datas:
- content += f'source: {data["source"]}, keyword: {data["keyword"]}\n'
- content += f'title: {data["title"]}, postdate: {data["postdate"]}\n'
- content += f'link: {data["link"]}\n'
- content += f'\n{"*" * 50}\n'
- SendEmail(
- subject=f"{source} - KeyWord Search Message",
- title=f'New Message ({datetime.now().strftime("%Y-%m-%d %H:%M:%S")})',
- text=content
- ).send()
- if __name__ == '__main__':
- print('keyword reminder start')
- search = KeySearch()
- search.main()
- print('keyword reminder done')
|