|
|
@@ -1,256 +0,0 @@
|
|
|
-# -*- coding: utf-8 -*-
|
|
|
-'''
|
|
|
-爬取多个 web 新闻网站
|
|
|
-存 mongo, 但只检索是否已发送过消息
|
|
|
-'''
|
|
|
-import os
|
|
|
-import sys
|
|
|
-import threading
|
|
|
-import time
|
|
|
-
|
|
|
-import httpx
|
|
|
-
|
|
|
-sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
|
|
|
-from html import unescape
|
|
|
-from datetime import datetime
|
|
|
-import re
|
|
|
-from utils.utils_mongo_handle import MongoHandle
|
|
|
-from base.base_load_config import load_config
|
|
|
-
|
|
|
-config_json = load_config()
|
|
|
-DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
|
|
|
-
|
|
|
-
|
|
|
-class MessageSearchKey(object):
|
|
|
- def __init__(self):
|
|
|
- db_name = 'NEWS'
|
|
|
- collection_name = 'web3_news'
|
|
|
- self.mongo = MongoHandle(db=db_name, collection=collection_name, del_db=False, del_collection=False,
|
|
|
- auto_remove=0)
|
|
|
- self.headers = {
|
|
|
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
|
|
|
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
- "Accept-Language": "en-US,en;q=0.5",
|
|
|
- "Accept-Encoding": "gzip, deflate, br",
|
|
|
- "Connection": "keep-alive",
|
|
|
- "Content-Type": "application/json"
|
|
|
- }
|
|
|
-
|
|
|
- def techflow(self):
|
|
|
- # 深潮TechFlow url: https://www.163.com/dy/media/T1561634363944.html
|
|
|
- tag_title = '深潮TechFlow'
|
|
|
- data_list = []
|
|
|
- target = ['https://www.163.com/dy/media/T1561634363944.html']
|
|
|
- for url in target:
|
|
|
- print('前往 url: {}'.format(url))
|
|
|
-
|
|
|
- resp = httpx.get(url, headers=self.headers, timeout=10)
|
|
|
- if resp.status_code != 200:
|
|
|
- print('深潮TechFlow - 获取数据失败, 状态码: {}'.format(resp.status_code))
|
|
|
- return False
|
|
|
-
|
|
|
- resp.encoding = 'utf-8'
|
|
|
- html = resp.text
|
|
|
- context_urls = re.findall('<a href="(.*?)" class="title">', html)
|
|
|
- title_list = re.findall('class="title">(.*?)</a>', html)
|
|
|
- posted_time_list = re.findall('<span class="time">(.*?)</span>', html)
|
|
|
- for title, context_url, posted_time in zip(title_list, context_urls, posted_time_list):
|
|
|
- data = {
|
|
|
- 'title': title,
|
|
|
- 'context': title,
|
|
|
- 'source_url': url,
|
|
|
- 'link': context_url,
|
|
|
- 'article_type': tag_title,
|
|
|
- 'article_source': tag_title,
|
|
|
- 'img_url': '',
|
|
|
- 'keyword': '',
|
|
|
- 'posted_date': posted_time,
|
|
|
- 'create_time': int(time.time()),
|
|
|
- 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'repush_times': DEFAULT_RE_PUSH_TIMES
|
|
|
- }
|
|
|
- filter_criteria = {'title': data['title']}
|
|
|
- count = self.mongo.collection.count_documents(filter_criteria)
|
|
|
- if count == 0:
|
|
|
- result = self.mongo.collection.insert_one(data)
|
|
|
-
|
|
|
- def panewslab(self):
|
|
|
- tag_title = 'panewslab'
|
|
|
- base_url = 'https://www.panewslab.com'
|
|
|
-
|
|
|
- # ------------------------------------------------------------------------------------------------------------
|
|
|
- try:
|
|
|
- url = 'https://www.panewslab.com/webapi/index/list?Rn=20&LId=1&LastTime=1724891115&TagId=&tw=0'
|
|
|
- print('前往 url: {}'.format(url))
|
|
|
- resp = httpx.get(url, headers=self.headers, timeout=10)
|
|
|
- if resp.status_code != 200:
|
|
|
- print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
|
|
|
- return False
|
|
|
-
|
|
|
- resp.encoding = 'utf-8'
|
|
|
- resp_json = resp.json()
|
|
|
- for resp_data in resp_json['data']:
|
|
|
- try:
|
|
|
- data = {
|
|
|
- 'title': resp_data['share']['title'],
|
|
|
- 'context': resp_data['desc'],
|
|
|
- 'source_url': url,
|
|
|
- 'link': resp_data['share']['url'],
|
|
|
- 'article_type': tag_title,
|
|
|
- 'article_source': tag_title,
|
|
|
- 'img_url': '',
|
|
|
- 'keyword': '',
|
|
|
- 'posted_date': datetime.utcfromtimestamp(int(resp_data['publishTime'])).strftime(
|
|
|
- '%Y-%m-%d %H:%M:%S'),
|
|
|
- 'create_time': int(time.time()),
|
|
|
- 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'repush_times': DEFAULT_RE_PUSH_TIMES
|
|
|
- }
|
|
|
- filter_criteria = {'title': data['title']}
|
|
|
- count = self.mongo.collection.count_documents(filter_criteria)
|
|
|
- if count == 0:
|
|
|
- result = self.mongo.collection.insert_one(data)
|
|
|
- except Exception as e:
|
|
|
- print(f'{tag_title}: 数据取值失败, {e}')
|
|
|
- continue
|
|
|
- except Exception as e:
|
|
|
- print(f'{tag_title}: 数据取值失败, {e}')
|
|
|
-
|
|
|
- # -------------------------------------------------------------------------------------------------------------
|
|
|
- url = 'https://www.panewslab.com/zh/profundity/index.html'
|
|
|
- print('前往 url: {}'.format(url))
|
|
|
- resp = httpx.get(url, headers=self.headers, timeout=10)
|
|
|
- if resp.status_code != 200:
|
|
|
- print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
|
|
|
- return False
|
|
|
-
|
|
|
- resp.encoding = 'utf-8'
|
|
|
- html = resp.text
|
|
|
- context_urls = re.findall('<div class="list-left" data-v-559b28aa><a href="(.*?)" target="_blank"', html)
|
|
|
- title_list = re.findall('target="_blank" class="n-title" data-v-559b28aa>(.*?)</a>', html)
|
|
|
- context_list = re.findall('<p class="description" data-v-559b28aa>(.*?)</p>', html)
|
|
|
- for title, context, context_url in zip(title_list, context_list, context_urls):
|
|
|
- data = {
|
|
|
- 'title': title,
|
|
|
- 'context': context,
|
|
|
- 'source_url': url,
|
|
|
- 'link': base_url + context_url,
|
|
|
- 'article_type': tag_title,
|
|
|
- 'article_source': tag_title,
|
|
|
- 'img_url': '',
|
|
|
- 'keyword': '',
|
|
|
- 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'create_time': int(time.time()),
|
|
|
- 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'repush_times': DEFAULT_RE_PUSH_TIMES
|
|
|
- }
|
|
|
- filter_criteria = {'title': data['title']}
|
|
|
- count = self.mongo.collection.count_documents(filter_criteria)
|
|
|
- if count == 0:
|
|
|
- result = self.mongo.collection.insert_one(data)
|
|
|
-
|
|
|
- # -------------------------------------------------------------------------------------------------------------
|
|
|
- url = 'https://www.panewslab.com/zh/news/index.html'
|
|
|
- print('前往 url: {}'.format(url))
|
|
|
- resp = httpx.get(url, headers=self.headers, timeout=10)
|
|
|
- if resp.status_code != 200:
|
|
|
- print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
|
|
|
- return False
|
|
|
-
|
|
|
- resp.encoding = 'utf-8'
|
|
|
- html = resp.text
|
|
|
- context_urls = re.findall('class="content" data-v-3376a1f2><a href="(.*?)" target="_blank"', html)
|
|
|
- title_list = re.findall('target="_blank" class="n-title" data-v-3376a1f2>(.*?)</a>', html)
|
|
|
- context_list = re.findall('</a> <p data-v-3376a1f2>(.*?)</p>', html)
|
|
|
- for title, context, context_url in zip(title_list, context_list, context_urls):
|
|
|
- data = {
|
|
|
- 'title': title,
|
|
|
- 'context': context,
|
|
|
- 'source_url': url,
|
|
|
- 'link': base_url + context_url,
|
|
|
- 'article_type': tag_title,
|
|
|
- 'article_source': tag_title,
|
|
|
- 'img_url': '',
|
|
|
- 'keyword': '',
|
|
|
- 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'create_time': int(time.time()),
|
|
|
- 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'repush_times': DEFAULT_RE_PUSH_TIMES
|
|
|
- }
|
|
|
- filter_criteria = {'title': data['title']}
|
|
|
- count = self.mongo.collection.count_documents(filter_criteria)
|
|
|
- if count == 0:
|
|
|
- result = self.mongo.collection.insert_one(data)
|
|
|
-
|
|
|
- def foresightnews(self):
|
|
|
- # 获取 foresightnews 新闻数据
|
|
|
- tag_title = 'foresightnews'
|
|
|
- base_url = 'https://foresightnews.pro/'
|
|
|
-
|
|
|
- # -------------------------------------------------------------------------------------------------------------
|
|
|
- url = 'https://foresightnews.pro/'
|
|
|
- print('前往 url: {}'.format(url))
|
|
|
- resp = httpx.get(url, headers=self.headers, timeout=10)
|
|
|
- if resp.status_code != 200:
|
|
|
- print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
|
|
|
-
|
|
|
- return False
|
|
|
-
|
|
|
- resp.encoding = 'utf-8'
|
|
|
- html = resp.text
|
|
|
- html = unescape(html)
|
|
|
- context_urls = re.findall('</div></div></div></a><a href="(.*?)" target="_blank"', html)
|
|
|
- title_list = re.findall('<div class="topic-body-title" data-v-3171afda>(.*?)</div>', html)
|
|
|
- context_list = re.findall('<div class="topic-body-content" data-v-3171afda>(.*?)</div>', html)
|
|
|
- posted_time_list = re.findall('div class="topic-time" data-v-3171afda>(.*?)</div>', html)
|
|
|
-
|
|
|
- for title, context, context_url, posted_time in zip(title_list, context_list, context_urls, posted_time_list):
|
|
|
- data = {
|
|
|
- 'title': title,
|
|
|
- 'context': context,
|
|
|
- 'source_url': url,
|
|
|
- 'link': base_url + context_url,
|
|
|
- 'article_type': tag_title,
|
|
|
- 'article_source': tag_title,
|
|
|
- 'img_url': '',
|
|
|
- 'keyword': '',
|
|
|
- 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'create_time': int(time.time()),
|
|
|
- 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
- 'repush_times': DEFAULT_RE_PUSH_TIMES
|
|
|
- }
|
|
|
- filter_criteria = {'title': title}
|
|
|
- count = self.mongo.collection.count_documents(filter_criteria)
|
|
|
- if count == 0:
|
|
|
- result = self.mongo.collection.insert_one(data)
|
|
|
-
|
|
|
- def main(self):
|
|
|
-
|
|
|
- # 打开浏览器之后, 按照每个网站不同的规则, 进行数据获取, 最后无论成功或者失败, 都放到 self.data_set
|
|
|
- # 每条新闻数据格式: {text: '', url: '', post_time: ''}
|
|
|
- # 跑完所有规则, 在数据库判定是否发送过消息, 数据格式: {text: '', url: '', post_time: '', push_count: 0}
|
|
|
-
|
|
|
- functions = [
|
|
|
- self.techflow,
|
|
|
- self.panewslab,
|
|
|
- self.foresightnews
|
|
|
- ]
|
|
|
-
|
|
|
- # 创建并启动线程
|
|
|
- print('创建并启动线程')
|
|
|
- threads = []
|
|
|
- for func in functions:
|
|
|
- thread = threading.Thread(target=func)
|
|
|
- thread.start()
|
|
|
- threads.append(thread)
|
|
|
-
|
|
|
- # 等待所有线程完成
|
|
|
- for thread in threads:
|
|
|
- thread.join()
|
|
|
-
|
|
|
- print('程序运行结束')
|
|
|
-
|
|
|
-
|
|
|
-if __name__ == "__main__":
|
|
|
- m = MessageSearchKey()
|
|
|
- m.main()
|