web3_news.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 爬取多个 web 新闻网站
  4. 存 mongo, 但只检索是否已发送过消息
  5. '''
  6. import os
  7. import sys
  8. import threading
  9. import time
  10. import httpx
  11. sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
  12. from html import unescape
  13. from datetime import datetime
  14. import re
  15. from utils.utils_mongo_handle import MongoHandle
  16. from base.base_load_config import load_config
  17. config_json = load_config()
  18. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  19. class MessageSearchKey(object):
  20. def __init__(self):
  21. db_name = 'NEWS'
  22. collection_name = 'web3_news'
  23. self.mongo = MongoHandle(db=db_name, collection=collection_name, del_db=False, del_collection=False,
  24. auto_remove=0)
  25. self.headers = {
  26. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
  27. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  28. "Accept-Language": "en-US,en;q=0.5",
  29. "Accept-Encoding": "gzip, deflate, br",
  30. "Connection": "keep-alive",
  31. "Content-Type": "application/json"
  32. }
  33. def techflow(self):
  34. # 深潮TechFlow url: https://www.163.com/dy/media/T1561634363944.html
  35. tag_title = '深潮TechFlow'
  36. data_list = []
  37. target = ['https://www.163.com/dy/media/T1561634363944.html']
  38. for url in target:
  39. print('前往 url: {}'.format(url))
  40. resp = httpx.get(url, headers=self.headers, timeout=10)
  41. if resp.status_code != 200:
  42. print('深潮TechFlow - 获取数据失败, 状态码: {}'.format(resp.status_code))
  43. return False
  44. resp.encoding = 'utf-8'
  45. html = resp.text
  46. context_urls = re.findall('<a href="(.*?)" class="title">', html)
  47. title_list = re.findall('class="title">(.*?)</a>', html)
  48. posted_time_list = re.findall('<span class="time">(.*?)</span>', html)
  49. for title, context_url, posted_time in zip(title_list, context_urls, posted_time_list):
  50. data = {
  51. 'title': title,
  52. 'context': title,
  53. 'source_url': url,
  54. 'link': context_url,
  55. 'article_type': tag_title,
  56. 'article_source': tag_title,
  57. 'img_url': '',
  58. 'keyword': '',
  59. 'posted_date': posted_time,
  60. 'create_time': int(time.time()),
  61. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  62. 'repush_times': DEFAULT_RE_PUSH_TIMES
  63. }
  64. filter_criteria = {'title': data['title']}
  65. count = self.mongo.collection.count_documents(filter_criteria)
  66. if count == 0:
  67. result = self.mongo.collection.insert_one(data)
  68. def panewslab(self):
  69. tag_title = 'panewslab'
  70. base_url = 'https://www.panewslab.com'
  71. # ------------------------------------------------------------------------------------------------------------
  72. try:
  73. url = 'https://www.panewslab.com/webapi/index/list?Rn=20&LId=1&LastTime=1724891115&TagId=&tw=0'
  74. print('前往 url: {}'.format(url))
  75. resp = httpx.get(url, headers=self.headers, timeout=10)
  76. if resp.status_code != 200:
  77. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  78. return False
  79. resp.encoding = 'utf-8'
  80. resp_json = resp.json()
  81. for resp_data in resp_json['data']:
  82. try:
  83. data = {
  84. 'title': resp_data['share']['title'],
  85. 'context': resp_data['desc'],
  86. 'source_url': url,
  87. 'link': resp_data['share']['url'],
  88. 'article_type': tag_title,
  89. 'article_source': tag_title,
  90. 'img_url': '',
  91. 'keyword': '',
  92. 'posted_date': datetime.utcfromtimestamp(int(resp_data['publishTime'])).strftime(
  93. '%Y-%m-%d %H:%M:%S'),
  94. 'create_time': int(time.time()),
  95. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  96. 'repush_times': DEFAULT_RE_PUSH_TIMES
  97. }
  98. filter_criteria = {'title': data['title']}
  99. count = self.mongo.collection.count_documents(filter_criteria)
  100. if count == 0:
  101. result = self.mongo.collection.insert_one(data)
  102. except Exception as e:
  103. print(f'{tag_title}: 数据取值失败, {e}')
  104. continue
  105. except Exception as e:
  106. print(f'{tag_title}: 数据取值失败, {e}')
  107. # -------------------------------------------------------------------------------------------------------------
  108. url = 'https://www.panewslab.com/zh/profundity/index.html'
  109. print('前往 url: {}'.format(url))
  110. resp = httpx.get(url, headers=self.headers, timeout=10)
  111. if resp.status_code != 200:
  112. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  113. return False
  114. resp.encoding = 'utf-8'
  115. html = resp.text
  116. context_urls = re.findall('<div class="list-left" data-v-559b28aa><a href="(.*?)" target="_blank"', html)
  117. title_list = re.findall('target="_blank" class="n-title" data-v-559b28aa>(.*?)</a>', html)
  118. context_list = re.findall('<p class="description" data-v-559b28aa>(.*?)</p>', html)
  119. for title, context, context_url in zip(title_list, context_list, context_urls):
  120. data = {
  121. 'title': title,
  122. 'context': context,
  123. 'source_url': url,
  124. 'link': base_url + context_url,
  125. 'article_type': tag_title,
  126. 'article_source': tag_title,
  127. 'img_url': '',
  128. 'keyword': '',
  129. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  130. 'create_time': int(time.time()),
  131. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  132. 'repush_times': DEFAULT_RE_PUSH_TIMES
  133. }
  134. filter_criteria = {'title': data['title']}
  135. count = self.mongo.collection.count_documents(filter_criteria)
  136. if count == 0:
  137. result = self.mongo.collection.insert_one(data)
  138. # -------------------------------------------------------------------------------------------------------------
  139. url = 'https://www.panewslab.com/zh/news/index.html'
  140. print('前往 url: {}'.format(url))
  141. resp = httpx.get(url, headers=self.headers, timeout=10)
  142. if resp.status_code != 200:
  143. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  144. return False
  145. resp.encoding = 'utf-8'
  146. html = resp.text
  147. context_urls = re.findall('class="content" data-v-3376a1f2><a href="(.*?)" target="_blank"', html)
  148. title_list = re.findall('target="_blank" class="n-title" data-v-3376a1f2>(.*?)</a>', html)
  149. context_list = re.findall('</a> <p data-v-3376a1f2>(.*?)</p>', html)
  150. for title, context, context_url in zip(title_list, context_list, context_urls):
  151. data = {
  152. 'title': title,
  153. 'context': context,
  154. 'source_url': url,
  155. 'link': base_url + context_url,
  156. 'article_type': tag_title,
  157. 'article_source': tag_title,
  158. 'img_url': '',
  159. 'keyword': '',
  160. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  161. 'create_time': int(time.time()),
  162. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  163. 'repush_times': DEFAULT_RE_PUSH_TIMES
  164. }
  165. filter_criteria = {'title': data['title']}
  166. count = self.mongo.collection.count_documents(filter_criteria)
  167. if count == 0:
  168. result = self.mongo.collection.insert_one(data)
  169. def foresightnews(self):
  170. # 获取 foresightnews 新闻数据
  171. tag_title = 'foresightnews'
  172. base_url = 'https://foresightnews.pro/'
  173. # -------------------------------------------------------------------------------------------------------------
  174. url = 'https://foresightnews.pro/'
  175. print('前往 url: {}'.format(url))
  176. resp = httpx.get(url, headers=self.headers, timeout=10)
  177. if resp.status_code != 200:
  178. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  179. return False
  180. resp.encoding = 'utf-8'
  181. html = resp.text
  182. html = unescape(html)
  183. context_urls = re.findall('</div></div></div></a><a href="(.*?)" target="_blank"', html)
  184. title_list = re.findall('<div class="topic-body-title" data-v-3171afda>(.*?)</div>', html)
  185. context_list = re.findall('<div class="topic-body-content" data-v-3171afda>(.*?)</div>', html)
  186. posted_time_list = re.findall('div class="topic-time" data-v-3171afda>(.*?)</div>', html)
  187. for title, context, context_url, posted_time in zip(title_list, context_list, context_urls, posted_time_list):
  188. data = {
  189. 'title': title,
  190. 'context': context,
  191. 'source_url': url,
  192. 'link': base_url + context_url,
  193. 'article_type': tag_title,
  194. 'article_source': tag_title,
  195. 'img_url': '',
  196. 'keyword': '',
  197. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  198. 'create_time': int(time.time()),
  199. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  200. 'repush_times': DEFAULT_RE_PUSH_TIMES
  201. }
  202. filter_criteria = {'title': title}
  203. count = self.mongo.collection.count_documents(filter_criteria)
  204. if count == 0:
  205. result = self.mongo.collection.insert_one(data)
  206. def main(self):
  207. # 打开浏览器之后, 按照每个网站不同的规则, 进行数据获取, 最后无论成功或者失败, 都放到 self.data_set
  208. # 每条新闻数据格式: {text: '', url: '', post_time: ''}
  209. # 跑完所有规则, 在数据库判定是否发送过消息, 数据格式: {text: '', url: '', post_time: '', push_count: 0}
  210. functions = [
  211. self.techflow,
  212. self.panewslab,
  213. self.foresightnews
  214. ]
  215. # 创建并启动线程
  216. print('创建并启动线程')
  217. threads = []
  218. for func in functions:
  219. thread = threading.Thread(target=func)
  220. thread.start()
  221. threads.append(thread)
  222. # 等待所有线程完成
  223. for thread in threads:
  224. thread.join()
  225. print('程序运行结束')
  226. if __name__ == "__main__":
  227. m = MessageSearchKey()
  228. m.main()