web3_news.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 爬取多个 web 新闻网站
  4. 存 mongo, 但只检索是否已发送过消息
  5. '''
  6. import os
  7. import sys
  8. import threading
  9. from utils.utils import LoadConfig
  10. sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
  11. from html import unescape
  12. import re
  13. from utils.utils import *
  14. config_json = LoadConfig().load_config()
  15. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  16. class MessageSearchKey(object):
  17. def __init__(self):
  18. db_name = 'NEWS'
  19. collection_name = 'web3_news'
  20. self.mongo = MongoHandle(db=db_name, collection=collection_name, del_db=False, del_collection=False,
  21. auto_remove=0)
  22. self.headers = {
  23. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
  24. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  25. "Accept-Language": "en-US,en;q=0.5",
  26. "Accept-Encoding": "gzip, deflate, br",
  27. "Connection": "keep-alive",
  28. "Content-Type": "application/json"
  29. }
  30. def techflow(self):
  31. # 深潮TechFlow url: https://www.163.com/dy/media/T1561634363944.html
  32. tag_title = '深潮TechFlow'
  33. data_list = []
  34. target = ['https://www.163.com/dy/media/T1561634363944.html']
  35. for url in target:
  36. print('前往 url: {}'.format(url))
  37. resp = httpx.get(url, headers=self.headers, timeout=10)
  38. if resp.status_code != 200:
  39. print('深潮TechFlow - 获取数据失败, 状态码: {}'.format(resp.status_code))
  40. return False
  41. resp.encoding = 'utf-8'
  42. html = resp.text
  43. context_urls = re.findall('<a href="(.*?)" class="title">', html)
  44. title_list = re.findall('class="title">(.*?)</a>', html)
  45. posted_time_list = re.findall('<span class="time">(.*?)</span>', html)
  46. for title, context_url, posted_time in zip(title_list, context_urls, posted_time_list):
  47. data = {
  48. 'title': title,
  49. 'context': title,
  50. 'source_url': url,
  51. 'link': context_url,
  52. 'article_type': tag_title,
  53. 'article_source': tag_title,
  54. 'img_url': '',
  55. 'keyword': '',
  56. 'posted_date': posted_time,
  57. 'create_time': int(time.time()),
  58. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  59. 'repush_times': DEFAULT_RE_PUSH_TIMES
  60. }
  61. filter_criteria = {'title': data['title']}
  62. count = self.mongo.collection.count_documents(filter_criteria)
  63. if count == 0:
  64. result = self.mongo.collection.insert_one(data)
  65. def panewslab(self):
  66. tag_title = 'panewslab'
  67. base_url = 'https://www.panewslab.com'
  68. # ------------------------------------------------------------------------------------------------------------
  69. try:
  70. url = 'https://www.panewslab.com/webapi/index/list?Rn=20&LId=1&LastTime=1724891115&TagId=&tw=0'
  71. print('前往 url: {}'.format(url))
  72. resp = httpx.get(url, headers=self.headers, timeout=10)
  73. if resp.status_code != 200:
  74. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  75. return False
  76. resp.encoding = 'utf-8'
  77. resp_json = resp.json()
  78. for resp_data in resp_json['data']:
  79. try:
  80. data = {
  81. 'title': resp_data['share']['title'],
  82. 'context': resp_data['desc'],
  83. 'source_url': url,
  84. 'link': resp_data['share']['url'],
  85. 'article_type': tag_title,
  86. 'article_source': tag_title,
  87. 'img_url': '',
  88. 'keyword': '',
  89. 'posted_date': datetime.utcfromtimestamp(int(resp_data['publishTime'])).strftime(
  90. '%Y-%m-%d %H:%M:%S'),
  91. 'create_time': int(time.time()),
  92. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  93. 'repush_times': DEFAULT_RE_PUSH_TIMES
  94. }
  95. filter_criteria = {'title': data['title']}
  96. count = self.mongo.collection.count_documents(filter_criteria)
  97. if count == 0:
  98. result = self.mongo.collection.insert_one(data)
  99. except Exception as e:
  100. print(f'{tag_title}: 数据取值失败, {e}')
  101. continue
  102. except Exception as e:
  103. print(f'{tag_title}: 数据取值失败, {e}')
  104. # -------------------------------------------------------------------------------------------------------------
  105. url = 'https://www.panewslab.com/zh/profundity/index.html'
  106. print('前往 url: {}'.format(url))
  107. resp = httpx.get(url, headers=self.headers, timeout=10)
  108. if resp.status_code != 200:
  109. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  110. return False
  111. resp.encoding = 'utf-8'
  112. html = resp.text
  113. context_urls = re.findall('<div class="list-left" data-v-559b28aa><a href="(.*?)" target="_blank"', html)
  114. title_list = re.findall('target="_blank" class="n-title" data-v-559b28aa>(.*?)</a>', html)
  115. context_list = re.findall('<p class="description" data-v-559b28aa>(.*?)</p>', html)
  116. for title, context, context_url in zip(title_list, context_list, context_urls):
  117. data = {
  118. 'title': title,
  119. 'context': context,
  120. 'source_url': url,
  121. 'link': base_url + context_url,
  122. 'article_type': tag_title,
  123. 'article_source': tag_title,
  124. 'img_url': '',
  125. 'keyword': '',
  126. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  127. 'create_time': int(time.time()),
  128. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  129. 'repush_times': DEFAULT_RE_PUSH_TIMES
  130. }
  131. filter_criteria = {'title': data['title']}
  132. count = self.mongo.collection.count_documents(filter_criteria)
  133. if count == 0:
  134. result = self.mongo.collection.insert_one(data)
  135. # -------------------------------------------------------------------------------------------------------------
  136. url = 'https://www.panewslab.com/zh/news/index.html'
  137. print('前往 url: {}'.format(url))
  138. resp = httpx.get(url, headers=self.headers, timeout=10)
  139. if resp.status_code != 200:
  140. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  141. return False
  142. resp.encoding = 'utf-8'
  143. html = resp.text
  144. context_urls = re.findall('class="content" data-v-3376a1f2><a href="(.*?)" target="_blank"', html)
  145. title_list = re.findall('target="_blank" class="n-title" data-v-3376a1f2>(.*?)</a>', html)
  146. context_list = re.findall('</a> <p data-v-3376a1f2>(.*?)</p>', html)
  147. for title, context, context_url in zip(title_list, context_list, context_urls):
  148. data = {
  149. 'title': title,
  150. 'context': context,
  151. 'source_url': url,
  152. 'link': base_url + context_url,
  153. 'article_type': tag_title,
  154. 'article_source': tag_title,
  155. 'img_url': '',
  156. 'keyword': '',
  157. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  158. 'create_time': int(time.time()),
  159. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  160. 'repush_times': DEFAULT_RE_PUSH_TIMES
  161. }
  162. filter_criteria = {'title': data['title']}
  163. count = self.mongo.collection.count_documents(filter_criteria)
  164. if count == 0:
  165. result = self.mongo.collection.insert_one(data)
  166. def foresightnews(self):
  167. # 获取 foresightnews 新闻数据
  168. tag_title = 'foresightnews'
  169. base_url = 'https://foresightnews.pro/'
  170. # -------------------------------------------------------------------------------------------------------------
  171. url = 'https://foresightnews.pro/'
  172. print('前往 url: {}'.format(url))
  173. resp = httpx.get(url, headers=self.headers, timeout=10)
  174. if resp.status_code != 200:
  175. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  176. return False
  177. resp.encoding = 'utf-8'
  178. html = resp.text
  179. html = unescape(html)
  180. context_urls = re.findall('</div></div></div></a><a href="(.*?)" target="_blank"', html)
  181. title_list = re.findall('<div class="topic-body-title" data-v-3171afda>(.*?)</div>', html)
  182. context_list = re.findall('<div class="topic-body-content" data-v-3171afda>(.*?)</div>', html)
  183. posted_time_list = re.findall('div class="topic-time" data-v-3171afda>(.*?)</div>', html)
  184. for title, context, context_url, posted_time in zip(title_list, context_list, context_urls, posted_time_list):
  185. data = {
  186. 'title': title,
  187. 'context': context,
  188. 'source_url': url,
  189. 'link': base_url + context_url,
  190. 'article_type': tag_title,
  191. 'article_source': tag_title,
  192. 'img_url': '',
  193. 'keyword': '',
  194. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  195. 'create_time': int(time.time()),
  196. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  197. 'repush_times': DEFAULT_RE_PUSH_TIMES
  198. }
  199. filter_criteria = {'title': title}
  200. count = self.mongo.collection.count_documents(filter_criteria)
  201. if count == 0:
  202. result = self.mongo.collection.insert_one(data)
  203. def main(self):
  204. # 打开浏览器之后, 按照每个网站不同的规则, 进行数据获取, 最后无论成功或者失败, 都放到 self.data_set
  205. # 每条新闻数据格式: {text: '', url: '', post_time: ''}
  206. # 跑完所有规则, 在数据库判定是否发送过消息, 数据格式: {text: '', url: '', post_time: '', push_count: 0}
  207. functions = [
  208. self.techflow,
  209. self.panewslab,
  210. self.foresightnews
  211. ]
  212. # 创建并启动线程
  213. print('创建并启动线程')
  214. threads = []
  215. for func in functions:
  216. thread = threading.Thread(target=func)
  217. thread.start()
  218. threads.append(thread)
  219. # 等待所有线程完成
  220. for thread in threads:
  221. thread.join()
  222. print('程序运行结束')
  223. if __name__ == "__main__":
  224. m = MessageSearchKey()
  225. m.main()