web3_news.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. # -*- coding: utf-8 -*-
  2. '''
  3. 爬取多个 web 新闻网站
  4. 存 mongo, 但只检索是否已发送过消息
  5. '''
  6. import os
  7. import sys
  8. import threading
  9. sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
  10. from html import unescape
  11. import re
  12. from utils.utils import *
  13. config_json = LoadConfig().load_config()
  14. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  15. class MessageSearchKey(object):
  16. def __init__(self):
  17. db_name = 'NEWS'
  18. collection_name = 'web3_news'
  19. self.mongo = MongoHandle(db=db_name, collection=collection_name, del_db=False, del_collection=False,
  20. auto_remove=0)
  21. self.headers = {
  22. "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)",
  23. "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
  24. "Accept-Language": "en-US,en;q=0.5",
  25. "Accept-Encoding": "gzip, deflate, br",
  26. "Connection": "keep-alive",
  27. "Content-Type": "application/json"
  28. }
  29. def techflow(self):
  30. # 深潮TechFlow url: https://www.163.com/dy/media/T1561634363944.html
  31. tag_title = '深潮TechFlow'
  32. data_list = []
  33. target = ['https://www.163.com/dy/media/T1561634363944.html']
  34. for url in target:
  35. print('前往 url: {}'.format(url))
  36. resp = httpx.get(url, headers=self.headers, timeout=10)
  37. if resp.status_code != 200:
  38. print('深潮TechFlow - 获取数据失败, 状态码: {}'.format(resp.status_code))
  39. return False
  40. resp.encoding = 'utf-8'
  41. html = resp.text
  42. context_urls = re.findall('<a href="(.*?)" class="title">', html)
  43. title_list = re.findall('class="title">(.*?)</a>', html)
  44. posted_time_list = re.findall('<span class="time">(.*?)</span>', html)
  45. for title, context_url, posted_time in zip(title_list, context_urls, posted_time_list):
  46. data = {
  47. 'title': title,
  48. 'context': title,
  49. 'source_url': url,
  50. 'link': context_url,
  51. 'article_type': tag_title,
  52. 'article_source': tag_title,
  53. 'img_url': '',
  54. 'keyword': '',
  55. 'posted_date': posted_time,
  56. 'create_time': int(time.time()),
  57. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  58. 'repush_times': DEFAULT_RE_PUSH_TIMES
  59. }
  60. filter_criteria = {'title': data['title']}
  61. count = self.mongo.collection.count_documents(filter_criteria)
  62. if count == 0:
  63. result = self.mongo.collection.insert_one(data)
  64. def panewslab(self):
  65. tag_title = 'panewslab'
  66. base_url = 'https://www.panewslab.com'
  67. # ------------------------------------------------------------------------------------------------------------
  68. try:
  69. url = 'https://www.panewslab.com/webapi/index/list?Rn=20&LId=1&LastTime=1724891115&TagId=&tw=0'
  70. print('前往 url: {}'.format(url))
  71. resp = httpx.get(url, headers=self.headers, timeout=10)
  72. if resp.status_code != 200:
  73. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  74. return False
  75. resp.encoding = 'utf-8'
  76. resp_json = resp.json()
  77. for resp_data in resp_json['data']:
  78. try:
  79. data = {
  80. 'title': resp_data['share']['title'],
  81. 'context': resp_data['desc'],
  82. 'source_url': url,
  83. 'link': resp_data['share']['url'],
  84. 'article_type': tag_title,
  85. 'article_source': tag_title,
  86. 'img_url': '',
  87. 'keyword': '',
  88. 'posted_date': datetime.utcfromtimestamp(int(resp_data['publishTime'])).strftime(
  89. '%Y-%m-%d %H:%M:%S'),
  90. 'create_time': int(time.time()),
  91. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  92. 'repush_times': DEFAULT_RE_PUSH_TIMES
  93. }
  94. filter_criteria = {'title': data['title']}
  95. count = self.mongo.collection.count_documents(filter_criteria)
  96. if count == 0:
  97. result = self.mongo.collection.insert_one(data)
  98. except Exception as e:
  99. print(f'{tag_title}: 数据取值失败, {e}')
  100. continue
  101. except Exception as e:
  102. print(f'{tag_title}: 数据取值失败, {e}')
  103. # -------------------------------------------------------------------------------------------------------------
  104. url = 'https://www.panewslab.com/zh/profundity/index.html'
  105. print('前往 url: {}'.format(url))
  106. resp = httpx.get(url, headers=self.headers, timeout=10)
  107. if resp.status_code != 200:
  108. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  109. return False
  110. resp.encoding = 'utf-8'
  111. html = resp.text
  112. context_urls = re.findall('<div class="list-left" data-v-559b28aa><a href="(.*?)" target="_blank"', html)
  113. title_list = re.findall('target="_blank" class="n-title" data-v-559b28aa>(.*?)</a>', html)
  114. context_list = re.findall('<p class="description" data-v-559b28aa>(.*?)</p>', html)
  115. for title, context, context_url in zip(title_list, context_list, context_urls):
  116. data = {
  117. 'title': title,
  118. 'context': context,
  119. 'source_url': url,
  120. 'link': base_url + context_url,
  121. 'article_type': tag_title,
  122. 'article_source': tag_title,
  123. 'img_url': '',
  124. 'keyword': '',
  125. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  126. 'create_time': int(time.time()),
  127. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  128. 'repush_times': DEFAULT_RE_PUSH_TIMES
  129. }
  130. filter_criteria = {'title': data['title']}
  131. count = self.mongo.collection.count_documents(filter_criteria)
  132. if count == 0:
  133. result = self.mongo.collection.insert_one(data)
  134. # -------------------------------------------------------------------------------------------------------------
  135. url = 'https://www.panewslab.com/zh/news/index.html'
  136. print('前往 url: {}'.format(url))
  137. resp = httpx.get(url, headers=self.headers, timeout=10)
  138. if resp.status_code != 200:
  139. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  140. return False
  141. resp.encoding = 'utf-8'
  142. html = resp.text
  143. context_urls = re.findall('class="content" data-v-3376a1f2><a href="(.*?)" target="_blank"', html)
  144. title_list = re.findall('target="_blank" class="n-title" data-v-3376a1f2>(.*?)</a>', html)
  145. context_list = re.findall('</a> <p data-v-3376a1f2>(.*?)</p>', html)
  146. for title, context, context_url in zip(title_list, context_list, context_urls):
  147. data = {
  148. 'title': title,
  149. 'context': context,
  150. 'source_url': url,
  151. 'link': base_url + context_url,
  152. 'article_type': tag_title,
  153. 'article_source': tag_title,
  154. 'img_url': '',
  155. 'keyword': '',
  156. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  157. 'create_time': int(time.time()),
  158. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  159. 'repush_times': DEFAULT_RE_PUSH_TIMES
  160. }
  161. filter_criteria = {'title': data['title']}
  162. count = self.mongo.collection.count_documents(filter_criteria)
  163. if count == 0:
  164. result = self.mongo.collection.insert_one(data)
  165. def foresightnews(self):
  166. # 获取 foresightnews 新闻数据
  167. tag_title = 'foresightnews'
  168. base_url = 'https://foresightnews.pro/'
  169. # -------------------------------------------------------------------------------------------------------------
  170. url = 'https://foresightnews.pro/'
  171. print('前往 url: {}'.format(url))
  172. resp = httpx.get(url, headers=self.headers, timeout=10)
  173. if resp.status_code != 200:
  174. print('{} - 获取数据失败, 状态码: {}'.format(tag_title, resp.status_code))
  175. return False
  176. resp.encoding = 'utf-8'
  177. html = resp.text
  178. html = unescape(html)
  179. context_urls = re.findall('</div></div></div></a><a href="(.*?)" target="_blank"', html)
  180. title_list = re.findall('<div class="topic-body-title" data-v-3171afda>(.*?)</div>', html)
  181. context_list = re.findall('<div class="topic-body-content" data-v-3171afda>(.*?)</div>', html)
  182. posted_time_list = re.findall('div class="topic-time" data-v-3171afda>(.*?)</div>', html)
  183. for title, context, context_url, posted_time in zip(title_list, context_list, context_urls, posted_time_list):
  184. data = {
  185. 'title': title,
  186. 'context': context,
  187. 'source_url': url,
  188. 'link': base_url + context_url,
  189. 'article_type': tag_title,
  190. 'article_source': tag_title,
  191. 'img_url': '',
  192. 'keyword': '',
  193. 'posted_date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  194. 'create_time': int(time.time()),
  195. 'create_datetime': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
  196. 'repush_times': DEFAULT_RE_PUSH_TIMES
  197. }
  198. filter_criteria = {'title': title}
  199. count = self.mongo.collection.count_documents(filter_criteria)
  200. if count == 0:
  201. result = self.mongo.collection.insert_one(data)
  202. def main(self):
  203. # 打开浏览器之后, 按照每个网站不同的规则, 进行数据获取, 最后无论成功或者失败, 都放到 self.data_set
  204. # 每条新闻数据格式: {text: '', url: '', post_time: ''}
  205. # 跑完所有规则, 在数据库判定是否发送过消息, 数据格式: {text: '', url: '', post_time: '', push_count: 0}
  206. functions = [
  207. self.techflow,
  208. self.panewslab,
  209. self.foresightnews
  210. ]
  211. # 创建并启动线程
  212. print('创建并启动线程')
  213. threads = []
  214. for func in functions:
  215. thread = threading.Thread(target=func)
  216. thread.start()
  217. threads.append(thread)
  218. # 等待所有线程完成
  219. for thread in threads:
  220. thread.join()
  221. print('程序运行结束')
  222. if __name__ == "__main__":
  223. m = MessageSearchKey()
  224. m.main()