news_get_chiphell.py 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243
  1. # -*- coding: utf-8 -*-
  2. '''
  3. chiphell
  4. '''
  5. import os
  6. import random
  7. import sys
  8. import threading
  9. import re
  10. import time
  11. from datetime import datetime
  12. import httpx
  13. sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
  14. from utils.utils_mongo_handle import MongoHandle
  15. from utils.utils_logs_handle import LogsHandle
  16. from utils.utils_send_email import SendEmail
  17. from base.base_load_config import load_config
  18. config_json = load_config()
  19. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  20. class CHIPHELL(object):
  21. def __init__(self):
  22. self.logs_handle = LogsHandle()
  23. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  24. self.base_url = 'https://www.chiphell.com/'
  25. self.href_url = 'portal.php?mod=list&catid={}'
  26. self.db = 'NEWS'
  27. self.collection = 'chiphell_info'
  28. self.headers = {
  29. 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8'
  30. }
  31. self.send_email_datas = []
  32. self.send_email_now = 0
  33. def req(self, source, target):
  34. print(f'正在获取 {source} 数据')
  35. # sleep_time = random.uniform(10, 15)
  36. sleep_time = random.uniform(1, 2)
  37. print(f'睡眠 {sleep_time} 秒')
  38. time.sleep(sleep_time)
  39. result_list = []
  40. try:
  41. resp = httpx.get(url=self.base_url + self.href_url.format(target), headers=self.headers)
  42. except Exception as e:
  43. print(e)
  44. return 0
  45. if resp.status_code == 200:
  46. resp.encoding = 'utf-8'
  47. # print(resp.text)
  48. dl_list = re.findall('<dt class="xs2">([\S\s]*?)</dl>', resp.text)
  49. for dl in dl_list:
  50. if dl:
  51. url_list = re.findall('<a href="(.*?)" target="_blank" ', dl)
  52. title_list = re.findall('class="xi2" style="">(.*?)</a> </dt>', dl)
  53. img_url_list = re.findall('target="_blank"><img src="(.*?)"', dl)
  54. context_list = re.findall('class="tn" /></a></div>([\S\s]*?)</dd>', dl)
  55. post_time_list = re.findall('<span class="xg1"> (.*?)</span>', dl)
  56. for url, title, img_url, context, post_time in zip(url_list, title_list, img_url_list, context_list,
  57. post_time_list):
  58. # 清理正文内容的空格和换行等字符
  59. if context:
  60. for i in [' ', '\n']:
  61. context = context.replace(i, '')
  62. context = context.replace('\r', ' ')
  63. result_list.append({
  64. "title": title,
  65. "context": context,
  66. "source_url": self.base_url + url,
  67. 'link': '',
  68. "article_type": source.split(' - ')[1],
  69. "article_source": source.split(' - ')[0],
  70. "img_url": img_url,
  71. 'keyword': '',
  72. "posted_date": post_time,
  73. "create_time": int(time.time()),
  74. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  75. "repush_times": DEFAULT_RE_PUSH_TIMES
  76. })
  77. else:
  78. print(resp.status_code)
  79. return 0
  80. return result_list
  81. def save_to_mongo(self, collection, source_data):
  82. print(f'正在处理 {self.collection} 数据')
  83. mongo = MongoHandle(db=self.db, collection=self.collection, del_db=False, del_collection=False, auto_remove=0)
  84. for data_to_insert in source_data:
  85. try:
  86. # 检查数据库中是否存在匹配的文档
  87. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  88. count = mongo.collection.count_documents(filter_criteria)
  89. if count == 0:
  90. # 如果没有找到匹配的文档,插入新文档
  91. result = mongo.collection.insert_one(data_to_insert)
  92. # 准备发送邮件的数据
  93. self.send_email_datas.append(data_to_insert)
  94. except TypeError as te:
  95. print('\n%s' % te)
  96. self.logs_handle.logs_write('chiphell', '写入数据库报错: %s' % te, 'error', False)
  97. return 0
  98. print(f'处理 chiphell - {collection}数据完成')
  99. def send_to_email(self):
  100. title = 'chiphell - info'
  101. subject = 'chiphell - info'
  102. text = '********************************************************\n'
  103. for data in self.send_email_datas:
  104. text += '标题: {}\n'.format(data['title'])
  105. text += '正文: {}\n'.format(data['context'])
  106. text += '板块: {}\n'.format(data['article_source'])
  107. text += '类型: {}\n'.format(data['article_type'])
  108. text += '文章地址: {}\n'.format(data['source_url'])
  109. text += '文章时间: {}\n'.format(data['posted_date'])
  110. text += '获取时间: {}\n'.format(data['create_datetime'])
  111. text += '********************************************************\n\n'
  112. send_email = SendEmail(subject=subject, title=title, text=text)
  113. send_email.send()
  114. self.logs_handle.logs_write('chiphell', f'{title}-发送邮件完成', 'done', False)
  115. def main(self):
  116. category = {
  117. '评测': {
  118. '笔记本': '19',
  119. '机箱': '11',
  120. # '处理器': '13',
  121. # '散热器': '14',
  122. # '主板': '15',
  123. # '内存': '137',
  124. # '外设': '18',
  125. # '电源': '35',
  126. '存储': '23',
  127. '显示设备': '21',
  128. # '台式机': '88',
  129. '显卡': '10',
  130. # '相机': '116'
  131. },
  132. '电脑': {
  133. '配件开箱': '98',
  134. '整机搭建': '99',
  135. '桌面书房': '101'
  136. },
  137. '掌设': {
  138. '智能手机': '40',
  139. '智能穿戴': '89',
  140. '笔电平板': '41',
  141. # '周边附件': '92'
  142. },
  143. # '摄影': {
  144. # '微单卡片': '52',
  145. # '单反单电': '51',
  146. # '经典旁轴': '53',
  147. # '怀旧菲林': '54',
  148. # '影音摄像': '57',
  149. # '周边附件': '55'
  150. # },
  151. # '汽车': {
  152. # '买菜车': '58',
  153. # '商务车': '59',
  154. # '性能车': '63',
  155. # '旅行车': '60',
  156. # 'SUV': '61',
  157. # 'MPV': '95',
  158. # '摩托轻骑': '65',
  159. # '改装配件': '96'
  160. # },
  161. # '单车': {
  162. # '山地车': '108',
  163. # '公路车': '109',
  164. # '折叠车': '110',
  165. # '休旅车': '111'
  166. # },
  167. # '腕表': {
  168. # '机械表': '128',
  169. # '电子表': '126'
  170. # },
  171. '视听': {
  172. '耳机耳放': '71',
  173. '音箱功放': '72',
  174. # '解码转盘': '73',
  175. '随身设备': '74'
  176. },
  177. '美食': {
  178. '当地美食': '68',
  179. '世界美食': '117',
  180. '私房菜品': '69',
  181. '美食器材': '70'
  182. },
  183. # '家居': {
  184. # '家居': '132'
  185. # },
  186. }
  187. response_datas = {}
  188. for source1, tags in category.items():
  189. # source1作为表名, 先放到response_datas里面
  190. if source1 not in response_datas:
  191. response_datas[source1] = []
  192. for source2, target in tags.items():
  193. source = source1 + ' - ' + source2
  194. response_data = self.req(source, target)
  195. if response_data != 0:
  196. response_datas[source1] += response_data
  197. if response_datas:
  198. threads = []
  199. for k, v in response_datas.items():
  200. thread = threading.Thread(target=self.save_to_mongo, args=(k, v,))
  201. threads.append(thread)
  202. thread.start()
  203. for thread in threads:
  204. thread.join()
  205. else:
  206. self.logs_handle.logs_write('chiphell', '获取数据为空', 'error', False)
  207. return False
  208. # 如果 self.send_email_datas 中有数据, 则发送邮件
  209. if self.send_email_now:
  210. if self.send_email_datas:
  211. self.send_to_email()
  212. if __name__ == '__main__':
  213. CHIPHELL().main()