chiphell.py 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236
  1. # -*- coding: utf-8 -*-
  2. '''
  3. chiphell
  4. '''
  5. import os
  6. import random
  7. import sys
  8. import threading
  9. import re
  10. sys.path.append(os.path.join(os.path.abspath(__file__).split('AutoInfo')[0] + 'AutoInfo'))
  11. from utils.utils import *
  12. config_json = LoadConfig().load_config()
  13. DEFAULT_RE_PUSH_TIMES = config_json['DEFAULT_RE_PUSH_TIMES']
  14. class CHIPHELL(object):
  15. def __init__(self):
  16. self.logs_handle = LogsHandle()
  17. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  18. self.base_url = 'https://www.chiphell.com/'
  19. self.href_url = 'portal.php?mod=list&catid={}'
  20. self.db = 'NEWS'
  21. self.collection = 'chiphell_info'
  22. self.headers = {
  23. 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8'
  24. }
  25. self.temp_datas = []
  26. def req(self, source, target):
  27. print(f'正在获取 {source} 数据')
  28. # sleep_time = random.uniform(10, 15)
  29. sleep_time = random.uniform(1, 2)
  30. print(f'睡眠 {sleep_time} 秒')
  31. time.sleep(sleep_time)
  32. result_list = []
  33. try:
  34. url = self.base_url + self.href_url.format(target)
  35. print(url)
  36. resp = httpx.get(url=url, headers=self.headers)
  37. except Exception as e:
  38. print(e)
  39. return 0
  40. if resp.status_code == 200:
  41. resp.encoding = 'utf-8'
  42. # print(resp.text)
  43. dl_list = re.findall('<dt class="xs2">([\S\s]*?)</dl>', resp.text)
  44. for dl in dl_list:
  45. if dl:
  46. url_list = re.findall('<a href="(.*?)" target="_blank" ', dl)
  47. title_list = re.findall('class="xi2" style="">(.*?)</a> </dt>', dl)
  48. img_url_list = re.findall('target="_blank"><img src="(.*?)"', dl)
  49. context_list = re.findall('class="tn" /></a></div>([\S\s]*?)</dd>', dl)
  50. post_time_list = re.findall('<span class="xg1"> (.*?)</span>', dl)
  51. for url, title, img_url, context, post_time in zip(url_list, title_list, img_url_list, context_list,
  52. post_time_list):
  53. # 清理正文内容的空格和换行等字符
  54. if context:
  55. for i in [' ', '\n']:
  56. context = context.replace(i, '')
  57. context = context.replace('\r', ' ')
  58. result_list.append({
  59. "title": title,
  60. "context": context,
  61. "source_url": self.base_url + url,
  62. 'link': '',
  63. "article_type": source.split(' - ')[1],
  64. "article_source": source.split(' - ')[0],
  65. "img_url": img_url,
  66. 'keyword': '',
  67. "posted_date": post_time,
  68. "create_time": int(time.time()),
  69. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
  70. "repush_times": DEFAULT_RE_PUSH_TIMES
  71. })
  72. else:
  73. print(resp.status_code)
  74. return 0
  75. return result_list
  76. def save_to_mongo(self, collection, source_data):
  77. print(f'正在处理 {self.collection} 数据')
  78. mongo = MongoHandle(db=self.db, collection=self.collection, del_db=False, del_collection=False, auto_remove=0)
  79. for data_to_insert in source_data:
  80. try:
  81. # 检查数据库中是否存在匹配的文档
  82. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  83. count = mongo.collection.count_documents(filter_criteria)
  84. if count == 0:
  85. # 如果没有找到匹配的文档,插入新文档
  86. result = mongo.collection.insert_one(data_to_insert)
  87. # 准备发送邮件的数据
  88. self.temp_datas.append(data_to_insert)
  89. except TypeError as te:
  90. print('\n%s' % te)
  91. self.logs_handle.logs_write('chiphell', '写入数据库报错: %s' % te, 'error', False)
  92. return 0
  93. print(f'处理 chiphell - {collection}数据完成')
  94. def send_to_email(self):
  95. text = '********************************************************\n'
  96. for data in self.temp_datas:
  97. text += '标题: {}\n'.format(data['title'])
  98. text += '正文: {}\n'.format(data['context'])
  99. text += '板块: {}\n'.format(data['article_source'])
  100. text += '类型: {}\n'.format(data['article_type'])
  101. text += '文章地址: {}\n'.format(data['source_url'])
  102. text += '文章时间: {}\n'.format(data['posted_date'])
  103. text += '获取时间: {}\n'.format(data['create_datetime'])
  104. text += '********************************************************\n\n'
  105. title = 'chiphell - info - ' + str(datetime.now().strftime('%Y-%m-%d %H:%M:%S'))
  106. sub = 'chiphell - info'
  107. SendEmail(subject=sub, title=title, text=text).send()
  108. # GotifyNotifier(title=title, message=text, token_name='news').send_message()
  109. self.logs_handle.logs_write('chiphell', f'{title}-发送邮件完成', 'done', False)
  110. def main(self):
  111. category = {
  112. '评测': {
  113. '笔记本': '19',
  114. '机箱': '11',
  115. # '处理器': '13',
  116. # '散热器': '14',
  117. # '主板': '15',
  118. # '内存': '137',
  119. # '外设': '18',
  120. # '电源': '35',
  121. '存储': '23',
  122. '显示设备': '21',
  123. # '台式机': '88',
  124. '显卡': '10',
  125. # '相机': '116'
  126. },
  127. '电脑': {
  128. '配件开箱': '98',
  129. '整机搭建': '99',
  130. '桌面书房': '101'
  131. },
  132. '掌设': {
  133. '智能手机': '40',
  134. '智能穿戴': '89',
  135. '笔电平板': '41',
  136. # '周边附件': '92'
  137. },
  138. # '摄影': {
  139. # '微单卡片': '52',
  140. # '单反单电': '51',
  141. # '经典旁轴': '53',
  142. # '怀旧菲林': '54',
  143. # '影音摄像': '57',
  144. # '周边附件': '55'
  145. # },
  146. # '汽车': {
  147. # '买菜车': '58',
  148. # '商务车': '59',
  149. # '性能车': '63',
  150. # '旅行车': '60',
  151. # 'SUV': '61',
  152. # 'MPV': '95',
  153. # '摩托轻骑': '65',
  154. # '改装配件': '96'
  155. # },
  156. # '单车': {
  157. # '山地车': '108',
  158. # '公路车': '109',
  159. # '折叠车': '110',
  160. # '休旅车': '111'
  161. # },
  162. # '腕表': {
  163. # '机械表': '128',
  164. # '电子表': '126'
  165. # },
  166. '视听': {
  167. '耳机耳放': '71',
  168. '音箱功放': '72',
  169. # '解码转盘': '73',
  170. '随身设备': '74'
  171. },
  172. '美食': {
  173. '当地美食': '68',
  174. '世界美食': '117',
  175. '私房菜品': '69',
  176. '美食器材': '70'
  177. },
  178. # '家居': {
  179. # '家居': '132'
  180. # },
  181. }
  182. response_datas = {}
  183. for source1, tags in category.items():
  184. # source1作为表名, 先放到response_datas里面
  185. if source1 not in response_datas:
  186. response_datas[source1] = []
  187. for source2, target in tags.items():
  188. source = source1 + ' - ' + source2
  189. response_data = self.req(source, target)
  190. if response_data != 0:
  191. response_datas[source1] += response_data
  192. if response_datas:
  193. threads = []
  194. for k, v in response_datas.items():
  195. thread = threading.Thread(target=self.save_to_mongo, args=(k, v,))
  196. threads.append(thread)
  197. thread.start()
  198. for thread in threads:
  199. thread.join()
  200. if self.temp_datas:
  201. self.send_to_email()
  202. return None
  203. else:
  204. self.logs_handle.logs_write('chiphell - info', '获取数据为空', 'error', False)
  205. return False
  206. if __name__ == '__main__':
  207. CHIPHELL().main()