news_get_chiphell.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. # -*- coding: utf-8 -*-
  2. '''
  3. chiphell
  4. '''
  5. import os
  6. import random
  7. import sys
  8. import threading
  9. sys.path.append(os.path.join(os.getcwd().split('auto')[0], 'auto'))
  10. import re
  11. import time
  12. from datetime import datetime
  13. import httpx
  14. from tools_mongo_handle import MongoHandle
  15. from tools_logs_handle import LogsHandle
  16. from tools_send_email import SendEmail
  17. class CHIPHELL(object):
  18. def __init__(self):
  19. self.logs_handle = LogsHandle()
  20. self.now_day = time.strftime('%Y-%m-%d', time.localtime())
  21. self.base_url = 'https://www.chiphell.com/'
  22. self.href_url = 'portal.php?mod=list&catid={}'
  23. self.db = 'NEWS'
  24. self.collection = 'chiphell_info'
  25. self.headers = {
  26. 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; Media Center PC 6.0; InfoPath.2; MS-RTC LM 8'
  27. }
  28. self.send_email_datas = []
  29. def req(self, source, target):
  30. print(f'正在获取 {source} 数据')
  31. # sleep_time = random.uniform(10, 15)
  32. sleep_time = random.uniform(1, 2)
  33. print(f'睡眠 {sleep_time} 秒')
  34. time.sleep(sleep_time)
  35. result_list = []
  36. try:
  37. resp = httpx.get(url=self.base_url + self.href_url.format(target), headers=self.headers)
  38. except Exception as e:
  39. print(e)
  40. return 0
  41. if resp.status_code == 200:
  42. resp.encoding = 'utf-8'
  43. # print(resp.text)
  44. dl_list = re.findall('<dt class="xs2">([\S\s]*?)</dl>', resp.text)
  45. for dl in dl_list:
  46. if dl:
  47. url_list = re.findall('<a href="(.*?)" target="_blank" ', dl)
  48. title_list = re.findall('class="xi2" style="">(.*?)</a> </dt>', dl)
  49. img_url_list = re.findall('target="_blank"><img src="(.*?)"', dl)
  50. context_list = re.findall('class="tn" /></a></div>([\S\s]*?)</dd>', dl)
  51. post_time_list = re.findall('<span class="xg1"> (.*?)</span>', dl)
  52. for url, title, img_url, context, post_time in zip(url_list, title_list, img_url_list, context_list, post_time_list):
  53. # 清理正文内容的空格和换行等字符
  54. if context:
  55. for i in [' ', '\n']:
  56. context = context.replace(i, '')
  57. context = context.replace('\r', ' ')
  58. result_list.append({
  59. "title": title,
  60. "context": context,
  61. "source_url": self.base_url + url,
  62. 'link': '',
  63. "article_type": source.split(' - ')[1],
  64. "article_source": source.split(' - ')[0],
  65. "img_url": img_url,
  66. 'keyword': '',
  67. "posted_date": post_time,
  68. "create_time": int(time.time()),
  69. "create_datetime": datetime.now().strftime('%Y-%m-%d %H:%M:%S')
  70. })
  71. else:
  72. print(resp.status_code)
  73. return 0
  74. return result_list
  75. def save_to_mongo(self, collection, source_data):
  76. print(f'正在处理 {self.collection} 数据')
  77. mongo = MongoHandle(db=self.db, collection=self.collection, del_db=False, del_collection=False, auto_remove=0)
  78. for data_to_insert in source_data:
  79. try:
  80. # 检查数据库中是否存在匹配的文档
  81. filter_criteria = {'title': data_to_insert.get('title', '')} # 确保 title 字段有值
  82. count = mongo.collection.count_documents(filter_criteria)
  83. if count == 0:
  84. # 如果没有找到匹配的文档,插入新文档
  85. result = mongo.collection.insert_one(data_to_insert)
  86. # 准备发送邮件的数据
  87. self.send_email_datas.append(data_to_insert)
  88. except TypeError as te:
  89. print('\n%s' % te)
  90. self.logs_handle.logs_write('chiphell', '写入数据库报错: %s' % te, 'error', False)
  91. return 0
  92. print(f'处理 chiphell - {collection}数据完成')
  93. def send_to_email(self):
  94. title = 'chiphell - info'
  95. subject = 'chiphell - info'
  96. text = '********************************************************\n'
  97. for data in self.send_email_datas:
  98. text += '标题: {}\n'.format(data['title'])
  99. text += '正文: {}\n'.format(data['context'])
  100. text += '板块: {}\n'.format(data['article_source'])
  101. text += '类型: {}\n'.format(data['article_type'])
  102. text += '文章地址: {}\n'.format(data['source_url'])
  103. text += '文章时间: {}\n'.format(data['posted_date'])
  104. text += '获取时间: {}\n'.format(data['create_datetime'])
  105. text += '********************************************************\n\n'
  106. send_email = SendEmail(subject=subject, title=title, text=text)
  107. send_email.send()
  108. self.logs_handle.logs_write('chiphell', f'{title}-发送邮件完成', 'done', False)
  109. def main(self):
  110. category = {
  111. '评测': {
  112. '笔记本': '19',
  113. '机箱': '11',
  114. # '处理器': '13',
  115. # '散热器': '14',
  116. # '主板': '15',
  117. # '内存': '137',
  118. # '外设': '18',
  119. # '电源': '35',
  120. '存储': '23',
  121. '显示设备': '21',
  122. # '台式机': '88',
  123. '显卡': '10',
  124. # '相机': '116'
  125. },
  126. '电脑': {
  127. '配件开箱': '98',
  128. '整机搭建': '99',
  129. '桌面书房': '101'
  130. },
  131. '掌设': {
  132. '智能手机': '40',
  133. '智能穿戴': '89',
  134. '笔电平板': '41',
  135. # '周边附件': '92'
  136. },
  137. # '摄影': {
  138. # '微单卡片': '52',
  139. # '单反单电': '51',
  140. # '经典旁轴': '53',
  141. # '怀旧菲林': '54',
  142. # '影音摄像': '57',
  143. # '周边附件': '55'
  144. # },
  145. # '汽车': {
  146. # '买菜车': '58',
  147. # '商务车': '59',
  148. # '性能车': '63',
  149. # '旅行车': '60',
  150. # 'SUV': '61',
  151. # 'MPV': '95',
  152. # '摩托轻骑': '65',
  153. # '改装配件': '96'
  154. # },
  155. # '单车': {
  156. # '山地车': '108',
  157. # '公路车': '109',
  158. # '折叠车': '110',
  159. # '休旅车': '111'
  160. # },
  161. # '腕表': {
  162. # '机械表': '128',
  163. # '电子表': '126'
  164. # },
  165. '视听': {
  166. '耳机耳放': '71',
  167. '音箱功放': '72',
  168. # '解码转盘': '73',
  169. '随身设备': '74'
  170. },
  171. '美食': {
  172. '当地美食': '68',
  173. '世界美食': '117',
  174. '私房菜品': '69',
  175. '美食器材': '70'
  176. },
  177. # '家居': {
  178. # '家居': '132'
  179. # },
  180. }
  181. response_datas = {}
  182. for source1, tags in category.items():
  183. # source1作为表名, 先放到response_datas里面
  184. if source1 not in response_datas:
  185. response_datas[source1] = []
  186. for source2, target in tags.items():
  187. source = source1 + ' - ' + source2
  188. response_data = self.req(source, target)
  189. if response_data != 0:
  190. response_datas[source1] += response_data
  191. if response_datas:
  192. threads = []
  193. for k, v in response_datas.items():
  194. thread = threading.Thread(target=self.save_to_mongo, args=(k, v,))
  195. threads.append(thread)
  196. thread.start()
  197. for thread in threads:
  198. thread.join()
  199. else:
  200. self.logs_handle.logs_write('chiphell', '获取数据为空', 'error', False)
  201. return False
  202. # 如果 self.send_email_datas 中有数据, 则发送邮件
  203. if self.send_email_datas:
  204. self.send_to_email()
  205. if __name__ == '__main__':
  206. C = CHIPHELL()
  207. C.main()