utils_check_base.py 2.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768
  1. # -*- coding: utf-8 -*-
  2. """
  3. 消息模块基础, 用于打开浏览器等相关操作
  4. """
  5. import random
  6. from playwright.sync_api import sync_playwright
  7. import sys
  8. import os
  9. import time
  10. sys.path.append(os.path.join(os.path.abspath(__file__).split('auto')[0] + 'auto'))
  11. from utils.utils_logs_handle import LogsHandle
  12. class CryptoCrawler:
  13. def __init__(self, url_list, selectors, check_difference=False, headless=True, proxy=False):
  14. self.url_list = url_list
  15. self.selectors = selectors
  16. self.check_difference = check_difference # 用于检测数据是否发生变化 (开关)
  17. self.data_difference = False # 用于检测数据是否发生变化 (结果) (默认 否)
  18. self.logs_handle = LogsHandle() # 记录日志
  19. self.db = 'CHECK'
  20. self.collection = 'check'
  21. self.headless = headless
  22. self.proxy = proxy
  23. def main(self):
  24. with sync_playwright() as playwright:
  25. if self.proxy:
  26. browser = playwright.webkit.launch(headless=self.headless, proxy={'server': '127.0.0.1:7890'})
  27. else:
  28. browser = playwright.webkit.launch(headless=self.headless)
  29. context = browser.new_context(viewport={'width': 1920, 'height': 1080})
  30. page = context.new_page()
  31. all_data = []
  32. for url_info in self.url_list:
  33. for key, url in url_info.items():
  34. result_list = []
  35. try:
  36. page.goto(url)
  37. page.wait_for_load_state('load')
  38. time.sleep(5) # 确保页面完全加载
  39. for selector in self.selectors:
  40. element = page.query_selector(selector)
  41. if element:
  42. res = element.text_content().strip()
  43. result_list.append({key: res})
  44. except Exception as e:
  45. err_str = f"Error fetching {url}: {e}"
  46. self.logs_handle.logs_write(self.collection, err_str, 'error', False)
  47. continue
  48. if result_list:
  49. all_data.append(result_list)
  50. time.sleep(random.randint(1, 3))
  51. browser.close()
  52. if all_data:
  53. return all_data
  54. else:
  55. return None