DawnC commited on
Commit
14ee6e4
·
verified ·
1 Parent(s): f222f88

Update device_manager.py

Browse files
Files changed (1) hide show
  1. device_manager.py +32 -22
device_manager.py CHANGED
@@ -1,6 +1,5 @@
1
  from functools import wraps
2
  import torch
3
- from huggingface_hub import HfApi
4
  import os
5
  import logging
6
 
@@ -22,48 +21,59 @@ class DeviceManager:
22
 
23
  self._initialized = True
24
  self._current_device = None
 
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def check_zero_gpu_availability(self):
 
27
  try:
28
- # 檢查 Hugging Face Space 環境變數
29
- if not os.environ.get('SPACE_ID'):
30
- return False
31
-
32
- # 檢查是否在 Spaces 環境中並且啟用了 ZeroGPU
33
- if os.environ.get('ZERO_GPU_AVAILABLE') == '1':
34
- return True
35
-
36
- return False
37
-
38
  except Exception as e:
39
- logger.warning(f"Error checking ZeroGPU availability: {e}")
40
- return False
41
 
42
  def get_optimal_device(self):
 
43
  if self._current_device is None:
44
  if self.check_zero_gpu_availability():
45
  try:
46
- # 確保 CUDA 可用
47
- if torch.cuda.is_available():
48
- self._current_device = torch.device('cuda')
49
- logger.info("Using ZeroGPU")
50
- else:
51
- raise RuntimeError("CUDA not available")
52
  except Exception as e:
53
- logger.warning(f"Failed to initialize ZeroGPU: {e}")
54
  self._current_device = torch.device('cpu')
55
- logger.info("Fallback to CPU due to GPU initialization failure")
56
  else:
57
  self._current_device = torch.device('cpu')
58
  logger.info("Using CPU (ZeroGPU not available)")
59
  return self._current_device
60
 
61
  def move_to_device(self, tensor_or_model):
 
62
  device = self.get_optimal_device()
63
  try:
64
  if hasattr(tensor_or_model, 'to'):
65
  return tensor_or_model.to(device)
66
- except Exception:
 
67
  self._current_device = torch.device('cpu')
68
  if hasattr(tensor_or_model, 'to'):
69
  return tensor_or_model.to('cpu')
 
1
  from functools import wraps
2
  import torch
 
3
  import os
4
  import logging
5
 
 
21
 
22
  self._initialized = True
23
  self._current_device = None
24
+ self.initialize_zero_gpu()
25
 
26
+ def initialize_zero_gpu(self):
27
+ """初始化 ZeroGPU"""
28
+ try:
29
+ # 檢查是否在 Hugging Face Spaces 環境中
30
+ if os.environ.get('SPACE_ID'):
31
+ # 嘗試初始化 ZeroGPU
32
+ os.environ['CUDA_VISIBLE_DEVICES'] = '0'
33
+ # 設置必要的環境變數
34
+ os.environ['ZERO_GPU'] = '1'
35
+ logger.info("ZeroGPU environment initialized")
36
+ except Exception as e:
37
+ logger.warning(f"Failed to initialize ZeroGPU environment: {e}")
38
+
39
  def check_zero_gpu_availability(self):
40
+ """檢查 ZeroGPU 是否可用"""
41
  try:
42
+ if os.environ.get('SPACE_ID') and os.environ.get('ZERO_GPU') == '1':
43
+ # 確保 CUDA 運行時環境正確設置
44
+ if torch.cuda.is_available():
45
+ torch.cuda.init()
46
+ return True
 
 
 
 
 
47
  except Exception as e:
48
+ logger.warning(f"ZeroGPU check failed: {e}")
49
+ return False
50
 
51
  def get_optimal_device(self):
52
+ """獲取最佳可用設備"""
53
  if self._current_device is None:
54
  if self.check_zero_gpu_availability():
55
  try:
56
+ self._current_device = torch.device('cuda')
57
+ logger.info("Using ZeroGPU")
58
+ # 嘗試進行一次小規模的 CUDA 操作來驗證
59
+ torch.zeros(1).cuda()
 
 
60
  except Exception as e:
61
+ logger.warning(f"Failed to use ZeroGPU: {e}")
62
  self._current_device = torch.device('cpu')
63
+ logger.info("Fallback to CPU")
64
  else:
65
  self._current_device = torch.device('cpu')
66
  logger.info("Using CPU (ZeroGPU not available)")
67
  return self._current_device
68
 
69
  def move_to_device(self, tensor_or_model):
70
+ """將張量或模型移動到最佳設備"""
71
  device = self.get_optimal_device()
72
  try:
73
  if hasattr(tensor_or_model, 'to'):
74
  return tensor_or_model.to(device)
75
+ except Exception as e:
76
+ logger.warning(f"Failed to move to {device}, falling back to CPU: {e}")
77
  self._current_device = torch.device('cpu')
78
  if hasattr(tensor_or_model, 'to'):
79
  return tensor_or_model.to('cpu')