|
import platform |
|
import subprocess |
|
import os |
|
import psutil |
|
import torch |
|
from typing import Optional, Dict, Tuple, Union |
|
|
|
NumericValue = Union[int, float] |
|
MetricsDict = Dict[str, NumericValue] |
|
|
|
class SystemMonitor: |
|
@staticmethod |
|
def get_nvidia_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: |
|
"""Get NVIDIA GPU information and metrics for GPU 0.""" |
|
metrics = {} |
|
gpu_name_from_torch = "NVIDIA GPU (name unavailable)" |
|
warning_message = None |
|
|
|
try: |
|
gpu_name_from_torch = f"{torch.cuda.get_device_name(0)}" |
|
except Exception: |
|
|
|
|
|
metrics = { |
|
'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0, |
|
'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0, |
|
|
|
'memory_reserved_gb': 0.0, |
|
'temperature': 0.0, |
|
'utilization': 0.0 |
|
} |
|
warning_message = "Could not get GPU name via PyTorch. nvidia-smi likely to fail or has failed. Displaying basic PyTorch memory (application-specific)." |
|
return gpu_name_from_torch, metrics, warning_message |
|
|
|
|
|
nvidia_smi_common_args = [ |
|
'nvidia-smi', |
|
'--query-gpu=memory.used,memory.total,memory.reserved,temperature.gpu,utilization.gpu', |
|
'--format=csv,nounits,noheader' |
|
] |
|
|
|
smi_output_str = None |
|
try: |
|
|
|
smi_output_str = subprocess.check_output( |
|
nvidia_smi_common_args + ['--id=0'], |
|
encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE |
|
) |
|
except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e1: |
|
|
|
try: |
|
|
|
smi_output_str = subprocess.check_output( |
|
nvidia_smi_common_args, |
|
encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE |
|
) |
|
if smi_output_str: |
|
smi_output_str = smi_output_str.strip().split('\n')[0] |
|
except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e2: |
|
|
|
|
|
metrics = { |
|
'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0, |
|
'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0, |
|
'memory_reserved_gb': 0.0, |
|
'temperature': 0.0, |
|
'utilization': 0.0 |
|
} |
|
warning_message = "nvidia-smi failed. GPU Memory Used is PyTorch specific (not total). Other GPU stats unavailable." |
|
return gpu_name_from_torch, metrics, warning_message |
|
|
|
if smi_output_str: |
|
parts = smi_output_str.strip().split(',') |
|
if len(parts) == 5: |
|
memory_used_mib, memory_total_mib, memory_reserved_mib, temp, util = map(float, parts) |
|
metrics = { |
|
'memory_used_gb': memory_used_mib / 1024, |
|
'memory_total_gb': memory_total_mib / 1024, |
|
'memory_reserved_gb': memory_reserved_mib / 1024, |
|
'temperature': temp, |
|
'utilization': util |
|
} |
|
else: |
|
|
|
warning_message = "nvidia-smi output format unexpected. Some GPU stats may be missing or inaccurate." |
|
|
|
metrics = { |
|
'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0, |
|
'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0, |
|
'memory_reserved_gb': 0.0, |
|
'temperature': 0.0, |
|
'utilization': 0.0 |
|
} |
|
if len(parts) >= 2: |
|
try: metrics['memory_used_gb'] = float(parts[0]) / 1024 |
|
except: pass |
|
try: metrics['memory_total_gb'] = float(parts[1]) / 1024 |
|
except: pass |
|
else: |
|
metrics = { |
|
'memory_used_gb': 0.0, 'memory_total_gb': 0.0, 'memory_reserved_gb': 0.0, |
|
'temperature': 0.0, 'utilization': 0.0 |
|
} |
|
warning_message = "Failed to get any output from nvidia-smi." |
|
|
|
|
|
return gpu_name_from_torch, metrics, warning_message |
|
|
|
@staticmethod |
|
def get_mac_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: |
|
"""Get Apple Silicon GPU information without requiring sudo.""" |
|
metrics = {} |
|
warning_message = None |
|
try: |
|
memory = psutil.virtual_memory() |
|
metrics = { |
|
'memory_total_gb': memory.total / (1024**3), |
|
'memory_used_gb': memory.used / (1024**3), |
|
'utilization': psutil.cpu_percent(), |
|
|
|
'memory_reserved_gb': 0.0, |
|
'temperature': 0.0 |
|
} |
|
if metrics['utilization'] == psutil.cpu_percent(): |
|
warning_message = "Mac GPU Load is proxied by CPU Usage." |
|
|
|
except Exception as e: |
|
|
|
metrics = { |
|
'memory_total_gb': 0.0, 'memory_used_gb': 0.0, 'utilization': 0.0, |
|
'memory_reserved_gb': 0.0, 'temperature': 0.0 |
|
} |
|
warning_message = "Could not retrieve Mac system info." |
|
|
|
return "Apple Silicon GPU", metrics, warning_message |
|
|
|
@staticmethod |
|
def get_amd_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: |
|
"""Get AMD GPU information.""" |
|
metrics = { |
|
'memory_used_gb': 0.0, |
|
'memory_total_gb': 0.0, |
|
'memory_reserved_gb': 0.0, |
|
'temperature': 0.0, |
|
'utilization': 0.0 |
|
} |
|
warning_message = None |
|
source = "unknown" |
|
|
|
try: |
|
|
|
try: |
|
result = subprocess.check_output(['rocm-smi', '--showmeminfo', 'vram', '--showtemp', '--showuse'], encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
lines = result.strip().split('\n') |
|
for line in lines: |
|
if line.startswith("GPU[0]"): |
|
if "VRAM Usage:" in line: |
|
mem_parts = line.split("VRAM Usage:")[1].strip().split('/') |
|
metrics['memory_used_gb'] = float(mem_parts[0].replace('M', '')) / 1024 |
|
metrics['memory_total_gb'] = float(mem_parts[1].replace('M', '')) / 1024 |
|
source = "rocm-smi" |
|
elif "Temperature:" in line: |
|
metrics['temperature'] = float(line.split("Temperature:")[1].strip().replace('c', '')) |
|
source = "rocm-smi" |
|
elif "GPU Use:" in line: |
|
metrics['utilization'] = float(line.split("GPU Use:")[1].strip().replace('%', '')) |
|
source = "rocm-smi" |
|
if source != "rocm-smi": |
|
warning_message = "rocm-smi ran but output parsing failed." |
|
except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e_rocm: |
|
|
|
warning_message = "rocm-smi not found or failed. " |
|
|
|
if platform.system() == "Linux": |
|
base_path = "/sys/class/drm/card0/device" |
|
sysfs_found_any = False |
|
try: |
|
with open(f"{base_path}/hwmon/hwmon0/temp1_input") as f: |
|
metrics['temperature'] = float(f.read().strip()) / 1000 |
|
sysfs_found_any = True |
|
except (FileNotFoundError, PermissionError, ValueError): pass |
|
|
|
try: |
|
with open(f"{base_path}/mem_info_vram_total") as f: |
|
metrics['memory_total_gb'] = int(f.read().strip()) / (1024**3) |
|
with open(f"{base_path}/mem_info_vram_used") as f: |
|
metrics['memory_used_gb'] = int(f.read().strip()) / (1024**3) |
|
sysfs_found_any = True |
|
except (FileNotFoundError, PermissionError, ValueError): pass |
|
|
|
try: |
|
with open(f"{base_path}/gpu_busy_percent") as f: |
|
metrics['utilization'] = float(f.read().strip()) |
|
sysfs_found_any = True |
|
except (FileNotFoundError, PermissionError, ValueError): pass |
|
|
|
if sysfs_found_any: |
|
source = "sysfs" |
|
warning_message += "Using sysfs (may be incomplete)." |
|
else: |
|
warning_message += "sysfs also failed or provided no data." |
|
else: |
|
warning_message += "Not on Linux, no sysfs fallback." |
|
|
|
except Exception as e_amd_main: |
|
|
|
warning_message = (warning_message or "") + " Unexpected error in AMD GPU info gathering." |
|
|
|
return f"AMD GPU ({source})", metrics, warning_message |
|
|
|
@staticmethod |
|
def is_amd_gpu() -> bool: |
|
try: |
|
|
|
rocm_smi_exists = False |
|
try: |
|
subprocess.check_call(['rocm-smi', '-h'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=0.5) |
|
rocm_smi_exists = True |
|
except (subprocess.SubprocessError, FileNotFoundError): |
|
pass |
|
|
|
if rocm_smi_exists: |
|
return True |
|
|
|
|
|
if platform.system() == "Linux" and os.path.exists('/sys/class/drm/card0/device/vendor'): |
|
with open('/sys/class/drm/card0/device/vendor', 'r') as f: |
|
return '0x1002' in f.read() |
|
return False |
|
except: |
|
return False |
|
|
|
@classmethod |
|
def get_system_info(cls) -> str: |
|
"""Get detailed system status with support for different GPU types.""" |
|
gpu_name_display: Optional[str] = None |
|
metrics: MetricsDict = {} |
|
gpu_warning: Optional[str] = None |
|
|
|
try: |
|
|
|
if torch.cuda.is_available(): |
|
gpu_name_display, metrics, gpu_warning = cls.get_nvidia_gpu_info() |
|
elif platform.system() == "Darwin" and platform.processor() == "arm": |
|
gpu_name_display, metrics, gpu_warning = cls.get_mac_gpu_info() |
|
elif cls.is_amd_gpu(): |
|
gpu_name_display, metrics, gpu_warning = cls.get_amd_gpu_info() |
|
else: |
|
|
|
|
|
|
|
|
|
pass |
|
|
|
|
|
|
|
if gpu_name_display: |
|
gpu_info_lines = [f"🎮 GPU: {gpu_name_display}"] |
|
|
|
|
|
if 'memory_used_gb' in metrics and 'memory_total_gb' in metrics: |
|
mem_label = "GPU Memory" |
|
if platform.system() == "Darwin" and platform.processor() == "arm": |
|
mem_label = "Unified Memory" |
|
|
|
gpu_info_lines.append( |
|
f"📊 {mem_label}: {metrics.get('memory_used_gb', 0.0):.1f}GB / {metrics.get('memory_total_gb', 0.0):.1f}GB" |
|
) |
|
|
|
|
|
|
|
|
|
|
|
if 'temperature' in metrics and metrics.get('temperature', 0.0) > 0: |
|
gpu_info_lines.append(f"🌡️ GPU Temp: {metrics.get('temperature', 0.0):.0f}°C") |
|
|
|
if 'utilization' in metrics: |
|
gpu_info_lines.append(f"⚡ GPU Load: {metrics.get('utilization', 0.0):.0f}%") |
|
|
|
if gpu_warning: |
|
gpu_info_lines.append(f"⚠️ {gpu_warning}") |
|
|
|
gpu_section = "\n".join(gpu_info_lines) + "\n" |
|
else: |
|
gpu_section = "🎮 GPU: No dedicated GPU detected or supported\n" |
|
|
|
|
|
cpu_count = psutil.cpu_count(logical=False) |
|
cpu_threads = psutil.cpu_count(logical=True) |
|
cpu_info = f"💻 CPU: {cpu_count or 'N/A'} Cores, {cpu_threads or 'N/A'} Threads\n" |
|
cpu_usage = f"⚡ CPU Usage: {psutil.cpu_percent()}%\n" |
|
|
|
|
|
ram = psutil.virtual_memory() |
|
ram_used_gb = ram.used / (1024**3) |
|
ram_total_gb = ram.total / (1024**3) |
|
ram_info = f"🎯 System RAM: {ram_used_gb:.1f}GB / {ram_total_gb:.1f}GB ({ram.percent}%)" |
|
|
|
return f"{gpu_section}{cpu_info}{cpu_usage}{ram_info}" |
|
|
|
except Exception as e: |
|
|
|
|
|
return f"Error collecting system info: {str(e)}" |