File size: 17,021 Bytes
05fcd0f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
import platform
import subprocess
import os
import psutil
import torch
from typing import Optional, Dict, Tuple, Union

NumericValue = Union[int, float]
MetricsDict = Dict[str, NumericValue]

class SystemMonitor:
    @staticmethod
    def get_nvidia_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]:
        """Get NVIDIA GPU information and metrics for GPU 0."""
        metrics = {}
        gpu_name_from_torch = "NVIDIA GPU (name unavailable)"
        warning_message = None # To indicate if nvidia-smi failed and we're using PyTorch fallback

        try:
            gpu_name_from_torch = f"{torch.cuda.get_device_name(0)}"
        except Exception:
            # If even the name fails, nvidia-smi is highly likely to fail too.
            # Prepare basic PyTorch metrics as the ultimate fallback.
            metrics = {
                'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0,
                'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0,
                # Add placeholders for other metrics to maintain UI symmetry if nvidia-smi fails
                'memory_reserved_gb': 0.0,
                'temperature': 0.0,
                'utilization': 0.0
            }
            warning_message = "Could not get GPU name via PyTorch. nvidia-smi likely to fail or has failed. Displaying basic PyTorch memory (application-specific)."
            return gpu_name_from_torch, metrics, warning_message

        # Query for memory.used, memory.total, memory.reserved, temperature.gpu, utilization.gpu
        nvidia_smi_common_args = [
            'nvidia-smi',
            '--query-gpu=memory.used,memory.total,memory.reserved,temperature.gpu,utilization.gpu',
            '--format=csv,nounits,noheader'
        ]

        smi_output_str = None
        try:
            # Attempt 1: Query specific GPU 0
            smi_output_str = subprocess.check_output(
                nvidia_smi_common_args + ['--id=0'],
                encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE
            )
        except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e1:
            # print(f"nvidia-smi with --id=0 failed: {type(e1).__name__}. Trying general query.")
            try:
                # Attempt 2: Query all GPUs and parse the first line
                smi_output_str = subprocess.check_output(
                    nvidia_smi_common_args, # Without --id=0
                    encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE
                )
                if smi_output_str:
                    smi_output_str = smi_output_str.strip().split('\n')[0] # Take the first line
            except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e2:
                # print(f"nvidia-smi (general query) also failed: {type(e2).__name__}. Falling back to torch.cuda.")
                # Fallback to basic CUDA info from PyTorch, plus placeholders for UI
                metrics = {
                    'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0,
                    'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0,
                    'memory_reserved_gb': 0.0, # Placeholder
                    'temperature': 0.0,       # Placeholder
                    'utilization': 0.0        # Placeholder
                }
                warning_message = "nvidia-smi failed. GPU Memory Used is PyTorch specific (not total). Other GPU stats unavailable."
                return gpu_name_from_torch, metrics, warning_message

        if smi_output_str:
            parts = smi_output_str.strip().split(',')
            if len(parts) == 5: # memory.used, memory.total, memory.reserved, temp, util
                memory_used_mib, memory_total_mib, memory_reserved_mib, temp, util = map(float, parts)
                metrics = {
                    'memory_used_gb': memory_used_mib / 1024,
                    'memory_total_gb': memory_total_mib / 1024,
                    'memory_reserved_gb': memory_reserved_mib / 1024, # This is from nvidia-smi's memory.reserved
                    'temperature': temp,
                    'utilization': util
                }
            else:
                # print(f"Unexpected nvidia-smi output format: {smi_output_str}. Parts: {len(parts)}")
                warning_message = "nvidia-smi output format unexpected. Some GPU stats may be missing or inaccurate."
                # Fallback with placeholders to maintain UI structure
                metrics = {
                    'memory_used_gb': torch.cuda.memory_allocated(0) / 1024**3 if torch.cuda.is_available() else 0, # PyTorch fallback
                    'memory_total_gb': torch.cuda.get_device_properties(0).total_memory / 1024**3 if torch.cuda.is_available() else 0, # PyTorch fallback
                    'memory_reserved_gb': 0.0,
                    'temperature': 0.0,
                    'utilization': 0.0
                }
                if len(parts) >= 2: # Try to parse what we can if format is just partially off
                    try: metrics['memory_used_gb'] = float(parts[0]) / 1024
                    except: pass
                    try: metrics['memory_total_gb'] = float(parts[1]) / 1024
                    except: pass
        else: # Should have been caught by try-except, but as a final safety
            metrics = {
                'memory_used_gb': 0.0, 'memory_total_gb': 0.0, 'memory_reserved_gb': 0.0,
                'temperature': 0.0, 'utilization': 0.0
            }
            warning_message = "Failed to get any output from nvidia-smi."


        return gpu_name_from_torch, metrics, warning_message

    @staticmethod
    def get_mac_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: # Add warning return for consistency
        """Get Apple Silicon GPU information without requiring sudo."""
        metrics = {}
        warning_message = None
        try:
            memory = psutil.virtual_memory()
            metrics = {
                'memory_total_gb': memory.total / (1024**3),
                'memory_used_gb': memory.used / (1024**3), # This is system RAM, reported as "Unified Memory"
                'utilization': psutil.cpu_percent(),  # Still CPU usage as proxy
                # Placeholders for Mac to match NVIDIA's output structure for UI symmetry
                'memory_reserved_gb': 0.0, # N/A for unified memory in this context
                'temperature': 0.0 # Not easily available without sudo
            }
            if metrics['utilization'] == psutil.cpu_percent(): # Check if it's actually CPU util
                 warning_message = "Mac GPU Load is proxied by CPU Usage."

        except Exception as e:
            # print(f"Error getting Mac info: {e}")
            metrics = {
                'memory_total_gb': 0.0, 'memory_used_gb': 0.0, 'utilization': 0.0,
                'memory_reserved_gb': 0.0, 'temperature': 0.0
            }
            warning_message = "Could not retrieve Mac system info."
        
        return "Apple Silicon GPU", metrics, warning_message # Changed name for clarity

    @staticmethod
    def get_amd_gpu_info() -> Tuple[str, MetricsDict, Optional[str]]: # Add warning return
        """Get AMD GPU information."""
        metrics = { # Initialize with placeholders for all expected keys for UI symmetry
            'memory_used_gb': 0.0,
            'memory_total_gb': 0.0,
            'memory_reserved_gb': 0.0, # Typically N/A or not reported by rocm-smi in a 'reserved' sense
            'temperature': 0.0,
            'utilization': 0.0
        }
        warning_message = None
        source = "unknown"

        try:
            # Try rocm-smi first
            try:
                result = subprocess.check_output(['rocm-smi', '--showmeminfo', 'vram', '--showtemp', '--showuse'], encoding='utf-8', timeout=1.5, stderr=subprocess.PIPE)
                # Example rocm-smi output parsing (highly dependent on actual output format)
                # This needs to be robust or use a more structured output format like --json if rocm-smi supports it
                # For VRAM Used/Total:
                # GPU[0]		VRAM Usage: 2020M/16368M
                # For Temp:
                # GPU[0]		Temperature: 34c
                # For Util:
                # GPU[0]		GPU Use: 0%
                lines = result.strip().split('\n')
                for line in lines:
                    if line.startswith("GPU[0]"): # Assuming card 0
                        if "VRAM Usage:" in line:
                            mem_parts = line.split("VRAM Usage:")[1].strip().split('/')
                            metrics['memory_used_gb'] = float(mem_parts[0].replace('M', '')) / 1024
                            metrics['memory_total_gb'] = float(mem_parts[1].replace('M', '')) / 1024
                            source = "rocm-smi"
                        elif "Temperature:" in line:
                            metrics['temperature'] = float(line.split("Temperature:")[1].strip().replace('c', ''))
                            source = "rocm-smi"
                        elif "GPU Use:" in line:
                            metrics['utilization'] = float(line.split("GPU Use:")[1].strip().replace('%', ''))
                            source = "rocm-smi"
                if source != "rocm-smi": # if parsing failed or fields were missing
                    warning_message = "rocm-smi ran but output parsing failed."
            except (subprocess.SubprocessError, FileNotFoundError, ValueError) as e_rocm:
                # print(f"rocm-smi failed: {e_rocm}. Trying sysfs.")
                warning_message = "rocm-smi not found or failed. "
                # Try sysfs as fallback on Linux
                if platform.system() == "Linux":
                    base_path = "/sys/class/drm/card0/device" # This assumes card0
                    sysfs_found_any = False
                    try:
                        with open(f"{base_path}/hwmon/hwmon0/temp1_input") as f: # Check for specific hwmon index
                            metrics['temperature'] = float(f.read().strip()) / 1000
                        sysfs_found_any = True
                    except (FileNotFoundError, PermissionError, ValueError): pass # Ignore if specific file not found
                    
                    try:
                        with open(f"{base_path}/mem_info_vram_total") as f:
                            metrics['memory_total_gb'] = int(f.read().strip()) / (1024**3) # Bytes to GiB
                        with open(f"{base_path}/mem_info_vram_used") as f:
                            metrics['memory_used_gb'] = int(f.read().strip()) / (1024**3) # Bytes to GiB
                        sysfs_found_any = True
                    except (FileNotFoundError, PermissionError, ValueError): pass
                            
                    try:
                        with open(f"{base_path}/gpu_busy_percent") as f:
                            metrics['utilization'] = float(f.read().strip())
                        sysfs_found_any = True
                    except (FileNotFoundError, PermissionError, ValueError): pass
                    
                    if sysfs_found_any:
                        source = "sysfs"
                        warning_message += "Using sysfs (may be incomplete)."
                    else:
                        warning_message += "sysfs also failed or provided no data."
                else:
                     warning_message += "Not on Linux, no sysfs fallback."
        
        except Exception as e_amd_main: # Catch-all for unforeseen issues in AMD block
            # print(f"General error in get_amd_gpu_info: {e_amd_main}")
            warning_message = (warning_message or "") + " Unexpected error in AMD GPU info gathering."
        
        return f"AMD GPU ({source})", metrics, warning_message

    @staticmethod
    def is_amd_gpu() -> bool: # No changes needed here
        try:
            # Check for rocm-smi first as it's more definitive
            rocm_smi_exists = False
            try:
                subprocess.check_call(['rocm-smi', '-h'], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=0.5)
                rocm_smi_exists = True
            except (subprocess.SubprocessError, FileNotFoundError):
                pass # rocm-smi not found or errored
            
            if rocm_smi_exists:
                return True

            # Fallback to sysfs check if on Linux
            if platform.system() == "Linux" and os.path.exists('/sys/class/drm/card0/device/vendor'):
                with open('/sys/class/drm/card0/device/vendor', 'r') as f:
                    return '0x1002' in f.read() # AMD's PCI vendor ID
            return False
        except:
            return False

    @classmethod
    def get_system_info(cls) -> str:
        """Get detailed system status with support for different GPU types."""
        gpu_name_display: Optional[str] = None
        metrics: MetricsDict = {}
        gpu_warning: Optional[str] = None

        try:
            # Determine GPU type and get metrics
            if torch.cuda.is_available(): # Implies NVIDIA usually
                gpu_name_display, metrics, gpu_warning = cls.get_nvidia_gpu_info()
            elif platform.system() == "Darwin" and platform.processor() == "arm": # Apple Silicon
                gpu_name_display, metrics, gpu_warning = cls.get_mac_gpu_info()
            elif cls.is_amd_gpu(): # Check for AMD (works on Linux, might need refinement for Windows if not using PyTorch ROCm)
                gpu_name_display, metrics, gpu_warning = cls.get_amd_gpu_info()
            else: # No specific GPU detected by these primary checks
                # Could add a PyTorch ROCm check here if desired for AMD on Windows/Linux without rocm-smi
                # if hasattr(torch, 'rocm_is_available') and torch.rocm_is_available():
                # gpu_name_display = "AMD GPU (via PyTorch ROCm)"
                # metrics = { ... basic torch.rocm metrics ... }
                pass


            # Format GPU info based on available metrics
            if gpu_name_display:
                gpu_info_lines = [f"๐ŸŽฎ GPU: {gpu_name_display}"]
                
                # Standard memory reporting
                if 'memory_used_gb' in metrics and 'memory_total_gb' in metrics:
                    mem_label = "GPU Memory"
                    if platform.system() == "Darwin" and platform.processor() == "arm":
                        mem_label = "Unified Memory" # For Apple Silicon
                    
                    gpu_info_lines.append(
                        f"๐Ÿ“Š {mem_label}: {metrics.get('memory_used_gb', 0.0):.1f}GB / {metrics.get('memory_total_gb', 0.0):.1f}GB"
                    )

                # VRAM Reserved (NVIDIA specific from nvidia-smi, or placeholder)
                # if 'memory_reserved_gb' in metrics and torch.cuda.is_available() and not (platform.system() == "Darwin"): # Show for NVIDIA, not Mac
                    # gpu_info_lines.append(f"๐Ÿ’พ VRAM Reserved: {metrics.get('memory_reserved_gb', 0.0):.1f}GB")
                
                if 'temperature' in metrics and metrics.get('temperature', 0.0) > 0: # Only show if temp is valid
                    gpu_info_lines.append(f"๐ŸŒก๏ธ GPU Temp: {metrics.get('temperature', 0.0):.0f}ยฐC")
                
                if 'utilization' in metrics:
                    gpu_info_lines.append(f"โšก GPU Load: {metrics.get('utilization', 0.0):.0f}%")
                
                if gpu_warning: # Display any warning from the GPU info functions
                    gpu_info_lines.append(f"โš ๏ธ {gpu_warning}")
                    
                gpu_section = "\n".join(gpu_info_lines) + "\n"
            else:
                gpu_section = "๐ŸŽฎ GPU: No dedicated GPU detected or supported\n"
            
            # Get CPU info
            cpu_count = psutil.cpu_count(logical=False) # Physical cores
            cpu_threads = psutil.cpu_count(logical=True) # Logical processors
            cpu_info = f"๐Ÿ’ป CPU: {cpu_count or 'N/A'} Cores, {cpu_threads or 'N/A'} Threads\n"
            cpu_usage = f"โšก CPU Usage: {psutil.cpu_percent()}%\n"
            
            # Get RAM info
            ram = psutil.virtual_memory()
            ram_used_gb = ram.used / (1024**3)
            ram_total_gb = ram.total / (1024**3)
            ram_info = f"๐ŸŽฏ System RAM: {ram_used_gb:.1f}GB / {ram_total_gb:.1f}GB ({ram.percent}%)"
            
            return f"{gpu_section}{cpu_info}{cpu_usage}{ram_info}"
            
        except Exception as e:
            # print(f"Overall error in get_system_info: {e}")
            # import traceback; print(traceback.format_exc())
            return f"Error collecting system info: {str(e)}"