Source code for unilab.training.monitoring
"""Hardware monitoring utilities for performance profiling."""
from typing import Dict
import torch
try:
import psutil
HAS_PSUTIL = True
except ImportError:
HAS_PSUTIL = False
[docs]
class HardwareMonitor:
"""Monitor CPU, GPU, memory usage."""
[docs]
def __init__(self):
self.has_psutil = HAS_PSUTIL
if self.has_psutil:
self.process = psutil.Process()
self.has_cuda = torch.cuda.is_available()
if self.has_cuda:
try:
import pynvml
pynvml.nvmlInit()
self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(0)
self.has_nvml = True
except Exception:
self.has_nvml = False
else:
self.has_nvml = False
[docs]
def get_metrics(self) -> Dict[str, float]:
"""Get current hardware metrics."""
metrics = {}
# CPU & Memory (requires psutil)
if self.has_psutil:
metrics["cpu_percent"] = self.process.cpu_percent()
metrics["cpu_count"] = psutil.cpu_count()
mem = self.process.memory_info()
metrics["memory_rss_mb"] = mem.rss / 1024 / 1024
metrics["memory_percent"] = self.process.memory_percent()
# GPU
if self.has_cuda:
metrics["gpu_memory_allocated_mb"] = torch.cuda.memory_allocated() / 1024 / 1024
metrics["gpu_memory_reserved_mb"] = torch.cuda.memory_reserved() / 1024 / 1024
if self.has_nvml:
import pynvml
util = pynvml.nvmlDeviceGetUtilizationRates(self.nvml_handle)
metrics["gpu_utilization"] = util.gpu
metrics["gpu_memory_utilization"] = util.memory
return metrics