package gpu import ( "context" "fmt" "os/exec" "strings" ) // Manager manages GPU resources type Manager struct { // GPU tracking would be implemented here gpuInventory map[string]GPUInfo } // GPUInfo represents GPU information type GPUInfo struct { ID string Type string Status string VMID *int Health string Memory int64 Utilization float64 } // NewManager creates a new GPU manager func NewManager() *Manager { return &Manager{ gpuInventory: make(map[string]GPUInfo), } } // AllocateGPU allocates a GPU for a VM func (m *Manager) AllocateGPU(ctx context.Context, vmID int, gpuType string) error { // Find available GPU of the specified type var availableGPU *GPUInfo for id, gpu := range m.gpuInventory { if gpu.Type == gpuType && gpu.Status == "AVAILABLE" && gpu.VMID == nil { availableGPU = &gpu availableGPU.ID = id break } } if availableGPU == nil { return fmt.Errorf("no available GPU of type %s", gpuType) } // Allocate the GPU availableGPU.VMID = &vmID availableGPU.Status = "ALLOCATED" m.gpuInventory[availableGPU.ID] = *availableGPU // In a real implementation, this would: // 1. Update Proxmox VM configuration to attach GPU // 2. Use PCI passthrough or vGPU depending on GPU type // 3. Update resource tracking return nil } // GetGPUHealth gets the health status of a GPU func (m *Manager) GetGPUHealth(ctx context.Context, gpuID string) (string, error) { // Check if GPU exists in inventory gpu, exists := m.gpuInventory[gpuID] if !exists { // Try to discover GPU using nvidia-smi or similar tools health, err := m.discoverGPUHealth(ctx, gpuID) if err != nil { return "UNKNOWN", err } return health, nil } // If GPU is allocated, check actual health via monitoring if gpu.Status == "ALLOCATED" { health, err := m.checkAllocatedGPUHealth(ctx, gpuID) if err != nil { return gpu.Health, err } // Update health in inventory gpu.Health = health m.gpuInventory[gpuID] = gpu return health, nil } return gpu.Health, nil } // discoverGPUHealth discovers GPU health using system tools func (m *Manager) discoverGPUHealth(ctx context.Context, gpuID string) (string, error) { // Try nvidia-smi first (for NVIDIA GPUs) cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=health", "--format=csv,noheader", "--id="+gpuID) output, err := cmd.Output() if err == nil { health := strings.TrimSpace(string(output)) if health == "Ok" || health == "Healthy" { return "HEALTHY", nil } return "DEGRADED", nil } // Try AMD GPU monitoring (rocm-smi) cmd = exec.CommandContext(ctx, "rocm-smi", "--showtemp", "--id", gpuID) output, err = cmd.Output() if err == nil { // Parse AMD GPU temperature lines := strings.Split(strings.TrimSpace(string(output)), "\n") for _, line := range lines { if strings.Contains(line, "Temperature") { var temp int if _, err := fmt.Sscanf(line, "%*s %d", &temp); err == nil { const maxTemp = 95 // AMD typical max temp if temp >= maxTemp { return "DEGRADED", nil } return "HEALTHY", nil } } } } // Try Intel GPU monitoring (intel_gpu_top or similar) // Note: Intel GPU monitoring varies by generation return "UNKNOWN", fmt.Errorf("could not determine GPU health - no compatible monitoring tool found") } // checkAllocatedGPUHealth checks health of an allocated GPU func (m *Manager) checkAllocatedGPUHealth(ctx context.Context, gpuID string) (string, error) { // Check GPU utilization and temperature cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=utilization.gpu,temperature.gpu", "--format=csv,noheader", "--id="+gpuID) output, err := cmd.Output() if err != nil { return "UNKNOWN", err } // Parse output parts := strings.Split(strings.TrimSpace(string(output)), ",") if len(parts) >= 2 { // Extract utilization and temperature _ = strings.TrimSpace(parts[0]) // utilStr - reserved for future use tempStr := strings.TrimSpace(parts[1]) // Parse temperature (remove % and extract number) tempParts := strings.Fields(tempStr) if len(tempParts) > 0 { var temp int if _, err := fmt.Sscanf(tempParts[0], "%d", &temp); err == nil { // Temperature thresholds const maxTemp = 83 // NVIDIA default max temp const warningTemp = 75 if temp >= maxTemp { return "DEGRADED", nil } if temp >= warningTemp { return "WARNING", nil } } } return "HEALTHY", nil } return "UNKNOWN", fmt.Errorf("could not parse GPU metrics") } // ListGPUs lists all available GPUs func (m *Manager) ListGPUs(ctx context.Context) ([]GPUInfo, error) { // Discover GPUs using nvidia-smi or similar cmd := exec.CommandContext(ctx, "nvidia-smi", "--query-gpu=index,name,memory.total", "--format=csv,noheader") output, err := cmd.Output() if err != nil { // If nvidia-smi is not available, return inventory gpus := make([]GPUInfo, 0, len(m.gpuInventory)) for _, gpu := range m.gpuInventory { gpus = append(gpus, gpu) } return gpus, nil } // Parse nvidia-smi output and update inventory lines := strings.Split(strings.TrimSpace(string(output)), "\n") for _, line := range lines { parts := strings.Split(line, ",") if len(parts) >= 3 { gpuID := strings.TrimSpace(parts[0]) gpuType := strings.TrimSpace(parts[1]) if _, exists := m.gpuInventory[gpuID]; !exists { m.gpuInventory[gpuID] = GPUInfo{ ID: gpuID, Type: gpuType, Status: "AVAILABLE", Health: "HEALTHY", } } } } gpus := make([]GPUInfo, 0, len(m.gpuInventory)) for _, gpu := range m.gpuInventory { gpus = append(gpus, gpu) } return gpus, nil }