#!/bin/bash source ~/.bashrc # Diagnose VM Issues # Comprehensive diagnosis of VM problems set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" # Load environment variables if [ -f "$PROJECT_ROOT/.env" ]; then set -a source <(grep -v '^#' "$PROJECT_ROOT/.env" | grep -v '^$' | sed 's/#.*$//' | grep '=') set +a fi # Colors RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' PVE_USERNAME="${PVE_USERNAME:-root@pam}" PVE_PASSWORD="${PVE_ROOT_PASS:-}" PROXMOX_URL="${PROXMOX_ML110_URL:-https://192.168.1.206:8006}" PROXMOX_NODE="${PROXMOX_NODE:-pve}" log_info() { echo -e "${GREEN}[INFO]${NC} $1" } log_warn() { echo -e "${YELLOW}[WARN]${NC} $1" } log_error() { echo -e "${RED}[ERROR]${NC} $1" } log_issue() { echo -e "${RED}[ISSUE]${NC} $1" } get_api_token() { local response=$(curl -s -k --connect-timeout 10 --max-time 15 \ -d "username=$PVE_USERNAME&password=$PVE_PASSWORD" \ "$PROXMOX_URL/api2/json/access/ticket" 2>&1) if echo "$response" | grep -q '"data"'; then local ticket=$(echo "$response" | grep -o '"ticket":"[^"]*' | cut -d'"' -f4) local csrf_token=$(echo "$response" | grep -o '"CSRFPreventionToken":"[^"]*' | cut -d'"' -f4) echo "$ticket|$csrf_token" else echo "" fi } diagnose_template() { log_info "Diagnosing template VM 9000..." local tokens=$(get_api_token) local ticket=$(echo "$tokens" | cut -d'|' -f1) local csrf_token=$(echo "$tokens" | cut -d'|' -f2) local config=$(curl -s -k -H "Cookie: PVEAuthCookie=$ticket" \ -H "CSRFPreventionToken: $csrf_token" \ "$PROXMOX_URL/api2/json/nodes/$PROXMOX_NODE/qemu/9000/config") local disk=$(echo "$config" | python3 -c "import sys, json; d=json.load(sys.stdin).get('data', {}); print(d.get('scsi0', ''))" 2>/dev/null) local size=$(echo "$disk" | grep -o 'size=[^,]*' | cut -d'=' -f2) if [ "$size" = "600M" ]; then log_issue "Template has only 600M disk - likely no OS installed" log_warn "Template may need OS installation before cloning" return 1 fi return 0 } diagnose_vm() { local vmid=$1 local name=$2 local ip=$3 log_info "Diagnosing VM $vmid ($name)..." local tokens=$(get_api_token) local ticket=$(echo "$tokens" | cut -d'|' -f1) local csrf_token=$(echo "$tokens" | cut -d'|' -f2) # Check VM status local status=$(curl -s -k -H "Cookie: PVEAuthCookie=$ticket" \ -H "CSRFPreventionToken: $csrf_token" \ "$PROXMOX_URL/api2/json/nodes/$PROXMOX_NODE/qemu/$vmid/status/current" | \ python3 -c "import sys, json; print(json.load(sys.stdin).get('data', {}).get('status', 'unknown'))" 2>/dev/null) echo " Status: $status" # Check QEMU Guest Agent local agent_check=$(curl -s -k -H "Cookie: PVEAuthCookie=$ticket" \ -H "CSRFPreventionToken: $csrf_token" \ "$PROXMOX_URL/api2/json/nodes/$PROXMOX_NODE/qemu/$vmid/agent/network-get-interfaces" 2>&1) if echo "$agent_check" | grep -q "not running"; then log_issue "QEMU Guest Agent not running - OS may not be installed or agent not installed" fi # Check network connectivity if ping -c 1 -W 2 "$ip" &>/dev/null; then log_info " Network: ✓ Reachable" else log_issue " Network: ✗ Not reachable" log_warn " Possible causes:" log_warn " - OS not installed" log_warn " - Cloud-init not installed" log_warn " - Network configuration failed" log_warn " - VM stuck in boot" fi # Check SSH if timeout 3 bash -c "cat < /dev/null > /dev/tcp/$ip/22" 2>/dev/null; then log_info " SSH: ✓ Port 22 open" else log_issue " SSH: ✗ Port 22 closed" fi } main() { log_info "VM Issue Diagnosis" echo "" # Diagnose template diagnose_template echo "" # Diagnose VMs local vms=( "100 cloudflare-tunnel 192.168.1.60" "101 k3s-master 192.168.1.188" "102 git-server 192.168.1.121" "103 observability 192.168.1.82" ) for vm_spec in "${vms[@]}"; do read -r vmid name ip <<< "$vm_spec" diagnose_vm "$vmid" "$name" "$ip" echo "" done log_info "Diagnosis complete" log_warn "If template has no OS, VMs need manual OS installation via Proxmox console" } main "$@"