#!/usr/bin/env bash # Disaster Recovery Test Script # This script tests disaster recovery procedures set -e SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" source "$SCRIPT_DIR/../lib/init.sh" PROJECT_ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" NAMESPACE="${NAMESPACE:-besu-network}" BACKUP_DIR="${BACKUP_DIR:-/tmp/backup-test}" log_success "Running Disaster Recovery Tests..." # Test backup procedures log_warn "Testing backup procedures..." # Create backup directory mkdir -p "$BACKUP_DIR" # Test chaindata backup if [ -f "$PROJECT_ROOT/scripts/backup/backup-chaindata.sh" ]; then log_warn " Testing chaindata backup..." # Run backup (dry-run if possible) if "$PROJECT_ROOT/scripts/backup/backup-chaindata.sh" 2>&1 | tee /tmp/backup-test.log; then log_success "✓ Backup script executed successfully" else log_warn "⚠ Backup script execution had issues (check logs)" fi else log_warn "⚠ Backup script not found" fi # Test restore procedures log_warn "Testing restore procedures..." if [ -f "$PROJECT_ROOT/scripts/backup/restore-chaindata.sh" ]; then log_success "✓ Restore script exists" # Validate restore script syntax if bash -n "$PROJECT_ROOT/scripts/backup/restore-chaindata.sh" 2>&1; then log_success "✓ Restore script syntax is valid" else log_error "✗ Restore script has syntax errors" exit 1 fi else log_warn "⚠ Restore script not found" fi # Test failover scenarios log_warn "Testing failover scenarios..." # Test validator failover VALIDATOR_PODS=$(kubectl get pods -n "$NAMESPACE" -l component=validator -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") if [ -n "$VALIDATOR_PODS" ]; then log_warn " Testing validator pod failure..." # Get initial pod count INITIAL_COUNT=$(kubectl get pods -n "$NAMESPACE" -l component=validator --no-headers | wc -l) log_warn " Initial validator pods: $INITIAL_COUNT" # Delete a pod log_warn " Deleting pod: $VALIDATOR_PODS" kubectl delete pod "$VALIDATOR_PODS" -n "$NAMESPACE" --wait=false # Wait for pod to be recreated log_warn " Waiting for pod to be recreated (60 seconds)..." sleep 60 # Check if pod was recreated NEW_COUNT=$(kubectl get pods -n "$NAMESPACE" -l component=validator --no-headers | wc -l) log_warn " Current validator pods: $NEW_COUNT" if [ "$NEW_COUNT" -eq "$INITIAL_COUNT" ]; then log_success "✓ Validator pod was recreated successfully" else log_warn "⚠ Validator pod count changed (may still be recovering)" fi else log_warn "⚠ No validator pods available for failover test" fi # Test RPC node failover RPC_PODS=$(kubectl get pods -n "$NAMESPACE" -l component=rpc -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") if [ -n "$RPC_PODS" ]; then log_warn " Testing RPC node failover..." INITIAL_COUNT=$(kubectl get pods -n "$NAMESPACE" -l component=rpc --no-headers | wc -l) log_warn " Initial RPC pods: $INITIAL_COUNT" # Delete a pod log_warn " Deleting pod: $RPC_PODS" kubectl delete pod "$RPC_PODS" -n "$NAMESPACE" --wait=false # Wait for pod to be recreated log_warn " Waiting for pod to be recreated (60 seconds)..." sleep 60 # Check if pod was recreated NEW_COUNT=$(kubectl get pods -n "$NAMESPACE" -l component=rpc --no-headers | wc -l) log_warn " Current RPC pods: $NEW_COUNT" if [ "$NEW_COUNT" -eq "$INITIAL_COUNT" ]; then log_success "✓ RPC pod was recreated successfully" else log_warn "⚠ RPC pod count changed (may still be recovering)" fi else log_warn "⚠ No RPC pods available for failover test" fi # Test recovery time objectives log_warn "Testing recovery time objectives..." # Measure pod restart time if [ -n "$VALIDATOR_PODS" ]; then log_warn " Measuring pod restart time..." START_TIME=$(date +%s) kubectl delete pod "$VALIDATOR_PODS" -n "$NAMESPACE" --wait=false # Wait for pod to be ready if kubectl wait --for=condition=ready pod -l component=validator -n "$NAMESPACE" --timeout=300s 2>/dev/null; then END_TIME=$(date +%s) RESTART_TIME=$((END_TIME - START_TIME)) log_success "✓ Pod restarted in ${RESTART_TIME} seconds" # Check against RTO (1 hour for critical services) if [ "$RESTART_TIME" -lt 3600 ]; then log_success "✓ Restart time is within RTO (1 hour)" else log_error "✗ Restart time exceeds RTO (1 hour)" fi else log_error "✗ Pod did not restart within timeout" fi else log_warn "⚠ No pods available for RTO test" fi # Test backup validation log_warn "Testing backup validation..." # Check if backup files exist BACKUP_FILES=$(find "$BACKUP_DIR" -name "*.tar.gz" 2>/dev/null || echo "") if [ -n "$BACKUP_FILES" ]; then log_success "✓ Backup files found" # Validate backup file integrity for backup in $BACKUP_FILES; do log_warn " Validating $backup..." if tar -tzf "$backup" > /dev/null 2>&1; then log_success "✓ Backup file is valid: $backup" else log_error "✗ Backup file is corrupted: $backup" fi done else log_warn "⚠ No backup files found for validation" fi # Test key recovery log_warn "Testing key recovery..." if [ -f "$PROJECT_ROOT/scripts/key-management/rotate-keys.sh" ]; then log_success "✓ Key rotation script exists" # Validate key rotation script syntax if bash -n "$PROJECT_ROOT/scripts/key-management/rotate-keys.sh" 2>&1; then log_success "✓ Key rotation script syntax is valid" else log_error "✗ Key rotation script has syntax errors" exit 1 fi else log_warn "⚠ Key rotation script not found" fi # Document test results log_warn "Documenting test results..." TEST_RESULTS_FILE="/tmp/disaster-recovery-test-results.txt" cat > "$TEST_RESULTS_FILE" <