From ded7d24924282e5d46c6a007639500e31b5d1107 Mon Sep 17 00:00:00 2001 From: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Date: Wed, 22 Apr 2026 23:30:34 +0000 Subject: [PATCH] PR AA follow-up: manual-rollback loud-failure summary + keep-min-5 backup-prune cron + root-only initial-keys handoff file - deploy-currencicombo-8604.sh: on readiness timeout, print loud failure summary (journalctl tails + exact --rollback command with specific backup path) instead of silently exiting. Deliberately does NOT auto-rollback; first cutovers often fail because of env/migration mistakes and auto-restore hides the failure state ops needs. - install.sh: on first run, write the three API keys + EVENT_SIGNING_SECRET to /root/currencicombo-first-keys.txt (0600, root:root) as a handoff copy. Canonical values still live in /etc/currencicombo/orchestrator.env. Log one pointer line (not the secrets themselves) to journald. Handoff file is NOT regenerated if orchestrator.env already exists. - install-prune-cron.sh (new, opt-in): installs /etc/cron.daily/ currencicombo-prune-backups that deletes entries older than 30 days from /var/lib/currencicombo/backups/ WHILE always keeping the newest 5 regardless of age. Enforced via newest-first sort + i --- scripts/deployment/README.md | 111 ++++++++++++++---- .../deployment/deploy-currencicombo-8604.sh | 32 ++++- scripts/deployment/install-prune-cron.sh | 102 ++++++++++++++++ scripts/deployment/install.sh | 39 +++++- scripts/deployment/webapp-nginx.conf | 17 ++- 5 files changed, 268 insertions(+), 33 deletions(-) create mode 100755 scripts/deployment/install-prune-cron.sh diff --git a/scripts/deployment/README.md b/scripts/deployment/README.md index e3df48a..6cab131 100644 --- a/scripts/deployment/README.md +++ b/scripts/deployment/README.md @@ -22,7 +22,7 @@ the repo. │ │ ▼ ▼ curucombo.曼李.com/* (default) curucombo.曼李.com/api/* - curucombo.曼李.com/events/* (SSE) ← swap ─ correctly routed to :8080 + (incl. SSE /api/plans/*/events/stream) │ │ CT 8604 │10.160.0.14:3000 CT 8604 │10.160.0.14:8080 ▼ ▼ @@ -46,7 +46,8 @@ the repo. | `systemd/currencicombo-webapp.service` | nginx serving the Vite SPA on `:3000` | | `webapp-nginx.conf` | full nginx.conf for the webapp unit | | `.env.prod.example` | env template installed to `/etc/currencicombo/orchestrator.env` | -| `install.sh` | one-shot host setup: user / dirs / DB role / systemd units | +| `install.sh` | one-shot host setup: user / dirs / DB role / systemd units / first-run key handoff file | +| `install-prune-cron.sh` | opt-in daily cron that prunes `/var/lib/currencicombo/backups/` (30-day retention, keep-min 5) | | `deploy-currencicombo-8604.sh` | build-and-swap deploy driver (the script Phoenix/proxmox deploy-api calls) | | `README.md` | you're reading it | @@ -71,12 +72,21 @@ All commands run as **root** inside the CT. On success you'll see: ``` [install] generated EVENT_SIGNING_SECRET (64 hex) - [install] generated 3 API keys (initiator/settler/auditor) — grep /etc/currencicombo/orchestrator.env + [install] generated 3 API keys (initiator/settler/auditor) + [install] initial secrets written to /root/currencicombo-first-keys.txt (0600) — record in password manager, then 'shred -u /root/currencicombo-first-keys.txt' [install] install complete. ``` - **Grab the three API keys from `/etc/currencicombo/orchestrator.env`** and put them in your password manager — they authenticate initiator / settler / auditor calls. -4. If you need to resolve any `EXT-*` blocker (e.g. point at a real dbis_core), edit `/etc/currencicombo/orchestrator.env` before the first deploy. -5. First build-and-start: + `install.sh` writes the three API keys + `EVENT_SIGNING_SECRET` to **two** places: + - `/etc/currencicombo/orchestrator.env` — canonical, read by systemd (`0640`, owned by `currencicombo`). + - `/root/currencicombo-first-keys.txt` — **root-only handoff file** (`0600`). Grab it once, record the values in your password manager, then `shred -u` it. + The handoff file is **not** regenerated on re-run — if `orchestrator.env` already exists, `install.sh` does not produce new secrets. +4. (Optional) Install the backup-pruning cron: + ``` + bash /var/lib/currencicombo/repo/scripts/deployment/install-prune-cron.sh + ``` + Drops a `/etc/cron.daily/currencicombo-prune-backups` that deletes anything under `/var/lib/currencicombo/backups/` older than 30 days while **always keeping the newest 5** regardless of age. Safe on re-run; opt out with `sudo rm /etc/cron.daily/currencicombo-prune-backups`. +5. If you need to resolve any `EXT-*` blocker (e.g. point at a real dbis_core), edit `/etc/currencicombo/orchestrator.env` before the first deploy. +6. First build-and-start: ``` bash /var/lib/currencicombo/repo/scripts/deployment/deploy-currencicombo-8604.sh ``` @@ -96,16 +106,17 @@ All commands run as **root** inside the CT. ## NPMplus ingress changes required at cutover `curucombo.曼李.com` today proxies 100% to `10.160.0.14:3000`. After -cutover it must become a **single-origin path-routed proxy**: +cutover it must become a **single-origin path-routed proxy** with **two** +rules (the SSE endpoint lives at `/api/plans/:id/events/stream`, so it's +already under `/api/*` — no separate `/events/*` rule is needed): -| location | upstream | notes | +| location | upstream | proxy settings | |---|---|---| -| `/api/*` | `http://10.160.0.14:8080` | orchestrator API. Forward `Host`, `X-Real-IP`, `X-Forwarded-*`. `proxy_read_timeout 60s`. | -| `/events/*` | `http://10.160.0.14:8080` | **SSE** — must set `proxy_buffering off;` and `proxy_read_timeout 24h;`. | -| `/` | `http://10.160.0.14:3000` | Vite SPA. Default upstream. | +| `/api/*` | `http://10.160.0.14:8080` | **SSE-friendly settings apply here because the SSE route `/api/plans/:id/events/stream` is under /api/**. Set: `proxy_http_version 1.1;`, `proxy_set_header Connection "";`, `proxy_buffering off;`, `proxy_cache off;`, `proxy_read_timeout 24h;`, `proxy_send_timeout 24h;`. Standard forwarding: `proxy_set_header Host $host;`, `X-Real-IP $remote_addr;`, `X-Forwarded-For $proxy_add_x_forwarded_for;`, `X-Forwarded-Proto $scheme;`. The slight overhead of `proxy_buffering off` on plain REST calls is negligible for this workload. | +| `/` | `http://10.160.0.14:3000` | Vite SPA. Default upstream. No special settings. | -If you skip the `/api` + `/events` rules, the nginx in `webapp-nginx.conf` -intentionally returns `HTTP 421` for those paths — a clean "upstream is +If you skip the `/api/*` rule, the nginx in `webapp-nginx.conf` +intentionally returns `HTTP 421` for that path — a clean "upstream is misconfigured" signal instead of silently returning `index.html` and breaking the browser with a JSON parse error. @@ -125,19 +136,79 @@ Flags: - `--rollback` — restore the most recent `/var/lib/currencicombo/backups//` and restart units. Does **not** git-pull or rebuild. Every deploy writes a timestamped backup to -`/var/lib/currencicombo/backups//` before swapping. Old -backups are not auto-pruned — `find /var/lib/currencicombo/backups -maxdepth 1 -mtime +30 -exec rm -rf {} +` on a cron. +`/var/lib/currencicombo/backups//` before swapping. Pruning is opt-in via `install-prune-cron.sh` (30-day retention, keep-min 5). Without the cron, backups accumulate forever — quietly filling `/var/lib` is how the next outage starts. -## Rollback (if a deploy goes sideways) +## Failure handling on deploy +**Rollback is manual.** `deploy-currencicombo-8604.sh` **does not** auto-restore the previous backup if the orchestrator fails to become ready. First cutovers typically fail because of env typos or migration mistakes, and auto-restoring hides the failure state ops needs. + +Instead, on a readiness timeout the deploy script prints: +- last 40 lines of `journalctl -u currencicombo-orchestrator` +- last 20 lines of `journalctl -u currencicombo-webapp` +- **the exact `--rollback` command with the specific backup path filled in** + +Example tail on failure: +``` +================================================================ +DEPLOY FAILED: orchestrator did not become ready after 60s +================================================================ + +## currencicombo-orchestrator (last 40 lines): +... env validation error: EVENT_SIGNING_SECRET is required ... + +## Units are in whatever state deploy left them. To restore +## the previous build (does NOT revert DB migrations): + + sudo /var/lib/currencicombo/repo/scripts/deployment/deploy-currencicombo-8604.sh --rollback + # (will restore /var/lib/currencicombo/backups/20260423-140215) + +================================================================ +``` + +Rollback one-liner (when ops has decided to restore): ``` sudo /var/lib/currencicombo/repo/scripts/deployment/deploy-currencicombo-8604.sh --rollback ``` -Restores the most recent backup and restarts both units. Does not touch -the DB. If the deploy that failed applied a new migration, a DB rollback -is a manual `psql` task — we don't attempt generic `down` migrations -because the orchestrator's migration runner only emits `up()` paths. +Rollback restores the most recent backup and restarts both units. It **does not** touch the DB. If the failed deploy applied a new migration, DB rollback is a manual `psql` task — the orchestrator's migration runner only emits `up()` paths. + +## Post-cutover smoke checks through NPMplus + +Once the NPMplus `/api/*` rule is live, from a workstation (not the CT): + +``` +# 1. Front-door TLS is healthy +curl -skI https://curucombo.xn--vov0g.com/ | head -3 +# expect: HTTP/2 200 +# expect: NO 'x-nextjs-prerender' header (that was the old Next.js build) + +# 2. SPA is the new Vite portal +curl -sk https://curucombo.xn--vov0g.com/ | grep -oE '[^<]+' +# expect: Solace Bank Group PLC — Treasury Management Portal + +# 3. Orchestrator ready through NPMplus +curl -sk https://curucombo.xn--vov0g.com/api/ready | head -1 +# expect: {"ready":true} (not HTML) + +# 4. Orchestrator blocker log (through CT shell, not NPMplus) +ssh root@10.160.0.14 'journalctl -u currencicombo-orchestrator -n 200 | grep -E "ExternalBlockers|EXT-"' +# expect: [ExternalBlockers] 6 active, 1 resolved +# expect: one line per EXT-* id + +# 5. SSE actually streams (catches silent NPMplus proxy_buffering=on misconfig) +curl -sk -N --max-time 5 -H 'Accept: text/event-stream' \ + https://curucombo.xn--vov0g.com/api/plans/demo-pay-014/events/stream \ + | head -20 || true +# expect: HTTP/2 200 with Content-Type: text/event-stream +# expect: at least one 'data: {...}\n\n' frame to arrive WITHIN ~1s +# if you see nothing for 3-5s and then everything dumps at once: +# NPMplus has proxy_buffering=on. Fix: proxy_buffering off; proxy_http_version 1.1; proxy_set_header Connection ""; +# if the ping is 401/403: expected — SSE is auth-gated; the point is to +# prove the request REACHED the orchestrator (content-type header + +# chunked response headers) rather than hitting the Vite SPA. +``` + +A plain `HTTP/2 200` with a `Content-Type: text/html` body on `/api/ready` means NPMplus is silently falling back to the `/` rule — the `/api/*` rule is missing or ordered wrong. The `webapp-nginx.conf` in this repo returns `HTTP 421` for `/api/*` to make that case obvious when debugging CT-locally, but at the NPMplus edge nginx serves whatever NPMplus routes to it. ## Troubleshooting diff --git a/scripts/deployment/deploy-currencicombo-8604.sh b/scripts/deployment/deploy-currencicombo-8604.sh index 526002e..ca638a1 100755 --- a/scripts/deployment/deploy-currencicombo-8604.sh +++ b/scripts/deployment/deploy-currencicombo-8604.sh @@ -178,8 +178,36 @@ SECS=0 until curl -sfL --max-time 3 "${CC_HEALTH_URL}" >/dev/null 2>&1; do SECS=$((SECS + 2)) if [[ "${SECS}" -ge "${CC_HEALTH_TIMEOUT_SECS}" ]]; then - journalctl -u "${ORCHESTRATOR_UNIT}" -n 80 --no-pager || true - die "orchestrator did not become ready after ${CC_HEALTH_TIMEOUT_SECS}s" + # Loud failure summary. Deliberately does NOT auto-rollback — first + # cutovers often fail because of env/migration mistakes, and + # auto-restoring the old build hides the failure state ops needs to + # diagnose. Print the exact --rollback command with the specific + # backup path filled in, so it's one copy-paste away if desired. + { + echo + echo "================================================================" + echo "DEPLOY FAILED: orchestrator did not become ready after ${CC_HEALTH_TIMEOUT_SECS}s" + echo "================================================================" + echo + echo "## currencicombo-orchestrator (last 40 lines):" + journalctl -u "${ORCHESTRATOR_UNIT}" -n 40 --no-pager 2>&1 || echo "(journalctl unavailable)" + echo + echo "## currencicombo-webapp (last 20 lines):" + journalctl -u "${WEBAPP_UNIT}" -n 20 --no-pager 2>&1 || echo "(journalctl unavailable)" + echo + echo "## Units are in whatever state deploy left them. To restore" + echo "## the previous build (does NOT revert DB migrations):" + echo + if [[ -n "${BACKUP:-}" && -d "${BACKUP}" ]]; then + echo " sudo $0 --rollback" + echo " # (will restore ${BACKUP})" + else + echo " # No backup was taken (first deploy). Manual recovery required." + fi + echo + echo "================================================================" + } >&2 + exit 1 fi sleep 2 done diff --git a/scripts/deployment/install-prune-cron.sh b/scripts/deployment/install-prune-cron.sh new file mode 100755 index 0000000..22e934d --- /dev/null +++ b/scripts/deployment/install-prune-cron.sh @@ -0,0 +1,102 @@ +#!/usr/bin/env bash +# install-prune-cron.sh — opt-in cron job to prune old deploy backups. +# +# Run ONCE as root (or with sudo) after install.sh to enable daily +# pruning of /var/lib/currencicombo/backups/. The pruner: +# - deletes entries older than 30 days +# - ALWAYS keeps the newest N backups regardless of age (default 5) +# +# No-op on re-run. Opt out by removing /etc/cron.daily/currencicombo-prune-backups. + +set -euo pipefail + +BACKUP_DIR="${CC_BACKUP_DIR:-/var/lib/currencicombo/backups}" +RETAIN_DAYS="${CC_BACKUP_RETAIN_DAYS:-30}" +KEEP_MIN="${CC_BACKUP_KEEP_MIN:-5}" +CRON_FILE="/etc/cron.daily/currencicombo-prune-backups" +DRY_RUN=0 + +while [[ $# -gt 0 ]]; do + case "$1" in + --dry-run) DRY_RUN=1; shift ;; + -h|--help) + cat <<'USAGE' +Usage: sudo ./install-prune-cron.sh [--dry-run] + +Env overrides: + CC_BACKUP_DIR (default: /var/lib/currencicombo/backups) + CC_BACKUP_RETAIN_DAYS (default: 30) + CC_BACKUP_KEEP_MIN (default: 5) +USAGE + exit 0 ;; + *) echo "unknown arg: $1" >&2; exit 2 ;; + esac +done + +log() { printf '[install-prune-cron] %s\n' "$*" >&2; } +die() { printf '[install-prune-cron][FATAL] %s\n' "$*" >&2; exit 1; } + +[[ "$EUID" -eq 0 ]] || die "must run as root (sudo)" + +# The pruner script body. Runs daily via cron.daily. +# KEEP_MIN is enforced by listing backups newest-first, skipping the +# first KEEP_MIN, then deleting any remaining entries older than +# RETAIN_DAYS. This means we always keep at least KEEP_MIN (even if +# they're all <30 days old), and never delete one of the newest +# KEEP_MIN (even if it's >30 days old on a dormant host). +read -r -d '' PRUNER_BODY </dev/null | sort -rn | awk '{print \$2}') + +count=\${#all[@]} +if (( count <= KEEP_MIN )); then + logger -t currencicombo-prune "count=\$count <= KEEP_MIN=\$KEEP_MIN; nothing to prune" + exit 0 +fi + +cutoff=\$(date -d "\$RETAIN_DAYS days ago" +%s) +deleted=0 +kept=0 +for i in "\${!all[@]}"; do + p="\${all[\$i]}" + if (( i < KEEP_MIN )); then + kept=\$((kept + 1)) + continue + fi + mtime=\$(stat -c %Y "\$p" 2>/dev/null || echo 0) + if (( mtime < cutoff )); then + rm -rf -- "\$p" + deleted=\$((deleted + 1)) + else + kept=\$((kept + 1)) + fi +done +logger -t currencicombo-prune "deleted=\$deleted kept=\$kept total_before=\$count" +PRUNER + +if [[ "${DRY_RUN}" -eq 1 ]]; then + log "[dry-run] would write ${CRON_FILE} (0755) with pruner targeting ${BACKUP_DIR}, retain ${RETAIN_DAYS}d, keep-min ${KEEP_MIN}" + echo "---" + echo "${PRUNER_BODY}" + echo "---" + exit 0 +fi + +printf '%s\n' "${PRUNER_BODY}" > "${CRON_FILE}" +chmod 0755 "${CRON_FILE}" +chown root:root "${CRON_FILE}" + +log "installed ${CRON_FILE} (backups older than ${RETAIN_DAYS}d, keep-min ${KEEP_MIN}, target ${BACKUP_DIR})" +log "runs daily via /etc/cron.daily/. Opt out: sudo rm ${CRON_FILE}" +log "logs to syslog (tag currencicombo-prune); journalctl -t currencicombo-prune" diff --git a/scripts/deployment/install.sh b/scripts/deployment/install.sh index 6280972..c92537b 100755 --- a/scripts/deployment/install.sh +++ b/scripts/deployment/install.sh @@ -160,8 +160,9 @@ fi # ---------------------------------------------------------------------- # 5. orchestrator.env # ---------------------------------------------------------------------- +FIRST_KEYS_FILE="/root/currencicombo-first-keys.txt" if [[ -f "${ENV_FILE}" ]]; then - log "${ENV_FILE} already exists — leaving alone" + log "${ENV_FILE} already exists — leaving alone (no new keys generated)" else log "writing ${ENV_FILE}" install -o "${APP_USER}" -g "${APP_USER}" -m 0640 "${SCRIPT_DIR}/.env.prod.example" "${ENV_FILE}" @@ -172,8 +173,42 @@ else AUD_KEY="$(openssl rand -hex 24)" run "sed -i 's|^EVENT_SIGNING_SECRET=.*|EVENT_SIGNING_SECRET=${SECRET}|' '${ENV_FILE}'" run "sed -i 's|^ORCHESTRATOR_API_KEYS=.*|ORCHESTRATOR_API_KEYS=${INIT_KEY}:initiator,${SETT_KEY}:settler,${AUD_KEY}:auditor|' '${ENV_FILE}'" + # Write a root-only handoff file so ops can grab the keys without + # scraping journald or reading the env file. The canonical copy lives + # in ${ENV_FILE}; delete this file once the keys are in your password + # manager. + if [[ "${DRY_RUN}" -eq 0 ]]; then + umask 077 + cat > "${FIRST_KEYS_FILE}" <