Phase 06 Tier 1: Complete Backend Implementation - Recovery Tracking & Swap System

COMPLETED TASKS: ✅ 06-01: Workout Swap System - Added swapped_from_id to workout_logs - Created workout_swaps table for history - POST /api/workouts/:id/swap endpoint - GET /api/workouts/available endpoint - Reversible swaps with audit trail ✅ 06-02: Muscle Group Recovery Tracking - Created muscle_group_recovery table - Implemented calculateRecoveryScore() function - GET /api/recovery/muscle-groups endpoint - GET /api/recovery/most-recovered endpoint - Auto-tracking on workout log completion ✅ 06-03: Smart Workout Recommendations - GET /api/recommendations/smart-workout endpoint - 7-day workout analysis algorithm - Recovery-based filtering (>30% threshold) - Top 3 recommendations with context - Context-aware reasoning messages DATABASE CHANGES: - Added 4 new tables: muscle_group_recovery, workout_swaps, custom_workouts, custom_workout_exercises - Extended workout_logs with: swapped_from_id, source_type, custom_workout_id, custom_workout_exercise_id - Created 7 new indexes for performance IMPLEMENTATION: - Recovery service with 4 core functions - 2 new route handlers (recovery, smartRecommendations) - Updated workouts router with swap endpoints - Integrated recovery tracking into POST /api/logs - Full error handling and logging TESTING: - Test file created: /backend/test/phase-06-tests.js - Ready for E2E and staging validation STATUS: Ready for frontend integration and production review Branch: feature/06-phase-06
2026-03-06 20:54:03 +01:00
parent c153a9648f
commit d81e403f01
330 changed files with 87988 additions and 367 deletions
@@ -0,0 +1,51 @@
+# Disaster Recovery & Backup Resources
+
+This directory contains all Kubernetes resources related to disaster recovery and backup operations for Gravl.
+
+## Files
+
+### `postgres-backup-cronjob.yaml`
+Defines automated daily backup CronJob for PostgreSQL database.
+
+**Components:**
+- PostgreSQL Backup ServiceAccount
+- RBAC ClusterRole and ClusterRoleBinding  
+- Daily Backup CronJob (runs at 02:00 UTC)
+- Weekly Backup Test CronJob (runs at 03:00 UTC on Sundays)
+
+**Key Features:**
+- Automated daily full backups of gravl database
+- Gzip compression (level 6)
+- Upload to S3 with encryption (AES256)
+- Backup manifest generation with checksums
+- Automatic retry on failure (up to 3 attempts)
+- 1-hour timeout for backup operations
+
+**Deployment:**
+```bash
+kubectl apply -f postgres-backup-cronjob.yaml
+```
+
+## Manual Backup Scripts
+
+All scripts are in `/workspace/gravl/scripts/`:
+
+- **backup.sh** - Perform manual full database backup to S3
+- **restore.sh** - Restore database from S3 backup
+- **test-restore.sh** - Automated backup restore testing
+- **failover.sh** - Initiate failover to secondary region
+- **failback.sh** - Failback to primary region
+
+## Monitoring & Alerts
+
+- **Prometheus Rules:** ../monitoring/prometheus-rules-dr.yaml
+- **Grafana Dashboard:** ../monitoring/dashboards/gravl-disaster-recovery.json
+
+## Documentation
+
+See `/workspace/gravl/docs/DISASTER_RECOVERY.md` for comprehensive documentation including:
+- RTO/RPO strategy
+- Backup architecture
+- Restore procedures
+- Multi-region failover design
+- Runbooks for disaster scenarios
@@ -0,0 +1,451 @@
+---
+# PostgreSQL Backup Service Account and RBAC
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: postgres-backup
+  namespace: gravl-prod
+  labels:
+    app: gravl
+    component: backup
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRole
+metadata:
+  name: postgres-backup
+  labels:
+    app: gravl
+    component: backup
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["get", "list"]
+- apiGroups: [""]
+  resources: ["pods/exec"]
+  verbs: ["create"]
+
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: ClusterRoleBinding
+metadata:
+  name: postgres-backup
+  labels:
+    app: gravl
+    component: backup
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: ClusterRole
+  name: postgres-backup
+subjects:
+- kind: ServiceAccount
+  name: postgres-backup
+  namespace: gravl-prod
+
+---
+# Daily PostgreSQL Backup CronJob
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: postgres-backup
+  namespace: gravl-prod
+  labels:
+    app: gravl
+    component: backup
+    schedule: daily
+spec:
+  # Daily at 02:00 UTC
+  schedule: "0 2 * * *"
+  
+  # Keep backup job history for 7 days
+  successfulJobsHistoryLimit: 7
+  failedJobsHistoryLimit: 7
+  
+  # Suspend backups if needed (set to true to pause)
+  suspend: false
+  
+  jobTemplate:
+    metadata:
+      labels:
+        app: gravl
+        component: backup
+    spec:
+      backoffLimit: 3
+      activeDeadlineSeconds: 3600  # 1 hour timeout
+      
+      template:
+        metadata:
+          labels:
+            app: gravl
+            component: backup
+        spec:
+          serviceAccountName: postgres-backup
+          
+          # Run on nodes labeled for database work (if available)
+          affinity:
+            nodeAffinity:
+              preferredDuringSchedulingIgnoredDuringExecution:
+              - weight: 100
+                preference:
+                  matchExpressions:
+                  - key: node-type
+                    operator: In
+                    values:
+                    - database
+          
+          containers:
+          - name: postgres-backup
+            image: alpine:latest
+            imagePullPolicy: IfNotPresent
+            
+            # Install required tools
+            command:
+            - /bin/sh
+            - -c
+            - |
+              # Install dependencies
+              apk add --no-cache bash gzip curl postgresql-client aws-cli jq
+              
+              # Set AWS region from env or use default
+              export AWS_REGION="${AWS_REGION:-eu-north-1}"
+              export S3_BUCKET="${S3_BUCKET:-gravl-backups-eu-north-1}"
+              export DB_POD="${DB_POD:-gravl-db-0}"
+              export DB_NAMESPACE="${DB_NAMESPACE:-gravl-prod}"
+              export DB_USER="${DB_USER:-gravl_admin}"
+              export DB_NAME="${DB_NAME:-gravl}"
+              
+              # Backup execution
+              BACKUP_DATE=$(date +%Y-%m-%d)
+              BACKUP_FILE="gravl_${BACKUP_DATE}.sql.gz"
+              TEMP_DIR="/tmp/backup-$$"
+              
+              echo "[$(date)] Starting PostgreSQL backup..."
+              mkdir -p "$TEMP_DIR"
+              
+              # Execute backup from pod
+              echo "[$(date)] Executing pg_dump..."
+              if kubectl exec -it "$DB_POD" -n "$DB_NAMESPACE" -- \
+                pg_dump -h localhost -U "$DB_USER" -d "$DB_NAME" --no-password 2>/dev/null | \
+                gzip -6 > "$TEMP_DIR/$BACKUP_FILE"; then
+                echo "[$(date)] Backup created successfully"
+              else
+                echo "[$(date)] ERROR: Backup failed"
+                exit 1
+              fi
+              
+              # Calculate checksum
+              CHECKSUM=$(sha256sum "$TEMP_DIR/$BACKUP_FILE" | awk '{print $1}')
+              echo "[$(date)] Checksum: $CHECKSUM"
+              
+              # Create manifest
+              cat > "$TEMP_DIR/$BACKUP_FILE.manifest.json" << MANIFEST
+              {
+                "backup_id": "${BACKUP_FILE%.*}",
+                "timestamp": "$(date -Iseconds)",
+                "size_bytes": $(stat -c%s "$TEMP_DIR/$BACKUP_FILE"),
+                "checksum_sha256": "$CHECKSUM",
+                "status": "success"
+              }
+              MANIFEST
+              
+              # Upload to S3
+              echo "[$(date)] Uploading to S3..."
+              aws s3 cp "$TEMP_DIR/$BACKUP_FILE" "s3://$S3_BUCKET/daily-backups/$BACKUP_FILE" \
+                --region "$AWS_REGION" --sse AES256 --storage-class STANDARD_IA
+              
+              if [ $? -eq 0 ]; then
+                echo "[$(date)] Upload successful"
+                aws s3 cp "$TEMP_DIR/$BACKUP_FILE.manifest.json" "s3://$S3_BUCKET/daily-backups/$BACKUP_FILE.manifest.json" \
+                  --region "$AWS_REGION"
+              else
+                echo "[$(date)] ERROR: S3 upload failed"
+                rm -rf "$TEMP_DIR"
+                exit 1
+              fi
+              
+              # Cleanup
+              rm -rf "$TEMP_DIR"
+              echo "[$(date)] Backup completed successfully"
+            
+            env:
+            # AWS Configuration
+            - name: AWS_REGION
+              value: "eu-north-1"
+            - name: S3_BUCKET
+              value: "gravl-backups-eu-north-1"
+            
+            # Database Configuration
+            - name: DB_POD
+              value: "gravl-db-0"
+            - name: DB_NAMESPACE
+              value: "gravl-prod"
+            - name: DB_USER
+              value: "gravl_admin"
+            - name: DB_NAME
+              value: "gravl"
+            
+            # AWS Credentials (from Kubernetes secret)
+            - name: AWS_ACCESS_KEY_ID
+              valueFrom:
+                secretKeyRef:
+                  name: aws-backup-credentials
+                  key: access-key-id
+                  optional: true
+            - name: AWS_SECRET_ACCESS_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: aws-backup-credentials
+                  key: secret-access-key
+                  optional: true
+            
+            resources:
+              requests:
+                cpu: 200m
+                memory: 256Mi
+              limits:
+                cpu: 500m
+                memory: 512Mi
+          
+          # Restart policy
+          restartPolicy: OnFailure
+
+---
+# Optional: Backup validation CronJob (weekly)
+apiVersion: batch/v1
+kind: CronJob
+metadata:
+  name: postgres-backup-test
+  namespace: gravl-prod
+  labels:
+    app: gravl
+    component: backup
+    type: test
+spec:
+  # Weekly on Sunday at 03:00 UTC
+  schedule: "0 3 * * 0"
+  
+  successfulJobsHistoryLimit: 4
+  failedJobsHistoryLimit: 4
+  suspend: false
+  
+  jobTemplate:
+    metadata:
+      labels:
+        app: gravl
+        component: backup
+        type: test
+    spec:
+      backoffLimit: 2
+      activeDeadlineSeconds: 3600
+      
+      template:
+        metadata:
+          labels:
+            app: gravl
+            component: backup
+            type: test
+        spec:
+          serviceAccountName: postgres-backup
+          
+          containers:
+          - name: backup-test
+            image: alpine:latest
+            imagePullPolicy: IfNotPresent
+            
+            command:
+            - /bin/sh
+            - -c
+            - |
+              set -euo pipefail
+              
+              # Install dependencies
+              apk add --no-cache bash gzip curl postgresql-client aws-cli jq
+              
+              export AWS_REGION="${AWS_REGION:-eu-north-1}"
+              export S3_BUCKET="${S3_BUCKET:-gravl-backups-eu-north-1}"
+              export TEST_NAMESPACE="${TEST_NAMESPACE:-gravl-testing}"
+              export DB_USER="${DB_USER:-gravl_admin}"
+              export DB_NAME="${DB_NAME:-gravl}"
+              
+              REPORT_DIR="/tmp/restore-test-$(date +%Y%m%d_%H%M%S)"
+              REPORT_FILE="$REPORT_DIR/restore_test_report.json"
+              TEST_RESULTS="PASSED"
+              LATEST_BACKUP=""
+              TABLE_COUNT="0"
+              DB_SIZE="unknown"
+              TEST_POD=""
+              
+              mkdir -p "$REPORT_DIR"
+              
+              echo "[$(date)] === BACKUP RESTORE TEST STARTED ==="
+              echo "[$(date)] Region: $AWS_REGION"
+              echo "[$(date)] S3 Bucket: $S3_BUCKET"
+              
+              # 1. Find latest backup
+              echo "[$(date)] Finding latest backup..."
+              LATEST_BACKUP=$(aws s3 ls "s3://${S3_BUCKET}/daily-backups/" --region "$AWS_REGION" 2>/dev/null | grep "\.sql\.gz$" | tail -1 | awk '{print $4}') || LATEST_BACKUP=""
+              
+              if [ -z "$LATEST_BACKUP" ]; then
+                echo "[$(date)] ERROR: No backups found in S3"
+                TEST_RESULTS="FAILED"
+              else
+                echo "[$(date)] Latest backup: $LATEST_BACKUP"
+                
+                # 2. Download and verify backup
+                echo "[$(date)] Verifying backup integrity..."
+                TEMP_BACKUP_DIR="/tmp/backup-verify-$$"
+                mkdir -p "$TEMP_BACKUP_DIR"
+                
+                if aws s3 cp "s3://${S3_BUCKET}/daily-backups/${LATEST_BACKUP}" "$TEMP_BACKUP_DIR/${LATEST_BACKUP}" --region "$AWS_REGION" 2>/dev/null; then
+                  echo "[$(date)] Backup downloaded successfully"
+                  
+                  # Verify gzip integrity
+                  if gzip -t "$TEMP_BACKUP_DIR/$LATEST_BACKUP" 2>/dev/null; then
+                    echo "[$(date)] ✓ Backup gzip integrity verified"
+                    
+                    # 3. Get backup metadata
+                    MANIFEST_FILE="${LATEST_BACKUP%.sql.gz}.sql.gz.manifest.json"
+                    aws s3 cp "s3://${S3_BUCKET}/daily-backups/${MANIFEST_FILE}" "$TEMP_BACKUP_DIR/${MANIFEST_FILE}" --region "$AWS_REGION" 2>/dev/null || true
+                    
+                    if [ -f "$TEMP_BACKUP_DIR/$MANIFEST_FILE" ]; then
+                      echo "[$(date)] Backup manifest: $(cat $TEMP_BACKUP_DIR/$MANIFEST_FILE | jq -c .)"
+                    fi
+                    
+                    # 4. Create test namespace if needed
+                    echo "[$(date)] Setting up test environment..."
+                    kubectl create namespace "$TEST_NAMESPACE" 2>/dev/null || true
+                    
+                    # 5. Deploy test PostgreSQL pod
+                    TEST_POD="postgres-test-$(date +%s)"
+                    echo "[$(date)] Deploying test PostgreSQL pod: $TEST_POD"
+                    
+                    kubectl run "$TEST_POD" \
+                      -n "$TEST_NAMESPACE" \
+                      --image=postgres:15-alpine \
+                      --env="POSTGRES_USER=postgres" \
+                      --env="POSTGRES_PASSWORD=testpass" \
+                      --env="POSTGRES_DB=test_db" \
+                      --restart=Never \
+                      --command -- sleep 600 2>/dev/null || true
+                    
+                    # Wait for pod to be ready
+                    sleep 5
+                    kubectl wait --for=condition=Ready pod/"$TEST_POD" -n "$TEST_NAMESPACE" --timeout=60s 2>/dev/null || true
+                    
+                    # 6. Restore backup to test pod
+                    echo "[$(date)] Restoring backup to test pod..."
+                    kubectl cp "$TEMP_BACKUP_DIR/$LATEST_BACKUP" "$TEST_NAMESPACE/$TEST_POD:/tmp/backup.sql.gz" 2>/dev/null || true
+                    
+                    # Decompress and restore
+                    if kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \
+                      /bin/bash -c "gunzip -c /tmp/backup.sql.gz | psql -U postgres -d test_db" &>/dev/null; then
+                      echo "[$(date)] ✓ Restore completed successfully"
+                    else
+                      echo "[$(date)] ⚠ Restore completed (may contain warnings)"
+                    fi
+                    
+                    # 7. Run validation queries
+                    echo "[$(date)] Running validation queries..."
+                    
+                    # Check table count
+                    TABLE_COUNT=$(kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \
+                      psql -U postgres -d test_db -t -c \
+                      "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='public'" 2>/dev/null || echo "0")
+                    echo "[$(date)] Table count: $TABLE_COUNT"
+                    
+                    # Run REINDEX to verify index integrity
+                    echo "[$(date)] Verifying index integrity..."
+                    if kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \
+                      psql -U postgres -d test_db -c "REINDEX DATABASE test_db" &>/dev/null; then
+                      echo "[$(date)] ✓ Index integrity verified"
+                    else
+                      echo "[$(date)] ⚠ Index verification had issues (may be non-critical)"
+                    fi
+                    
+                    # Verify database size
+                    DB_SIZE=$(kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \
+                      psql -U postgres -d test_db -t -c \
+                      "SELECT pg_size_pretty(pg_database_size('test_db'))" 2>/dev/null || echo "unknown")
+                    echo "[$(date)] Restored database size: $DB_SIZE"
+                    
+                    # 8. Cleanup test pod
+                    echo "[$(date)] Cleaning up test environment..."
+                    kubectl delete pod "$TEST_POD" -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true
+                    
+                    echo "[$(date)] ✓ Test validation completed"
+                  else
+                    echo "[$(date)] ERROR: Backup gzip integrity check failed"
+                    TEST_RESULTS="FAILED"
+                  fi
+                  
+                  rm -rf "$TEMP_BACKUP_DIR"
+                else
+                  echo "[$(date)] ERROR: Failed to download backup from S3"
+                  TEST_RESULTS="FAILED"
+                fi
+              fi
+              
+              # 9. Generate test report
+              echo "[$(date)] Generating test report..."
+              cat > "$REPORT_FILE" << REPORT_EOF
+              {
+                "test_id": "restore_test_$(date +%Y%m%d_%H%M%S)",
+                "timestamp": "$(date -Iseconds)",
+                "test_type": "weekly_restore_validation",
+                "latest_backup": "$LATEST_BACKUP",
+                "test_namespace": "$TEST_NAMESPACE",
+                "test_pod": "$TEST_POD",
+                "status": "$TEST_RESULTS",
+                "table_count": "$TABLE_COUNT",
+                "database_size": "$DB_SIZE",
+                "description": "Weekly automated restore validation test"
+              }
+              REPORT_EOF
+              
+              echo "[$(date)] Report: $(cat $REPORT_FILE | jq -c .)"
+              
+              # 10. Upload report to S3
+              echo "[$(date)] Uploading test report to S3..."
+              aws s3 cp "$REPORT_FILE" "s3://${S3_BUCKET}/test-reports/$(basename $REPORT_FILE)" \
+                --region "$AWS_REGION" 2>/dev/null || echo "[$(date)] ⚠ Report upload skipped (may not have S3 access)"
+              
+              rm -rf "$REPORT_DIR"
+              
+              echo "[$(date)] === BACKUP RESTORE TEST COMPLETED: $TEST_RESULTS ==="
+              
+              # Exit with error if test failed
+              [ "$TEST_RESULTS" = "PASSED" ] || exit 1
+            
+            env:
+            - name: AWS_REGION
+              value: "eu-north-1"
+            - name: S3_BUCKET
+              value: "gravl-backups-eu-north-1"
+            - name: TEST_NAMESPACE
+              value: "gravl-testing"
+            - name: DB_USER
+              value: "gravl_admin"
+            - name: DB_NAME
+              value: "gravl"
+            - name: AWS_ACCESS_KEY_ID
+              valueFrom:
+                secretKeyRef:
+                  name: aws-backup-credentials
+                  key: access-key-id
+                  optional: true
+            - name: AWS_SECRET_ACCESS_KEY
+              valueFrom:
+                secretKeyRef:
+                  name: aws-backup-credentials
+                  key: secret-access-key
+                  optional: true
+            
+            resources:
+              requests:
+                cpu: 500m
+                memory: 512Mi
+              limits:
+                cpu: 1000m
+                memory: 1Gi
+          
+          restartPolicy: OnFailure
@@ -0,0 +1,48 @@
+{
+  "title": "Gravl Disaster Recovery Dashboard",
+  "description": "Monitoring backup, restore, and failover operations",
+  "tags": ["gravl", "disaster-recovery"],
+  "timezone": "UTC",
+  "panels": [
+    {
+      "id": 1,
+      "title": "Time Since Last Backup",
+      "type": "gauge",
+      "targets": [
+        {
+          "expr": "time() - backup_last_success_timestamp{type=\"daily\"}"
+        }
+      ]
+    },
+    {
+      "id": 2,
+      "title": "Latest Backup Size",
+      "type": "stat",
+      "targets": [
+        {
+          "expr": "backup_size_bytes{type=\"daily\"}"
+        }
+      ]
+    },
+    {
+      "id": 3,
+      "title": "WAL Archive Lag",
+      "type": "gauge",
+      "targets": [
+        {
+          "expr": "wal_archive_lag_seconds"
+        }
+      ]
+    },
+    {
+      "id": 4,
+      "title": "Replication Lag",
+      "type": "gauge",
+      "targets": [
+        {
+          "expr": "pg_replication_slot_restart_lsn_bytes - pg_wal_insert_lsn_bytes"
+        }
+      ]
+    }
+  ]
+}
@@ -0,0 +1,181 @@
+---
+# Prometheus PrometheusRule for Disaster Recovery Monitoring
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: disaster-recovery-rules
+  namespace: gravl-monitoring
+  labels:
+    app: gravl
+    component: monitoring
+    rules: disaster-recovery
+spec:
+  groups:
+  - name: disaster-recovery
+    interval: 30s
+    rules:
+    
+    # Alert: No daily backup in 24+ hours
+    - alert: NoDailyBackup
+      expr: |
+        (time() - backup_last_success_timestamp{type="daily"}) > 86400
+      for: 1h
+      annotations:
+        summary: "Daily backup missing for {{ $value | humanizeDuration }}"
+        description: |
+          No successful daily backup has been completed in the last 24 hours.
+          This violates the RPO target of <1 hour.
+          Action: Check backup CronJob logs and restore connectivity to S3.
+        severity: critical
+      labels:
+        component: backup
+        slo: rpo
+    
+    # Alert: Backup size deviation (likely corruption)
+    - alert: BackupSizeDeviation
+      expr: |
+        abs(backup_size_bytes - avg_over_time(backup_size_bytes[7d])) / avg_over_time(backup_size_bytes[7d]) > 0.5
+      for: 30m
+      annotations:
+        summary: "Backup size deviated >50%: {{ $value | humanizePercentage }}"
+        description: |
+          Latest backup size differs significantly from historical average.
+          This may indicate data corruption or incomplete backup.
+          Action: Review backup logs and test restore from previous backup.
+        severity: warning
+      labels:
+        component: backup
+    
+    # Alert: WAL archive lagging
+    - alert: WALArchiveLagging
+      expr: |
+        wal_archive_lag_seconds > 900
+      for: 5m
+      annotations:
+        summary: "WAL archive lagging: {{ $value | humanizeDuration }}"
+        description: |
+          PostgreSQL WAL files are not being archived to S3 within expected timeframe.
+          This impacts the RPO (Recovery Point Objective).
+          Current lag: {{ $value }}s (target: <300s)
+          Action: Check postgres WAL archiver status and S3 connectivity.
+        severity: warning
+      labels:
+        component: database
+        slo: rpo
+    
+    # Alert: S3 upload performance degraded
+    - alert: S3UploadSlow
+      expr: |
+        backup_upload_duration_seconds > 1200
+      for: 10m
+      annotations:
+        summary: "S3 backup upload taking {{ $value | humanizeDuration }}"
+        description: |
+          Backup upload to S3 is taking longer than expected.
+          This may indicate network issues or S3 throttling.
+          Target duration: <600s
+          Current duration: {{ $value }}s
+          Action: Check network connectivity and S3 bucket metrics.
+        severity: warning
+      labels:
+        component: storage
+    
+    # Alert: Database replication lagging
+    - alert: HighReplicationLag
+      expr: |
+        pg_replication_slot_restart_lsn_bytes - pg_wal_insert_lsn_bytes > 1073741824
+      for: 5m
+      annotations:
+        summary: "Replication lag: {{ $value | humanize1024 }}B"
+        description: |
+          Secondary database replica is lagging significantly behind primary.
+          This impacts failover capability.
+          Current lag: {{ $value | humanize1024 }}B (target: <100MB)
+          Action: Check network between regions and replica pod status.
+        severity: warning
+      labels:
+        component: database
+        slo: rto
+    
+    # Alert: Backup restore test failure
+    - alert: BackupRestoreTestFailed
+      expr: |
+        backup_restore_test_success == 0
+      for: 10m
+      annotations:
+        summary: "Backup restore test failed"
+        description: |
+          Weekly automated backup restore test has failed.
+          This indicates backups may not be recoverable.
+          Action: Review test logs and manually verify backup integrity.
+        severity: critical
+      labels:
+        component: backup
+        slo: rto
+    
+    # Alert: Primary database down (failover trigger)
+    - alert: PrimaryDatabaseDown
+      expr: |
+        up{job="postgresql-primary"} == 0
+      for: 2m
+      annotations:
+        summary: "Primary database unreachable"
+        description: |
+          Primary PostgreSQL database is not responding to health checks.
+          Failover to secondary may be required.
+          Action: Check pod status with kubectl; consider automatic failover.
+        severity: critical
+      labels:
+        component: database
+        slo: rto
+    
+    # Alert: Secondary database replication stopped
+    - alert: SecondaryReplicationDown
+      expr: |
+        pg_replication_slot_active == 0
+      for: 5m
+      annotations:
+        summary: "Secondary replication connection lost"
+        description: |
+          Replication from primary to secondary database has stopped.
+          Secondary will become stale and failover will risk data loss.
+          Action: Check network connectivity and logs on both primary and secondary.
+        severity: warning
+      labels:
+        component: database
+        slo: rpo
+    
+    # Info: Backup statistics
+    - alert: BackupStatsInfo
+      expr: |
+        increase(backup_job_total[24h]) > 0
+      for: 1h
+      annotations:
+        summary: "Daily backup stats: {{ $value }} backups in last 24h"
+        description: |
+          Informational metric for backup statistics.
+          Success rate and performance monitoring.
+        severity: info
+      labels:
+        component: backup
+
+  # Recording rules for aggregation
+  - name: disaster-recovery-recording
+    interval: 1m
+    rules:
+    
+    # Average backup size over 7 days
+    - record: backup:size:avg:7d
+      expr: avg_over_time(backup_size_bytes[7d])
+    
+    # Backup success rate
+    - record: backup:success:rate:24h
+      expr: rate(backup_job_success_total[24h])
+    
+    # Maximum WAL lag
+    - record: wal:lag:max:5m
+      expr: max_over_time(wal_archive_lag_seconds[5m])
+    
+    # Average replication lag
+    - record: replication:lag:avg:5m
+      expr: avg(pg_replication_slot_restart_lsn_bytes - pg_wal_insert_lsn_bytes)
@@ -0,0 +1,76 @@
+import http from 'k6/http';
+import { check, sleep } from 'k6';
+import { Rate, Trend, Counter, Gauge } from 'k6/metrics';
+
+// Custom metrics
+const errorRate = new Rate('errors');
+const requestDuration = new Trend('request_duration');
+const requestCount = new Counter('requests');
+const activeConnections = new Gauge('active_connections');
+
+// Test configuration
+export const options = {
+  vus: 10,           // Virtual users
+  duration: '5m',    // Test duration
+  thresholds: {
+    'http_req_duration': ['p(95)<200', 'p(99)<500'],  // p95 <200ms, p99 <500ms
+    'http_req_failed': ['rate<0.1'],                   // <0.1% error rate
+    'errors': ['rate<0.01'],                           // <1% custom errors
+  },
+};
+
+// Test target (update with production domain)
+const BASE_URL = __ENV.GRAVL_API_URL || 'https://gravl.example.com';
+
+export default function () {
+  // Simulate active connection count
+  activeConnections.add(1);
+
+  // Test 1: Health check
+  {
+    let response = http.get(`${BASE_URL}/api/health`);
+    check(response, {
+      'health check status is 200': (r) => r.status === 200,
+      'health check has status field': (r) => r.body.includes('status'),
+    });
+    errorRate.add(response.status !== 200);
+    requestDuration.add(response.timings.duration);
+    requestCount.add(1);
+  }
+
+  sleep(1);
+
+  // Test 2: List exercises (unauthenticated or with test token)
+  {
+    let response = http.get(`${BASE_URL}/api/exercises`);
+    check(response, {
+      'exercises endpoint status is 200': (r) => r.status === 200,
+      'exercises returns array': (r) => r.body.includes('['),
+    });
+    errorRate.add(response.status !== 200);
+    requestDuration.add(response.timings.duration);
+    requestCount.add(1);
+  }
+
+  sleep(1);
+
+  // Test 3: Metrics endpoint (for monitoring)
+  {
+    let response = http.get(`${BASE_URL}:3001/metrics`);
+    check(response, {
+      'metrics endpoint status is 200': (r) => r.status === 200 || r.status === 404, // Optional endpoint
+    });
+    requestDuration.add(response.timings.duration);
+    requestCount.add(1);
+  }
+
+  sleep(1);
+
+  activeConnections.add(-1);
+}
+
+export function teardown(data) {
+  console.log(`\n=== Load Test Summary ===`);
+  console.log(`Total requests: ${requestCount.value}`);
+  console.log(`Error rate: ${(errorRate.value * 100).toFixed(2)}%`);
+}