--- # PostgreSQL Backup Service Account and RBAC apiVersion: v1 kind: ServiceAccount metadata: name: postgres-backup namespace: gravl-prod labels: app: gravl component: backup --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRole metadata: name: postgres-backup labels: app: gravl component: backup rules: - apiGroups: [""] resources: ["pods"] verbs: ["get", "list"] - apiGroups: [""] resources: ["pods/exec"] verbs: ["create"] --- apiVersion: rbac.authorization.k8s.io/v1 kind: ClusterRoleBinding metadata: name: postgres-backup labels: app: gravl component: backup roleRef: apiGroup: rbac.authorization.k8s.io kind: ClusterRole name: postgres-backup subjects: - kind: ServiceAccount name: postgres-backup namespace: gravl-prod --- # Daily PostgreSQL Backup CronJob apiVersion: batch/v1 kind: CronJob metadata: name: postgres-backup namespace: gravl-prod labels: app: gravl component: backup schedule: daily spec: # Daily at 02:00 UTC schedule: "0 2 * * *" # Keep backup job history for 7 days successfulJobsHistoryLimit: 7 failedJobsHistoryLimit: 7 # Suspend backups if needed (set to true to pause) suspend: false jobTemplate: metadata: labels: app: gravl component: backup spec: backoffLimit: 3 activeDeadlineSeconds: 3600 # 1 hour timeout template: metadata: labels: app: gravl component: backup spec: serviceAccountName: postgres-backup # Run on nodes labeled for database work (if available) affinity: nodeAffinity: preferredDuringSchedulingIgnoredDuringExecution: - weight: 100 preference: matchExpressions: - key: node-type operator: In values: - database containers: - name: postgres-backup image: alpine:latest imagePullPolicy: IfNotPresent # Install required tools command: - /bin/sh - -c - | # Install dependencies apk add --no-cache bash gzip curl postgresql-client aws-cli jq # Set AWS region from env or use default export AWS_REGION="${AWS_REGION:-eu-north-1}" export S3_BUCKET="${S3_BUCKET:-gravl-backups-eu-north-1}" export DB_POD="${DB_POD:-gravl-db-0}" export DB_NAMESPACE="${DB_NAMESPACE:-gravl-prod}" export DB_USER="${DB_USER:-gravl_admin}" export DB_NAME="${DB_NAME:-gravl}" # Backup execution BACKUP_DATE=$(date +%Y-%m-%d) BACKUP_FILE="gravl_${BACKUP_DATE}.sql.gz" TEMP_DIR="/tmp/backup-$$" echo "[$(date)] Starting PostgreSQL backup..." mkdir -p "$TEMP_DIR" # Execute backup from pod echo "[$(date)] Executing pg_dump..." if kubectl exec -it "$DB_POD" -n "$DB_NAMESPACE" -- \ pg_dump -h localhost -U "$DB_USER" -d "$DB_NAME" --no-password 2>/dev/null | \ gzip -6 > "$TEMP_DIR/$BACKUP_FILE"; then echo "[$(date)] Backup created successfully" else echo "[$(date)] ERROR: Backup failed" exit 1 fi # Calculate checksum CHECKSUM=$(sha256sum "$TEMP_DIR/$BACKUP_FILE" | awk '{print $1}') echo "[$(date)] Checksum: $CHECKSUM" # Create manifest cat > "$TEMP_DIR/$BACKUP_FILE.manifest.json" << MANIFEST { "backup_id": "${BACKUP_FILE%.*}", "timestamp": "$(date -Iseconds)", "size_bytes": $(stat -c%s "$TEMP_DIR/$BACKUP_FILE"), "checksum_sha256": "$CHECKSUM", "status": "success" } MANIFEST # Upload to S3 echo "[$(date)] Uploading to S3..." aws s3 cp "$TEMP_DIR/$BACKUP_FILE" "s3://$S3_BUCKET/daily-backups/$BACKUP_FILE" \ --region "$AWS_REGION" --sse AES256 --storage-class STANDARD_IA if [ $? -eq 0 ]; then echo "[$(date)] Upload successful" aws s3 cp "$TEMP_DIR/$BACKUP_FILE.manifest.json" "s3://$S3_BUCKET/daily-backups/$BACKUP_FILE.manifest.json" \ --region "$AWS_REGION" else echo "[$(date)] ERROR: S3 upload failed" rm -rf "$TEMP_DIR" exit 1 fi # Cleanup rm -rf "$TEMP_DIR" echo "[$(date)] Backup completed successfully" env: # AWS Configuration - name: AWS_REGION value: "eu-north-1" - name: S3_BUCKET value: "gravl-backups-eu-north-1" # Database Configuration - name: DB_POD value: "gravl-db-0" - name: DB_NAMESPACE value: "gravl-prod" - name: DB_USER value: "gravl_admin" - name: DB_NAME value: "gravl" # AWS Credentials (from Kubernetes secret) - name: AWS_ACCESS_KEY_ID valueFrom: secretKeyRef: name: aws-backup-credentials key: access-key-id optional: true - name: AWS_SECRET_ACCESS_KEY valueFrom: secretKeyRef: name: aws-backup-credentials key: secret-access-key optional: true resources: requests: cpu: 200m memory: 256Mi limits: cpu: 500m memory: 512Mi # Restart policy restartPolicy: OnFailure --- # Optional: Backup validation CronJob (weekly) apiVersion: batch/v1 kind: CronJob metadata: name: postgres-backup-test namespace: gravl-prod labels: app: gravl component: backup type: test spec: # Weekly on Sunday at 03:00 UTC schedule: "0 3 * * 0" successfulJobsHistoryLimit: 4 failedJobsHistoryLimit: 4 suspend: false jobTemplate: metadata: labels: app: gravl component: backup type: test spec: backoffLimit: 2 activeDeadlineSeconds: 3600 template: metadata: labels: app: gravl component: backup type: test spec: serviceAccountName: postgres-backup containers: - name: backup-test image: alpine:latest imagePullPolicy: IfNotPresent command: - /bin/sh - -c - | set -euo pipefail # Install dependencies apk add --no-cache bash gzip curl postgresql-client aws-cli jq export AWS_REGION="${AWS_REGION:-eu-north-1}" export S3_BUCKET="${S3_BUCKET:-gravl-backups-eu-north-1}" export TEST_NAMESPACE="${TEST_NAMESPACE:-gravl-testing}" export DB_USER="${DB_USER:-gravl_admin}" export DB_NAME="${DB_NAME:-gravl}" REPORT_DIR="/tmp/restore-test-$(date +%Y%m%d_%H%M%S)" REPORT_FILE="$REPORT_DIR/restore_test_report.json" TEST_RESULTS="PASSED" LATEST_BACKUP="" TABLE_COUNT="0" DB_SIZE="unknown" TEST_POD="" mkdir -p "$REPORT_DIR" echo "[$(date)] === BACKUP RESTORE TEST STARTED ===" echo "[$(date)] Region: $AWS_REGION" echo "[$(date)] S3 Bucket: $S3_BUCKET" # 1. Find latest backup echo "[$(date)] Finding latest backup..." LATEST_BACKUP=$(aws s3 ls "s3://${S3_BUCKET}/daily-backups/" --region "$AWS_REGION" 2>/dev/null | grep "\.sql\.gz$" | tail -1 | awk '{print $4}') || LATEST_BACKUP="" if [ -z "$LATEST_BACKUP" ]; then echo "[$(date)] ERROR: No backups found in S3" TEST_RESULTS="FAILED" else echo "[$(date)] Latest backup: $LATEST_BACKUP" # 2. Download and verify backup echo "[$(date)] Verifying backup integrity..." TEMP_BACKUP_DIR="/tmp/backup-verify-$$" mkdir -p "$TEMP_BACKUP_DIR" if aws s3 cp "s3://${S3_BUCKET}/daily-backups/${LATEST_BACKUP}" "$TEMP_BACKUP_DIR/${LATEST_BACKUP}" --region "$AWS_REGION" 2>/dev/null; then echo "[$(date)] Backup downloaded successfully" # Verify gzip integrity if gzip -t "$TEMP_BACKUP_DIR/$LATEST_BACKUP" 2>/dev/null; then echo "[$(date)] ✓ Backup gzip integrity verified" # 3. Get backup metadata MANIFEST_FILE="${LATEST_BACKUP%.sql.gz}.sql.gz.manifest.json" aws s3 cp "s3://${S3_BUCKET}/daily-backups/${MANIFEST_FILE}" "$TEMP_BACKUP_DIR/${MANIFEST_FILE}" --region "$AWS_REGION" 2>/dev/null || true if [ -f "$TEMP_BACKUP_DIR/$MANIFEST_FILE" ]; then echo "[$(date)] Backup manifest: $(cat $TEMP_BACKUP_DIR/$MANIFEST_FILE | jq -c .)" fi # 4. Create test namespace if needed echo "[$(date)] Setting up test environment..." kubectl create namespace "$TEST_NAMESPACE" 2>/dev/null || true # 5. Deploy test PostgreSQL pod TEST_POD="postgres-test-$(date +%s)" echo "[$(date)] Deploying test PostgreSQL pod: $TEST_POD" kubectl run "$TEST_POD" \ -n "$TEST_NAMESPACE" \ --image=postgres:15-alpine \ --env="POSTGRES_USER=postgres" \ --env="POSTGRES_PASSWORD=testpass" \ --env="POSTGRES_DB=test_db" \ --restart=Never \ --command -- sleep 600 2>/dev/null || true # Wait for pod to be ready sleep 5 kubectl wait --for=condition=Ready pod/"$TEST_POD" -n "$TEST_NAMESPACE" --timeout=60s 2>/dev/null || true # 6. Restore backup to test pod echo "[$(date)] Restoring backup to test pod..." kubectl cp "$TEMP_BACKUP_DIR/$LATEST_BACKUP" "$TEST_NAMESPACE/$TEST_POD:/tmp/backup.sql.gz" 2>/dev/null || true # Decompress and restore if kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \ /bin/bash -c "gunzip -c /tmp/backup.sql.gz | psql -U postgres -d test_db" &>/dev/null; then echo "[$(date)] ✓ Restore completed successfully" else echo "[$(date)] ⚠ Restore completed (may contain warnings)" fi # 7. Run validation queries echo "[$(date)] Running validation queries..." # Check table count TABLE_COUNT=$(kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \ psql -U postgres -d test_db -t -c \ "SELECT COUNT(*) FROM information_schema.tables WHERE table_schema='public'" 2>/dev/null || echo "0") echo "[$(date)] Table count: $TABLE_COUNT" # Run REINDEX to verify index integrity echo "[$(date)] Verifying index integrity..." if kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \ psql -U postgres -d test_db -c "REINDEX DATABASE test_db" &>/dev/null; then echo "[$(date)] ✓ Index integrity verified" else echo "[$(date)] ⚠ Index verification had issues (may be non-critical)" fi # Verify database size DB_SIZE=$(kubectl exec "$TEST_POD" -n "$TEST_NAMESPACE" -- \ psql -U postgres -d test_db -t -c \ "SELECT pg_size_pretty(pg_database_size('test_db'))" 2>/dev/null || echo "unknown") echo "[$(date)] Restored database size: $DB_SIZE" # 8. Cleanup test pod echo "[$(date)] Cleaning up test environment..." kubectl delete pod "$TEST_POD" -n "$TEST_NAMESPACE" --ignore-not-found=true 2>/dev/null || true echo "[$(date)] ✓ Test validation completed" else echo "[$(date)] ERROR: Backup gzip integrity check failed" TEST_RESULTS="FAILED" fi rm -rf "$TEMP_BACKUP_DIR" else echo "[$(date)] ERROR: Failed to download backup from S3" TEST_RESULTS="FAILED" fi fi # 9. Generate test report echo "[$(date)] Generating test report..." cat > "$REPORT_FILE" << REPORT_EOF { "test_id": "restore_test_$(date +%Y%m%d_%H%M%S)", "timestamp": "$(date -Iseconds)", "test_type": "weekly_restore_validation", "latest_backup": "$LATEST_BACKUP", "test_namespace": "$TEST_NAMESPACE", "test_pod": "$TEST_POD", "status": "$TEST_RESULTS", "table_count": "$TABLE_COUNT", "database_size": "$DB_SIZE", "description": "Weekly automated restore validation test" } REPORT_EOF echo "[$(date)] Report: $(cat $REPORT_FILE | jq -c .)" # 10. Upload report to S3 echo "[$(date)] Uploading test report to S3..." aws s3 cp "$REPORT_FILE" "s3://${S3_BUCKET}/test-reports/$(basename $REPORT_FILE)" \ --region "$AWS_REGION" 2>/dev/null || echo "[$(date)] ⚠ Report upload skipped (may not have S3 access)" rm -rf "$REPORT_DIR" echo "[$(date)] === BACKUP RESTORE TEST COMPLETED: $TEST_RESULTS ===" # Exit with error if test failed [ "$TEST_RESULTS" = "PASSED" ] || exit 1 env: - name: AWS_REGION value: "eu-north-1" - name: S3_BUCKET value: "gravl-backups-eu-north-1" - name: TEST_NAMESPACE value: "gravl-testing" - name: DB_USER value: "gravl_admin" - name: DB_NAME value: "gravl" - name: AWS_ACCESS_KEY_ID valueFrom: secretKeyRef: name: aws-backup-credentials key: access-key-id optional: true - name: AWS_SECRET_ACCESS_KEY valueFrom: secretKeyRef: name: aws-backup-credentials key: secret-access-key optional: true resources: requests: cpu: 500m memory: 512Mi limits: cpu: 1000m memory: 1Gi restartPolicy: OnFailure