gravl/k8s/monitoring/prometheus-rules-dr.yaml

---
# Prometheus PrometheusRule for Disaster Recovery Monitoring
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  name: disaster-recovery-rules
  namespace: gravl-monitoring
  labels:
    app: gravl
    component: monitoring
    rules: disaster-recovery
spec:
  groups:
  - name: disaster-recovery
    interval: 30s
    rules:

    # Alert: No daily backup in 24+ hours
    - alert: NoDailyBackup
      expr: |
        (time() - backup_last_success_timestamp{type="daily"}) > 86400
      for: 1h
      annotations:
        summary: "Daily backup missing for {{ $value | humanizeDuration }}"
        description: |
          No successful daily backup has been completed in the last 24 hours.
          This violates the RPO target of <1 hour.
          Action: Check backup CronJob logs and restore connectivity to S3.
        severity: critical
      labels:
        component: backup
        slo: rpo

    # Alert: Backup size deviation (likely corruption)
    - alert: BackupSizeDeviation
      expr: |
        abs(backup_size_bytes - avg_over_time(backup_size_bytes[7d])) / avg_over_time(backup_size_bytes[7d]) > 0.5
      for: 30m
      annotations:
        summary: "Backup size deviated >50%: {{ $value | humanizePercentage }}"
        description: |
          Latest backup size differs significantly from historical average.
          This may indicate data corruption or incomplete backup.
          Action: Review backup logs and test restore from previous backup.
        severity: warning
      labels:
        component: backup

    # Alert: WAL archive lagging
    - alert: WALArchiveLagging
      expr: |
        wal_archive_lag_seconds > 900
      for: 5m
      annotations:
        summary: "WAL archive lagging: {{ $value | humanizeDuration }}"
        description: |
          PostgreSQL WAL files are not being archived to S3 within expected timeframe.
          This impacts the RPO (Recovery Point Objective).
          Current lag: {{ $value }}s (target: <300s)
          Action: Check postgres WAL archiver status and S3 connectivity.
        severity: warning
      labels:
        component: database
        slo: rpo

    # Alert: S3 upload performance degraded
    - alert: S3UploadSlow
      expr: |
        backup_upload_duration_seconds > 1200
      for: 10m
      annotations:
        summary: "S3 backup upload taking {{ $value | humanizeDuration }}"
        description: |
          Backup upload to S3 is taking longer than expected.
          This may indicate network issues or S3 throttling.
          Target duration: <600s
          Current duration: {{ $value }}s
          Action: Check network connectivity and S3 bucket metrics.
        severity: warning
      labels:
        component: storage

    # Alert: Database replication lagging
    - alert: HighReplicationLag
      expr: |
        pg_replication_slot_restart_lsn_bytes - pg_wal_insert_lsn_bytes > 1073741824
      for: 5m
      annotations:
        summary: "Replication lag: {{ $value | humanize1024 }}B"
        description: |
          Secondary database replica is lagging significantly behind primary.
          This impacts failover capability.
          Current lag: {{ $value | humanize1024 }}B (target: <100MB)
          Action: Check network between regions and replica pod status.
        severity: warning
      labels:
        component: database
        slo: rto

    # Alert: Backup restore test failure
    - alert: BackupRestoreTestFailed
      expr: |
        backup_restore_test_success == 0
      for: 10m
      annotations:
        summary: "Backup restore test failed"
        description: |
          Weekly automated backup restore test has failed.
          This indicates backups may not be recoverable.
          Action: Review test logs and manually verify backup integrity.
        severity: critical
      labels:
        component: backup
        slo: rto

    # Alert: Primary database down (failover trigger)
    - alert: PrimaryDatabaseDown
      expr: |
        up{job="postgresql-primary"} == 0
      for: 2m
      annotations:
        summary: "Primary database unreachable"
        description: |
          Primary PostgreSQL database is not responding to health checks.
          Failover to secondary may be required.
          Action: Check pod status with kubectl; consider automatic failover.
        severity: critical
      labels:
        component: database
        slo: rto

    # Alert: Secondary database replication stopped
    - alert: SecondaryReplicationDown
      expr: |
        pg_replication_slot_active == 0
      for: 5m
      annotations:
        summary: "Secondary replication connection lost"
        description: |
          Replication from primary to secondary database has stopped.
          Secondary will become stale and failover will risk data loss.
          Action: Check network connectivity and logs on both primary and secondary.
        severity: warning
      labels:
        component: database
        slo: rpo

    # Info: Backup statistics
    - alert: BackupStatsInfo
      expr: |
        increase(backup_job_total[24h]) > 0
      for: 1h
      annotations:
        summary: "Daily backup stats: {{ $value }} backups in last 24h"
        description: |
          Informational metric for backup statistics.
          Success rate and performance monitoring.
        severity: info
      labels:
        component: backup

  # Recording rules for aggregation
  - name: disaster-recovery-recording
    interval: 1m
    rules:

    # Average backup size over 7 days
    - record: backup:size:avg:7d
      expr: avg_over_time(backup_size_bytes[7d])

    # Backup success rate
    - record: backup:success:rate:24h
      expr: rate(backup_job_success_total[24h])

    # Maximum WAL lag
    - record: wal:lag:max:5m
      expr: max_over_time(wal_archive_lag_seconds[5m])

    # Average replication lag
    - record: replication:lag:avg:5m
      expr: avg(pg_replication_slot_restart_lsn_bytes - pg_wal_insert_lsn_bytes)