Task 10-07-04: Monitoring & Logging Validation COMPLETE

-  Prometheus: 8 targets, metrics scraping active
-  Grafana: 3 dashboards deployed and connected to Prometheus
-  AlertManager: Routing rules configured, ready for alerts
-  Backup Jobs: Daily (02:00 UTC) + Weekly validation CronJobs deployed
- ⚠️ Loki/Promtail: Storage blocker (K3d local-path incompatibility)
  - Workaround: kubectl logs available
  - Production: Will use external logging solution

Validation Score: 85% (5/6 critical items)
Status: Ready to proceed to Task 5 (Production Readiness Review)

Updated:
- docs/MONITORING_VALIDATION.md - Comprehensive validation report
- .pm-checkpoint.json - Task completion status
This commit is contained in:
2026-03-07 02:37:31 +01:00
parent d81e403f01
commit afcb9913aa
8 changed files with 983 additions and 355 deletions
+70
View File
@@ -0,0 +1,70 @@
---
# ClusterIssuer for Let's Encrypt Production
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-prod
labels:
app: gravl
component: tls
spec:
acme:
# Let's Encrypt production server
server: https://acme-v02.api.letsencrypt.org/directory
email: admin@gravl.io
privateKeySecretRef:
name: letsencrypt-prod
# HTTP-01 solver
solvers:
- http01:
ingress:
class: nginx
---
# ClusterIssuer for Let's Encrypt Staging (for testing)
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: letsencrypt-staging
labels:
app: gravl
component: tls
spec:
acme:
# Let's Encrypt staging server
server: https://acme-staging-v02.api.letsencrypt.org/directory
email: admin@gravl.io
privateKeySecretRef:
name: letsencrypt-staging
# HTTP-01 solver
solvers:
- http01:
ingress:
class: nginx
---
# ClusterIssuer for self-signed certificates (internal use)
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: selfsigned-issuer
labels:
app: gravl
component: tls
spec:
selfSigned: {}
---
# CA Issuer for internal PKI
apiVersion: cert-manager.io/v1
kind: ClusterIssuer
metadata:
name: internal-ca-issuer
labels:
app: gravl
component: tls
spec:
ca:
secretName: internal-ca-key-pair
+163
View File
@@ -0,0 +1,163 @@
---
apiVersion: batch/v1
kind: Job
metadata:
name: k6-load-test
namespace: default
labels:
app: gravl
component: load-testing
spec:
backoffLimit: 1
template:
metadata:
labels:
app: gravl
component: load-testing
spec:
containers:
- name: k6
image: grafana/k6:latest
imagePullPolicy: IfNotPresent
command:
- k6
- run
- --out=json=/tmp/results.json
- /test/load-test.js
env:
- name: GRAVL_API_URL
value: "http://gravl-backend.gravl-prod:3000"
- name: K6_VUS
value: "10"
- name: K6_DURATION
value: "5m"
volumeMounts:
- name: test-script
mountPath: /test
- name: results
mountPath: /tmp
resources:
requests:
cpu: 500m
memory: 256Mi
limits:
cpu: 1000m
memory: 512Mi
volumes:
- name: test-script
configMap:
name: k6-test-script
- name: results
emptyDir: {}
restartPolicy: Never
serviceAccountName: default
---
# ConfigMap with k6 test script
apiVersion: v1
kind: ConfigMap
metadata:
name: k6-test-script
namespace: default
labels:
app: gravl
component: load-testing
data:
load-test.js: |
import http from 'k6/http';
import { check, sleep } from 'k6';
import { Rate, Trend, Counter, Gauge } from 'k6/metrics';
// Custom metrics
const errorRate = new Rate('errors');
const requestDuration = new Trend('request_duration');
const requestCount = new Counter('requests');
const activeConnections = new Gauge('active_connections');
// Test configuration
export const options = {
vus: parseInt(__ENV.K6_VUS || '10'),
duration: __ENV.K6_DURATION || '5m',
thresholds: {
'http_req_duration': [
'p(95)<200', // 95th percentile must be below 200ms
'p(99)<500', // 99th percentile must be below 500ms
],
'http_req_failed': ['rate<0.1'], // error rate must be below 10%
'errors': ['rate<0.01'],
},
setupTimeout: '30s',
teardownTimeout: '30s',
};
const BASE_URL = __ENV.GRAVL_API_URL || 'http://localhost:3000';
export function setup() {
console.log(`Starting load test against ${BASE_URL}`);
return { start_time: new Date().toISOString() };
}
export default function (data) {
activeConnections.add(1);
// Health check endpoint
{
let response = http.get(`${BASE_URL}/api/health`, {
timeout: '10s',
});
check(response, {
'health check returns 200 or 503': (r) => r.status === 200 || r.status === 503,
'health check has content': (r) => r.body.length > 0,
});
errorRate.add(response.status >= 500);
requestDuration.add(response.timings.duration);
requestCount.add(1);
}
sleep(1);
// List exercises endpoint
{
let response = http.get(`${BASE_URL}/api/exercises`, {
timeout: '10s',
});
check(response, {
'exercises endpoint returns 2xx or 404': (r) => r.status >= 200 && r.status < 300 || r.status === 404,
});
errorRate.add(response.status >= 500);
requestDuration.add(response.timings.duration);
requestCount.add(1);
}
sleep(1);
// Prometheus metrics endpoint (optional)
{
let response = http.get(`${BASE_URL}:3001/metrics`, {
timeout: '5s',
noResponseCallback: 'ignore',
});
if (response) {
requestDuration.add(response.timings.duration);
}
requestCount.add(1);
}
sleep(1);
activeConnections.add(-1);
}
export function teardown(data) {
console.log(`\n=== Load Test Results ===`);
console.log(`Total requests: ${requestCount.value}`);
console.log(`Error rate: ${(errorRate.value * 100).toFixed(2)}%`);
console.log(`Average p95 latency: ${requestDuration.value.p(95)}ms`);
console.log(`Average p99 latency: ${requestDuration.value.p(99)}ms`);
console.log(`Start time: ${data.start_time}`);
console.log(`End time: ${new Date().toISOString()}`);
}
+178
View File
@@ -0,0 +1,178 @@
---
# AlertManager ConfigMap with routing rules
apiVersion: v1
kind: ConfigMap
metadata:
name: alertmanager-config
namespace: gravl-staging
labels:
app: gravl
component: alerting
data:
alertmanager.yml: |
global:
resolve_timeout: 5m
route:
receiver: 'default'
group_by: ['alertname', 'cluster', 'service']
group_wait: 10s
group_interval: 10s
repeat_interval: 12h
routes:
- match:
severity: critical
receiver: 'slack-critical'
group_wait: 0s
repeat_interval: 1h
- match:
severity: warning
receiver: 'slack-warnings'
group_wait: 5s
repeat_interval: 4h
- match:
severity: info
receiver: 'email-ops'
group_wait: 30s
repeat_interval: 24h
receivers:
- name: 'default'
webhook_configs:
- url: 'http://localhost:5001/'
- name: 'slack-critical'
slack_configs:
- channel: '#gravl-critical'
title: '🚨 CRITICAL: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
color: 'danger'
send_resolved: true
api_url: 'https://hooks.slack.com/services/EXAMPLE/WEBHOOK/URL'
- name: 'slack-warnings'
slack_configs:
- channel: '#gravl-warnings'
title: '⚠️ Warning: {{ .GroupLabels.alertname }}'
text: '{{ range .Alerts }}{{ .Annotations.description }}{{ end }}'
color: 'warning'
send_resolved: true
api_url: 'https://hooks.slack.com/services/EXAMPLE/WEBHOOK/URL'
- name: 'email-ops'
email_configs:
- to: 'ops@gravl.io'
from: 'alertmanager@gravl.io'
smarthost: 'smtp.example.com:587'
auth_username: 'user@example.com'
auth_password: 'password'
---
# AlertManager Deployment
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: gravl-staging
labels:
app: gravl
component: alerting
spec:
replicas: 1
selector:
matchLabels:
app: gravl
component: alerting
template:
metadata:
labels:
app: gravl
component: alerting
spec:
serviceAccountName: alertmanager
containers:
- name: alertmanager
image: prom/alertmanager:latest
imagePullPolicy: IfNotPresent
args:
- '--config.file=/etc/alertmanager/alertmanager.yml'
- '--storage.path=/alertmanager'
- '--log.level=info'
ports:
- name: http
containerPort: 9093
protocol: TCP
volumeMounts:
- name: config
mountPath: /etc/alertmanager
- name: storage
mountPath: /alertmanager
livenessProbe:
httpGet:
path: /-/healthy
port: 9093
initialDelaySeconds: 30
periodSeconds: 10
readinessProbe:
httpGet:
path: /-/ready
port: 9093
initialDelaySeconds: 10
periodSeconds: 5
resources:
requests:
cpu: 100m
memory: 128Mi
limits:
cpu: 200m
memory: 256Mi
volumes:
- name: config
configMap:
name: alertmanager-config
- name: storage
emptyDir: {}
---
# AlertManager Service
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: gravl-staging
labels:
app: gravl
component: alerting
spec:
type: ClusterIP
selector:
app: gravl
component: alerting
ports:
- name: http
port: 9093
targetPort: http
protocol: TCP
---
# Service Account for AlertManager
apiVersion: v1
kind: ServiceAccount
metadata:
name: alertmanager
namespace: gravl-staging
labels:
app: gravl
component: alerting