# VAULTMESH-MONITORING-STACK.md **Observability for the Civilization Ledger** > *You cannot govern what you cannot see.* --- ## 1. Prometheus Configuration ```yaml # config/prometheus.yaml global: scrape_interval: 15s evaluation_interval: 15s alerting: alertmanagers: - static_configs: - targets: - alertmanager:9093 rule_files: - /etc/prometheus/rules/*.yaml scrape_configs: # Portal metrics - job_name: 'vaultmesh-portal' kubernetes_sd_configs: - role: pod namespaces: names: - vaultmesh relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] regex: portal action: keep - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape] regex: "true" action: keep - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] target_label: __address__ regex: (.+) replacement: ${1}:9090 # Guardian metrics - job_name: 'vaultmesh-guardian' kubernetes_sd_configs: - role: pod namespaces: names: - vaultmesh relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] regex: guardian action: keep - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] target_label: __address__ regex: (.+) replacement: ${1}:9090 # Oracle metrics - job_name: 'vaultmesh-oracle' kubernetes_sd_configs: - role: pod namespaces: names: - vaultmesh relabel_configs: - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name] regex: oracle action: keep - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port] target_label: __address__ regex: (.+) replacement: ${1}:9090 # PostgreSQL metrics - job_name: 'postgres' static_configs: - targets: ['postgres-exporter:9187'] # Redis metrics - job_name: 'redis' static_configs: - targets: ['redis-exporter:9121'] ``` --- ## 2. Alerting Rules ```yaml # config/prometheus/rules/vaultmesh-alerts.yaml groups: - name: vaultmesh.receipts rules: - alert: ReceiptWriteFailure expr: rate(vaultmesh_receipt_write_errors_total[5m]) > 0 for: 1m labels: severity: critical scroll: "{{ $labels.scroll }}" annotations: summary: "Receipt write failures detected" description: "{{ $value }} receipt write errors in scroll {{ $labels.scroll }}" - alert: ReceiptRateAnomaly expr: | abs( rate(vaultmesh_receipts_total[5m]) - avg_over_time(rate(vaultmesh_receipts_total[5m])[1h:5m]) ) > 2 * stddev_over_time(rate(vaultmesh_receipts_total[5m])[1h:5m]) for: 10m labels: severity: warning annotations: summary: "Unusual receipt rate detected" description: "Receipt rate deviates significantly from baseline" - name: vaultmesh.guardian rules: - alert: AnchorDelayed expr: time() - vaultmesh_guardian_last_anchor_timestamp > 7200 for: 5m labels: severity: warning annotations: summary: "Guardian anchor delayed" description: "Last anchor was {{ $value | humanizeDuration }} ago" - alert: AnchorCriticallyDelayed expr: time() - vaultmesh_guardian_last_anchor_timestamp > 14400 for: 5m labels: severity: critical annotations: summary: "Guardian anchor critically delayed" description: "No anchor in over 4 hours" - alert: AnchorFailure expr: increase(vaultmesh_guardian_anchor_failures_total[1h]) > 0 for: 1m labels: severity: critical annotations: summary: "Guardian anchor failure" description: "{{ $value }} anchor failures in the last hour" - alert: ProofChainDivergence expr: vaultmesh_guardian_proofchain_divergence == 1 for: 1m labels: severity: critical annotations: summary: "ProofChain divergence detected" description: "Computed Merkle root differs from stored root" - name: vaultmesh.oracle rules: - alert: OracleHighLatency expr: histogram_quantile(0.95, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m])) > 30 for: 5m labels: severity: warning annotations: summary: "Oracle query latency high" description: "95th percentile query latency is {{ $value | humanizeDuration }}" - alert: OracleLLMErrors expr: rate(vaultmesh_oracle_llm_errors_total[5m]) > 0.1 for: 5m labels: severity: warning annotations: summary: "Oracle LLM errors elevated" description: "{{ $value }} LLM errors per second" - alert: OracleCorpusEmpty expr: vaultmesh_oracle_corpus_documents_total == 0 for: 1m labels: severity: critical annotations: summary: "Oracle corpus is empty" description: "No documents loaded in compliance corpus" - name: vaultmesh.mesh rules: - alert: NodeUnhealthy expr: vaultmesh_mesh_node_healthy == 0 for: 5m labels: severity: warning node: "{{ $labels.node_id }}" annotations: summary: "Mesh node unhealthy" description: "Node {{ $labels.node_id }} is unhealthy" - alert: NodeDown expr: time() - vaultmesh_mesh_node_last_seen_timestamp > 600 for: 5m labels: severity: critical node: "{{ $labels.node_id }}" annotations: summary: "Mesh node down" description: "Node {{ $labels.node_id }} not seen for {{ $value | humanizeDuration }}" - alert: RouteUnhealthy expr: vaultmesh_mesh_route_healthy == 0 for: 5m labels: severity: warning annotations: summary: "Mesh route unhealthy" description: "Route {{ $labels.route_id }} is unhealthy" - name: vaultmesh.psi rules: - alert: PhaseProlongedNigredo expr: vaultmesh_psi_phase_duration_seconds{phase="nigredo"} > 86400 for: 1h labels: severity: warning annotations: summary: "System in Nigredo phase for extended period" description: "System has been in crisis phase for {{ $value | humanizeDuration }}" - alert: TransmutationStalled expr: vaultmesh_psi_transmutation_status{status="in_progress"} == 1 and time() - vaultmesh_psi_transmutation_started_timestamp > 86400 for: 1h labels: severity: warning annotations: summary: "Transmutation stalled" description: "Transmutation {{ $labels.transmutation_id }} in progress for over 24 hours" - name: vaultmesh.governance rules: - alert: ConstitutionalViolation expr: increase(vaultmesh_governance_violations_total[1h]) > 0 for: 0m labels: severity: critical annotations: summary: "Constitutional violation detected" description: "{{ $value }} violation(s) in the last hour" - alert: EmergencyActive expr: vaultmesh_governance_emergency_active == 1 for: 0m labels: severity: warning annotations: summary: "Governance emergency active" description: "Emergency powers in effect" - name: vaultmesh.federation rules: - alert: FederationWitnessFailure expr: increase(vaultmesh_federation_witness_failures_total[1h]) > 0 for: 5m labels: severity: warning annotations: summary: "Federation witness failure" description: "Failed to witness {{ $labels.remote_mesh }} receipts" - alert: FederationDiscrepancy expr: vaultmesh_federation_discrepancy_detected == 1 for: 0m labels: severity: critical annotations: summary: "Federation discrepancy detected" description: "Discrepancy with {{ $labels.remote_mesh }}: {{ $labels.discrepancy_type }}" ``` --- ## 3. Grafana Dashboards ### 3.1 Main Dashboard ```json { "dashboard": { "title": "VaultMesh Overview", "uid": "vaultmesh-overview", "tags": ["vaultmesh"], "timezone": "browser", "panels": [ { "title": "System Status", "type": "stat", "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0}, "targets": [ { "expr": "sum(up{job=~\"vaultmesh-.*\"})", "legendFormat": "Services Up" } ], "fieldConfig": { "defaults": { "thresholds": { "steps": [ {"color": "red", "value": 0}, {"color": "yellow", "value": 2}, {"color": "green", "value": 3} ] } } } }, { "title": "Current Phase", "type": "stat", "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0}, "targets": [ { "expr": "vaultmesh_psi_current_phase", "legendFormat": "Phase" } ], "fieldConfig": { "defaults": { "mappings": [ {"type": "value", "options": {"0": {"text": "NIGREDO", "color": "dark-purple"}}}, {"type": "value", "options": {"1": {"text": "ALBEDO", "color": "white"}}}, {"type": "value", "options": {"2": {"text": "CITRINITAS", "color": "yellow"}}}, {"type": "value", "options": {"3": {"text": "RUBEDO", "color": "red"}}} ] } } }, { "title": "Last Anchor Age", "type": "stat", "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0}, "targets": [ { "expr": "time() - vaultmesh_guardian_last_anchor_timestamp", "legendFormat": "Age" } ], "fieldConfig": { "defaults": { "unit": "s", "thresholds": { "steps": [ {"color": "green", "value": 0}, {"color": "yellow", "value": 3600}, {"color": "red", "value": 7200} ] } } } }, { "title": "Total Receipts", "type": "stat", "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0}, "targets": [ { "expr": "sum(vaultmesh_receipts_total)", "legendFormat": "Receipts" } ] }, { "title": "Receipt Rate by Scroll", "type": "timeseries", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4}, "targets": [ { "expr": "rate(vaultmesh_receipts_total[5m])", "legendFormat": "{{ scroll }}" } ], "fieldConfig": { "defaults": { "unit": "ops" } } }, { "title": "Anchor History", "type": "timeseries", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4}, "targets": [ { "expr": "increase(vaultmesh_guardian_anchors_total[1h])", "legendFormat": "Successful Anchors" }, { "expr": "increase(vaultmesh_guardian_anchor_failures_total[1h])", "legendFormat": "Failed Anchors" } ] }, { "title": "Mesh Node Status", "type": "table", "gridPos": {"h": 6, "w": 12, "x": 0, "y": 12}, "targets": [ { "expr": "vaultmesh_mesh_node_healthy", "format": "table", "instant": true } ], "transformations": [ { "id": "organize", "options": { "excludeByName": {"Time": true, "__name__": true}, "renameByName": {"node_id": "Node", "Value": "Healthy"} } } ] }, { "title": "Oracle Query Latency", "type": "timeseries", "gridPos": {"h": 6, "w": 12, "x": 12, "y": 12}, "targets": [ { "expr": "histogram_quantile(0.50, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))", "legendFormat": "p50" }, { "expr": "histogram_quantile(0.95, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))", "legendFormat": "p95" }, { "expr": "histogram_quantile(0.99, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))", "legendFormat": "p99" } ], "fieldConfig": { "defaults": { "unit": "s" } } } ] } } ``` ### 3.2 Guardian Dashboard ```json { "dashboard": { "title": "VaultMesh Guardian", "uid": "vaultmesh-guardian", "tags": ["vaultmesh", "guardian"], "panels": [ { "title": "Anchor Status", "type": "stat", "gridPos": {"h": 4, "w": 8, "x": 0, "y": 0}, "targets": [ { "expr": "vaultmesh_guardian_anchor_status", "legendFormat": "Status" } ], "fieldConfig": { "defaults": { "mappings": [ {"type": "value", "options": {"0": {"text": "IDLE", "color": "blue"}}}, {"type": "value", "options": {"1": {"text": "ANCHORING", "color": "yellow"}}}, {"type": "value", "options": {"2": {"text": "SUCCESS", "color": "green"}}}, {"type": "value", "options": {"3": {"text": "FAILED", "color": "red"}}} ] } } }, { "title": "Receipts Since Last Anchor", "type": "stat", "gridPos": {"h": 4, "w": 8, "x": 8, "y": 0}, "targets": [ { "expr": "vaultmesh_guardian_receipts_since_anchor" } ] }, { "title": "Anchor Epochs", "type": "stat", "gridPos": {"h": 4, "w": 8, "x": 16, "y": 0}, "targets": [ { "expr": "vaultmesh_guardian_anchor_epoch" } ] }, { "title": "ProofChain Roots by Scroll", "type": "table", "gridPos": {"h": 8, "w": 24, "x": 0, "y": 4}, "targets": [ { "expr": "vaultmesh_guardian_proofchain_root_info", "format": "table", "instant": true } ] }, { "title": "Anchor Duration", "type": "timeseries", "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12}, "targets": [ { "expr": "histogram_quantile(0.95, rate(vaultmesh_guardian_anchor_duration_seconds_bucket[1h]))", "legendFormat": "p95" } ], "fieldConfig": { "defaults": { "unit": "s" } } }, { "title": "Anchor Events", "type": "logs", "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12}, "datasource": "Loki", "targets": [ { "expr": "{job=\"vaultmesh-guardian\"} |= \"anchor\"" } ] } ] } } ``` --- ## 4. Metrics Endpoints ### 4.1 Portal Metrics ```rust // vaultmesh-portal/src/metrics.rs use prometheus::{ Counter, CounterVec, Histogram, HistogramVec, Gauge, GaugeVec, Opts, Registry, labels, }; use lazy_static::lazy_static; lazy_static! { pub static ref REGISTRY: Registry = Registry::new(); // Receipt metrics pub static ref RECEIPTS_TOTAL: CounterVec = CounterVec::new( Opts::new("vaultmesh_receipts_total", "Total receipts by scroll"), &["scroll", "type"] ).unwrap(); pub static ref RECEIPT_WRITE_DURATION: HistogramVec = HistogramVec::new( prometheus::HistogramOpts::new( "vaultmesh_receipt_write_duration_seconds", "Receipt write duration" ).buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]), &["scroll"] ).unwrap(); pub static ref RECEIPT_WRITE_ERRORS: CounterVec = CounterVec::new( Opts::new("vaultmesh_receipt_write_errors_total", "Receipt write errors"), &["scroll", "error_type"] ).unwrap(); // API metrics pub static ref HTTP_REQUESTS_TOTAL: CounterVec = CounterVec::new( Opts::new("vaultmesh_http_requests_total", "Total HTTP requests"), &["method", "path", "status"] ).unwrap(); pub static ref HTTP_REQUEST_DURATION: HistogramVec = HistogramVec::new( prometheus::HistogramOpts::new( "vaultmesh_http_request_duration_seconds", "HTTP request duration" ).buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]), &["method", "path"] ).unwrap(); // Connection metrics pub static ref ACTIVE_CONNECTIONS: Gauge = Gauge::new( "vaultmesh_active_connections", "Active connections" ).unwrap(); pub static ref DB_POOL_SIZE: GaugeVec = GaugeVec::new( Opts::new("vaultmesh_db_pool_size", "Database pool size"), &["state"] ).unwrap(); } pub fn register_metrics() { REGISTRY.register(Box::new(RECEIPTS_TOTAL.clone())).unwrap(); REGISTRY.register(Box::new(RECEIPT_WRITE_DURATION.clone())).unwrap(); REGISTRY.register(Box::new(RECEIPT_WRITE_ERRORS.clone())).unwrap(); REGISTRY.register(Box::new(HTTP_REQUESTS_TOTAL.clone())).unwrap(); REGISTRY.register(Box::new(HTTP_REQUEST_DURATION.clone())).unwrap(); REGISTRY.register(Box::new(ACTIVE_CONNECTIONS.clone())).unwrap(); REGISTRY.register(Box::new(DB_POOL_SIZE.clone())).unwrap(); } ``` ### 4.2 Guardian Metrics ```rust // vaultmesh-guardian/src/metrics.rs use prometheus::{ Counter, CounterVec, Histogram, Gauge, GaugeVec, Opts, Registry, }; use lazy_static::lazy_static; lazy_static! { pub static ref REGISTRY: Registry = Registry::new(); // Anchor metrics pub static ref ANCHORS_TOTAL: Counter = Counter::new( "vaultmesh_guardian_anchors_total", "Total successful anchors" ).unwrap(); pub static ref ANCHOR_FAILURES_TOTAL: CounterVec = CounterVec::new( Opts::new("vaultmesh_guardian_anchor_failures_total", "Anchor failures by reason"), &["reason"] ).unwrap(); pub static ref ANCHOR_DURATION: Histogram = Histogram::with_opts( prometheus::HistogramOpts::new( "vaultmesh_guardian_anchor_duration_seconds", "Anchor cycle duration" ).buckets(vec![1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0]) ).unwrap(); pub static ref LAST_ANCHOR_TIMESTAMP: Gauge = Gauge::new( "vaultmesh_guardian_last_anchor_timestamp", "Timestamp of last successful anchor" ).unwrap(); pub static ref ANCHOR_EPOCH: Gauge = Gauge::new( "vaultmesh_guardian_anchor_epoch", "Current anchor epoch number" ).unwrap(); pub static ref RECEIPTS_SINCE_ANCHOR: Gauge = Gauge::new( "vaultmesh_guardian_receipts_since_anchor", "Receipts added since last anchor" ).unwrap(); pub static ref ANCHOR_STATUS: Gauge = Gauge::new( "vaultmesh_guardian_anchor_status", "Current anchor status (0=idle, 1=anchoring, 2=success, 3=failed)" ).unwrap(); // ProofChain metrics pub static ref PROOFCHAIN_ROOT_INFO: GaugeVec = GaugeVec::new( Opts::new("vaultmesh_guardian_proofchain_root_info", "ProofChain root information"), &["scroll", "root_hash"] ).unwrap(); pub static ref PROOFCHAIN_DIVERGENCE: Gauge = Gauge::new( "vaultmesh_guardian_proofchain_divergence", "ProofChain divergence detected (0=no, 1=yes)" ).unwrap(); // Sentinel metrics pub static ref SENTINEL_EVENTS: CounterVec = CounterVec::new( Opts::new("vaultmesh_guardian_sentinel_events_total", "Sentinel events"), &["event_type", "severity"] ).unwrap(); } ```