20 KiB
20 KiB
VAULTMESH-MONITORING-STACK.md
Observability for the Civilization Ledger
You cannot govern what you cannot see.
1. Prometheus Configuration
# config/prometheus.yaml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
rule_files:
- /etc/prometheus/rules/*.yaml
scrape_configs:
# Portal metrics
- job_name: 'vaultmesh-portal'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- vaultmesh
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: portal
action: keep
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
regex: "true"
action: keep
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
regex: (.+)
replacement: ${1}:9090
# Guardian metrics
- job_name: 'vaultmesh-guardian'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- vaultmesh
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: guardian
action: keep
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
regex: (.+)
replacement: ${1}:9090
# Oracle metrics
- job_name: 'vaultmesh-oracle'
kubernetes_sd_configs:
- role: pod
namespaces:
names:
- vaultmesh
relabel_configs:
- source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
regex: oracle
action: keep
- source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
target_label: __address__
regex: (.+)
replacement: ${1}:9090
# PostgreSQL metrics
- job_name: 'postgres'
static_configs:
- targets: ['postgres-exporter:9187']
# Redis metrics
- job_name: 'redis'
static_configs:
- targets: ['redis-exporter:9121']
2. Alerting Rules
# config/prometheus/rules/vaultmesh-alerts.yaml
groups:
- name: vaultmesh.receipts
rules:
- alert: ReceiptWriteFailure
expr: rate(vaultmesh_receipt_write_errors_total[5m]) > 0
for: 1m
labels:
severity: critical
scroll: "{{ $labels.scroll }}"
annotations:
summary: "Receipt write failures detected"
description: "{{ $value }} receipt write errors in scroll {{ $labels.scroll }}"
- alert: ReceiptRateAnomaly
expr: |
abs(
rate(vaultmesh_receipts_total[5m]) -
avg_over_time(rate(vaultmesh_receipts_total[5m])[1h:5m])
) > 2 * stddev_over_time(rate(vaultmesh_receipts_total[5m])[1h:5m])
for: 10m
labels:
severity: warning
annotations:
summary: "Unusual receipt rate detected"
description: "Receipt rate deviates significantly from baseline"
- name: vaultmesh.guardian
rules:
- alert: AnchorDelayed
expr: time() - vaultmesh_guardian_last_anchor_timestamp > 7200
for: 5m
labels:
severity: warning
annotations:
summary: "Guardian anchor delayed"
description: "Last anchor was {{ $value | humanizeDuration }} ago"
- alert: AnchorCriticallyDelayed
expr: time() - vaultmesh_guardian_last_anchor_timestamp > 14400
for: 5m
labels:
severity: critical
annotations:
summary: "Guardian anchor critically delayed"
description: "No anchor in over 4 hours"
- alert: AnchorFailure
expr: increase(vaultmesh_guardian_anchor_failures_total[1h]) > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Guardian anchor failure"
description: "{{ $value }} anchor failures in the last hour"
- alert: ProofChainDivergence
expr: vaultmesh_guardian_proofchain_divergence == 1
for: 1m
labels:
severity: critical
annotations:
summary: "ProofChain divergence detected"
description: "Computed Merkle root differs from stored root"
- name: vaultmesh.oracle
rules:
- alert: OracleHighLatency
expr: histogram_quantile(0.95, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m])) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Oracle query latency high"
description: "95th percentile query latency is {{ $value | humanizeDuration }}"
- alert: OracleLLMErrors
expr: rate(vaultmesh_oracle_llm_errors_total[5m]) > 0.1
for: 5m
labels:
severity: warning
annotations:
summary: "Oracle LLM errors elevated"
description: "{{ $value }} LLM errors per second"
- alert: OracleCorpusEmpty
expr: vaultmesh_oracle_corpus_documents_total == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Oracle corpus is empty"
description: "No documents loaded in compliance corpus"
- name: vaultmesh.mesh
rules:
- alert: NodeUnhealthy
expr: vaultmesh_mesh_node_healthy == 0
for: 5m
labels:
severity: warning
node: "{{ $labels.node_id }}"
annotations:
summary: "Mesh node unhealthy"
description: "Node {{ $labels.node_id }} is unhealthy"
- alert: NodeDown
expr: time() - vaultmesh_mesh_node_last_seen_timestamp > 600
for: 5m
labels:
severity: critical
node: "{{ $labels.node_id }}"
annotations:
summary: "Mesh node down"
description: "Node {{ $labels.node_id }} not seen for {{ $value | humanizeDuration }}"
- alert: RouteUnhealthy
expr: vaultmesh_mesh_route_healthy == 0
for: 5m
labels:
severity: warning
annotations:
summary: "Mesh route unhealthy"
description: "Route {{ $labels.route_id }} is unhealthy"
- name: vaultmesh.psi
rules:
- alert: PhaseProlongedNigredo
expr: vaultmesh_psi_phase_duration_seconds{phase="nigredo"} > 86400
for: 1h
labels:
severity: warning
annotations:
summary: "System in Nigredo phase for extended period"
description: "System has been in crisis phase for {{ $value | humanizeDuration }}"
- alert: TransmutationStalled
expr: vaultmesh_psi_transmutation_status{status="in_progress"} == 1 and time() - vaultmesh_psi_transmutation_started_timestamp > 86400
for: 1h
labels:
severity: warning
annotations:
summary: "Transmutation stalled"
description: "Transmutation {{ $labels.transmutation_id }} in progress for over 24 hours"
- name: vaultmesh.governance
rules:
- alert: ConstitutionalViolation
expr: increase(vaultmesh_governance_violations_total[1h]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: "Constitutional violation detected"
description: "{{ $value }} violation(s) in the last hour"
- alert: EmergencyActive
expr: vaultmesh_governance_emergency_active == 1
for: 0m
labels:
severity: warning
annotations:
summary: "Governance emergency active"
description: "Emergency powers in effect"
- name: vaultmesh.federation
rules:
- alert: FederationWitnessFailure
expr: increase(vaultmesh_federation_witness_failures_total[1h]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Federation witness failure"
description: "Failed to witness {{ $labels.remote_mesh }} receipts"
- alert: FederationDiscrepancy
expr: vaultmesh_federation_discrepancy_detected == 1
for: 0m
labels:
severity: critical
annotations:
summary: "Federation discrepancy detected"
description: "Discrepancy with {{ $labels.remote_mesh }}: {{ $labels.discrepancy_type }}"
3. Grafana Dashboards
3.1 Main Dashboard
{
"dashboard": {
"title": "VaultMesh Overview",
"uid": "vaultmesh-overview",
"tags": ["vaultmesh"],
"timezone": "browser",
"panels": [
{
"title": "System Status",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
"targets": [
{
"expr": "sum(up{job=~\"vaultmesh-.*\"})",
"legendFormat": "Services Up"
}
],
"fieldConfig": {
"defaults": {
"thresholds": {
"steps": [
{"color": "red", "value": 0},
{"color": "yellow", "value": 2},
{"color": "green", "value": 3}
]
}
}
}
},
{
"title": "Current Phase",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
"targets": [
{
"expr": "vaultmesh_psi_current_phase",
"legendFormat": "Phase"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{"type": "value", "options": {"0": {"text": "NIGREDO", "color": "dark-purple"}}},
{"type": "value", "options": {"1": {"text": "ALBEDO", "color": "white"}}},
{"type": "value", "options": {"2": {"text": "CITRINITAS", "color": "yellow"}}},
{"type": "value", "options": {"3": {"text": "RUBEDO", "color": "red"}}}
]
}
}
},
{
"title": "Last Anchor Age",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
"targets": [
{
"expr": "time() - vaultmesh_guardian_last_anchor_timestamp",
"legendFormat": "Age"
}
],
"fieldConfig": {
"defaults": {
"unit": "s",
"thresholds": {
"steps": [
{"color": "green", "value": 0},
{"color": "yellow", "value": 3600},
{"color": "red", "value": 7200}
]
}
}
}
},
{
"title": "Total Receipts",
"type": "stat",
"gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
"targets": [
{
"expr": "sum(vaultmesh_receipts_total)",
"legendFormat": "Receipts"
}
]
},
{
"title": "Receipt Rate by Scroll",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
"targets": [
{
"expr": "rate(vaultmesh_receipts_total[5m])",
"legendFormat": "{{ scroll }}"
}
],
"fieldConfig": {
"defaults": {
"unit": "ops"
}
}
},
{
"title": "Anchor History",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
"targets": [
{
"expr": "increase(vaultmesh_guardian_anchors_total[1h])",
"legendFormat": "Successful Anchors"
},
{
"expr": "increase(vaultmesh_guardian_anchor_failures_total[1h])",
"legendFormat": "Failed Anchors"
}
]
},
{
"title": "Mesh Node Status",
"type": "table",
"gridPos": {"h": 6, "w": 12, "x": 0, "y": 12},
"targets": [
{
"expr": "vaultmesh_mesh_node_healthy",
"format": "table",
"instant": true
}
],
"transformations": [
{
"id": "organize",
"options": {
"excludeByName": {"Time": true, "__name__": true},
"renameByName": {"node_id": "Node", "Value": "Healthy"}
}
}
]
},
{
"title": "Oracle Query Latency",
"type": "timeseries",
"gridPos": {"h": 6, "w": 12, "x": 12, "y": 12},
"targets": [
{
"expr": "histogram_quantile(0.50, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))",
"legendFormat": "p50"
},
{
"expr": "histogram_quantile(0.95, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))",
"legendFormat": "p95"
},
{
"expr": "histogram_quantile(0.99, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))",
"legendFormat": "p99"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
}
}
]
}
}
3.2 Guardian Dashboard
{
"dashboard": {
"title": "VaultMesh Guardian",
"uid": "vaultmesh-guardian",
"tags": ["vaultmesh", "guardian"],
"panels": [
{
"title": "Anchor Status",
"type": "stat",
"gridPos": {"h": 4, "w": 8, "x": 0, "y": 0},
"targets": [
{
"expr": "vaultmesh_guardian_anchor_status",
"legendFormat": "Status"
}
],
"fieldConfig": {
"defaults": {
"mappings": [
{"type": "value", "options": {"0": {"text": "IDLE", "color": "blue"}}},
{"type": "value", "options": {"1": {"text": "ANCHORING", "color": "yellow"}}},
{"type": "value", "options": {"2": {"text": "SUCCESS", "color": "green"}}},
{"type": "value", "options": {"3": {"text": "FAILED", "color": "red"}}}
]
}
}
},
{
"title": "Receipts Since Last Anchor",
"type": "stat",
"gridPos": {"h": 4, "w": 8, "x": 8, "y": 0},
"targets": [
{
"expr": "vaultmesh_guardian_receipts_since_anchor"
}
]
},
{
"title": "Anchor Epochs",
"type": "stat",
"gridPos": {"h": 4, "w": 8, "x": 16, "y": 0},
"targets": [
{
"expr": "vaultmesh_guardian_anchor_epoch"
}
]
},
{
"title": "ProofChain Roots by Scroll",
"type": "table",
"gridPos": {"h": 8, "w": 24, "x": 0, "y": 4},
"targets": [
{
"expr": "vaultmesh_guardian_proofchain_root_info",
"format": "table",
"instant": true
}
]
},
{
"title": "Anchor Duration",
"type": "timeseries",
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
"targets": [
{
"expr": "histogram_quantile(0.95, rate(vaultmesh_guardian_anchor_duration_seconds_bucket[1h]))",
"legendFormat": "p95"
}
],
"fieldConfig": {
"defaults": {
"unit": "s"
}
}
},
{
"title": "Anchor Events",
"type": "logs",
"gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
"datasource": "Loki",
"targets": [
{
"expr": "{job=\"vaultmesh-guardian\"} |= \"anchor\""
}
]
}
]
}
}
4. Metrics Endpoints
4.1 Portal Metrics
// vaultmesh-portal/src/metrics.rs
use prometheus::{
Counter, CounterVec, Histogram, HistogramVec, Gauge, GaugeVec,
Opts, Registry, labels,
};
use lazy_static::lazy_static;
lazy_static! {
pub static ref REGISTRY: Registry = Registry::new();
// Receipt metrics
pub static ref RECEIPTS_TOTAL: CounterVec = CounterVec::new(
Opts::new("vaultmesh_receipts_total", "Total receipts by scroll"),
&["scroll", "type"]
).unwrap();
pub static ref RECEIPT_WRITE_DURATION: HistogramVec = HistogramVec::new(
prometheus::HistogramOpts::new(
"vaultmesh_receipt_write_duration_seconds",
"Receipt write duration"
).buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]),
&["scroll"]
).unwrap();
pub static ref RECEIPT_WRITE_ERRORS: CounterVec = CounterVec::new(
Opts::new("vaultmesh_receipt_write_errors_total", "Receipt write errors"),
&["scroll", "error_type"]
).unwrap();
// API metrics
pub static ref HTTP_REQUESTS_TOTAL: CounterVec = CounterVec::new(
Opts::new("vaultmesh_http_requests_total", "Total HTTP requests"),
&["method", "path", "status"]
).unwrap();
pub static ref HTTP_REQUEST_DURATION: HistogramVec = HistogramVec::new(
prometheus::HistogramOpts::new(
"vaultmesh_http_request_duration_seconds",
"HTTP request duration"
).buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]),
&["method", "path"]
).unwrap();
// Connection metrics
pub static ref ACTIVE_CONNECTIONS: Gauge = Gauge::new(
"vaultmesh_active_connections",
"Active connections"
).unwrap();
pub static ref DB_POOL_SIZE: GaugeVec = GaugeVec::new(
Opts::new("vaultmesh_db_pool_size", "Database pool size"),
&["state"]
).unwrap();
}
pub fn register_metrics() {
REGISTRY.register(Box::new(RECEIPTS_TOTAL.clone())).unwrap();
REGISTRY.register(Box::new(RECEIPT_WRITE_DURATION.clone())).unwrap();
REGISTRY.register(Box::new(RECEIPT_WRITE_ERRORS.clone())).unwrap();
REGISTRY.register(Box::new(HTTP_REQUESTS_TOTAL.clone())).unwrap();
REGISTRY.register(Box::new(HTTP_REQUEST_DURATION.clone())).unwrap();
REGISTRY.register(Box::new(ACTIVE_CONNECTIONS.clone())).unwrap();
REGISTRY.register(Box::new(DB_POOL_SIZE.clone())).unwrap();
}
4.2 Guardian Metrics
// vaultmesh-guardian/src/metrics.rs
use prometheus::{
Counter, CounterVec, Histogram, Gauge, GaugeVec,
Opts, Registry,
};
use lazy_static::lazy_static;
lazy_static! {
pub static ref REGISTRY: Registry = Registry::new();
// Anchor metrics
pub static ref ANCHORS_TOTAL: Counter = Counter::new(
"vaultmesh_guardian_anchors_total",
"Total successful anchors"
).unwrap();
pub static ref ANCHOR_FAILURES_TOTAL: CounterVec = CounterVec::new(
Opts::new("vaultmesh_guardian_anchor_failures_total", "Anchor failures by reason"),
&["reason"]
).unwrap();
pub static ref ANCHOR_DURATION: Histogram = Histogram::with_opts(
prometheus::HistogramOpts::new(
"vaultmesh_guardian_anchor_duration_seconds",
"Anchor cycle duration"
).buckets(vec![1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0])
).unwrap();
pub static ref LAST_ANCHOR_TIMESTAMP: Gauge = Gauge::new(
"vaultmesh_guardian_last_anchor_timestamp",
"Timestamp of last successful anchor"
).unwrap();
pub static ref ANCHOR_EPOCH: Gauge = Gauge::new(
"vaultmesh_guardian_anchor_epoch",
"Current anchor epoch number"
).unwrap();
pub static ref RECEIPTS_SINCE_ANCHOR: Gauge = Gauge::new(
"vaultmesh_guardian_receipts_since_anchor",
"Receipts added since last anchor"
).unwrap();
pub static ref ANCHOR_STATUS: Gauge = Gauge::new(
"vaultmesh_guardian_anchor_status",
"Current anchor status (0=idle, 1=anchoring, 2=success, 3=failed)"
).unwrap();
// ProofChain metrics
pub static ref PROOFCHAIN_ROOT_INFO: GaugeVec = GaugeVec::new(
Opts::new("vaultmesh_guardian_proofchain_root_info", "ProofChain root information"),
&["scroll", "root_hash"]
).unwrap();
pub static ref PROOFCHAIN_DIVERGENCE: Gauge = Gauge::new(
"vaultmesh_guardian_proofchain_divergence",
"ProofChain divergence detected (0=no, 1=yes)"
).unwrap();
// Sentinel metrics
pub static ref SENTINEL_EVENTS: CounterVec = CounterVec::new(
Opts::new("vaultmesh_guardian_sentinel_events_total", "Sentinel events"),
&["event_type", "severity"]
).unwrap();
}