Files
vm-core/docs/VAULTMESH-MONITORING-STACK.md
2025-12-27 00:10:32 +00:00

20 KiB

VAULTMESH-MONITORING-STACK.md

Observability for the Civilization Ledger

You cannot govern what you cannot see.


1. Prometheus Configuration

# config/prometheus.yaml
global:
  scrape_interval: 15s
  evaluation_interval: 15s

alerting:
  alertmanagers:
    - static_configs:
        - targets:
            - alertmanager:9093

rule_files:
  - /etc/prometheus/rules/*.yaml

scrape_configs:
  # Portal metrics
  - job_name: 'vaultmesh-portal'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - vaultmesh
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
        regex: portal
        action: keep
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        regex: "true"
        action: keep
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
        target_label: __address__
        regex: (.+)
        replacement: ${1}:9090

  # Guardian metrics
  - job_name: 'vaultmesh-guardian'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - vaultmesh
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
        regex: guardian
        action: keep
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
        target_label: __address__
        regex: (.+)
        replacement: ${1}:9090

  # Oracle metrics
  - job_name: 'vaultmesh-oracle'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - vaultmesh
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_label_app_kubernetes_io_name]
        regex: oracle
        action: keep
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_port]
        target_label: __address__
        regex: (.+)
        replacement: ${1}:9090

  # PostgreSQL metrics
  - job_name: 'postgres'
    static_configs:
      - targets: ['postgres-exporter:9187']

  # Redis metrics
  - job_name: 'redis'
    static_configs:
      - targets: ['redis-exporter:9121']

2. Alerting Rules

# config/prometheus/rules/vaultmesh-alerts.yaml
groups:
  - name: vaultmesh.receipts
    rules:
      - alert: ReceiptWriteFailure
        expr: rate(vaultmesh_receipt_write_errors_total[5m]) > 0
        for: 1m
        labels:
          severity: critical
          scroll: "{{ $labels.scroll }}"
        annotations:
          summary: "Receipt write failures detected"
          description: "{{ $value }} receipt write errors in scroll {{ $labels.scroll }}"

      - alert: ReceiptRateAnomaly
        expr: |
          abs(
            rate(vaultmesh_receipts_total[5m]) -
            avg_over_time(rate(vaultmesh_receipts_total[5m])[1h:5m])
          ) > 2 * stddev_over_time(rate(vaultmesh_receipts_total[5m])[1h:5m])
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Unusual receipt rate detected"
          description: "Receipt rate deviates significantly from baseline"

  - name: vaultmesh.guardian
    rules:
      - alert: AnchorDelayed
        expr: time() - vaultmesh_guardian_last_anchor_timestamp > 7200
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Guardian anchor delayed"
          description: "Last anchor was {{ $value | humanizeDuration }} ago"

      - alert: AnchorCriticallyDelayed
        expr: time() - vaultmesh_guardian_last_anchor_timestamp > 14400
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Guardian anchor critically delayed"
          description: "No anchor in over 4 hours"

      - alert: AnchorFailure
        expr: increase(vaultmesh_guardian_anchor_failures_total[1h]) > 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Guardian anchor failure"
          description: "{{ $value }} anchor failures in the last hour"

      - alert: ProofChainDivergence
        expr: vaultmesh_guardian_proofchain_divergence == 1
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "ProofChain divergence detected"
          description: "Computed Merkle root differs from stored root"

  - name: vaultmesh.oracle
    rules:
      - alert: OracleHighLatency
        expr: histogram_quantile(0.95, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m])) > 30
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Oracle query latency high"
          description: "95th percentile query latency is {{ $value | humanizeDuration }}"

      - alert: OracleLLMErrors
        expr: rate(vaultmesh_oracle_llm_errors_total[5m]) > 0.1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Oracle LLM errors elevated"
          description: "{{ $value }} LLM errors per second"

      - alert: OracleCorpusEmpty
        expr: vaultmesh_oracle_corpus_documents_total == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Oracle corpus is empty"
          description: "No documents loaded in compliance corpus"

  - name: vaultmesh.mesh
    rules:
      - alert: NodeUnhealthy
        expr: vaultmesh_mesh_node_healthy == 0
        for: 5m
        labels:
          severity: warning
          node: "{{ $labels.node_id }}"
        annotations:
          summary: "Mesh node unhealthy"
          description: "Node {{ $labels.node_id }} is unhealthy"

      - alert: NodeDown
        expr: time() - vaultmesh_mesh_node_last_seen_timestamp > 600
        for: 5m
        labels:
          severity: critical
          node: "{{ $labels.node_id }}"
        annotations:
          summary: "Mesh node down"
          description: "Node {{ $labels.node_id }} not seen for {{ $value | humanizeDuration }}"

      - alert: RouteUnhealthy
        expr: vaultmesh_mesh_route_healthy == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Mesh route unhealthy"
          description: "Route {{ $labels.route_id }} is unhealthy"

  - name: vaultmesh.psi
    rules:
      - alert: PhaseProlongedNigredo
        expr: vaultmesh_psi_phase_duration_seconds{phase="nigredo"} > 86400
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "System in Nigredo phase for extended period"
          description: "System has been in crisis phase for {{ $value | humanizeDuration }}"

      - alert: TransmutationStalled
        expr: vaultmesh_psi_transmutation_status{status="in_progress"} == 1 and time() - vaultmesh_psi_transmutation_started_timestamp > 86400
        for: 1h
        labels:
          severity: warning
        annotations:
          summary: "Transmutation stalled"
          description: "Transmutation {{ $labels.transmutation_id }} in progress for over 24 hours"

  - name: vaultmesh.governance
    rules:
      - alert: ConstitutionalViolation
        expr: increase(vaultmesh_governance_violations_total[1h]) > 0
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "Constitutional violation detected"
          description: "{{ $value }} violation(s) in the last hour"

      - alert: EmergencyActive
        expr: vaultmesh_governance_emergency_active == 1
        for: 0m
        labels:
          severity: warning
        annotations:
          summary: "Governance emergency active"
          description: "Emergency powers in effect"

  - name: vaultmesh.federation
    rules:
      - alert: FederationWitnessFailure
        expr: increase(vaultmesh_federation_witness_failures_total[1h]) > 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Federation witness failure"
          description: "Failed to witness {{ $labels.remote_mesh }} receipts"

      - alert: FederationDiscrepancy
        expr: vaultmesh_federation_discrepancy_detected == 1
        for: 0m
        labels:
          severity: critical
        annotations:
          summary: "Federation discrepancy detected"
          description: "Discrepancy with {{ $labels.remote_mesh }}: {{ $labels.discrepancy_type }}"

3. Grafana Dashboards

3.1 Main Dashboard

{
  "dashboard": {
    "title": "VaultMesh Overview",
    "uid": "vaultmesh-overview",
    "tags": ["vaultmesh"],
    "timezone": "browser",
    "panels": [
      {
        "title": "System Status",
        "type": "stat",
        "gridPos": {"h": 4, "w": 6, "x": 0, "y": 0},
        "targets": [
          {
            "expr": "sum(up{job=~\"vaultmesh-.*\"})",
            "legendFormat": "Services Up"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "thresholds": {
              "steps": [
                {"color": "red", "value": 0},
                {"color": "yellow", "value": 2},
                {"color": "green", "value": 3}
              ]
            }
          }
        }
      },
      {
        "title": "Current Phase",
        "type": "stat",
        "gridPos": {"h": 4, "w": 6, "x": 6, "y": 0},
        "targets": [
          {
            "expr": "vaultmesh_psi_current_phase",
            "legendFormat": "Phase"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "mappings": [
              {"type": "value", "options": {"0": {"text": "NIGREDO", "color": "dark-purple"}}},
              {"type": "value", "options": {"1": {"text": "ALBEDO", "color": "white"}}},
              {"type": "value", "options": {"2": {"text": "CITRINITAS", "color": "yellow"}}},
              {"type": "value", "options": {"3": {"text": "RUBEDO", "color": "red"}}}
            ]
          }
        }
      },
      {
        "title": "Last Anchor Age",
        "type": "stat",
        "gridPos": {"h": 4, "w": 6, "x": 12, "y": 0},
        "targets": [
          {
            "expr": "time() - vaultmesh_guardian_last_anchor_timestamp",
            "legendFormat": "Age"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s",
            "thresholds": {
              "steps": [
                {"color": "green", "value": 0},
                {"color": "yellow", "value": 3600},
                {"color": "red", "value": 7200}
              ]
            }
          }
        }
      },
      {
        "title": "Total Receipts",
        "type": "stat",
        "gridPos": {"h": 4, "w": 6, "x": 18, "y": 0},
        "targets": [
          {
            "expr": "sum(vaultmesh_receipts_total)",
            "legendFormat": "Receipts"
          }
        ]
      },
      {
        "title": "Receipt Rate by Scroll",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 4},
        "targets": [
          {
            "expr": "rate(vaultmesh_receipts_total[5m])",
            "legendFormat": "{{ scroll }}"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "ops"
          }
        }
      },
      {
        "title": "Anchor History",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 4},
        "targets": [
          {
            "expr": "increase(vaultmesh_guardian_anchors_total[1h])",
            "legendFormat": "Successful Anchors"
          },
          {
            "expr": "increase(vaultmesh_guardian_anchor_failures_total[1h])",
            "legendFormat": "Failed Anchors"
          }
        ]
      },
      {
        "title": "Mesh Node Status",
        "type": "table",
        "gridPos": {"h": 6, "w": 12, "x": 0, "y": 12},
        "targets": [
          {
            "expr": "vaultmesh_mesh_node_healthy",
            "format": "table",
            "instant": true
          }
        ],
        "transformations": [
          {
            "id": "organize",
            "options": {
              "excludeByName": {"Time": true, "__name__": true},
              "renameByName": {"node_id": "Node", "Value": "Healthy"}
            }
          }
        ]
      },
      {
        "title": "Oracle Query Latency",
        "type": "timeseries",
        "gridPos": {"h": 6, "w": 12, "x": 12, "y": 12},
        "targets": [
          {
            "expr": "histogram_quantile(0.50, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))",
            "legendFormat": "p50"
          },
          {
            "expr": "histogram_quantile(0.95, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))",
            "legendFormat": "p95"
          },
          {
            "expr": "histogram_quantile(0.99, rate(vaultmesh_oracle_query_duration_seconds_bucket[5m]))",
            "legendFormat": "p99"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s"
          }
        }
      }
    ]
  }
}

3.2 Guardian Dashboard

{
  "dashboard": {
    "title": "VaultMesh Guardian",
    "uid": "vaultmesh-guardian",
    "tags": ["vaultmesh", "guardian"],
    "panels": [
      {
        "title": "Anchor Status",
        "type": "stat",
        "gridPos": {"h": 4, "w": 8, "x": 0, "y": 0},
        "targets": [
          {
            "expr": "vaultmesh_guardian_anchor_status",
            "legendFormat": "Status"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "mappings": [
              {"type": "value", "options": {"0": {"text": "IDLE", "color": "blue"}}},
              {"type": "value", "options": {"1": {"text": "ANCHORING", "color": "yellow"}}},
              {"type": "value", "options": {"2": {"text": "SUCCESS", "color": "green"}}},
              {"type": "value", "options": {"3": {"text": "FAILED", "color": "red"}}}
            ]
          }
        }
      },
      {
        "title": "Receipts Since Last Anchor",
        "type": "stat",
        "gridPos": {"h": 4, "w": 8, "x": 8, "y": 0},
        "targets": [
          {
            "expr": "vaultmesh_guardian_receipts_since_anchor"
          }
        ]
      },
      {
        "title": "Anchor Epochs",
        "type": "stat",
        "gridPos": {"h": 4, "w": 8, "x": 16, "y": 0},
        "targets": [
          {
            "expr": "vaultmesh_guardian_anchor_epoch"
          }
        ]
      },
      {
        "title": "ProofChain Roots by Scroll",
        "type": "table",
        "gridPos": {"h": 8, "w": 24, "x": 0, "y": 4},
        "targets": [
          {
            "expr": "vaultmesh_guardian_proofchain_root_info",
            "format": "table",
            "instant": true
          }
        ]
      },
      {
        "title": "Anchor Duration",
        "type": "timeseries",
        "gridPos": {"h": 8, "w": 12, "x": 0, "y": 12},
        "targets": [
          {
            "expr": "histogram_quantile(0.95, rate(vaultmesh_guardian_anchor_duration_seconds_bucket[1h]))",
            "legendFormat": "p95"
          }
        ],
        "fieldConfig": {
          "defaults": {
            "unit": "s"
          }
        }
      },
      {
        "title": "Anchor Events",
        "type": "logs",
        "gridPos": {"h": 8, "w": 12, "x": 12, "y": 12},
        "datasource": "Loki",
        "targets": [
          {
            "expr": "{job=\"vaultmesh-guardian\"} |= \"anchor\""
          }
        ]
      }
    ]
  }
}

4. Metrics Endpoints

4.1 Portal Metrics

// vaultmesh-portal/src/metrics.rs

use prometheus::{
    Counter, CounterVec, Histogram, HistogramVec, Gauge, GaugeVec,
    Opts, Registry, labels,
};
use lazy_static::lazy_static;

lazy_static! {
    pub static ref REGISTRY: Registry = Registry::new();

    // Receipt metrics
    pub static ref RECEIPTS_TOTAL: CounterVec = CounterVec::new(
        Opts::new("vaultmesh_receipts_total", "Total receipts by scroll"),
        &["scroll", "type"]
    ).unwrap();

    pub static ref RECEIPT_WRITE_DURATION: HistogramVec = HistogramVec::new(
        prometheus::HistogramOpts::new(
            "vaultmesh_receipt_write_duration_seconds",
            "Receipt write duration"
        ).buckets(vec![0.001, 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1.0]),
        &["scroll"]
    ).unwrap();

    pub static ref RECEIPT_WRITE_ERRORS: CounterVec = CounterVec::new(
        Opts::new("vaultmesh_receipt_write_errors_total", "Receipt write errors"),
        &["scroll", "error_type"]
    ).unwrap();

    // API metrics
    pub static ref HTTP_REQUESTS_TOTAL: CounterVec = CounterVec::new(
        Opts::new("vaultmesh_http_requests_total", "Total HTTP requests"),
        &["method", "path", "status"]
    ).unwrap();

    pub static ref HTTP_REQUEST_DURATION: HistogramVec = HistogramVec::new(
        prometheus::HistogramOpts::new(
            "vaultmesh_http_request_duration_seconds",
            "HTTP request duration"
        ).buckets(vec![0.01, 0.05, 0.1, 0.25, 0.5, 1.0, 2.5, 5.0, 10.0]),
        &["method", "path"]
    ).unwrap();

    // Connection metrics
    pub static ref ACTIVE_CONNECTIONS: Gauge = Gauge::new(
        "vaultmesh_active_connections",
        "Active connections"
    ).unwrap();

    pub static ref DB_POOL_SIZE: GaugeVec = GaugeVec::new(
        Opts::new("vaultmesh_db_pool_size", "Database pool size"),
        &["state"]
    ).unwrap();
}

pub fn register_metrics() {
    REGISTRY.register(Box::new(RECEIPTS_TOTAL.clone())).unwrap();
    REGISTRY.register(Box::new(RECEIPT_WRITE_DURATION.clone())).unwrap();
    REGISTRY.register(Box::new(RECEIPT_WRITE_ERRORS.clone())).unwrap();
    REGISTRY.register(Box::new(HTTP_REQUESTS_TOTAL.clone())).unwrap();
    REGISTRY.register(Box::new(HTTP_REQUEST_DURATION.clone())).unwrap();
    REGISTRY.register(Box::new(ACTIVE_CONNECTIONS.clone())).unwrap();
    REGISTRY.register(Box::new(DB_POOL_SIZE.clone())).unwrap();
}

4.2 Guardian Metrics

// vaultmesh-guardian/src/metrics.rs

use prometheus::{
    Counter, CounterVec, Histogram, Gauge, GaugeVec,
    Opts, Registry,
};
use lazy_static::lazy_static;

lazy_static! {
    pub static ref REGISTRY: Registry = Registry::new();

    // Anchor metrics
    pub static ref ANCHORS_TOTAL: Counter = Counter::new(
        "vaultmesh_guardian_anchors_total",
        "Total successful anchors"
    ).unwrap();

    pub static ref ANCHOR_FAILURES_TOTAL: CounterVec = CounterVec::new(
        Opts::new("vaultmesh_guardian_anchor_failures_total", "Anchor failures by reason"),
        &["reason"]
    ).unwrap();

    pub static ref ANCHOR_DURATION: Histogram = Histogram::with_opts(
        prometheus::HistogramOpts::new(
            "vaultmesh_guardian_anchor_duration_seconds",
            "Anchor cycle duration"
        ).buckets(vec![1.0, 5.0, 10.0, 30.0, 60.0, 120.0, 300.0])
    ).unwrap();

    pub static ref LAST_ANCHOR_TIMESTAMP: Gauge = Gauge::new(
        "vaultmesh_guardian_last_anchor_timestamp",
        "Timestamp of last successful anchor"
    ).unwrap();

    pub static ref ANCHOR_EPOCH: Gauge = Gauge::new(
        "vaultmesh_guardian_anchor_epoch",
        "Current anchor epoch number"
    ).unwrap();

    pub static ref RECEIPTS_SINCE_ANCHOR: Gauge = Gauge::new(
        "vaultmesh_guardian_receipts_since_anchor",
        "Receipts added since last anchor"
    ).unwrap();

    pub static ref ANCHOR_STATUS: Gauge = Gauge::new(
        "vaultmesh_guardian_anchor_status",
        "Current anchor status (0=idle, 1=anchoring, 2=success, 3=failed)"
    ).unwrap();

    // ProofChain metrics
    pub static ref PROOFCHAIN_ROOT_INFO: GaugeVec = GaugeVec::new(
        Opts::new("vaultmesh_guardian_proofchain_root_info", "ProofChain root information"),
        &["scroll", "root_hash"]
    ).unwrap();

    pub static ref PROOFCHAIN_DIVERGENCE: Gauge = Gauge::new(
        "vaultmesh_guardian_proofchain_divergence",
        "ProofChain divergence detected (0=no, 1=yes)"
    ).unwrap();

    // Sentinel metrics
    pub static ref SENTINEL_EVENTS: CounterVec = CounterVec::new(
        Opts::new("vaultmesh_guardian_sentinel_events_total", "Sentinel events"),
        &["event_type", "severity"]
    ).unwrap();
}