Files
vm-core/docs/skill/OPERATIONS.md
2025-12-27 00:10:32 +00:00

11 KiB

VaultMesh Operations Guide

Daily Operations

Morning Health Check

#!/bin/bash
# scripts/morning-check.sh

echo "=== VaultMesh Morning Health Check ==="
echo "Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)"

# 1. System health
echo -e "\n1. System Health"
vm-cli system health

# 2. Guardian status
echo -e "\n2. Guardian Status"
vm-guardian anchor-status

# 3. Phase status
echo -e "\n3. Current Phase"
vm-psi phase current

# 4. Overnight receipts
echo -e "\n4. Receipts (last 12h)"
vm-cli receipts count --since 12h

# 5. Any violations
echo -e "\n5. Governance Violations"
vm-gov violations list --since 24h --severity high,critical

# 6. Federation health
echo -e "\n6. Federation Status"
vm-federation health --all-peers

echo -e "\n=== Check Complete ==="

Anchor Monitoring

# Check anchor status
vm-guardian anchor-status

# View anchor history
vm-guardian anchor-history --last 24h

# Trigger manual anchor if needed
vm-guardian anchor-now --wait

# Verify specific receipt
vm-guardian verify-receipt blake3:abc123... --scroll Compliance

Receipt Queries

# Count receipts by scroll
vm-cli receipts count --by-scroll

# Search receipts
vm-cli receipts search --scroll Drills --from 2025-12-01 --to 2025-12-06

# Export receipts
vm-cli receipts export --scroll Compliance --format csv --output compliance.csv

# Verify integrity
vm-guardian verify-all --scroll all

Common Tasks

Add New Node to Mesh

# 1. Create DID for new node
vm-identity did create --type node --id new-node-01

# 2. Issue node credential
vm-identity credential issue \
  --type VaultMeshNodeCredential \
  --subject did:vm:node:new-node-01 \
  --issuer did:vm:node:portal-01

# 3. Add to mesh
vm-mesh node add \
  --did did:vm:node:new-node-01 \
  --endpoint https://new-node-01.vaultmesh.io \
  --type infrastructure

# 4. Grant capabilities
vm-identity capability grant \
  --subject did:vm:node:new-node-01 \
  --capability storage,compute

# 5. Verify
vm-mesh node status new-node-01

Key Rotation Ceremony

# 1. Initiate ceremony
vm-identity key-rotate \
  --did did:vm:node:brick-01 \
  --ceremony-type standard

# 2. Generate new keypair (on target node)
vm-identity key-generate --algorithm ed25519

# 3. Witness signatures (from other nodes)
vm-identity key-witness \
  --ceremony ceremony-2025-12-001 \
  --witness did:vm:node:brick-02

# 4. Publish new key
vm-identity key-publish --ceremony ceremony-2025-12-001

# 5. Verify propagation
vm-identity did resolve did:vm:node:brick-01

Create Security Drill

# 1. Create drill from prompt
vm-drills create \
  --prompt "Detect and respond to ransomware encryption" \
  --severity high \
  --skills detection-defense-ir,kubernetes-security

# 2. Review generated contract
vm-drills show drill-2025-12-001

# 3. Start execution
vm-drills start drill-2025-12-001

# 4. Complete stages
vm-drills complete-stage drill-2025-12-001 stage-1 \
  --outputs cases/drills/drill-2025-12-001/stage-1/ \
  --findings "Identified encryption patterns"

# 5. Seal drill
vm-drills seal drill-2025-12-001

Initiate Transmutation

# 1. Start transmutation from incident
vm-psi transmute start \
  --input INC-2025-12-001 \
  --input-type security_incident \
  --title "SSH Brute Force to Detection"

# 2. Extract IOCs
vm-psi transmute step transmute-2025-12-001 extract

# 3. Dissolve to standard format
vm-psi transmute step transmute-2025-12-001 dissolve

# 4. Purify (validate)
vm-psi transmute step transmute-2025-12-001 purify

# 5. Coagulate (generate rules)
vm-psi transmute step transmute-2025-12-001 coagulate

# 6. Seal
vm-psi transmute seal transmute-2025-12-001

Troubleshooting

Anchor Failures

Symptom: vm-guardian anchor-status shows failures

Diagnosis:

# Check guardian logs
kubectl logs -n vaultmesh -l app.kubernetes.io/name=guardian --tail=100

# Check anchor backend connectivity
vm-guardian test-backend ethereum
vm-guardian test-backend ots

# Check pending receipts
vm-guardian pending-receipts

Common Causes:

  1. Network issues: Check Ethereum RPC connectivity
  2. Insufficient funds: Check anchor wallet balance
  3. Rate limiting: Check if backend is rate limiting
  4. Configuration: Verify anchor config

Resolution:

# Retry anchor
vm-guardian anchor-now --backend ots --wait

# If Ethereum issues, switch to OTS temporarily
vm-guardian config set anchor.primary ots

# Check and top up wallet
vm-guardian wallet balance
vm-guardian wallet fund --amount 0.1

Receipt Integrity Errors

Symptom: verify-all reports mismatches

Diagnosis:

# Identify affected scroll
vm-guardian verify-all --scroll all --verbose

# Check specific receipt
vm-guardian verify-receipt blake3:... --scroll Compliance --debug

# Compare computed vs stored root
vm-guardian compute-root --scroll Compliance
cat receipts/ROOT.compliance.txt

Common Causes:

  1. Corrupted JSONL: File system issues
  2. Incomplete write: Process interrupted
  3. Manual modification: Violation of AXIOM-001

Resolution:

# If corruption detected, restore from backup
vm-cli backup restore --backup-id backup-2025-12-05 --scroll Compliance

# Recompute root after restore
vm-guardian recompute-root --scroll Compliance

# Trigger anchor to seal restored state
vm-guardian anchor-now --scroll Compliance --wait

Node Connectivity Issues

Symptom: Node showing unhealthy in mesh

Diagnosis:

# Check node status
vm-mesh node status brick-02

# Test connectivity
vm-mesh ping brick-02

# Check routes
vm-mesh routes list --node brick-02

# Check node logs
kubectl logs -n vaultmesh pod/brick-02 --tail=100

Common Causes:

  1. Network partition: Firewall/network issues
  2. Resource exhaustion: Node overloaded
  3. Certificate expiry: TLS cert expired
  4. Process crash: Service died

Resolution:

# Restart node pod
kubectl rollout restart deployment/brick-02 -n vaultmesh

# If cert expired
vm-identity cert-renew --node brick-02

# If persistent issues, remove and re-add
vm-mesh node remove brick-02 --force
vm-mesh node add --did did:vm:node:brick-02 --endpoint https://...

Oracle Query Failures

Symptom: Oracle returning errors

Diagnosis:

# Check oracle health
vm-oracle health

# Check LLM connectivity
vm-oracle test-llm anthropic
vm-oracle test-llm openai

# Check corpus status
vm-oracle corpus status

# Check logs
kubectl logs -n vaultmesh -l app.kubernetes.io/name=oracle --tail=100

Common Causes:

  1. LLM API issues: Rate limiting, key expiry
  2. Corpus empty: Documents not loaded
  3. Index corruption: Vector index issues
  4. Memory exhaustion: OOM conditions

Resolution:

# Rotate API key if expired
kubectl create secret generic oracle-llm-credentials \
  --from-literal=anthropic-key=NEW_KEY \
  -n vaultmesh --dry-run=client -o yaml | kubectl apply -f -

# Reload corpus
vm-oracle corpus reload

# Rebuild index
vm-oracle corpus reindex

# Restart oracle
kubectl rollout restart deployment/vaultmesh-oracle -n vaultmesh

Phase Stuck in Nigredo

Symptom: System in Nigredo for extended period

Diagnosis:

# Check phase details
vm-psi phase current --verbose

# Check active incidents
vm-offsec incidents list --status open

# Check for blocking issues
vm-psi blockers

# Review phase history
vm-psi phase history --last 7d

Common Causes:

  1. Unresolved incident: Active security issue
  2. Failed transmutation: Stuck in process
  3. Missing witness: Transmutation waiting for signature
  4. Metric threshold: Health metrics below threshold

Resolution:

# Close incident if resolved
vm-offsec incident close INC-2025-12-001 \
  --resolution "Threat neutralized, systems restored"

# Complete stuck transmutation
vm-psi transmute force-complete transmute-2025-12-001

# Manual phase transition (requires justification)
vm-psi phase transition albedo \
  --reason "Incident resolved, metrics stable" \
  --evidence evidence-report.md

Constitutional Violation Detected

Symptom: gov_violation alert fired

Diagnosis:

# View violation details
vm-gov violations show VIOL-2025-12-001

# Check what was attempted
vm-gov violations evidence VIOL-2025-12-001

# Review enforcement action
vm-gov enforcement show ENF-2025-12-001

Common Causes:

  1. Agent misconfiguration: Automation tried unauthorized action
  2. Capability expiry: Token expired mid-operation
  3. Bug in engine: Logic error attempting violation
  4. Attack attempt: Malicious action blocked

Resolution:

# If false positive, dismiss
vm-gov violations review VIOL-2025-12-001 \
  --decision dismiss \
  --reason "False positive due to timing issue"

# If real, review and uphold enforcement
vm-gov enforcement review ENF-2025-12-001 --decision uphold

# Fix underlying issue
# (depends on specific violation)

Backup & Recovery

Scheduled Backups

# Full backup
vm-cli backup create --type full

# Incremental backup
vm-cli backup create --type incremental

# List backups
vm-cli backup list

# Verify backup integrity
vm-cli backup verify backup-2025-12-05

Recovery Procedures

# 1. Stop services
kubectl scale deployment -n vaultmesh --replicas=0 --all

# 2. Restore from backup
vm-cli backup restore --backup-id backup-2025-12-05

# 3. Verify integrity
vm-guardian verify-all --scroll all

# 4. Restart services
kubectl scale deployment -n vaultmesh --replicas=2 \
  vaultmesh-portal vaultmesh-oracle
kubectl scale deployment -n vaultmesh --replicas=1 vaultmesh-guardian

# 5. Trigger anchor to seal restored state
vm-guardian anchor-now --wait

Disaster Recovery

# Full rebuild from backup
./scripts/disaster-recovery.sh --backup backup-2025-12-05

# Verify federation peers
vm-federation verify-all

# Re-establish federation trust if needed
vm-federation re-establish --peer vaultmesh-berlin

Performance Tuning

Receipt Write Optimization

# config.toml
[receipts]
# Batch writes for better throughput
batch_size = 100
batch_timeout_ms = 100

# Compression
compression = "zstd"
compression_level = 3

# Index configuration
index_cache_size_mb = 512

Database Tuning

-- Vacuum and analyze
VACUUM ANALYZE receipts;

-- Check slow queries
SELECT query, calls, mean_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;

-- Index usage
SELECT schemaname, tablename, indexname, idx_scan
FROM pg_stat_user_indexes
ORDER BY idx_scan;

Memory Optimization

# Check memory usage
kubectl top pods -n vaultmesh

# Adjust limits if needed
kubectl patch deployment vaultmesh-oracle -n vaultmesh \
  -p '{"spec":{"template":{"spec":{"containers":[{"name":"oracle","resources":{"limits":{"memory":"8Gi"}}}]}}}}'

Monitoring Dashboards

Key Metrics to Watch

Metric Warning Critical
vaultmesh_guardian_last_anchor_age > 2h > 4h
vaultmesh_receipt_write_errors_total > 0 > 10/min
vaultmesh_mesh_node_unhealthy any multiple
vaultmesh_oracle_latency_p95 > 30s > 60s
vaultmesh_governance_violations any critical
vaultmesh_psi_phase nigredo > 24h nigredo > 72h

Alert Response

# Acknowledge alert
vm-alerts ack ALERT-2025-12-001

# Silence alert (for maintenance)
vm-alerts silence --matcher 'alertname="AnchorDelayed"' --duration 2h

# View active alerts
vm-alerts list --active