11 KiB
11 KiB
VaultMesh Operations Guide
Daily Operations
Morning Health Check
#!/bin/bash
# scripts/morning-check.sh
echo "=== VaultMesh Morning Health Check ==="
echo "Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
# 1. System health
echo -e "\n1. System Health"
vm-cli system health
# 2. Guardian status
echo -e "\n2. Guardian Status"
vm-guardian anchor-status
# 3. Phase status
echo -e "\n3. Current Phase"
vm-psi phase current
# 4. Overnight receipts
echo -e "\n4. Receipts (last 12h)"
vm-cli receipts count --since 12h
# 5. Any violations
echo -e "\n5. Governance Violations"
vm-gov violations list --since 24h --severity high,critical
# 6. Federation health
echo -e "\n6. Federation Status"
vm-federation health --all-peers
echo -e "\n=== Check Complete ==="
Anchor Monitoring
# Check anchor status
vm-guardian anchor-status
# View anchor history
vm-guardian anchor-history --last 24h
# Trigger manual anchor if needed
vm-guardian anchor-now --wait
# Verify specific receipt
vm-guardian verify-receipt blake3:abc123... --scroll Compliance
Receipt Queries
# Count receipts by scroll
vm-cli receipts count --by-scroll
# Search receipts
vm-cli receipts search --scroll Drills --from 2025-12-01 --to 2025-12-06
# Export receipts
vm-cli receipts export --scroll Compliance --format csv --output compliance.csv
# Verify integrity
vm-guardian verify-all --scroll all
Common Tasks
Add New Node to Mesh
# 1. Create DID for new node
vm-identity did create --type node --id new-node-01
# 2. Issue node credential
vm-identity credential issue \
--type VaultMeshNodeCredential \
--subject did:vm:node:new-node-01 \
--issuer did:vm:node:portal-01
# 3. Add to mesh
vm-mesh node add \
--did did:vm:node:new-node-01 \
--endpoint https://new-node-01.vaultmesh.io \
--type infrastructure
# 4. Grant capabilities
vm-identity capability grant \
--subject did:vm:node:new-node-01 \
--capability storage,compute
# 5. Verify
vm-mesh node status new-node-01
Key Rotation Ceremony
# 1. Initiate ceremony
vm-identity key-rotate \
--did did:vm:node:brick-01 \
--ceremony-type standard
# 2. Generate new keypair (on target node)
vm-identity key-generate --algorithm ed25519
# 3. Witness signatures (from other nodes)
vm-identity key-witness \
--ceremony ceremony-2025-12-001 \
--witness did:vm:node:brick-02
# 4. Publish new key
vm-identity key-publish --ceremony ceremony-2025-12-001
# 5. Verify propagation
vm-identity did resolve did:vm:node:brick-01
Create Security Drill
# 1. Create drill from prompt
vm-drills create \
--prompt "Detect and respond to ransomware encryption" \
--severity high \
--skills detection-defense-ir,kubernetes-security
# 2. Review generated contract
vm-drills show drill-2025-12-001
# 3. Start execution
vm-drills start drill-2025-12-001
# 4. Complete stages
vm-drills complete-stage drill-2025-12-001 stage-1 \
--outputs cases/drills/drill-2025-12-001/stage-1/ \
--findings "Identified encryption patterns"
# 5. Seal drill
vm-drills seal drill-2025-12-001
Initiate Transmutation
# 1. Start transmutation from incident
vm-psi transmute start \
--input INC-2025-12-001 \
--input-type security_incident \
--title "SSH Brute Force to Detection"
# 2. Extract IOCs
vm-psi transmute step transmute-2025-12-001 extract
# 3. Dissolve to standard format
vm-psi transmute step transmute-2025-12-001 dissolve
# 4. Purify (validate)
vm-psi transmute step transmute-2025-12-001 purify
# 5. Coagulate (generate rules)
vm-psi transmute step transmute-2025-12-001 coagulate
# 6. Seal
vm-psi transmute seal transmute-2025-12-001
Troubleshooting
Anchor Failures
Symptom: vm-guardian anchor-status shows failures
Diagnosis:
# Check guardian logs
kubectl logs -n vaultmesh -l app.kubernetes.io/name=guardian --tail=100
# Check anchor backend connectivity
vm-guardian test-backend ethereum
vm-guardian test-backend ots
# Check pending receipts
vm-guardian pending-receipts
Common Causes:
- Network issues: Check Ethereum RPC connectivity
- Insufficient funds: Check anchor wallet balance
- Rate limiting: Check if backend is rate limiting
- Configuration: Verify anchor config
Resolution:
# Retry anchor
vm-guardian anchor-now --backend ots --wait
# If Ethereum issues, switch to OTS temporarily
vm-guardian config set anchor.primary ots
# Check and top up wallet
vm-guardian wallet balance
vm-guardian wallet fund --amount 0.1
Receipt Integrity Errors
Symptom: verify-all reports mismatches
Diagnosis:
# Identify affected scroll
vm-guardian verify-all --scroll all --verbose
# Check specific receipt
vm-guardian verify-receipt blake3:... --scroll Compliance --debug
# Compare computed vs stored root
vm-guardian compute-root --scroll Compliance
cat receipts/ROOT.compliance.txt
Common Causes:
- Corrupted JSONL: File system issues
- Incomplete write: Process interrupted
- Manual modification: Violation of AXIOM-001
Resolution:
# If corruption detected, restore from backup
vm-cli backup restore --backup-id backup-2025-12-05 --scroll Compliance
# Recompute root after restore
vm-guardian recompute-root --scroll Compliance
# Trigger anchor to seal restored state
vm-guardian anchor-now --scroll Compliance --wait
Node Connectivity Issues
Symptom: Node showing unhealthy in mesh
Diagnosis:
# Check node status
vm-mesh node status brick-02
# Test connectivity
vm-mesh ping brick-02
# Check routes
vm-mesh routes list --node brick-02
# Check node logs
kubectl logs -n vaultmesh pod/brick-02 --tail=100
Common Causes:
- Network partition: Firewall/network issues
- Resource exhaustion: Node overloaded
- Certificate expiry: TLS cert expired
- Process crash: Service died
Resolution:
# Restart node pod
kubectl rollout restart deployment/brick-02 -n vaultmesh
# If cert expired
vm-identity cert-renew --node brick-02
# If persistent issues, remove and re-add
vm-mesh node remove brick-02 --force
vm-mesh node add --did did:vm:node:brick-02 --endpoint https://...
Oracle Query Failures
Symptom: Oracle returning errors
Diagnosis:
# Check oracle health
vm-oracle health
# Check LLM connectivity
vm-oracle test-llm anthropic
vm-oracle test-llm openai
# Check corpus status
vm-oracle corpus status
# Check logs
kubectl logs -n vaultmesh -l app.kubernetes.io/name=oracle --tail=100
Common Causes:
- LLM API issues: Rate limiting, key expiry
- Corpus empty: Documents not loaded
- Index corruption: Vector index issues
- Memory exhaustion: OOM conditions
Resolution:
# Rotate API key if expired
kubectl create secret generic oracle-llm-credentials \
--from-literal=anthropic-key=NEW_KEY \
-n vaultmesh --dry-run=client -o yaml | kubectl apply -f -
# Reload corpus
vm-oracle corpus reload
# Rebuild index
vm-oracle corpus reindex
# Restart oracle
kubectl rollout restart deployment/vaultmesh-oracle -n vaultmesh
Phase Stuck in Nigredo
Symptom: System in Nigredo for extended period
Diagnosis:
# Check phase details
vm-psi phase current --verbose
# Check active incidents
vm-offsec incidents list --status open
# Check for blocking issues
vm-psi blockers
# Review phase history
vm-psi phase history --last 7d
Common Causes:
- Unresolved incident: Active security issue
- Failed transmutation: Stuck in process
- Missing witness: Transmutation waiting for signature
- Metric threshold: Health metrics below threshold
Resolution:
# Close incident if resolved
vm-offsec incident close INC-2025-12-001 \
--resolution "Threat neutralized, systems restored"
# Complete stuck transmutation
vm-psi transmute force-complete transmute-2025-12-001
# Manual phase transition (requires justification)
vm-psi phase transition albedo \
--reason "Incident resolved, metrics stable" \
--evidence evidence-report.md
Constitutional Violation Detected
Symptom: gov_violation alert fired
Diagnosis:
# View violation details
vm-gov violations show VIOL-2025-12-001
# Check what was attempted
vm-gov violations evidence VIOL-2025-12-001
# Review enforcement action
vm-gov enforcement show ENF-2025-12-001
Common Causes:
- Agent misconfiguration: Automation tried unauthorized action
- Capability expiry: Token expired mid-operation
- Bug in engine: Logic error attempting violation
- Attack attempt: Malicious action blocked
Resolution:
# If false positive, dismiss
vm-gov violations review VIOL-2025-12-001 \
--decision dismiss \
--reason "False positive due to timing issue"
# If real, review and uphold enforcement
vm-gov enforcement review ENF-2025-12-001 --decision uphold
# Fix underlying issue
# (depends on specific violation)
Backup & Recovery
Scheduled Backups
# Full backup
vm-cli backup create --type full
# Incremental backup
vm-cli backup create --type incremental
# List backups
vm-cli backup list
# Verify backup integrity
vm-cli backup verify backup-2025-12-05
Recovery Procedures
# 1. Stop services
kubectl scale deployment -n vaultmesh --replicas=0 --all
# 2. Restore from backup
vm-cli backup restore --backup-id backup-2025-12-05
# 3. Verify integrity
vm-guardian verify-all --scroll all
# 4. Restart services
kubectl scale deployment -n vaultmesh --replicas=2 \
vaultmesh-portal vaultmesh-oracle
kubectl scale deployment -n vaultmesh --replicas=1 vaultmesh-guardian
# 5. Trigger anchor to seal restored state
vm-guardian anchor-now --wait
Disaster Recovery
# Full rebuild from backup
./scripts/disaster-recovery.sh --backup backup-2025-12-05
# Verify federation peers
vm-federation verify-all
# Re-establish federation trust if needed
vm-federation re-establish --peer vaultmesh-berlin
Performance Tuning
Receipt Write Optimization
# config.toml
[receipts]
# Batch writes for better throughput
batch_size = 100
batch_timeout_ms = 100
# Compression
compression = "zstd"
compression_level = 3
# Index configuration
index_cache_size_mb = 512
Database Tuning
-- Vacuum and analyze
VACUUM ANALYZE receipts;
-- Check slow queries
SELECT query, calls, mean_time
FROM pg_stat_statements
ORDER BY mean_time DESC
LIMIT 10;
-- Index usage
SELECT schemaname, tablename, indexname, idx_scan
FROM pg_stat_user_indexes
ORDER BY idx_scan;
Memory Optimization
# Check memory usage
kubectl top pods -n vaultmesh
# Adjust limits if needed
kubectl patch deployment vaultmesh-oracle -n vaultmesh \
-p '{"spec":{"template":{"spec":{"containers":[{"name":"oracle","resources":{"limits":{"memory":"8Gi"}}}]}}}}'
Monitoring Dashboards
Key Metrics to Watch
| Metric | Warning | Critical |
|---|---|---|
vaultmesh_guardian_last_anchor_age |
> 2h | > 4h |
vaultmesh_receipt_write_errors_total |
> 0 | > 10/min |
vaultmesh_mesh_node_unhealthy |
any | multiple |
vaultmesh_oracle_latency_p95 |
> 30s | > 60s |
vaultmesh_governance_violations |
any | critical |
vaultmesh_psi_phase |
nigredo > 24h | nigredo > 72h |
Alert Response
# Acknowledge alert
vm-alerts ack ALERT-2025-12-001
# Silence alert (for maintenance)
vm-alerts silence --matcher 'alertname="AnchorDelayed"' --duration 2h
# View active alerts
vm-alerts list --active