538 lines
11 KiB
Markdown
538 lines
11 KiB
Markdown
# VaultMesh Operations Guide
|
|
|
|
## Daily Operations
|
|
|
|
### Morning Health Check
|
|
|
|
```bash
|
|
#!/bin/bash
|
|
# scripts/morning-check.sh
|
|
|
|
echo "=== VaultMesh Morning Health Check ==="
|
|
echo "Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
|
|
|
# 1. System health
|
|
echo -e "\n1. System Health"
|
|
vm-cli system health
|
|
|
|
# 2. Guardian status
|
|
echo -e "\n2. Guardian Status"
|
|
vm-guardian anchor-status
|
|
|
|
# 3. Phase status
|
|
echo -e "\n3. Current Phase"
|
|
vm-psi phase current
|
|
|
|
# 4. Overnight receipts
|
|
echo -e "\n4. Receipts (last 12h)"
|
|
vm-cli receipts count --since 12h
|
|
|
|
# 5. Any violations
|
|
echo -e "\n5. Governance Violations"
|
|
vm-gov violations list --since 24h --severity high,critical
|
|
|
|
# 6. Federation health
|
|
echo -e "\n6. Federation Status"
|
|
vm-federation health --all-peers
|
|
|
|
echo -e "\n=== Check Complete ==="
|
|
```
|
|
|
|
### Anchor Monitoring
|
|
|
|
```bash
|
|
# Check anchor status
|
|
vm-guardian anchor-status
|
|
|
|
# View anchor history
|
|
vm-guardian anchor-history --last 24h
|
|
|
|
# Trigger manual anchor if needed
|
|
vm-guardian anchor-now --wait
|
|
|
|
# Verify specific receipt
|
|
vm-guardian verify-receipt blake3:abc123... --scroll Compliance
|
|
```
|
|
|
|
### Receipt Queries
|
|
|
|
```bash
|
|
# Count receipts by scroll
|
|
vm-cli receipts count --by-scroll
|
|
|
|
# Search receipts
|
|
vm-cli receipts search --scroll Drills --from 2025-12-01 --to 2025-12-06
|
|
|
|
# Export receipts
|
|
vm-cli receipts export --scroll Compliance --format csv --output compliance.csv
|
|
|
|
# Verify integrity
|
|
vm-guardian verify-all --scroll all
|
|
```
|
|
|
|
---
|
|
|
|
## Common Tasks
|
|
|
|
### Add New Node to Mesh
|
|
|
|
```bash
|
|
# 1. Create DID for new node
|
|
vm-identity did create --type node --id new-node-01
|
|
|
|
# 2. Issue node credential
|
|
vm-identity credential issue \
|
|
--type VaultMeshNodeCredential \
|
|
--subject did:vm:node:new-node-01 \
|
|
--issuer did:vm:node:portal-01
|
|
|
|
# 3. Add to mesh
|
|
vm-mesh node add \
|
|
--did did:vm:node:new-node-01 \
|
|
--endpoint https://new-node-01.vaultmesh.io \
|
|
--type infrastructure
|
|
|
|
# 4. Grant capabilities
|
|
vm-identity capability grant \
|
|
--subject did:vm:node:new-node-01 \
|
|
--capability storage,compute
|
|
|
|
# 5. Verify
|
|
vm-mesh node status new-node-01
|
|
```
|
|
|
|
### Key Rotation Ceremony
|
|
|
|
```bash
|
|
# 1. Initiate ceremony
|
|
vm-identity key-rotate \
|
|
--did did:vm:node:brick-01 \
|
|
--ceremony-type standard
|
|
|
|
# 2. Generate new keypair (on target node)
|
|
vm-identity key-generate --algorithm ed25519
|
|
|
|
# 3. Witness signatures (from other nodes)
|
|
vm-identity key-witness \
|
|
--ceremony ceremony-2025-12-001 \
|
|
--witness did:vm:node:brick-02
|
|
|
|
# 4. Publish new key
|
|
vm-identity key-publish --ceremony ceremony-2025-12-001
|
|
|
|
# 5. Verify propagation
|
|
vm-identity did resolve did:vm:node:brick-01
|
|
```
|
|
|
|
### Create Security Drill
|
|
|
|
```bash
|
|
# 1. Create drill from prompt
|
|
vm-drills create \
|
|
--prompt "Detect and respond to ransomware encryption" \
|
|
--severity high \
|
|
--skills detection-defense-ir,kubernetes-security
|
|
|
|
# 2. Review generated contract
|
|
vm-drills show drill-2025-12-001
|
|
|
|
# 3. Start execution
|
|
vm-drills start drill-2025-12-001
|
|
|
|
# 4. Complete stages
|
|
vm-drills complete-stage drill-2025-12-001 stage-1 \
|
|
--outputs cases/drills/drill-2025-12-001/stage-1/ \
|
|
--findings "Identified encryption patterns"
|
|
|
|
# 5. Seal drill
|
|
vm-drills seal drill-2025-12-001
|
|
```
|
|
|
|
### Initiate Transmutation
|
|
|
|
```bash
|
|
# 1. Start transmutation from incident
|
|
vm-psi transmute start \
|
|
--input INC-2025-12-001 \
|
|
--input-type security_incident \
|
|
--title "SSH Brute Force to Detection"
|
|
|
|
# 2. Extract IOCs
|
|
vm-psi transmute step transmute-2025-12-001 extract
|
|
|
|
# 3. Dissolve to standard format
|
|
vm-psi transmute step transmute-2025-12-001 dissolve
|
|
|
|
# 4. Purify (validate)
|
|
vm-psi transmute step transmute-2025-12-001 purify
|
|
|
|
# 5. Coagulate (generate rules)
|
|
vm-psi transmute step transmute-2025-12-001 coagulate
|
|
|
|
# 6. Seal
|
|
vm-psi transmute seal transmute-2025-12-001
|
|
```
|
|
|
|
---
|
|
|
|
## Troubleshooting
|
|
|
|
### Anchor Failures
|
|
|
|
**Symptom**: `vm-guardian anchor-status` shows failures
|
|
|
|
**Diagnosis**:
|
|
```bash
|
|
# Check guardian logs
|
|
kubectl logs -n vaultmesh -l app.kubernetes.io/name=guardian --tail=100
|
|
|
|
# Check anchor backend connectivity
|
|
vm-guardian test-backend ethereum
|
|
vm-guardian test-backend ots
|
|
|
|
# Check pending receipts
|
|
vm-guardian pending-receipts
|
|
```
|
|
|
|
**Common Causes**:
|
|
1. **Network issues**: Check Ethereum RPC connectivity
|
|
2. **Insufficient funds**: Check anchor wallet balance
|
|
3. **Rate limiting**: Check if backend is rate limiting
|
|
4. **Configuration**: Verify anchor config
|
|
|
|
**Resolution**:
|
|
```bash
|
|
# Retry anchor
|
|
vm-guardian anchor-now --backend ots --wait
|
|
|
|
# If Ethereum issues, switch to OTS temporarily
|
|
vm-guardian config set anchor.primary ots
|
|
|
|
# Check and top up wallet
|
|
vm-guardian wallet balance
|
|
vm-guardian wallet fund --amount 0.1
|
|
```
|
|
|
|
### Receipt Integrity Errors
|
|
|
|
**Symptom**: `verify-all` reports mismatches
|
|
|
|
**Diagnosis**:
|
|
```bash
|
|
# Identify affected scroll
|
|
vm-guardian verify-all --scroll all --verbose
|
|
|
|
# Check specific receipt
|
|
vm-guardian verify-receipt blake3:... --scroll Compliance --debug
|
|
|
|
# Compare computed vs stored root
|
|
vm-guardian compute-root --scroll Compliance
|
|
cat receipts/ROOT.compliance.txt
|
|
```
|
|
|
|
**Common Causes**:
|
|
1. **Corrupted JSONL**: File system issues
|
|
2. **Incomplete write**: Process interrupted
|
|
3. **Manual modification**: Violation of AXIOM-001
|
|
|
|
**Resolution**:
|
|
```bash
|
|
# If corruption detected, restore from backup
|
|
vm-cli backup restore --backup-id backup-2025-12-05 --scroll Compliance
|
|
|
|
# Recompute root after restore
|
|
vm-guardian recompute-root --scroll Compliance
|
|
|
|
# Trigger anchor to seal restored state
|
|
vm-guardian anchor-now --scroll Compliance --wait
|
|
```
|
|
|
|
### Node Connectivity Issues
|
|
|
|
**Symptom**: Node showing unhealthy in mesh
|
|
|
|
**Diagnosis**:
|
|
```bash
|
|
# Check node status
|
|
vm-mesh node status brick-02
|
|
|
|
# Test connectivity
|
|
vm-mesh ping brick-02
|
|
|
|
# Check routes
|
|
vm-mesh routes list --node brick-02
|
|
|
|
# Check node logs
|
|
kubectl logs -n vaultmesh pod/brick-02 --tail=100
|
|
```
|
|
|
|
**Common Causes**:
|
|
1. **Network partition**: Firewall/network issues
|
|
2. **Resource exhaustion**: Node overloaded
|
|
3. **Certificate expiry**: TLS cert expired
|
|
4. **Process crash**: Service died
|
|
|
|
**Resolution**:
|
|
```bash
|
|
# Restart node pod
|
|
kubectl rollout restart deployment/brick-02 -n vaultmesh
|
|
|
|
# If cert expired
|
|
vm-identity cert-renew --node brick-02
|
|
|
|
# If persistent issues, remove and re-add
|
|
vm-mesh node remove brick-02 --force
|
|
vm-mesh node add --did did:vm:node:brick-02 --endpoint https://...
|
|
```
|
|
|
|
### Oracle Query Failures
|
|
|
|
**Symptom**: Oracle returning errors
|
|
|
|
**Diagnosis**:
|
|
```bash
|
|
# Check oracle health
|
|
vm-oracle health
|
|
|
|
# Check LLM connectivity
|
|
vm-oracle test-llm anthropic
|
|
vm-oracle test-llm openai
|
|
|
|
# Check corpus status
|
|
vm-oracle corpus status
|
|
|
|
# Check logs
|
|
kubectl logs -n vaultmesh -l app.kubernetes.io/name=oracle --tail=100
|
|
```
|
|
|
|
**Common Causes**:
|
|
1. **LLM API issues**: Rate limiting, key expiry
|
|
2. **Corpus empty**: Documents not loaded
|
|
3. **Index corruption**: Vector index issues
|
|
4. **Memory exhaustion**: OOM conditions
|
|
|
|
**Resolution**:
|
|
```bash
|
|
# Rotate API key if expired
|
|
kubectl create secret generic oracle-llm-credentials \
|
|
--from-literal=anthropic-key=NEW_KEY \
|
|
-n vaultmesh --dry-run=client -o yaml | kubectl apply -f -
|
|
|
|
# Reload corpus
|
|
vm-oracle corpus reload
|
|
|
|
# Rebuild index
|
|
vm-oracle corpus reindex
|
|
|
|
# Restart oracle
|
|
kubectl rollout restart deployment/vaultmesh-oracle -n vaultmesh
|
|
```
|
|
|
|
### Phase Stuck in Nigredo
|
|
|
|
**Symptom**: System in Nigredo for extended period
|
|
|
|
**Diagnosis**:
|
|
```bash
|
|
# Check phase details
|
|
vm-psi phase current --verbose
|
|
|
|
# Check active incidents
|
|
vm-offsec incidents list --status open
|
|
|
|
# Check for blocking issues
|
|
vm-psi blockers
|
|
|
|
# Review phase history
|
|
vm-psi phase history --last 7d
|
|
```
|
|
|
|
**Common Causes**:
|
|
1. **Unresolved incident**: Active security issue
|
|
2. **Failed transmutation**: Stuck in process
|
|
3. **Missing witness**: Transmutation waiting for signature
|
|
4. **Metric threshold**: Health metrics below threshold
|
|
|
|
**Resolution**:
|
|
```bash
|
|
# Close incident if resolved
|
|
vm-offsec incident close INC-2025-12-001 \
|
|
--resolution "Threat neutralized, systems restored"
|
|
|
|
# Complete stuck transmutation
|
|
vm-psi transmute force-complete transmute-2025-12-001
|
|
|
|
# Manual phase transition (requires justification)
|
|
vm-psi phase transition albedo \
|
|
--reason "Incident resolved, metrics stable" \
|
|
--evidence evidence-report.md
|
|
```
|
|
|
|
### Constitutional Violation Detected
|
|
|
|
**Symptom**: `gov_violation` alert fired
|
|
|
|
**Diagnosis**:
|
|
```bash
|
|
# View violation details
|
|
vm-gov violations show VIOL-2025-12-001
|
|
|
|
# Check what was attempted
|
|
vm-gov violations evidence VIOL-2025-12-001
|
|
|
|
# Review enforcement action
|
|
vm-gov enforcement show ENF-2025-12-001
|
|
```
|
|
|
|
**Common Causes**:
|
|
1. **Agent misconfiguration**: Automation tried unauthorized action
|
|
2. **Capability expiry**: Token expired mid-operation
|
|
3. **Bug in engine**: Logic error attempting violation
|
|
4. **Attack attempt**: Malicious action blocked
|
|
|
|
**Resolution**:
|
|
```bash
|
|
# If false positive, dismiss
|
|
vm-gov violations review VIOL-2025-12-001 \
|
|
--decision dismiss \
|
|
--reason "False positive due to timing issue"
|
|
|
|
# If real, review and uphold enforcement
|
|
vm-gov enforcement review ENF-2025-12-001 --decision uphold
|
|
|
|
# Fix underlying issue
|
|
# (depends on specific violation)
|
|
```
|
|
|
|
---
|
|
|
|
## Backup & Recovery
|
|
|
|
### Scheduled Backups
|
|
|
|
```bash
|
|
# Full backup
|
|
vm-cli backup create --type full
|
|
|
|
# Incremental backup
|
|
vm-cli backup create --type incremental
|
|
|
|
# List backups
|
|
vm-cli backup list
|
|
|
|
# Verify backup integrity
|
|
vm-cli backup verify backup-2025-12-05
|
|
```
|
|
|
|
### Recovery Procedures
|
|
|
|
```bash
|
|
# 1. Stop services
|
|
kubectl scale deployment -n vaultmesh --replicas=0 --all
|
|
|
|
# 2. Restore from backup
|
|
vm-cli backup restore --backup-id backup-2025-12-05
|
|
|
|
# 3. Verify integrity
|
|
vm-guardian verify-all --scroll all
|
|
|
|
# 4. Restart services
|
|
kubectl scale deployment -n vaultmesh --replicas=2 \
|
|
vaultmesh-portal vaultmesh-oracle
|
|
kubectl scale deployment -n vaultmesh --replicas=1 vaultmesh-guardian
|
|
|
|
# 5. Trigger anchor to seal restored state
|
|
vm-guardian anchor-now --wait
|
|
```
|
|
|
|
### Disaster Recovery
|
|
|
|
```bash
|
|
# Full rebuild from backup
|
|
./scripts/disaster-recovery.sh --backup backup-2025-12-05
|
|
|
|
# Verify federation peers
|
|
vm-federation verify-all
|
|
|
|
# Re-establish federation trust if needed
|
|
vm-federation re-establish --peer vaultmesh-berlin
|
|
```
|
|
|
|
---
|
|
|
|
## Performance Tuning
|
|
|
|
### Receipt Write Optimization
|
|
|
|
```toml
|
|
# config.toml
|
|
[receipts]
|
|
# Batch writes for better throughput
|
|
batch_size = 100
|
|
batch_timeout_ms = 100
|
|
|
|
# Compression
|
|
compression = "zstd"
|
|
compression_level = 3
|
|
|
|
# Index configuration
|
|
index_cache_size_mb = 512
|
|
```
|
|
|
|
### Database Tuning
|
|
|
|
```sql
|
|
-- Vacuum and analyze
|
|
VACUUM ANALYZE receipts;
|
|
|
|
-- Check slow queries
|
|
SELECT query, calls, mean_time
|
|
FROM pg_stat_statements
|
|
ORDER BY mean_time DESC
|
|
LIMIT 10;
|
|
|
|
-- Index usage
|
|
SELECT schemaname, tablename, indexname, idx_scan
|
|
FROM pg_stat_user_indexes
|
|
ORDER BY idx_scan;
|
|
```
|
|
|
|
### Memory Optimization
|
|
|
|
```bash
|
|
# Check memory usage
|
|
kubectl top pods -n vaultmesh
|
|
|
|
# Adjust limits if needed
|
|
kubectl patch deployment vaultmesh-oracle -n vaultmesh \
|
|
-p '{"spec":{"template":{"spec":{"containers":[{"name":"oracle","resources":{"limits":{"memory":"8Gi"}}}]}}}}'
|
|
```
|
|
|
|
---
|
|
|
|
## Monitoring Dashboards
|
|
|
|
### Key Metrics to Watch
|
|
|
|
| Metric | Warning | Critical |
|
|
|--------|---------|----------|
|
|
| `vaultmesh_guardian_last_anchor_age` | > 2h | > 4h |
|
|
| `vaultmesh_receipt_write_errors_total` | > 0 | > 10/min |
|
|
| `vaultmesh_mesh_node_unhealthy` | any | multiple |
|
|
| `vaultmesh_oracle_latency_p95` | > 30s | > 60s |
|
|
| `vaultmesh_governance_violations` | any | critical |
|
|
| `vaultmesh_psi_phase` | nigredo > 24h | nigredo > 72h |
|
|
|
|
### Alert Response
|
|
|
|
```bash
|
|
# Acknowledge alert
|
|
vm-alerts ack ALERT-2025-12-001
|
|
|
|
# Silence alert (for maintenance)
|
|
vm-alerts silence --matcher 'alertname="AnchorDelayed"' --duration 2h
|
|
|
|
# View active alerts
|
|
vm-alerts list --active
|
|
```
|