Initialize repository snapshot
This commit is contained in:
537
docs/skill/OPERATIONS.md
Normal file
537
docs/skill/OPERATIONS.md
Normal file
@@ -0,0 +1,537 @@
|
||||
# VaultMesh Operations Guide
|
||||
|
||||
## Daily Operations
|
||||
|
||||
### Morning Health Check
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# scripts/morning-check.sh
|
||||
|
||||
echo "=== VaultMesh Morning Health Check ==="
|
||||
echo "Date: $(date -u +%Y-%m-%dT%H:%M:%SZ)"
|
||||
|
||||
# 1. System health
|
||||
echo -e "\n1. System Health"
|
||||
vm-cli system health
|
||||
|
||||
# 2. Guardian status
|
||||
echo -e "\n2. Guardian Status"
|
||||
vm-guardian anchor-status
|
||||
|
||||
# 3. Phase status
|
||||
echo -e "\n3. Current Phase"
|
||||
vm-psi phase current
|
||||
|
||||
# 4. Overnight receipts
|
||||
echo -e "\n4. Receipts (last 12h)"
|
||||
vm-cli receipts count --since 12h
|
||||
|
||||
# 5. Any violations
|
||||
echo -e "\n5. Governance Violations"
|
||||
vm-gov violations list --since 24h --severity high,critical
|
||||
|
||||
# 6. Federation health
|
||||
echo -e "\n6. Federation Status"
|
||||
vm-federation health --all-peers
|
||||
|
||||
echo -e "\n=== Check Complete ==="
|
||||
```
|
||||
|
||||
### Anchor Monitoring
|
||||
|
||||
```bash
|
||||
# Check anchor status
|
||||
vm-guardian anchor-status
|
||||
|
||||
# View anchor history
|
||||
vm-guardian anchor-history --last 24h
|
||||
|
||||
# Trigger manual anchor if needed
|
||||
vm-guardian anchor-now --wait
|
||||
|
||||
# Verify specific receipt
|
||||
vm-guardian verify-receipt blake3:abc123... --scroll Compliance
|
||||
```
|
||||
|
||||
### Receipt Queries
|
||||
|
||||
```bash
|
||||
# Count receipts by scroll
|
||||
vm-cli receipts count --by-scroll
|
||||
|
||||
# Search receipts
|
||||
vm-cli receipts search --scroll Drills --from 2025-12-01 --to 2025-12-06
|
||||
|
||||
# Export receipts
|
||||
vm-cli receipts export --scroll Compliance --format csv --output compliance.csv
|
||||
|
||||
# Verify integrity
|
||||
vm-guardian verify-all --scroll all
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Common Tasks
|
||||
|
||||
### Add New Node to Mesh
|
||||
|
||||
```bash
|
||||
# 1. Create DID for new node
|
||||
vm-identity did create --type node --id new-node-01
|
||||
|
||||
# 2. Issue node credential
|
||||
vm-identity credential issue \
|
||||
--type VaultMeshNodeCredential \
|
||||
--subject did:vm:node:new-node-01 \
|
||||
--issuer did:vm:node:portal-01
|
||||
|
||||
# 3. Add to mesh
|
||||
vm-mesh node add \
|
||||
--did did:vm:node:new-node-01 \
|
||||
--endpoint https://new-node-01.vaultmesh.io \
|
||||
--type infrastructure
|
||||
|
||||
# 4. Grant capabilities
|
||||
vm-identity capability grant \
|
||||
--subject did:vm:node:new-node-01 \
|
||||
--capability storage,compute
|
||||
|
||||
# 5. Verify
|
||||
vm-mesh node status new-node-01
|
||||
```
|
||||
|
||||
### Key Rotation Ceremony
|
||||
|
||||
```bash
|
||||
# 1. Initiate ceremony
|
||||
vm-identity key-rotate \
|
||||
--did did:vm:node:brick-01 \
|
||||
--ceremony-type standard
|
||||
|
||||
# 2. Generate new keypair (on target node)
|
||||
vm-identity key-generate --algorithm ed25519
|
||||
|
||||
# 3. Witness signatures (from other nodes)
|
||||
vm-identity key-witness \
|
||||
--ceremony ceremony-2025-12-001 \
|
||||
--witness did:vm:node:brick-02
|
||||
|
||||
# 4. Publish new key
|
||||
vm-identity key-publish --ceremony ceremony-2025-12-001
|
||||
|
||||
# 5. Verify propagation
|
||||
vm-identity did resolve did:vm:node:brick-01
|
||||
```
|
||||
|
||||
### Create Security Drill
|
||||
|
||||
```bash
|
||||
# 1. Create drill from prompt
|
||||
vm-drills create \
|
||||
--prompt "Detect and respond to ransomware encryption" \
|
||||
--severity high \
|
||||
--skills detection-defense-ir,kubernetes-security
|
||||
|
||||
# 2. Review generated contract
|
||||
vm-drills show drill-2025-12-001
|
||||
|
||||
# 3. Start execution
|
||||
vm-drills start drill-2025-12-001
|
||||
|
||||
# 4. Complete stages
|
||||
vm-drills complete-stage drill-2025-12-001 stage-1 \
|
||||
--outputs cases/drills/drill-2025-12-001/stage-1/ \
|
||||
--findings "Identified encryption patterns"
|
||||
|
||||
# 5. Seal drill
|
||||
vm-drills seal drill-2025-12-001
|
||||
```
|
||||
|
||||
### Initiate Transmutation
|
||||
|
||||
```bash
|
||||
# 1. Start transmutation from incident
|
||||
vm-psi transmute start \
|
||||
--input INC-2025-12-001 \
|
||||
--input-type security_incident \
|
||||
--title "SSH Brute Force to Detection"
|
||||
|
||||
# 2. Extract IOCs
|
||||
vm-psi transmute step transmute-2025-12-001 extract
|
||||
|
||||
# 3. Dissolve to standard format
|
||||
vm-psi transmute step transmute-2025-12-001 dissolve
|
||||
|
||||
# 4. Purify (validate)
|
||||
vm-psi transmute step transmute-2025-12-001 purify
|
||||
|
||||
# 5. Coagulate (generate rules)
|
||||
vm-psi transmute step transmute-2025-12-001 coagulate
|
||||
|
||||
# 6. Seal
|
||||
vm-psi transmute seal transmute-2025-12-001
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Anchor Failures
|
||||
|
||||
**Symptom**: `vm-guardian anchor-status` shows failures
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check guardian logs
|
||||
kubectl logs -n vaultmesh -l app.kubernetes.io/name=guardian --tail=100
|
||||
|
||||
# Check anchor backend connectivity
|
||||
vm-guardian test-backend ethereum
|
||||
vm-guardian test-backend ots
|
||||
|
||||
# Check pending receipts
|
||||
vm-guardian pending-receipts
|
||||
```
|
||||
|
||||
**Common Causes**:
|
||||
1. **Network issues**: Check Ethereum RPC connectivity
|
||||
2. **Insufficient funds**: Check anchor wallet balance
|
||||
3. **Rate limiting**: Check if backend is rate limiting
|
||||
4. **Configuration**: Verify anchor config
|
||||
|
||||
**Resolution**:
|
||||
```bash
|
||||
# Retry anchor
|
||||
vm-guardian anchor-now --backend ots --wait
|
||||
|
||||
# If Ethereum issues, switch to OTS temporarily
|
||||
vm-guardian config set anchor.primary ots
|
||||
|
||||
# Check and top up wallet
|
||||
vm-guardian wallet balance
|
||||
vm-guardian wallet fund --amount 0.1
|
||||
```
|
||||
|
||||
### Receipt Integrity Errors
|
||||
|
||||
**Symptom**: `verify-all` reports mismatches
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Identify affected scroll
|
||||
vm-guardian verify-all --scroll all --verbose
|
||||
|
||||
# Check specific receipt
|
||||
vm-guardian verify-receipt blake3:... --scroll Compliance --debug
|
||||
|
||||
# Compare computed vs stored root
|
||||
vm-guardian compute-root --scroll Compliance
|
||||
cat receipts/ROOT.compliance.txt
|
||||
```
|
||||
|
||||
**Common Causes**:
|
||||
1. **Corrupted JSONL**: File system issues
|
||||
2. **Incomplete write**: Process interrupted
|
||||
3. **Manual modification**: Violation of AXIOM-001
|
||||
|
||||
**Resolution**:
|
||||
```bash
|
||||
# If corruption detected, restore from backup
|
||||
vm-cli backup restore --backup-id backup-2025-12-05 --scroll Compliance
|
||||
|
||||
# Recompute root after restore
|
||||
vm-guardian recompute-root --scroll Compliance
|
||||
|
||||
# Trigger anchor to seal restored state
|
||||
vm-guardian anchor-now --scroll Compliance --wait
|
||||
```
|
||||
|
||||
### Node Connectivity Issues
|
||||
|
||||
**Symptom**: Node showing unhealthy in mesh
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check node status
|
||||
vm-mesh node status brick-02
|
||||
|
||||
# Test connectivity
|
||||
vm-mesh ping brick-02
|
||||
|
||||
# Check routes
|
||||
vm-mesh routes list --node brick-02
|
||||
|
||||
# Check node logs
|
||||
kubectl logs -n vaultmesh pod/brick-02 --tail=100
|
||||
```
|
||||
|
||||
**Common Causes**:
|
||||
1. **Network partition**: Firewall/network issues
|
||||
2. **Resource exhaustion**: Node overloaded
|
||||
3. **Certificate expiry**: TLS cert expired
|
||||
4. **Process crash**: Service died
|
||||
|
||||
**Resolution**:
|
||||
```bash
|
||||
# Restart node pod
|
||||
kubectl rollout restart deployment/brick-02 -n vaultmesh
|
||||
|
||||
# If cert expired
|
||||
vm-identity cert-renew --node brick-02
|
||||
|
||||
# If persistent issues, remove and re-add
|
||||
vm-mesh node remove brick-02 --force
|
||||
vm-mesh node add --did did:vm:node:brick-02 --endpoint https://...
|
||||
```
|
||||
|
||||
### Oracle Query Failures
|
||||
|
||||
**Symptom**: Oracle returning errors
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check oracle health
|
||||
vm-oracle health
|
||||
|
||||
# Check LLM connectivity
|
||||
vm-oracle test-llm anthropic
|
||||
vm-oracle test-llm openai
|
||||
|
||||
# Check corpus status
|
||||
vm-oracle corpus status
|
||||
|
||||
# Check logs
|
||||
kubectl logs -n vaultmesh -l app.kubernetes.io/name=oracle --tail=100
|
||||
```
|
||||
|
||||
**Common Causes**:
|
||||
1. **LLM API issues**: Rate limiting, key expiry
|
||||
2. **Corpus empty**: Documents not loaded
|
||||
3. **Index corruption**: Vector index issues
|
||||
4. **Memory exhaustion**: OOM conditions
|
||||
|
||||
**Resolution**:
|
||||
```bash
|
||||
# Rotate API key if expired
|
||||
kubectl create secret generic oracle-llm-credentials \
|
||||
--from-literal=anthropic-key=NEW_KEY \
|
||||
-n vaultmesh --dry-run=client -o yaml | kubectl apply -f -
|
||||
|
||||
# Reload corpus
|
||||
vm-oracle corpus reload
|
||||
|
||||
# Rebuild index
|
||||
vm-oracle corpus reindex
|
||||
|
||||
# Restart oracle
|
||||
kubectl rollout restart deployment/vaultmesh-oracle -n vaultmesh
|
||||
```
|
||||
|
||||
### Phase Stuck in Nigredo
|
||||
|
||||
**Symptom**: System in Nigredo for extended period
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# Check phase details
|
||||
vm-psi phase current --verbose
|
||||
|
||||
# Check active incidents
|
||||
vm-offsec incidents list --status open
|
||||
|
||||
# Check for blocking issues
|
||||
vm-psi blockers
|
||||
|
||||
# Review phase history
|
||||
vm-psi phase history --last 7d
|
||||
```
|
||||
|
||||
**Common Causes**:
|
||||
1. **Unresolved incident**: Active security issue
|
||||
2. **Failed transmutation**: Stuck in process
|
||||
3. **Missing witness**: Transmutation waiting for signature
|
||||
4. **Metric threshold**: Health metrics below threshold
|
||||
|
||||
**Resolution**:
|
||||
```bash
|
||||
# Close incident if resolved
|
||||
vm-offsec incident close INC-2025-12-001 \
|
||||
--resolution "Threat neutralized, systems restored"
|
||||
|
||||
# Complete stuck transmutation
|
||||
vm-psi transmute force-complete transmute-2025-12-001
|
||||
|
||||
# Manual phase transition (requires justification)
|
||||
vm-psi phase transition albedo \
|
||||
--reason "Incident resolved, metrics stable" \
|
||||
--evidence evidence-report.md
|
||||
```
|
||||
|
||||
### Constitutional Violation Detected
|
||||
|
||||
**Symptom**: `gov_violation` alert fired
|
||||
|
||||
**Diagnosis**:
|
||||
```bash
|
||||
# View violation details
|
||||
vm-gov violations show VIOL-2025-12-001
|
||||
|
||||
# Check what was attempted
|
||||
vm-gov violations evidence VIOL-2025-12-001
|
||||
|
||||
# Review enforcement action
|
||||
vm-gov enforcement show ENF-2025-12-001
|
||||
```
|
||||
|
||||
**Common Causes**:
|
||||
1. **Agent misconfiguration**: Automation tried unauthorized action
|
||||
2. **Capability expiry**: Token expired mid-operation
|
||||
3. **Bug in engine**: Logic error attempting violation
|
||||
4. **Attack attempt**: Malicious action blocked
|
||||
|
||||
**Resolution**:
|
||||
```bash
|
||||
# If false positive, dismiss
|
||||
vm-gov violations review VIOL-2025-12-001 \
|
||||
--decision dismiss \
|
||||
--reason "False positive due to timing issue"
|
||||
|
||||
# If real, review and uphold enforcement
|
||||
vm-gov enforcement review ENF-2025-12-001 --decision uphold
|
||||
|
||||
# Fix underlying issue
|
||||
# (depends on specific violation)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Backup & Recovery
|
||||
|
||||
### Scheduled Backups
|
||||
|
||||
```bash
|
||||
# Full backup
|
||||
vm-cli backup create --type full
|
||||
|
||||
# Incremental backup
|
||||
vm-cli backup create --type incremental
|
||||
|
||||
# List backups
|
||||
vm-cli backup list
|
||||
|
||||
# Verify backup integrity
|
||||
vm-cli backup verify backup-2025-12-05
|
||||
```
|
||||
|
||||
### Recovery Procedures
|
||||
|
||||
```bash
|
||||
# 1. Stop services
|
||||
kubectl scale deployment -n vaultmesh --replicas=0 --all
|
||||
|
||||
# 2. Restore from backup
|
||||
vm-cli backup restore --backup-id backup-2025-12-05
|
||||
|
||||
# 3. Verify integrity
|
||||
vm-guardian verify-all --scroll all
|
||||
|
||||
# 4. Restart services
|
||||
kubectl scale deployment -n vaultmesh --replicas=2 \
|
||||
vaultmesh-portal vaultmesh-oracle
|
||||
kubectl scale deployment -n vaultmesh --replicas=1 vaultmesh-guardian
|
||||
|
||||
# 5. Trigger anchor to seal restored state
|
||||
vm-guardian anchor-now --wait
|
||||
```
|
||||
|
||||
### Disaster Recovery
|
||||
|
||||
```bash
|
||||
# Full rebuild from backup
|
||||
./scripts/disaster-recovery.sh --backup backup-2025-12-05
|
||||
|
||||
# Verify federation peers
|
||||
vm-federation verify-all
|
||||
|
||||
# Re-establish federation trust if needed
|
||||
vm-federation re-establish --peer vaultmesh-berlin
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Performance Tuning
|
||||
|
||||
### Receipt Write Optimization
|
||||
|
||||
```toml
|
||||
# config.toml
|
||||
[receipts]
|
||||
# Batch writes for better throughput
|
||||
batch_size = 100
|
||||
batch_timeout_ms = 100
|
||||
|
||||
# Compression
|
||||
compression = "zstd"
|
||||
compression_level = 3
|
||||
|
||||
# Index configuration
|
||||
index_cache_size_mb = 512
|
||||
```
|
||||
|
||||
### Database Tuning
|
||||
|
||||
```sql
|
||||
-- Vacuum and analyze
|
||||
VACUUM ANALYZE receipts;
|
||||
|
||||
-- Check slow queries
|
||||
SELECT query, calls, mean_time
|
||||
FROM pg_stat_statements
|
||||
ORDER BY mean_time DESC
|
||||
LIMIT 10;
|
||||
|
||||
-- Index usage
|
||||
SELECT schemaname, tablename, indexname, idx_scan
|
||||
FROM pg_stat_user_indexes
|
||||
ORDER BY idx_scan;
|
||||
```
|
||||
|
||||
### Memory Optimization
|
||||
|
||||
```bash
|
||||
# Check memory usage
|
||||
kubectl top pods -n vaultmesh
|
||||
|
||||
# Adjust limits if needed
|
||||
kubectl patch deployment vaultmesh-oracle -n vaultmesh \
|
||||
-p '{"spec":{"template":{"spec":{"containers":[{"name":"oracle","resources":{"limits":{"memory":"8Gi"}}}]}}}}'
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Monitoring Dashboards
|
||||
|
||||
### Key Metrics to Watch
|
||||
|
||||
| Metric | Warning | Critical |
|
||||
|--------|---------|----------|
|
||||
| `vaultmesh_guardian_last_anchor_age` | > 2h | > 4h |
|
||||
| `vaultmesh_receipt_write_errors_total` | > 0 | > 10/min |
|
||||
| `vaultmesh_mesh_node_unhealthy` | any | multiple |
|
||||
| `vaultmesh_oracle_latency_p95` | > 30s | > 60s |
|
||||
| `vaultmesh_governance_violations` | any | critical |
|
||||
| `vaultmesh_psi_phase` | nigredo > 24h | nigredo > 72h |
|
||||
|
||||
### Alert Response
|
||||
|
||||
```bash
|
||||
# Acknowledge alert
|
||||
vm-alerts ack ALERT-2025-12-001
|
||||
|
||||
# Silence alert (for maintenance)
|
||||
vm-alerts silence --matcher 'alertname="AnchorDelayed"' --duration 2h
|
||||
|
||||
# View active alerts
|
||||
vm-alerts list --active
|
||||
```
|
||||
Reference in New Issue
Block a user