chore: initial import

This commit is contained in:
Sovereign
2025-12-18 00:29:15 +01:00
commit 789397eb33
22 changed files with 5944 additions and 0 deletions

View File

@@ -0,0 +1 @@
requests>=2.28.0

View File

@@ -0,0 +1,305 @@
#!/usr/bin/env python3
"""
VaultMesh vm-copilot Agent
A bot that monitors Command Center nodes and posts incidents/notes
when attention conditions are detected.
Usage:
python vm_copilot_agent.py
Environment variables:
VAULTMESH_CC_URL - Command Center URL (default: http://127.0.0.1:8088)
VM_COPILOT_INTERVAL - Poll interval in seconds (default: 60)
VM_COPILOT_NODES - Comma-separated list of node IDs to watch
"""
import os
import sys
import time
import logging
from datetime import datetime, timezone
from typing import Optional
import requests
# -----------------------------------------------------------------------------
# Configuration
# -----------------------------------------------------------------------------
CC_URL = os.environ.get("VAULTMESH_CC_URL", "http://127.0.0.1:8088")
POLL_INTERVAL = int(os.environ.get("VM_COPILOT_INTERVAL", "60"))
# Node IDs to watch (can be overridden via env var)
DEFAULT_NODES = [
"b5d5a72e-71d9-46d7-a4d7-a1c89d53224b", # gamma (machine-id derived)
]
WATCH_NODES = os.environ.get("VM_COPILOT_NODES", ",".join(DEFAULT_NODES)).split(",")
WATCH_NODES = [n.strip() for n in WATCH_NODES if n.strip()]
AUTHOR = "vm-copilot"
# Logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S",
)
log = logging.getLogger("vm-copilot")
# -----------------------------------------------------------------------------
# State Tracking
# -----------------------------------------------------------------------------
# Track conditions we've already alerted on: node_id -> set of condition keys
alerted_conditions: dict[str, set[str]] = {}
# Track last event timestamp per node for incremental fetching
last_event_ts: dict[str, str] = {}
# -----------------------------------------------------------------------------
# HTTP Helpers
# -----------------------------------------------------------------------------
def get_node(node_id: str) -> Optional[dict]:
"""Fetch a single node's state from CC by querying the nodes list."""
try:
# /nodes returns JSON array of all nodes
resp = requests.get(f"{CC_URL}/nodes", timeout=10)
if resp.status_code == 200:
nodes = resp.json()
# Find the node by ID
for node in nodes:
if node.get("node_id") == node_id:
return node
log.warning(f"Node {node_id} not found in nodes list")
return None
else:
log.error(f"Failed to get nodes: {resp.status_code}")
return None
except requests.RequestException as e:
log.error(f"Request error getting nodes: {e}")
return None
def get_events(
node_id: str,
since: Optional[str] = None,
kinds: Optional[list[str]] = None,
limit: int = 50,
) -> list[dict]:
"""Query events for a node from CC."""
params = {"node_id": node_id, "limit": str(limit)}
if since:
params["since"] = since
if kinds:
params["kind"] = ",".join(kinds)
try:
resp = requests.get(f"{CC_URL}/api/events", params=params, timeout=10)
if resp.status_code == 200:
return resp.json()
else:
log.error(f"Failed to get events: {resp.status_code}")
return []
except requests.RequestException as e:
log.error(f"Request error getting events: {e}")
return []
def post_event(kind: str, node_id: str, body: dict) -> Optional[dict]:
"""Post an event to CC."""
payload = {
"kind": kind,
"node_id": node_id,
"author": AUTHOR,
"body": body,
}
try:
resp = requests.post(
f"{CC_URL}/api/events",
json=payload,
headers={"Content-Type": "application/json"},
timeout=10,
)
if resp.status_code == 200:
result = resp.json()
log.info(f"Posted {kind} event: {result.get('id', 'unknown')}")
return result
else:
log.error(f"Failed to post event: {resp.status_code} - {resp.text}")
return None
except requests.RequestException as e:
log.error(f"Request error posting event: {e}")
return None
# -----------------------------------------------------------------------------
# Rule Engine
# -----------------------------------------------------------------------------
def analyze_node(node: dict, recent_events: list[dict]) -> list[tuple[str, str, dict]]:
"""
Analyze a node's state and return events to emit.
Returns:
List of (kind, condition_key, body) tuples
"""
to_emit = []
attention = node.get("attention", [])
hostname = node.get("hostname", "unknown")
node_id = node.get("node_id", "unknown")
# Rule 1: critical_findings - needs attention
if "critical_findings" in attention:
scan_summary = node.get("last_scan_summary")
crit_count = 0
if scan_summary:
crit_count = scan_summary.get("critical", 0) + scan_summary.get("high", 0)
to_emit.append((
"incident",
"critical_findings",
{
"title": "Critical Findings Detected",
"description": f"Node {hostname} has {crit_count} critical/high scan findings requiring attention",
"severity": "high",
},
))
to_emit.append((
"note",
"critical_findings_note",
{
"text": f"Auto-detected critical_findings on node {hostname}. Recommend manual review of scan results.",
"severity": "warn",
},
))
# Rule 2: heartbeat_stale - node may be offline
if "heartbeat_stale" in attention:
last_seen = node.get("last_seen", "unknown")
to_emit.append((
"incident",
"heartbeat_stale",
{
"title": "Node Heartbeat Stale",
"description": f"Node {hostname} has not sent a heartbeat recently. Last seen: {last_seen}",
"severity": "high",
},
))
# Rule 3: cloudflare_down - informational note
if "cloudflare_down" in attention:
to_emit.append((
"note",
"cloudflare_down",
{
"text": f"Node {hostname} reports Cloudflare tunnel is down.",
"severity": "warn",
},
))
# Rule 4: never_scanned - needs initial scan
if "never_scanned" in attention:
to_emit.append((
"note",
"never_scanned",
{
"text": f"Node {hostname} has never been scanned. Recommend triggering initial sovereign scan.",
"severity": "info",
},
))
return to_emit
def has_recent_operator_ack(events: list[dict]) -> bool:
"""Check if there's a recent ack from someone other than vm-copilot."""
for ev in events:
if ev.get("kind") == "ack":
author = ev.get("author", "")
if author and author != AUTHOR:
return True
return False
# -----------------------------------------------------------------------------
# Main Loop
# -----------------------------------------------------------------------------
def run_cycle():
"""Run a single monitoring cycle."""
for node_id in WATCH_NODES:
log.debug(f"Checking node {node_id}")
# Fetch node state
node = get_node(node_id)
if not node:
continue
hostname = node.get("hostname", node_id[:8])
attention = node.get("attention", [])
log.info(f"Node {hostname}: attention={attention}")
# Fetch recent events
since = last_event_ts.get(node_id)
events = get_events(
node_id,
since=since,
kinds=["note", "incident", "ack", "resolve"],
)
# Update last seen timestamp
if events:
# Events are returned oldest-first by default, get the latest
latest_ts = max(ev.get("ts", "") for ev in events)
if latest_ts:
last_event_ts[node_id] = latest_ts
# Check for operator acks
recent_ack = has_recent_operator_ack(events)
if recent_ack:
log.info(f"Node {hostname}: operator ack detected, resetting alerts")
alerted_conditions.pop(node_id, None)
# Analyze and emit events
to_emit = analyze_node(node, events)
for kind, condition_key, body in to_emit:
# Skip if we've already alerted on this condition
node_alerts = alerted_conditions.setdefault(node_id, set())
if condition_key in node_alerts:
log.debug(f"Already alerted on {condition_key} for {hostname}")
continue
# Post the event
result = post_event(kind, node_id, body)
if result:
node_alerts.add(condition_key)
def main():
"""Main entry point."""
log.info("=" * 60)
log.info("VaultMesh vm-copilot starting")
log.info(f" CC URL: {CC_URL}")
log.info(f" Poll interval: {POLL_INTERVAL}s")
log.info(f" Watching nodes: {WATCH_NODES}")
log.info("=" * 60)
while True:
try:
run_cycle()
except Exception as e:
log.exception(f"Error in monitoring cycle: {e}")
log.info(f"Sleeping {POLL_INTERVAL}s...")
time.sleep(POLL_INTERVAL)
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
log.info("Shutting down...")
sys.exit(0)