#!/usr/bin/env python3 """ VaultMesh vm-copilot Agent A bot that monitors Command Center nodes and posts incidents/notes when attention conditions are detected. Usage: python vm_copilot_agent.py Environment variables: VAULTMESH_CC_URL - Command Center URL (default: http://127.0.0.1:8088) VM_COPILOT_INTERVAL - Poll interval in seconds (default: 60) VM_COPILOT_NODES - Comma-separated list of node IDs to watch """ import os import sys import time import logging from datetime import datetime, timezone from typing import Optional import requests # ----------------------------------------------------------------------------- # Configuration # ----------------------------------------------------------------------------- CC_URL = os.environ.get("VAULTMESH_CC_URL", "http://127.0.0.1:8088") POLL_INTERVAL = int(os.environ.get("VM_COPILOT_INTERVAL", "60")) # Node IDs to watch (can be overridden via env var) DEFAULT_NODES = [ "b5d5a72e-71d9-46d7-a4d7-a1c89d53224b", # gamma (machine-id derived) ] WATCH_NODES = os.environ.get("VM_COPILOT_NODES", ",".join(DEFAULT_NODES)).split(",") WATCH_NODES = [n.strip() for n in WATCH_NODES if n.strip()] AUTHOR = "vm-copilot" # Logging logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", datefmt="%Y-%m-%d %H:%M:%S", ) log = logging.getLogger("vm-copilot") # ----------------------------------------------------------------------------- # State Tracking # ----------------------------------------------------------------------------- # Track conditions we've already alerted on: node_id -> set of condition keys alerted_conditions: dict[str, set[str]] = {} # Track last event timestamp per node for incremental fetching last_event_ts: dict[str, str] = {} # ----------------------------------------------------------------------------- # HTTP Helpers # ----------------------------------------------------------------------------- def get_node(node_id: str) -> Optional[dict]: """Fetch a single node's state from CC by querying the nodes list.""" try: # /nodes returns JSON array of all nodes resp = requests.get(f"{CC_URL}/nodes", timeout=10) if resp.status_code == 200: nodes = resp.json() # Find the node by ID for node in nodes: if node.get("node_id") == node_id: return node log.warning(f"Node {node_id} not found in nodes list") return None else: log.error(f"Failed to get nodes: {resp.status_code}") return None except requests.RequestException as e: log.error(f"Request error getting nodes: {e}") return None def get_events( node_id: str, since: Optional[str] = None, kinds: Optional[list[str]] = None, limit: int = 50, ) -> list[dict]: """Query events for a node from CC.""" params = {"node_id": node_id, "limit": str(limit)} if since: params["since"] = since if kinds: params["kind"] = ",".join(kinds) try: resp = requests.get(f"{CC_URL}/api/events", params=params, timeout=10) if resp.status_code == 200: return resp.json() else: log.error(f"Failed to get events: {resp.status_code}") return [] except requests.RequestException as e: log.error(f"Request error getting events: {e}") return [] def post_event(kind: str, node_id: str, body: dict) -> Optional[dict]: """Post an event to CC.""" payload = { "kind": kind, "node_id": node_id, "author": AUTHOR, "payload": body, } try: resp = requests.post( f"{CC_URL}/api/events", json=payload, headers={"Content-Type": "application/json"}, timeout=10, ) if resp.status_code in (200, 201): result = resp.json() log.info(f"Posted {kind} event: {result.get('id', 'unknown')}") return result else: log.error(f"Failed to post event: {resp.status_code} - {resp.text}") return None except requests.RequestException as e: log.error(f"Request error posting event: {e}") return None # ----------------------------------------------------------------------------- # Rule Engine # ----------------------------------------------------------------------------- def analyze_node(node: dict, recent_events: list[dict]) -> list[tuple[str, str, dict]]: """ Analyze a node's state and return events to emit. Returns: List of (kind, condition_key, body) tuples """ to_emit = [] attention = node.get("attention", []) hostname = node.get("hostname", "unknown") node_id = node.get("node_id", "unknown") # Rule 1: critical_findings - needs attention if "critical_findings" in attention: scan_summary = node.get("last_scan_summary") crit_count = 0 if scan_summary: crit_count = scan_summary.get("critical", 0) + scan_summary.get("high", 0) to_emit.append(( "incident", "critical_findings", { "title": "Critical Findings Detected", "description": f"Node {hostname} has {crit_count} critical/high scan findings requiring attention", "severity": "high", }, )) to_emit.append(( "note", "critical_findings_note", { "text": f"Auto-detected critical_findings on node {hostname}. Recommend manual review of scan results.", "severity": "warn", }, )) # Rule 2: heartbeat_stale - node may be offline if "heartbeat_stale" in attention: last_seen = node.get("last_seen", "unknown") to_emit.append(( "incident", "heartbeat_stale", { "title": "Node Heartbeat Stale", "description": f"Node {hostname} has not sent a heartbeat recently. Last seen: {last_seen}", "severity": "high", }, )) # Rule 3: cloudflare_down - informational note if "cloudflare_down" in attention: to_emit.append(( "note", "cloudflare_down", { "text": f"Node {hostname} reports Cloudflare tunnel is down.", "severity": "warn", }, )) # Rule 4: never_scanned - needs initial scan if "never_scanned" in attention: to_emit.append(( "note", "never_scanned", { "text": f"Node {hostname} has never been scanned. Recommend triggering initial sovereign scan.", "severity": "info", }, )) return to_emit def has_recent_operator_ack(events: list[dict]) -> bool: """Check if there's a recent ack from someone other than vm-copilot.""" for ev in events: if ev.get("kind") == "ack": author = ev.get("author", "") if author and author != AUTHOR: return True return False # ----------------------------------------------------------------------------- # Main Loop # ----------------------------------------------------------------------------- def run_cycle(): """Run a single monitoring cycle.""" for node_id in WATCH_NODES: log.debug(f"Checking node {node_id}") # Fetch node state node = get_node(node_id) if not node: continue hostname = node.get("hostname", node_id[:8]) attention = node.get("attention", []) log.info(f"Node {hostname}: attention={attention}") # Fetch recent events since = last_event_ts.get(node_id) events = get_events( node_id, since=since, kinds=["note", "incident", "ack", "resolve"], ) # Update last seen timestamp if events: # Events are returned oldest-first by default, get the latest latest_ts = max(ev.get("ts", "") for ev in events) if latest_ts: last_event_ts[node_id] = latest_ts # Check for operator acks recent_ack = has_recent_operator_ack(events) if recent_ack: log.info(f"Node {hostname}: operator ack detected, resetting alerts") alerted_conditions.pop(node_id, None) # Analyze and emit events to_emit = analyze_node(node, events) for kind, condition_key, body in to_emit: # Skip if we've already alerted on this condition node_alerts = alerted_conditions.setdefault(node_id, set()) if condition_key in node_alerts: log.debug(f"Already alerted on {condition_key} for {hostname}") continue # Post the event result = post_event(kind, node_id, body) if result: node_alerts.add(condition_key) def main(): """Main entry point.""" log.info("=" * 60) log.info("VaultMesh vm-copilot starting") log.info(f" CC URL: {CC_URL}") log.info(f" Poll interval: {POLL_INTERVAL}s") log.info(f" Watching nodes: {WATCH_NODES}") log.info("=" * 60) while True: try: run_cycle() except Exception as e: log.exception(f"Error in monitoring cycle: {e}") log.info(f"Sleeping {POLL_INTERVAL}s...") time.sleep(POLL_INTERVAL) if __name__ == "__main__": try: main() except KeyboardInterrupt: log.info("Shutting down...") sys.exit(0)