306 lines
9.5 KiB
Python
306 lines
9.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
VaultMesh vm-copilot Agent
|
|
|
|
A bot that monitors Command Center nodes and posts incidents/notes
|
|
when attention conditions are detected.
|
|
|
|
Usage:
|
|
python vm_copilot_agent.py
|
|
|
|
Environment variables:
|
|
VAULTMESH_CC_URL - Command Center URL (default: http://127.0.0.1:8088)
|
|
VM_COPILOT_INTERVAL - Poll interval in seconds (default: 60)
|
|
VM_COPILOT_NODES - Comma-separated list of node IDs to watch
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import time
|
|
import logging
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
import requests
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Configuration
|
|
# -----------------------------------------------------------------------------
|
|
|
|
CC_URL = os.environ.get("VAULTMESH_CC_URL", "http://127.0.0.1:8088")
|
|
POLL_INTERVAL = int(os.environ.get("VM_COPILOT_INTERVAL", "60"))
|
|
|
|
# Node IDs to watch (can be overridden via env var)
|
|
DEFAULT_NODES = [
|
|
"b5d5a72e-71d9-46d7-a4d7-a1c89d53224b", # gamma (machine-id derived)
|
|
]
|
|
WATCH_NODES = os.environ.get("VM_COPILOT_NODES", ",".join(DEFAULT_NODES)).split(",")
|
|
WATCH_NODES = [n.strip() for n in WATCH_NODES if n.strip()]
|
|
|
|
AUTHOR = "vm-copilot"
|
|
|
|
# Logging
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s [%(levelname)s] %(message)s",
|
|
datefmt="%Y-%m-%d %H:%M:%S",
|
|
)
|
|
log = logging.getLogger("vm-copilot")
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# State Tracking
|
|
# -----------------------------------------------------------------------------
|
|
|
|
# Track conditions we've already alerted on: node_id -> set of condition keys
|
|
alerted_conditions: dict[str, set[str]] = {}
|
|
|
|
# Track last event timestamp per node for incremental fetching
|
|
last_event_ts: dict[str, str] = {}
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# HTTP Helpers
|
|
# -----------------------------------------------------------------------------
|
|
|
|
def get_node(node_id: str) -> Optional[dict]:
|
|
"""Fetch a single node's state from CC by querying the nodes list."""
|
|
try:
|
|
# /nodes returns JSON array of all nodes
|
|
resp = requests.get(f"{CC_URL}/nodes", timeout=10)
|
|
if resp.status_code == 200:
|
|
nodes = resp.json()
|
|
# Find the node by ID
|
|
for node in nodes:
|
|
if node.get("node_id") == node_id:
|
|
return node
|
|
log.warning(f"Node {node_id} not found in nodes list")
|
|
return None
|
|
else:
|
|
log.error(f"Failed to get nodes: {resp.status_code}")
|
|
return None
|
|
except requests.RequestException as e:
|
|
log.error(f"Request error getting nodes: {e}")
|
|
return None
|
|
|
|
|
|
def get_events(
|
|
node_id: str,
|
|
since: Optional[str] = None,
|
|
kinds: Optional[list[str]] = None,
|
|
limit: int = 50,
|
|
) -> list[dict]:
|
|
"""Query events for a node from CC."""
|
|
params = {"node_id": node_id, "limit": str(limit)}
|
|
if since:
|
|
params["since"] = since
|
|
if kinds:
|
|
params["kind"] = ",".join(kinds)
|
|
|
|
try:
|
|
resp = requests.get(f"{CC_URL}/api/events", params=params, timeout=10)
|
|
if resp.status_code == 200:
|
|
return resp.json()
|
|
else:
|
|
log.error(f"Failed to get events: {resp.status_code}")
|
|
return []
|
|
except requests.RequestException as e:
|
|
log.error(f"Request error getting events: {e}")
|
|
return []
|
|
|
|
|
|
def post_event(kind: str, node_id: str, body: dict) -> Optional[dict]:
|
|
"""Post an event to CC."""
|
|
payload = {
|
|
"kind": kind,
|
|
"node_id": node_id,
|
|
"author": AUTHOR,
|
|
"payload": body,
|
|
}
|
|
try:
|
|
resp = requests.post(
|
|
f"{CC_URL}/api/events",
|
|
json=payload,
|
|
headers={"Content-Type": "application/json"},
|
|
timeout=10,
|
|
)
|
|
if resp.status_code in (200, 201):
|
|
result = resp.json()
|
|
log.info(f"Posted {kind} event: {result.get('id', 'unknown')}")
|
|
return result
|
|
else:
|
|
log.error(f"Failed to post event: {resp.status_code} - {resp.text}")
|
|
return None
|
|
except requests.RequestException as e:
|
|
log.error(f"Request error posting event: {e}")
|
|
return None
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Rule Engine
|
|
# -----------------------------------------------------------------------------
|
|
|
|
def analyze_node(node: dict, recent_events: list[dict]) -> list[tuple[str, str, dict]]:
|
|
"""
|
|
Analyze a node's state and return events to emit.
|
|
|
|
Returns:
|
|
List of (kind, condition_key, body) tuples
|
|
"""
|
|
to_emit = []
|
|
attention = node.get("attention", [])
|
|
hostname = node.get("hostname", "unknown")
|
|
node_id = node.get("node_id", "unknown")
|
|
|
|
# Rule 1: critical_findings - needs attention
|
|
if "critical_findings" in attention:
|
|
scan_summary = node.get("last_scan_summary")
|
|
crit_count = 0
|
|
if scan_summary:
|
|
crit_count = scan_summary.get("critical", 0) + scan_summary.get("high", 0)
|
|
|
|
to_emit.append((
|
|
"incident",
|
|
"critical_findings",
|
|
{
|
|
"title": "Critical Findings Detected",
|
|
"description": f"Node {hostname} has {crit_count} critical/high scan findings requiring attention",
|
|
"severity": "high",
|
|
},
|
|
))
|
|
to_emit.append((
|
|
"note",
|
|
"critical_findings_note",
|
|
{
|
|
"text": f"Auto-detected critical_findings on node {hostname}. Recommend manual review of scan results.",
|
|
"severity": "warn",
|
|
},
|
|
))
|
|
|
|
# Rule 2: heartbeat_stale - node may be offline
|
|
if "heartbeat_stale" in attention:
|
|
last_seen = node.get("last_seen", "unknown")
|
|
to_emit.append((
|
|
"incident",
|
|
"heartbeat_stale",
|
|
{
|
|
"title": "Node Heartbeat Stale",
|
|
"description": f"Node {hostname} has not sent a heartbeat recently. Last seen: {last_seen}",
|
|
"severity": "high",
|
|
},
|
|
))
|
|
|
|
# Rule 3: cloudflare_down - informational note
|
|
if "cloudflare_down" in attention:
|
|
to_emit.append((
|
|
"note",
|
|
"cloudflare_down",
|
|
{
|
|
"text": f"Node {hostname} reports Cloudflare tunnel is down.",
|
|
"severity": "warn",
|
|
},
|
|
))
|
|
|
|
# Rule 4: never_scanned - needs initial scan
|
|
if "never_scanned" in attention:
|
|
to_emit.append((
|
|
"note",
|
|
"never_scanned",
|
|
{
|
|
"text": f"Node {hostname} has never been scanned. Recommend triggering initial sovereign scan.",
|
|
"severity": "info",
|
|
},
|
|
))
|
|
|
|
return to_emit
|
|
|
|
|
|
def has_recent_operator_ack(events: list[dict]) -> bool:
|
|
"""Check if there's a recent ack from someone other than vm-copilot."""
|
|
for ev in events:
|
|
if ev.get("kind") == "ack":
|
|
author = ev.get("author", "")
|
|
if author and author != AUTHOR:
|
|
return True
|
|
return False
|
|
|
|
|
|
# -----------------------------------------------------------------------------
|
|
# Main Loop
|
|
# -----------------------------------------------------------------------------
|
|
|
|
def run_cycle():
|
|
"""Run a single monitoring cycle."""
|
|
for node_id in WATCH_NODES:
|
|
log.debug(f"Checking node {node_id}")
|
|
|
|
# Fetch node state
|
|
node = get_node(node_id)
|
|
if not node:
|
|
continue
|
|
|
|
hostname = node.get("hostname", node_id[:8])
|
|
attention = node.get("attention", [])
|
|
log.info(f"Node {hostname}: attention={attention}")
|
|
|
|
# Fetch recent events
|
|
since = last_event_ts.get(node_id)
|
|
events = get_events(
|
|
node_id,
|
|
since=since,
|
|
kinds=["note", "incident", "ack", "resolve"],
|
|
)
|
|
|
|
# Update last seen timestamp
|
|
if events:
|
|
# Events are returned oldest-first by default, get the latest
|
|
latest_ts = max(ev.get("ts", "") for ev in events)
|
|
if latest_ts:
|
|
last_event_ts[node_id] = latest_ts
|
|
|
|
# Check for operator acks
|
|
recent_ack = has_recent_operator_ack(events)
|
|
if recent_ack:
|
|
log.info(f"Node {hostname}: operator ack detected, resetting alerts")
|
|
alerted_conditions.pop(node_id, None)
|
|
|
|
# Analyze and emit events
|
|
to_emit = analyze_node(node, events)
|
|
|
|
for kind, condition_key, body in to_emit:
|
|
# Skip if we've already alerted on this condition
|
|
node_alerts = alerted_conditions.setdefault(node_id, set())
|
|
if condition_key in node_alerts:
|
|
log.debug(f"Already alerted on {condition_key} for {hostname}")
|
|
continue
|
|
|
|
# Post the event
|
|
result = post_event(kind, node_id, body)
|
|
if result:
|
|
node_alerts.add(condition_key)
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
log.info("=" * 60)
|
|
log.info("VaultMesh vm-copilot starting")
|
|
log.info(f" CC URL: {CC_URL}")
|
|
log.info(f" Poll interval: {POLL_INTERVAL}s")
|
|
log.info(f" Watching nodes: {WATCH_NODES}")
|
|
log.info("=" * 60)
|
|
|
|
while True:
|
|
try:
|
|
run_cycle()
|
|
except Exception as e:
|
|
log.exception(f"Error in monitoring cycle: {e}")
|
|
|
|
log.info(f"Sleeping {POLL_INTERVAL}s...")
|
|
time.sleep(POLL_INTERVAL)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
main()
|
|
except KeyboardInterrupt:
|
|
log.info("Shutting down...")
|
|
sys.exit(0)
|