chore: initial import
This commit is contained in:
305
vm-copilot/vm_copilot_agent.py
Normal file
305
vm-copilot/vm_copilot_agent.py
Normal file
@@ -0,0 +1,305 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
VaultMesh vm-copilot Agent
|
||||
|
||||
A bot that monitors Command Center nodes and posts incidents/notes
|
||||
when attention conditions are detected.
|
||||
|
||||
Usage:
|
||||
python vm_copilot_agent.py
|
||||
|
||||
Environment variables:
|
||||
VAULTMESH_CC_URL - Command Center URL (default: http://127.0.0.1:8088)
|
||||
VM_COPILOT_INTERVAL - Poll interval in seconds (default: 60)
|
||||
VM_COPILOT_NODES - Comma-separated list of node IDs to watch
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
CC_URL = os.environ.get("VAULTMESH_CC_URL", "http://127.0.0.1:8088")
|
||||
POLL_INTERVAL = int(os.environ.get("VM_COPILOT_INTERVAL", "60"))
|
||||
|
||||
# Node IDs to watch (can be overridden via env var)
|
||||
DEFAULT_NODES = [
|
||||
"b5d5a72e-71d9-46d7-a4d7-a1c89d53224b", # gamma (machine-id derived)
|
||||
]
|
||||
WATCH_NODES = os.environ.get("VM_COPILOT_NODES", ",".join(DEFAULT_NODES)).split(",")
|
||||
WATCH_NODES = [n.strip() for n in WATCH_NODES if n.strip()]
|
||||
|
||||
AUTHOR = "vm-copilot"
|
||||
|
||||
# Logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
)
|
||||
log = logging.getLogger("vm-copilot")
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# State Tracking
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
# Track conditions we've already alerted on: node_id -> set of condition keys
|
||||
alerted_conditions: dict[str, set[str]] = {}
|
||||
|
||||
# Track last event timestamp per node for incremental fetching
|
||||
last_event_ts: dict[str, str] = {}
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# HTTP Helpers
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def get_node(node_id: str) -> Optional[dict]:
|
||||
"""Fetch a single node's state from CC by querying the nodes list."""
|
||||
try:
|
||||
# /nodes returns JSON array of all nodes
|
||||
resp = requests.get(f"{CC_URL}/nodes", timeout=10)
|
||||
if resp.status_code == 200:
|
||||
nodes = resp.json()
|
||||
# Find the node by ID
|
||||
for node in nodes:
|
||||
if node.get("node_id") == node_id:
|
||||
return node
|
||||
log.warning(f"Node {node_id} not found in nodes list")
|
||||
return None
|
||||
else:
|
||||
log.error(f"Failed to get nodes: {resp.status_code}")
|
||||
return None
|
||||
except requests.RequestException as e:
|
||||
log.error(f"Request error getting nodes: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_events(
|
||||
node_id: str,
|
||||
since: Optional[str] = None,
|
||||
kinds: Optional[list[str]] = None,
|
||||
limit: int = 50,
|
||||
) -> list[dict]:
|
||||
"""Query events for a node from CC."""
|
||||
params = {"node_id": node_id, "limit": str(limit)}
|
||||
if since:
|
||||
params["since"] = since
|
||||
if kinds:
|
||||
params["kind"] = ",".join(kinds)
|
||||
|
||||
try:
|
||||
resp = requests.get(f"{CC_URL}/api/events", params=params, timeout=10)
|
||||
if resp.status_code == 200:
|
||||
return resp.json()
|
||||
else:
|
||||
log.error(f"Failed to get events: {resp.status_code}")
|
||||
return []
|
||||
except requests.RequestException as e:
|
||||
log.error(f"Request error getting events: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def post_event(kind: str, node_id: str, body: dict) -> Optional[dict]:
|
||||
"""Post an event to CC."""
|
||||
payload = {
|
||||
"kind": kind,
|
||||
"node_id": node_id,
|
||||
"author": AUTHOR,
|
||||
"body": body,
|
||||
}
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{CC_URL}/api/events",
|
||||
json=payload,
|
||||
headers={"Content-Type": "application/json"},
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
result = resp.json()
|
||||
log.info(f"Posted {kind} event: {result.get('id', 'unknown')}")
|
||||
return result
|
||||
else:
|
||||
log.error(f"Failed to post event: {resp.status_code} - {resp.text}")
|
||||
return None
|
||||
except requests.RequestException as e:
|
||||
log.error(f"Request error posting event: {e}")
|
||||
return None
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Rule Engine
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def analyze_node(node: dict, recent_events: list[dict]) -> list[tuple[str, str, dict]]:
|
||||
"""
|
||||
Analyze a node's state and return events to emit.
|
||||
|
||||
Returns:
|
||||
List of (kind, condition_key, body) tuples
|
||||
"""
|
||||
to_emit = []
|
||||
attention = node.get("attention", [])
|
||||
hostname = node.get("hostname", "unknown")
|
||||
node_id = node.get("node_id", "unknown")
|
||||
|
||||
# Rule 1: critical_findings - needs attention
|
||||
if "critical_findings" in attention:
|
||||
scan_summary = node.get("last_scan_summary")
|
||||
crit_count = 0
|
||||
if scan_summary:
|
||||
crit_count = scan_summary.get("critical", 0) + scan_summary.get("high", 0)
|
||||
|
||||
to_emit.append((
|
||||
"incident",
|
||||
"critical_findings",
|
||||
{
|
||||
"title": "Critical Findings Detected",
|
||||
"description": f"Node {hostname} has {crit_count} critical/high scan findings requiring attention",
|
||||
"severity": "high",
|
||||
},
|
||||
))
|
||||
to_emit.append((
|
||||
"note",
|
||||
"critical_findings_note",
|
||||
{
|
||||
"text": f"Auto-detected critical_findings on node {hostname}. Recommend manual review of scan results.",
|
||||
"severity": "warn",
|
||||
},
|
||||
))
|
||||
|
||||
# Rule 2: heartbeat_stale - node may be offline
|
||||
if "heartbeat_stale" in attention:
|
||||
last_seen = node.get("last_seen", "unknown")
|
||||
to_emit.append((
|
||||
"incident",
|
||||
"heartbeat_stale",
|
||||
{
|
||||
"title": "Node Heartbeat Stale",
|
||||
"description": f"Node {hostname} has not sent a heartbeat recently. Last seen: {last_seen}",
|
||||
"severity": "high",
|
||||
},
|
||||
))
|
||||
|
||||
# Rule 3: cloudflare_down - informational note
|
||||
if "cloudflare_down" in attention:
|
||||
to_emit.append((
|
||||
"note",
|
||||
"cloudflare_down",
|
||||
{
|
||||
"text": f"Node {hostname} reports Cloudflare tunnel is down.",
|
||||
"severity": "warn",
|
||||
},
|
||||
))
|
||||
|
||||
# Rule 4: never_scanned - needs initial scan
|
||||
if "never_scanned" in attention:
|
||||
to_emit.append((
|
||||
"note",
|
||||
"never_scanned",
|
||||
{
|
||||
"text": f"Node {hostname} has never been scanned. Recommend triggering initial sovereign scan.",
|
||||
"severity": "info",
|
||||
},
|
||||
))
|
||||
|
||||
return to_emit
|
||||
|
||||
|
||||
def has_recent_operator_ack(events: list[dict]) -> bool:
|
||||
"""Check if there's a recent ack from someone other than vm-copilot."""
|
||||
for ev in events:
|
||||
if ev.get("kind") == "ack":
|
||||
author = ev.get("author", "")
|
||||
if author and author != AUTHOR:
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# Main Loop
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
def run_cycle():
|
||||
"""Run a single monitoring cycle."""
|
||||
for node_id in WATCH_NODES:
|
||||
log.debug(f"Checking node {node_id}")
|
||||
|
||||
# Fetch node state
|
||||
node = get_node(node_id)
|
||||
if not node:
|
||||
continue
|
||||
|
||||
hostname = node.get("hostname", node_id[:8])
|
||||
attention = node.get("attention", [])
|
||||
log.info(f"Node {hostname}: attention={attention}")
|
||||
|
||||
# Fetch recent events
|
||||
since = last_event_ts.get(node_id)
|
||||
events = get_events(
|
||||
node_id,
|
||||
since=since,
|
||||
kinds=["note", "incident", "ack", "resolve"],
|
||||
)
|
||||
|
||||
# Update last seen timestamp
|
||||
if events:
|
||||
# Events are returned oldest-first by default, get the latest
|
||||
latest_ts = max(ev.get("ts", "") for ev in events)
|
||||
if latest_ts:
|
||||
last_event_ts[node_id] = latest_ts
|
||||
|
||||
# Check for operator acks
|
||||
recent_ack = has_recent_operator_ack(events)
|
||||
if recent_ack:
|
||||
log.info(f"Node {hostname}: operator ack detected, resetting alerts")
|
||||
alerted_conditions.pop(node_id, None)
|
||||
|
||||
# Analyze and emit events
|
||||
to_emit = analyze_node(node, events)
|
||||
|
||||
for kind, condition_key, body in to_emit:
|
||||
# Skip if we've already alerted on this condition
|
||||
node_alerts = alerted_conditions.setdefault(node_id, set())
|
||||
if condition_key in node_alerts:
|
||||
log.debug(f"Already alerted on {condition_key} for {hostname}")
|
||||
continue
|
||||
|
||||
# Post the event
|
||||
result = post_event(kind, node_id, body)
|
||||
if result:
|
||||
node_alerts.add(condition_key)
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
log.info("=" * 60)
|
||||
log.info("VaultMesh vm-copilot starting")
|
||||
log.info(f" CC URL: {CC_URL}")
|
||||
log.info(f" Poll interval: {POLL_INTERVAL}s")
|
||||
log.info(f" Watching nodes: {WATCH_NODES}")
|
||||
log.info("=" * 60)
|
||||
|
||||
while True:
|
||||
try:
|
||||
run_cycle()
|
||||
except Exception as e:
|
||||
log.exception(f"Error in monitoring cycle: {e}")
|
||||
|
||||
log.info(f"Sleeping {POLL_INTERVAL}s...")
|
||||
time.sleep(POLL_INTERVAL)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
main()
|
||||
except KeyboardInterrupt:
|
||||
log.info("Shutting down...")
|
||||
sys.exit(0)
|
||||
Reference in New Issue
Block a user