Initial commit: Cloudflare infrastructure with WAF Intelligence
- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access) - WAF Intelligence MCP server with threat analysis and ML classification - GitOps automation with PR workflows and drift detection - Observatory monitoring stack with Prometheus/Grafana - IDE operator rules for governed development - Security playbooks and compliance frameworks - Autonomous remediation and state reconciliation
This commit is contained in:
355
observatory/metrics-exporter.py
Normal file
355
observatory/metrics-exporter.py
Normal file
@@ -0,0 +1,355 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Cloudflare Metrics Exporter for Prometheus
|
||||
Exports Cloudflare state and invariant status as Prometheus metrics.
|
||||
|
||||
Usage:
|
||||
python3 metrics-exporter.py --port 9100
|
||||
|
||||
Environment Variables:
|
||||
CLOUDFLARE_API_TOKEN - API token
|
||||
CLOUDFLARE_ZONE_ID - Zone ID
|
||||
CLOUDFLARE_ACCOUNT_ID - Account ID
|
||||
SNAPSHOT_DIR - Directory containing state snapshots
|
||||
ANOMALY_DIR - Directory containing invariant reports
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timezone
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
from typing import Any, Dict, List, Optional
|
||||
import requests
|
||||
|
||||
# Configuration
|
||||
CF_API_BASE = "https://api.cloudflare.com/client/v4"
|
||||
DEFAULT_PORT = 9100
|
||||
SCRAPE_INTERVAL = 60 # seconds
|
||||
|
||||
|
||||
class CloudflareMetricsCollector:
|
||||
"""Collects Cloudflare metrics for Prometheus export."""
|
||||
|
||||
def __init__(self, api_token: str, zone_id: str, account_id: str,
|
||||
snapshot_dir: str, anomaly_dir: str):
|
||||
self.api_token = api_token
|
||||
self.zone_id = zone_id
|
||||
self.account_id = account_id
|
||||
self.snapshot_dir = snapshot_dir
|
||||
self.anomaly_dir = anomaly_dir
|
||||
self.session = requests.Session()
|
||||
self.session.headers.update({
|
||||
"Authorization": f"Bearer {api_token}",
|
||||
"Content-Type": "application/json"
|
||||
})
|
||||
self.metrics: Dict[str, Any] = {}
|
||||
self.last_scrape = 0
|
||||
|
||||
def _cf_request(self, endpoint: str) -> Dict[str, Any]:
|
||||
"""Make Cloudflare API request."""
|
||||
url = f"{CF_API_BASE}{endpoint}"
|
||||
response = self.session.get(url)
|
||||
response.raise_for_status()
|
||||
return response.json()
|
||||
|
||||
def _get_latest_file(self, pattern: str) -> Optional[str]:
|
||||
"""Get most recent file matching pattern."""
|
||||
files = glob.glob(pattern)
|
||||
if not files:
|
||||
return None
|
||||
return max(files, key=os.path.getmtime)
|
||||
|
||||
def collect_dns_metrics(self):
|
||||
"""Collect DNS record metrics."""
|
||||
try:
|
||||
data = self._cf_request(f"/zones/{self.zone_id}/dns_records?per_page=500")
|
||||
records = data.get("result", [])
|
||||
|
||||
# Count by type
|
||||
type_counts = {}
|
||||
proxied_count = 0
|
||||
unproxied_count = 0
|
||||
|
||||
for r in records:
|
||||
rtype = r.get("type", "UNKNOWN")
|
||||
type_counts[rtype] = type_counts.get(rtype, 0) + 1
|
||||
if r.get("proxied"):
|
||||
proxied_count += 1
|
||||
else:
|
||||
unproxied_count += 1
|
||||
|
||||
self.metrics["dns_records_total"] = len(records)
|
||||
self.metrics["dns_records_proxied"] = proxied_count
|
||||
self.metrics["dns_records_unproxied"] = unproxied_count
|
||||
|
||||
for rtype, count in type_counts.items():
|
||||
self.metrics[f"dns_records_by_type{{type=\"{rtype}\"}}"] = count
|
||||
|
||||
except Exception as e:
|
||||
self.metrics["dns_scrape_errors_total"] = self.metrics.get("dns_scrape_errors_total", 0) + 1
|
||||
|
||||
def collect_dnssec_metrics(self):
|
||||
"""Collect DNSSEC status."""
|
||||
try:
|
||||
data = self._cf_request(f"/zones/{self.zone_id}/dnssec")
|
||||
result = data.get("result", {})
|
||||
status = result.get("status", "unknown")
|
||||
|
||||
self.metrics["dnssec_enabled"] = 1 if status == "active" else 0
|
||||
|
||||
except Exception:
|
||||
self.metrics["dnssec_enabled"] = -1
|
||||
|
||||
def collect_tunnel_metrics(self):
|
||||
"""Collect tunnel metrics."""
|
||||
try:
|
||||
data = self._cf_request(f"/accounts/{self.account_id}/cfd_tunnel")
|
||||
tunnels = data.get("result", [])
|
||||
|
||||
active = 0
|
||||
healthy = 0
|
||||
total_connections = 0
|
||||
|
||||
for t in tunnels:
|
||||
if not t.get("deleted_at"):
|
||||
active += 1
|
||||
# Check connections
|
||||
try:
|
||||
conn_data = self._cf_request(
|
||||
f"/accounts/{self.account_id}/cfd_tunnel/{t['id']}/connections"
|
||||
)
|
||||
conns = conn_data.get("result", [])
|
||||
if conns:
|
||||
healthy += 1
|
||||
total_connections += len(conns)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
self.metrics["tunnels_total"] = active
|
||||
self.metrics["tunnels_healthy"] = healthy
|
||||
self.metrics["tunnels_unhealthy"] = active - healthy
|
||||
self.metrics["tunnel_connections_total"] = total_connections
|
||||
|
||||
except Exception:
|
||||
self.metrics["tunnel_scrape_errors_total"] = self.metrics.get("tunnel_scrape_errors_total", 0) + 1
|
||||
|
||||
def collect_access_metrics(self):
|
||||
"""Collect Access app metrics."""
|
||||
try:
|
||||
data = self._cf_request(f"/accounts/{self.account_id}/access/apps")
|
||||
apps = data.get("result", [])
|
||||
|
||||
self.metrics["access_apps_total"] = len(apps)
|
||||
|
||||
# Count by type
|
||||
type_counts = {}
|
||||
for app in apps:
|
||||
app_type = app.get("type", "unknown")
|
||||
type_counts[app_type] = type_counts.get(app_type, 0) + 1
|
||||
|
||||
for app_type, count in type_counts.items():
|
||||
self.metrics[f"access_apps_by_type{{type=\"{app_type}\"}}"] = count
|
||||
|
||||
except Exception:
|
||||
self.metrics["access_scrape_errors_total"] = self.metrics.get("access_scrape_errors_total", 0) + 1
|
||||
|
||||
def collect_zone_settings_metrics(self):
|
||||
"""Collect zone security settings."""
|
||||
try:
|
||||
data = self._cf_request(f"/zones/{self.zone_id}/settings")
|
||||
settings = {s["id"]: s["value"] for s in data.get("result", [])}
|
||||
|
||||
# TLS settings
|
||||
ssl = settings.get("ssl", "unknown")
|
||||
self.metrics["zone_ssl_strict"] = 1 if ssl in ("strict", "full_strict") else 0
|
||||
|
||||
min_tls = settings.get("min_tls_version", "unknown")
|
||||
self.metrics["zone_tls_version_secure"] = 1 if min_tls in ("1.2", "1.3") else 0
|
||||
|
||||
# Security features
|
||||
self.metrics["zone_always_https"] = 1 if settings.get("always_use_https") == "on" else 0
|
||||
self.metrics["zone_browser_check"] = 1 if settings.get("browser_check") == "on" else 0
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def collect_snapshot_metrics(self):
|
||||
"""Collect metrics from state snapshots."""
|
||||
latest = self._get_latest_file(os.path.join(self.snapshot_dir, "cloudflare-*.json"))
|
||||
if not latest:
|
||||
self.metrics["snapshot_age_seconds"] = -1
|
||||
return
|
||||
|
||||
try:
|
||||
mtime = os.path.getmtime(latest)
|
||||
age = time.time() - mtime
|
||||
self.metrics["snapshot_age_seconds"] = int(age)
|
||||
|
||||
with open(latest) as f:
|
||||
snapshot = json.load(f)
|
||||
|
||||
integrity = snapshot.get("integrity", {})
|
||||
self.metrics["snapshot_merkle_root_set"] = 1 if integrity.get("merkle_root") else 0
|
||||
|
||||
except Exception:
|
||||
self.metrics["snapshot_age_seconds"] = -1
|
||||
|
||||
def collect_invariant_metrics(self):
|
||||
"""Collect metrics from invariant reports."""
|
||||
latest = self._get_latest_file(os.path.join(self.anomaly_dir, "invariant-report-*.json"))
|
||||
if not latest:
|
||||
self.metrics["invariants_total"] = 0
|
||||
self.metrics["invariants_passed"] = 0
|
||||
self.metrics["invariants_failed"] = 0
|
||||
return
|
||||
|
||||
try:
|
||||
with open(latest) as f:
|
||||
report = json.load(f)
|
||||
|
||||
summary = report.get("summary", {})
|
||||
self.metrics["invariants_total"] = summary.get("total", 0)
|
||||
self.metrics["invariants_passed"] = summary.get("passed", 0)
|
||||
self.metrics["invariants_failed"] = summary.get("failed", 0)
|
||||
self.metrics["invariants_pass_rate"] = summary.get("pass_rate", 0)
|
||||
|
||||
# Report age
|
||||
mtime = os.path.getmtime(latest)
|
||||
self.metrics["invariant_report_age_seconds"] = int(time.time() - mtime)
|
||||
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def collect_anomaly_metrics(self):
|
||||
"""Count anomaly receipts."""
|
||||
anomaly_files = glob.glob(os.path.join(self.anomaly_dir, "anomaly-*.json"))
|
||||
self.metrics["anomalies_total"] = len(anomaly_files)
|
||||
|
||||
# Recent anomalies (last 24h)
|
||||
recent = 0
|
||||
day_ago = time.time() - 86400
|
||||
for f in anomaly_files:
|
||||
if os.path.getmtime(f) > day_ago:
|
||||
recent += 1
|
||||
self.metrics["anomalies_last_24h"] = recent
|
||||
|
||||
def collect_all(self):
|
||||
"""Collect all metrics."""
|
||||
now = time.time()
|
||||
if now - self.last_scrape < SCRAPE_INTERVAL:
|
||||
return # Rate limit
|
||||
|
||||
self.last_scrape = now
|
||||
self.metrics = {"scrape_timestamp": int(now)}
|
||||
|
||||
self.collect_dns_metrics()
|
||||
self.collect_dnssec_metrics()
|
||||
self.collect_tunnel_metrics()
|
||||
self.collect_access_metrics()
|
||||
self.collect_zone_settings_metrics()
|
||||
self.collect_snapshot_metrics()
|
||||
self.collect_invariant_metrics()
|
||||
self.collect_anomaly_metrics()
|
||||
|
||||
def format_prometheus(self) -> str:
|
||||
"""Format metrics as Prometheus exposition format."""
|
||||
lines = [
|
||||
"# HELP cloudflare_dns_records_total Total DNS records",
|
||||
"# TYPE cloudflare_dns_records_total gauge",
|
||||
"# HELP cloudflare_tunnels_total Total active tunnels",
|
||||
"# TYPE cloudflare_tunnels_total gauge",
|
||||
"# HELP cloudflare_tunnels_healthy Healthy tunnels with connections",
|
||||
"# TYPE cloudflare_tunnels_healthy gauge",
|
||||
"# HELP cloudflare_invariants_passed Invariants passing",
|
||||
"# TYPE cloudflare_invariants_passed gauge",
|
||||
"# HELP cloudflare_invariants_failed Invariants failing",
|
||||
"# TYPE cloudflare_invariants_failed gauge",
|
||||
"",
|
||||
]
|
||||
|
||||
for key, value in self.metrics.items():
|
||||
if isinstance(value, (int, float)):
|
||||
# Handle labels in key
|
||||
if "{" in key:
|
||||
lines.append(f"cloudflare_{key} {value}")
|
||||
else:
|
||||
lines.append(f"cloudflare_{key} {value}")
|
||||
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
class MetricsHandler(BaseHTTPRequestHandler):
|
||||
"""HTTP handler for Prometheus scrapes."""
|
||||
|
||||
collector: CloudflareMetricsCollector = None
|
||||
|
||||
def do_GET(self):
|
||||
if self.path == "/metrics":
|
||||
self.collector.collect_all()
|
||||
output = self.collector.format_prometheus()
|
||||
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain; charset=utf-8")
|
||||
self.end_headers()
|
||||
self.wfile.write(output.encode())
|
||||
elif self.path == "/health":
|
||||
self.send_response(200)
|
||||
self.send_header("Content-Type", "text/plain")
|
||||
self.end_headers()
|
||||
self.wfile.write(b"OK")
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
pass # Suppress default logging
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Cloudflare Metrics Exporter")
|
||||
parser.add_argument("--port", type=int, default=DEFAULT_PORT,
|
||||
help=f"Port to listen on (default: {DEFAULT_PORT})")
|
||||
parser.add_argument("--zone-id", default=os.environ.get("CLOUDFLARE_ZONE_ID"))
|
||||
parser.add_argument("--account-id", default=os.environ.get("CLOUDFLARE_ACCOUNT_ID"))
|
||||
parser.add_argument("--snapshot-dir",
|
||||
default=os.environ.get("SNAPSHOT_DIR", "../snapshots"))
|
||||
parser.add_argument("--anomaly-dir",
|
||||
default=os.environ.get("ANOMALY_DIR", "../anomalies"))
|
||||
args = parser.parse_args()
|
||||
|
||||
api_token = os.environ.get("CLOUDFLARE_API_TOKEN")
|
||||
if not api_token:
|
||||
print("Error: CLOUDFLARE_API_TOKEN required")
|
||||
return 1
|
||||
|
||||
if not args.zone_id or not args.account_id:
|
||||
print("Error: Zone ID and Account ID required")
|
||||
return 1
|
||||
|
||||
# Initialize collector
|
||||
collector = CloudflareMetricsCollector(
|
||||
api_token, args.zone_id, args.account_id,
|
||||
args.snapshot_dir, args.anomaly_dir
|
||||
)
|
||||
MetricsHandler.collector = collector
|
||||
|
||||
# Start server
|
||||
server = HTTPServer(("0.0.0.0", args.port), MetricsHandler)
|
||||
print(f"Cloudflare Metrics Exporter listening on :{args.port}")
|
||||
print(f" /metrics - Prometheus metrics")
|
||||
print(f" /health - Health check")
|
||||
|
||||
try:
|
||||
server.serve_forever()
|
||||
except KeyboardInterrupt:
|
||||
print("\nShutting down...")
|
||||
server.shutdown()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
Reference in New Issue
Block a user