#!/usr/bin/env python3 """ Cloudflare Metrics Exporter for Prometheus Exports Cloudflare state and invariant status as Prometheus metrics. Usage: python3 metrics-exporter.py --port 9100 Environment Variables: CLOUDFLARE_API_TOKEN - API token CLOUDFLARE_ZONE_ID - Zone ID CLOUDFLARE_ACCOUNT_ID - Account ID SNAPSHOT_DIR - Directory containing state snapshots ANOMALY_DIR - Directory containing invariant reports """ import argparse import glob import json import os import time from datetime import datetime, timezone from http.server import HTTPServer, BaseHTTPRequestHandler from typing import Any, Dict, List, Optional import requests # Configuration CF_API_BASE = "https://api.cloudflare.com/client/v4" DEFAULT_PORT = 9100 SCRAPE_INTERVAL = 60 # seconds class CloudflareMetricsCollector: """Collects Cloudflare metrics for Prometheus export.""" def __init__(self, api_token: str, zone_id: str, account_id: str, snapshot_dir: str, anomaly_dir: str): self.api_token = api_token self.zone_id = zone_id self.account_id = account_id self.snapshot_dir = snapshot_dir self.anomaly_dir = anomaly_dir self.session = requests.Session() self.session.headers.update({ "Authorization": f"Bearer {api_token}", "Content-Type": "application/json" }) self.metrics: Dict[str, Any] = {} self.last_scrape = 0 def _cf_request(self, endpoint: str) -> Dict[str, Any]: """Make Cloudflare API request.""" url = f"{CF_API_BASE}{endpoint}" response = self.session.get(url) response.raise_for_status() return response.json() def _get_latest_file(self, pattern: str) -> Optional[str]: """Get most recent file matching pattern.""" files = glob.glob(pattern) if not files: return None return max(files, key=os.path.getmtime) def collect_dns_metrics(self): """Collect DNS record metrics.""" try: data = self._cf_request(f"/zones/{self.zone_id}/dns_records?per_page=500") records = data.get("result", []) # Count by type type_counts = {} proxied_count = 0 unproxied_count = 0 for r in records: rtype = r.get("type", "UNKNOWN") type_counts[rtype] = type_counts.get(rtype, 0) + 1 if r.get("proxied"): proxied_count += 1 else: unproxied_count += 1 self.metrics["dns_records_total"] = len(records) self.metrics["dns_records_proxied"] = proxied_count self.metrics["dns_records_unproxied"] = unproxied_count for rtype, count in type_counts.items(): self.metrics[f"dns_records_by_type{{type=\"{rtype}\"}}"] = count except Exception as e: self.metrics["dns_scrape_errors_total"] = self.metrics.get("dns_scrape_errors_total", 0) + 1 def collect_dnssec_metrics(self): """Collect DNSSEC status.""" try: data = self._cf_request(f"/zones/{self.zone_id}/dnssec") result = data.get("result", {}) status = result.get("status", "unknown") self.metrics["dnssec_enabled"] = 1 if status == "active" else 0 except Exception: self.metrics["dnssec_enabled"] = -1 def collect_tunnel_metrics(self): """Collect tunnel metrics.""" try: data = self._cf_request(f"/accounts/{self.account_id}/cfd_tunnel") tunnels = data.get("result", []) active = 0 healthy = 0 total_connections = 0 for t in tunnels: if not t.get("deleted_at"): active += 1 # Check connections try: conn_data = self._cf_request( f"/accounts/{self.account_id}/cfd_tunnel/{t['id']}/connections" ) conns = conn_data.get("result", []) if conns: healthy += 1 total_connections += len(conns) except Exception: pass self.metrics["tunnels_total"] = active self.metrics["tunnels_healthy"] = healthy self.metrics["tunnels_unhealthy"] = active - healthy self.metrics["tunnel_connections_total"] = total_connections except Exception: self.metrics["tunnel_scrape_errors_total"] = self.metrics.get("tunnel_scrape_errors_total", 0) + 1 def collect_access_metrics(self): """Collect Access app metrics.""" try: data = self._cf_request(f"/accounts/{self.account_id}/access/apps") apps = data.get("result", []) self.metrics["access_apps_total"] = len(apps) # Count by type type_counts = {} for app in apps: app_type = app.get("type", "unknown") type_counts[app_type] = type_counts.get(app_type, 0) + 1 for app_type, count in type_counts.items(): self.metrics[f"access_apps_by_type{{type=\"{app_type}\"}}"] = count except Exception: self.metrics["access_scrape_errors_total"] = self.metrics.get("access_scrape_errors_total", 0) + 1 def collect_zone_settings_metrics(self): """Collect zone security settings.""" try: data = self._cf_request(f"/zones/{self.zone_id}/settings") settings = {s["id"]: s["value"] for s in data.get("result", [])} # TLS settings ssl = settings.get("ssl", "unknown") self.metrics["zone_ssl_strict"] = 1 if ssl in ("strict", "full_strict") else 0 min_tls = settings.get("min_tls_version", "unknown") self.metrics["zone_tls_version_secure"] = 1 if min_tls in ("1.2", "1.3") else 0 # Security features self.metrics["zone_always_https"] = 1 if settings.get("always_use_https") == "on" else 0 self.metrics["zone_browser_check"] = 1 if settings.get("browser_check") == "on" else 0 except Exception: pass def collect_snapshot_metrics(self): """Collect metrics from state snapshots.""" latest = self._get_latest_file(os.path.join(self.snapshot_dir, "cloudflare-*.json")) if not latest: self.metrics["snapshot_age_seconds"] = -1 return try: mtime = os.path.getmtime(latest) age = time.time() - mtime self.metrics["snapshot_age_seconds"] = int(age) with open(latest) as f: snapshot = json.load(f) integrity = snapshot.get("integrity", {}) self.metrics["snapshot_merkle_root_set"] = 1 if integrity.get("merkle_root") else 0 except Exception: self.metrics["snapshot_age_seconds"] = -1 def collect_invariant_metrics(self): """Collect metrics from invariant reports.""" latest = self._get_latest_file(os.path.join(self.anomaly_dir, "invariant-report-*.json")) if not latest: self.metrics["invariants_total"] = 0 self.metrics["invariants_passed"] = 0 self.metrics["invariants_failed"] = 0 return try: with open(latest) as f: report = json.load(f) summary = report.get("summary", {}) self.metrics["invariants_total"] = summary.get("total", 0) self.metrics["invariants_passed"] = summary.get("passed", 0) self.metrics["invariants_failed"] = summary.get("failed", 0) self.metrics["invariants_pass_rate"] = summary.get("pass_rate", 0) # Report age mtime = os.path.getmtime(latest) self.metrics["invariant_report_age_seconds"] = int(time.time() - mtime) except Exception: pass def collect_anomaly_metrics(self): """Count anomaly receipts.""" anomaly_files = glob.glob(os.path.join(self.anomaly_dir, "anomaly-*.json")) self.metrics["anomalies_total"] = len(anomaly_files) # Recent anomalies (last 24h) recent = 0 day_ago = time.time() - 86400 for f in anomaly_files: if os.path.getmtime(f) > day_ago: recent += 1 self.metrics["anomalies_last_24h"] = recent def collect_all(self): """Collect all metrics.""" now = time.time() if now - self.last_scrape < SCRAPE_INTERVAL: return # Rate limit self.last_scrape = now self.metrics = {"scrape_timestamp": int(now)} self.collect_dns_metrics() self.collect_dnssec_metrics() self.collect_tunnel_metrics() self.collect_access_metrics() self.collect_zone_settings_metrics() self.collect_snapshot_metrics() self.collect_invariant_metrics() self.collect_anomaly_metrics() def format_prometheus(self) -> str: """Format metrics as Prometheus exposition format.""" lines = [ "# HELP cloudflare_dns_records_total Total DNS records", "# TYPE cloudflare_dns_records_total gauge", "# HELP cloudflare_tunnels_total Total active tunnels", "# TYPE cloudflare_tunnels_total gauge", "# HELP cloudflare_tunnels_healthy Healthy tunnels with connections", "# TYPE cloudflare_tunnels_healthy gauge", "# HELP cloudflare_invariants_passed Invariants passing", "# TYPE cloudflare_invariants_passed gauge", "# HELP cloudflare_invariants_failed Invariants failing", "# TYPE cloudflare_invariants_failed gauge", "", ] for key, value in self.metrics.items(): if isinstance(value, (int, float)): # Handle labels in key if "{" in key: lines.append(f"cloudflare_{key} {value}") else: lines.append(f"cloudflare_{key} {value}") return "\n".join(lines) class MetricsHandler(BaseHTTPRequestHandler): """HTTP handler for Prometheus scrapes.""" collector: CloudflareMetricsCollector = None def do_GET(self): if self.path == "/metrics": self.collector.collect_all() output = self.collector.format_prometheus() self.send_response(200) self.send_header("Content-Type", "text/plain; charset=utf-8") self.end_headers() self.wfile.write(output.encode()) elif self.path == "/health": self.send_response(200) self.send_header("Content-Type", "text/plain") self.end_headers() self.wfile.write(b"OK") else: self.send_response(404) self.end_headers() def log_message(self, format, *args): pass # Suppress default logging def main(): parser = argparse.ArgumentParser(description="Cloudflare Metrics Exporter") parser.add_argument("--port", type=int, default=DEFAULT_PORT, help=f"Port to listen on (default: {DEFAULT_PORT})") parser.add_argument("--zone-id", default=os.environ.get("CLOUDFLARE_ZONE_ID")) parser.add_argument("--account-id", default=os.environ.get("CLOUDFLARE_ACCOUNT_ID")) parser.add_argument("--snapshot-dir", default=os.environ.get("SNAPSHOT_DIR", "../snapshots")) parser.add_argument("--anomaly-dir", default=os.environ.get("ANOMALY_DIR", "../anomalies")) args = parser.parse_args() api_token = os.environ.get("CLOUDFLARE_API_TOKEN") if not api_token: print("Error: CLOUDFLARE_API_TOKEN required") return 1 if not args.zone_id or not args.account_id: print("Error: Zone ID and Account ID required") return 1 # Initialize collector collector = CloudflareMetricsCollector( api_token, args.zone_id, args.account_id, args.snapshot_dir, args.anomaly_dir ) MetricsHandler.collector = collector # Start server server = HTTPServer(("0.0.0.0", args.port), MetricsHandler) print(f"Cloudflare Metrics Exporter listening on :{args.port}") print(f" /metrics - Prometheus metrics") print(f" /health - Health check") try: server.serve_forever() except KeyboardInterrupt: print("\nShutting down...") server.shutdown() return 0 if __name__ == "__main__": exit(main())