Files
vm-cloudflare/observatory/prometheus/alerts/tunnel-alerts.yml
Vault Sovereign 37a867c485 Initial commit: Cloudflare infrastructure with WAF Intelligence
- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access)
- WAF Intelligence MCP server with threat analysis and ML classification
- GitOps automation with PR workflows and drift detection
- Observatory monitoring stack with Prometheus/Grafana
- IDE operator rules for governed development
- Security playbooks and compliance frameworks
- Autonomous remediation and state reconciliation
2025-12-16 18:31:53 +00:00

211 lines
8.2 KiB
YAML

# Tunnel Alert Rules for Cloudflare Mesh Observatory
# Phase 5B - Alerts & Escalation
groups:
- name: tunnel_alerts
interval: 30s
rules:
# ============================================
# CRITICAL - Tunnel Down
# ============================================
- alert: TunnelDown
expr: cloudflare_tunnel_status == 0
for: 2m
labels:
severity: critical
component: tunnel
playbook: tunnel-rotation
annotations:
summary: "Cloudflare Tunnel {{ $labels.tunnel_name }} is DOWN"
description: |
Tunnel {{ $labels.tunnel_name }} (ID: {{ $labels.tunnel_id }}) has been
unreachable for more than 2 minutes. Services behind this tunnel are
likely unreachable.
impact: "Services behind tunnel are unreachable from the internet"
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
# ============================================
# CRITICAL - All Tunnels Down
# ============================================
- alert: AllTunnelsDown
expr: count(cloudflare_tunnel_status == 1) == 0
for: 1m
labels:
severity: critical
component: tunnel
playbook: tunnel-rotation
annotations:
summary: "ALL Cloudflare Tunnels are DOWN"
description: |
No healthy tunnels detected. Complete loss of tunnel connectivity.
This is a P0 incident requiring immediate attention.
impact: "Complete loss of external connectivity via tunnels"
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
# ============================================
# WARNING - Tunnel Degraded
# ============================================
- alert: TunnelDegraded
expr: cloudflare_tunnel_connections < 2
for: 5m
labels:
severity: warning
component: tunnel
annotations:
summary: "Tunnel {{ $labels.tunnel_name }} has reduced connections"
description: |
Tunnel {{ $labels.tunnel_name }} has fewer than 2 active connections.
This may indicate network issues or cloudflared problems.
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
# ============================================
# WARNING - Tunnel Rotation Due
# ============================================
- alert: TunnelRotationDue
expr: (time() - cloudflare_tunnel_created_timestamp) > (86400 * 30)
for: 1h
labels:
severity: warning
component: tunnel
playbook: tunnel-rotation
annotations:
summary: "Tunnel {{ $labels.tunnel_name }} rotation is due"
description: |
Tunnel {{ $labels.tunnel_name }} was created more than 30 days ago.
Per security policy, tunnels should be rotated monthly.
Age: {{ $value | humanizeDuration }}
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
# ============================================
# CRITICAL - Tunnel Rotation Overdue
# ============================================
- alert: TunnelRotationOverdue
expr: (time() - cloudflare_tunnel_created_timestamp) > (86400 * 45)
for: 1h
labels:
severity: critical
component: tunnel
playbook: tunnel-rotation
annotations:
summary: "Tunnel {{ $labels.tunnel_name }} rotation is OVERDUE"
description: |
Tunnel {{ $labels.tunnel_name }} is more than 45 days old.
This exceeds the maximum rotation interval and represents a
security policy violation.
Age: {{ $value | humanizeDuration }}
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
# ============================================
# WARNING - Tunnel High Latency
# ============================================
- alert: TunnelHighLatency
expr: cloudflare_tunnel_latency_ms > 500
for: 5m
labels:
severity: warning
component: tunnel
annotations:
summary: "High latency on tunnel {{ $labels.tunnel_name }}"
description: |
Tunnel {{ $labels.tunnel_name }} is experiencing latency above 500ms.
Current latency: {{ $value }}ms
This may impact user experience.
# ============================================
# CRITICAL - Tunnel Very High Latency
# ============================================
- alert: TunnelVeryHighLatency
expr: cloudflare_tunnel_latency_ms > 2000
for: 2m
labels:
severity: critical
component: tunnel
annotations:
summary: "Critical latency on tunnel {{ $labels.tunnel_name }}"
description: |
Tunnel {{ $labels.tunnel_name }} latency exceeds 2000ms.
Current latency: {{ $value }}ms
Services may be timing out.
# ============================================
# WARNING - Tunnel Error Rate High
# ============================================
- alert: TunnelHighErrorRate
expr: |
rate(cloudflare_tunnel_errors_total[5m])
/ rate(cloudflare_tunnel_requests_total[5m]) > 0.05
for: 5m
labels:
severity: warning
component: tunnel
annotations:
summary: "High error rate on tunnel {{ $labels.tunnel_name }}"
description: |
Tunnel {{ $labels.tunnel_name }} error rate exceeds 5%.
Current error rate: {{ $value | humanizePercentage }}
# ============================================
# CRITICAL - Tunnel Error Rate Critical
# ============================================
- alert: TunnelCriticalErrorRate
expr: |
rate(cloudflare_tunnel_errors_total[5m])
/ rate(cloudflare_tunnel_requests_total[5m]) > 0.20
for: 2m
labels:
severity: critical
component: tunnel
annotations:
summary: "Critical error rate on tunnel {{ $labels.tunnel_name }}"
description: |
Tunnel {{ $labels.tunnel_name }} error rate exceeds 20%.
Current error rate: {{ $value | humanizePercentage }}
This indicates severe connectivity issues.
# ============================================
# INFO - Tunnel Configuration Changed
# ============================================
- alert: TunnelConfigChanged
expr: changes(cloudflare_tunnel_config_hash[1h]) > 0
for: 0m
labels:
severity: info
component: tunnel
annotations:
summary: "Tunnel {{ $labels.tunnel_name }} configuration changed"
description: |
The configuration for tunnel {{ $labels.tunnel_name }} has changed
in the last hour. Verify this was an authorized change.
# ============================================
# WARNING - Cloudflared Version Outdated
# ============================================
- alert: CloudflaredOutdated
expr: cloudflare_cloudflared_version_age_days > 90
for: 24h
labels:
severity: warning
component: tunnel
annotations:
summary: "cloudflared version is outdated"
description: |
The cloudflared binary is more than 90 days old.
Current version age: {{ $value }} days
Consider upgrading to latest version for security patches.
# ============================================
# WARNING - Tunnel Connection Flapping
# ============================================
- alert: TunnelConnectionFlapping
expr: changes(cloudflare_tunnel_status[10m]) > 3
for: 10m
labels:
severity: warning
component: tunnel
annotations:
summary: "Tunnel {{ $labels.tunnel_name }} is flapping"
description: |
Tunnel {{ $labels.tunnel_name }} has changed state {{ $value }} times
in the last 10 minutes. This indicates instability.
Check network connectivity and cloudflared logs.