Initial commit: Cloudflare infrastructure with WAF Intelligence
- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access) - WAF Intelligence MCP server with threat analysis and ML classification - GitOps automation with PR workflows and drift detection - Observatory monitoring stack with Prometheus/Grafana - IDE operator rules for governed development - Security playbooks and compliance frameworks - Autonomous remediation and state reconciliation
This commit is contained in:
210
observatory/prometheus/alerts/tunnel-alerts.yml
Normal file
210
observatory/prometheus/alerts/tunnel-alerts.yml
Normal file
@@ -0,0 +1,210 @@
|
||||
# Tunnel Alert Rules for Cloudflare Mesh Observatory
|
||||
# Phase 5B - Alerts & Escalation
|
||||
|
||||
groups:
|
||||
- name: tunnel_alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# ============================================
|
||||
# CRITICAL - Tunnel Down
|
||||
# ============================================
|
||||
- alert: TunnelDown
|
||||
expr: cloudflare_tunnel_status == 0
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: tunnel
|
||||
playbook: tunnel-rotation
|
||||
annotations:
|
||||
summary: "Cloudflare Tunnel {{ $labels.tunnel_name }} is DOWN"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} (ID: {{ $labels.tunnel_id }}) has been
|
||||
unreachable for more than 2 minutes. Services behind this tunnel are
|
||||
likely unreachable.
|
||||
impact: "Services behind tunnel are unreachable from the internet"
|
||||
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
|
||||
|
||||
# ============================================
|
||||
# CRITICAL - All Tunnels Down
|
||||
# ============================================
|
||||
- alert: AllTunnelsDown
|
||||
expr: count(cloudflare_tunnel_status == 1) == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
component: tunnel
|
||||
playbook: tunnel-rotation
|
||||
annotations:
|
||||
summary: "ALL Cloudflare Tunnels are DOWN"
|
||||
description: |
|
||||
No healthy tunnels detected. Complete loss of tunnel connectivity.
|
||||
This is a P0 incident requiring immediate attention.
|
||||
impact: "Complete loss of external connectivity via tunnels"
|
||||
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
|
||||
|
||||
# ============================================
|
||||
# WARNING - Tunnel Degraded
|
||||
# ============================================
|
||||
- alert: TunnelDegraded
|
||||
expr: cloudflare_tunnel_connections < 2
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "Tunnel {{ $labels.tunnel_name }} has reduced connections"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} has fewer than 2 active connections.
|
||||
This may indicate network issues or cloudflared problems.
|
||||
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
|
||||
|
||||
# ============================================
|
||||
# WARNING - Tunnel Rotation Due
|
||||
# ============================================
|
||||
- alert: TunnelRotationDue
|
||||
expr: (time() - cloudflare_tunnel_created_timestamp) > (86400 * 30)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
component: tunnel
|
||||
playbook: tunnel-rotation
|
||||
annotations:
|
||||
summary: "Tunnel {{ $labels.tunnel_name }} rotation is due"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} was created more than 30 days ago.
|
||||
Per security policy, tunnels should be rotated monthly.
|
||||
Age: {{ $value | humanizeDuration }}
|
||||
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
|
||||
|
||||
# ============================================
|
||||
# CRITICAL - Tunnel Rotation Overdue
|
||||
# ============================================
|
||||
- alert: TunnelRotationOverdue
|
||||
expr: (time() - cloudflare_tunnel_created_timestamp) > (86400 * 45)
|
||||
for: 1h
|
||||
labels:
|
||||
severity: critical
|
||||
component: tunnel
|
||||
playbook: tunnel-rotation
|
||||
annotations:
|
||||
summary: "Tunnel {{ $labels.tunnel_name }} rotation is OVERDUE"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} is more than 45 days old.
|
||||
This exceeds the maximum rotation interval and represents a
|
||||
security policy violation.
|
||||
Age: {{ $value | humanizeDuration }}
|
||||
runbook_url: "https://wiki.internal/playbooks/tunnel-rotation"
|
||||
|
||||
# ============================================
|
||||
# WARNING - Tunnel High Latency
|
||||
# ============================================
|
||||
- alert: TunnelHighLatency
|
||||
expr: cloudflare_tunnel_latency_ms > 500
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "High latency on tunnel {{ $labels.tunnel_name }}"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} is experiencing latency above 500ms.
|
||||
Current latency: {{ $value }}ms
|
||||
This may impact user experience.
|
||||
|
||||
# ============================================
|
||||
# CRITICAL - Tunnel Very High Latency
|
||||
# ============================================
|
||||
- alert: TunnelVeryHighLatency
|
||||
expr: cloudflare_tunnel_latency_ms > 2000
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "Critical latency on tunnel {{ $labels.tunnel_name }}"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} latency exceeds 2000ms.
|
||||
Current latency: {{ $value }}ms
|
||||
Services may be timing out.
|
||||
|
||||
# ============================================
|
||||
# WARNING - Tunnel Error Rate High
|
||||
# ============================================
|
||||
- alert: TunnelHighErrorRate
|
||||
expr: |
|
||||
rate(cloudflare_tunnel_errors_total[5m])
|
||||
/ rate(cloudflare_tunnel_requests_total[5m]) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "High error rate on tunnel {{ $labels.tunnel_name }}"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} error rate exceeds 5%.
|
||||
Current error rate: {{ $value | humanizePercentage }}
|
||||
|
||||
# ============================================
|
||||
# CRITICAL - Tunnel Error Rate Critical
|
||||
# ============================================
|
||||
- alert: TunnelCriticalErrorRate
|
||||
expr: |
|
||||
rate(cloudflare_tunnel_errors_total[5m])
|
||||
/ rate(cloudflare_tunnel_requests_total[5m]) > 0.20
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "Critical error rate on tunnel {{ $labels.tunnel_name }}"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} error rate exceeds 20%.
|
||||
Current error rate: {{ $value | humanizePercentage }}
|
||||
This indicates severe connectivity issues.
|
||||
|
||||
# ============================================
|
||||
# INFO - Tunnel Configuration Changed
|
||||
# ============================================
|
||||
- alert: TunnelConfigChanged
|
||||
expr: changes(cloudflare_tunnel_config_hash[1h]) > 0
|
||||
for: 0m
|
||||
labels:
|
||||
severity: info
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "Tunnel {{ $labels.tunnel_name }} configuration changed"
|
||||
description: |
|
||||
The configuration for tunnel {{ $labels.tunnel_name }} has changed
|
||||
in the last hour. Verify this was an authorized change.
|
||||
|
||||
# ============================================
|
||||
# WARNING - Cloudflared Version Outdated
|
||||
# ============================================
|
||||
- alert: CloudflaredOutdated
|
||||
expr: cloudflare_cloudflared_version_age_days > 90
|
||||
for: 24h
|
||||
labels:
|
||||
severity: warning
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "cloudflared version is outdated"
|
||||
description: |
|
||||
The cloudflared binary is more than 90 days old.
|
||||
Current version age: {{ $value }} days
|
||||
Consider upgrading to latest version for security patches.
|
||||
|
||||
# ============================================
|
||||
# WARNING - Tunnel Connection Flapping
|
||||
# ============================================
|
||||
- alert: TunnelConnectionFlapping
|
||||
expr: changes(cloudflare_tunnel_status[10m]) > 3
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
component: tunnel
|
||||
annotations:
|
||||
summary: "Tunnel {{ $labels.tunnel_name }} is flapping"
|
||||
description: |
|
||||
Tunnel {{ $labels.tunnel_name }} has changed state {{ $value }} times
|
||||
in the last 10 minutes. This indicates instability.
|
||||
Check network connectivity and cloudflared logs.
|
||||
Reference in New Issue
Block a user