- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access) - WAF Intelligence MCP server with threat analysis and ML classification - GitOps automation with PR workflows and drift detection - Observatory monitoring stack with Prometheus/Grafana - IDE operator rules for governed development - Security playbooks and compliance frameworks - Autonomous remediation and state reconciliation
352 lines
10 KiB
YAML
352 lines
10 KiB
YAML
# Cloudflare Mesh Observatory - Escalation Matrix
|
|
# Phase 5B - Alerts & Escalation
|
|
#
|
|
# This matrix defines who gets notified for what, and when to escalate.
|
|
# Used by Alertmanager routing and for human reference.
|
|
|
|
---
|
|
version: "1.0"
|
|
last_updated: "2024-01-01"
|
|
|
|
# ==============================================================================
|
|
# SEVERITY DEFINITIONS
|
|
# ==============================================================================
|
|
severity_definitions:
|
|
critical:
|
|
description: "Service down, security incident, or data integrity issue"
|
|
response_time: "15 minutes"
|
|
notification_channels: ["pagerduty", "slack-critical", "phone"]
|
|
escalation_after: "30 minutes"
|
|
|
|
warning:
|
|
description: "Degraded service, policy violation, or impending issue"
|
|
response_time: "1 hour"
|
|
notification_channels: ["slack"]
|
|
escalation_after: "4 hours"
|
|
|
|
info:
|
|
description: "Informational, audit, or metric threshold"
|
|
response_time: "Next business day"
|
|
notification_channels: ["email-digest"]
|
|
escalation_after: null
|
|
|
|
# ==============================================================================
|
|
# ESCALATION CHAINS
|
|
# ==============================================================================
|
|
escalation_chains:
|
|
infrastructure:
|
|
name: "Infrastructure Team"
|
|
stages:
|
|
- stage: 1
|
|
delay: "0m"
|
|
contacts: ["infra-oncall"]
|
|
channels: ["pagerduty", "slack"]
|
|
- stage: 2
|
|
delay: "30m"
|
|
contacts: ["infra-lead"]
|
|
channels: ["pagerduty", "phone"]
|
|
- stage: 3
|
|
delay: "1h"
|
|
contacts: ["platform-director"]
|
|
channels: ["phone"]
|
|
|
|
security:
|
|
name: "Security Team"
|
|
stages:
|
|
- stage: 1
|
|
delay: "0m"
|
|
contacts: ["security-oncall"]
|
|
channels: ["pagerduty", "slack-security"]
|
|
- stage: 2
|
|
delay: "15m"
|
|
contacts: ["security-lead", "ciso"]
|
|
channels: ["pagerduty", "phone"]
|
|
|
|
platform:
|
|
name: "Platform Team"
|
|
stages:
|
|
- stage: 1
|
|
delay: "0m"
|
|
contacts: ["platform-oncall"]
|
|
channels: ["slack"]
|
|
- stage: 2
|
|
delay: "1h"
|
|
contacts: ["platform-lead"]
|
|
channels: ["pagerduty"]
|
|
|
|
# ==============================================================================
|
|
# COMPONENT -> ESCALATION CHAIN MAPPING
|
|
# ==============================================================================
|
|
component_ownership:
|
|
tunnel:
|
|
primary_chain: infrastructure
|
|
backup_chain: platform
|
|
slack_channel: "#cloudflare-tunnels"
|
|
playbooks:
|
|
- "TUNNEL-ROTATION-PROTOCOL.md"
|
|
|
|
dns:
|
|
primary_chain: infrastructure
|
|
backup_chain: security # DNS can be security-related
|
|
slack_channel: "#cloudflare-dns"
|
|
playbooks:
|
|
- "DNS-COMPROMISE-PLAYBOOK.md"
|
|
|
|
waf:
|
|
primary_chain: security
|
|
backup_chain: infrastructure
|
|
slack_channel: "#cloudflare-waf"
|
|
playbooks:
|
|
- "waf_incident_playbook.md"
|
|
|
|
invariant:
|
|
primary_chain: security
|
|
backup_chain: platform
|
|
slack_channel: "#cloudflare-security"
|
|
playbooks:
|
|
- "SECURITY-INVARIANTS.md"
|
|
|
|
proofchain:
|
|
primary_chain: platform
|
|
backup_chain: security
|
|
slack_channel: "#cloudflare-proofchain"
|
|
playbooks:
|
|
- "proofchain-incident.md"
|
|
|
|
# ==============================================================================
|
|
# ALERT -> RESPONSE MAPPING
|
|
# ==============================================================================
|
|
alert_responses:
|
|
# TUNNEL ALERTS
|
|
TunnelDown:
|
|
severity: critical
|
|
escalation_chain: infrastructure
|
|
immediate_actions:
|
|
- "Check cloudflared service status"
|
|
- "Verify network connectivity to origin"
|
|
- "Check Cloudflare status page"
|
|
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
|
|
auto_remediation: false # Manual intervention required
|
|
|
|
AllTunnelsDown:
|
|
severity: critical
|
|
escalation_chain: infrastructure
|
|
immediate_actions:
|
|
- "DECLARE INCIDENT"
|
|
- "Check all cloudflared instances"
|
|
- "Verify DNS resolution"
|
|
- "Check for Cloudflare outage"
|
|
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
|
|
auto_remediation: false
|
|
|
|
TunnelRotationDue:
|
|
severity: warning
|
|
escalation_chain: platform
|
|
immediate_actions:
|
|
- "Schedule maintenance window"
|
|
- "Prepare new tunnel credentials"
|
|
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
|
|
auto_remediation: true # Can be auto-scheduled
|
|
|
|
# DNS ALERTS
|
|
DNSHijackDetected:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "DECLARE SECURITY INCIDENT"
|
|
- "Verify DNS resolution from multiple locations"
|
|
- "Check Cloudflare audit logs"
|
|
- "Preserve evidence"
|
|
playbook: "DNS-COMPROMISE-PLAYBOOK.md"
|
|
auto_remediation: false # NEVER auto-remediate security incidents
|
|
|
|
DNSDriftDetected:
|
|
severity: warning
|
|
escalation_chain: infrastructure
|
|
immediate_actions:
|
|
- "Run state reconciler"
|
|
- "Identify changed records"
|
|
- "Verify authorization"
|
|
playbook: "DNS-COMPROMISE-PLAYBOOK.md"
|
|
auto_remediation: true # Can auto-reconcile if authorized
|
|
|
|
# WAF ALERTS
|
|
WAFMassiveAttack:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "Verify attack is real (not false positive)"
|
|
- "Consider Under Attack Mode"
|
|
- "Check rate limiting"
|
|
- "Document attack patterns"
|
|
playbook: "waf_incident_playbook.md"
|
|
auto_remediation: false
|
|
|
|
WAFRuleBypass:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "Analyze bypassed requests"
|
|
- "Tighten rule immediately"
|
|
- "Check for related vulnerabilities"
|
|
playbook: "waf_incident_playbook.md"
|
|
auto_remediation: false
|
|
|
|
WAFDisabled:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "IMMEDIATELY investigate why WAF is disabled"
|
|
- "Re-enable unless documented exception"
|
|
- "Review audit logs"
|
|
playbook: "waf_incident_playbook.md"
|
|
auto_remediation: true # Auto-enable WAF
|
|
|
|
# INVARIANT ALERTS
|
|
SSLModeDowngraded:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "Restore Full (Strict) SSL mode"
|
|
- "Investigate who made the change"
|
|
- "Review audit logs"
|
|
playbook: null
|
|
auto_remediation: true # Auto-restore SSL mode
|
|
|
|
AccessPolicyViolation:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "Review access attempt"
|
|
- "Block if malicious"
|
|
- "Notify affected user if legitimate"
|
|
playbook: null
|
|
auto_remediation: false
|
|
|
|
# PROOFCHAIN ALERTS
|
|
ProofchainIntegrityFailure:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "HALT all new receipt generation"
|
|
- "Preserve current state"
|
|
- "Identify last known-good checkpoint"
|
|
- "Do NOT attempt auto-recovery"
|
|
playbook: null
|
|
auto_remediation: false # NEVER auto-remediate integrity failures
|
|
|
|
ReceiptHashMismatch:
|
|
severity: critical
|
|
escalation_chain: security
|
|
immediate_actions:
|
|
- "Identify affected receipt"
|
|
- "Compare against backup"
|
|
- "Preserve for forensics"
|
|
playbook: null
|
|
auto_remediation: false
|
|
|
|
# ==============================================================================
|
|
# CONTACTS
|
|
# ==============================================================================
|
|
contacts:
|
|
infra-oncall:
|
|
name: "Infrastructure On-Call"
|
|
pagerduty_service: "PXXXXXX"
|
|
slack_handle: "@infra-oncall"
|
|
schedule: "follow-the-sun"
|
|
|
|
infra-lead:
|
|
name: "Infrastructure Team Lead"
|
|
pagerduty_user: "UXXXXXX"
|
|
phone: "+1-XXX-XXX-XXXX"
|
|
email: "infra-lead@company.com"
|
|
|
|
security-oncall:
|
|
name: "Security On-Call"
|
|
pagerduty_service: "PXXXXXX"
|
|
slack_handle: "@security-oncall"
|
|
schedule: "24x7"
|
|
|
|
security-lead:
|
|
name: "Security Team Lead"
|
|
pagerduty_user: "UXXXXXX"
|
|
phone: "+1-XXX-XXX-XXXX"
|
|
email: "security-lead@company.com"
|
|
|
|
ciso:
|
|
name: "Chief Information Security Officer"
|
|
phone: "+1-XXX-XXX-XXXX"
|
|
email: "ciso@company.com"
|
|
|
|
platform-oncall:
|
|
name: "Platform On-Call"
|
|
pagerduty_service: "PXXXXXX"
|
|
slack_handle: "@platform-oncall"
|
|
|
|
platform-lead:
|
|
name: "Platform Team Lead"
|
|
pagerduty_user: "UXXXXXX"
|
|
email: "platform-lead@company.com"
|
|
|
|
platform-director:
|
|
name: "Platform Director"
|
|
phone: "+1-XXX-XXX-XXXX"
|
|
email: "platform-director@company.com"
|
|
|
|
# ==============================================================================
|
|
# NOTIFICATION CHANNELS
|
|
# ==============================================================================
|
|
channels:
|
|
slack:
|
|
default: "#cloudflare-alerts"
|
|
critical: "#cloudflare-critical"
|
|
tunnels: "#cloudflare-tunnels"
|
|
dns: "#cloudflare-dns"
|
|
waf: "#cloudflare-waf"
|
|
security: "#cloudflare-security"
|
|
proofchain: "#cloudflare-proofchain"
|
|
|
|
pagerduty:
|
|
integration_key: "${PAGERDUTY_SERVICE_KEY}"
|
|
escalation_policy: "cloudflare-infrastructure"
|
|
|
|
email:
|
|
daily_digest: "cloudflare-team@company.com"
|
|
weekly_report: "platform-leadership@company.com"
|
|
|
|
# ==============================================================================
|
|
# AUTO-REMEDIATION POLICIES
|
|
# ==============================================================================
|
|
auto_remediation:
|
|
enabled: true
|
|
require_confirmation_for:
|
|
- "critical"
|
|
- "security_incident"
|
|
never_auto_remediate:
|
|
- "ProofchainIntegrityFailure"
|
|
- "ReceiptHashMismatch"
|
|
- "DNSHijackDetected"
|
|
- "WAFRuleBypass"
|
|
max_auto_remediations_per_hour: 5
|
|
cooldown_period: "10m"
|
|
|
|
# ==============================================================================
|
|
# MAINTENANCE WINDOWS
|
|
# ==============================================================================
|
|
maintenance_windows:
|
|
weekly_rotation:
|
|
schedule: "0 3 * * SUN" # 3 AM Sunday
|
|
duration: "2h"
|
|
suppress_alerts:
|
|
- "TunnelDown"
|
|
- "TunnelDegraded"
|
|
notify_channel: "#cloudflare-alerts"
|
|
|
|
monthly_patch:
|
|
schedule: "0 2 15 * *" # 2 AM on the 15th
|
|
duration: "4h"
|
|
suppress_alerts:
|
|
- "TunnelDown"
|
|
- "CloudflaredOutdated"
|
|
notify_channel: "#cloudflare-alerts"
|