# Cloudflare Mesh Observatory - Escalation Matrix # Phase 5B - Alerts & Escalation # # This matrix defines who gets notified for what, and when to escalate. # Used by Alertmanager routing and for human reference. --- version: "1.0" last_updated: "2024-01-01" # ============================================================================== # SEVERITY DEFINITIONS # ============================================================================== severity_definitions: critical: description: "Service down, security incident, or data integrity issue" response_time: "15 minutes" notification_channels: ["pagerduty", "slack-critical", "phone"] escalation_after: "30 minutes" warning: description: "Degraded service, policy violation, or impending issue" response_time: "1 hour" notification_channels: ["slack"] escalation_after: "4 hours" info: description: "Informational, audit, or metric threshold" response_time: "Next business day" notification_channels: ["email-digest"] escalation_after: null # ============================================================================== # ESCALATION CHAINS # ============================================================================== escalation_chains: infrastructure: name: "Infrastructure Team" stages: - stage: 1 delay: "0m" contacts: ["infra-oncall"] channels: ["pagerduty", "slack"] - stage: 2 delay: "30m" contacts: ["infra-lead"] channels: ["pagerduty", "phone"] - stage: 3 delay: "1h" contacts: ["platform-director"] channels: ["phone"] security: name: "Security Team" stages: - stage: 1 delay: "0m" contacts: ["security-oncall"] channels: ["pagerduty", "slack-security"] - stage: 2 delay: "15m" contacts: ["security-lead", "ciso"] channels: ["pagerduty", "phone"] platform: name: "Platform Team" stages: - stage: 1 delay: "0m" contacts: ["platform-oncall"] channels: ["slack"] - stage: 2 delay: "1h" contacts: ["platform-lead"] channels: ["pagerduty"] # ============================================================================== # COMPONENT -> ESCALATION CHAIN MAPPING # ============================================================================== component_ownership: tunnel: primary_chain: infrastructure backup_chain: platform slack_channel: "#cloudflare-tunnels" playbooks: - "TUNNEL-ROTATION-PROTOCOL.md" dns: primary_chain: infrastructure backup_chain: security # DNS can be security-related slack_channel: "#cloudflare-dns" playbooks: - "DNS-COMPROMISE-PLAYBOOK.md" waf: primary_chain: security backup_chain: infrastructure slack_channel: "#cloudflare-waf" playbooks: - "waf_incident_playbook.md" invariant: primary_chain: security backup_chain: platform slack_channel: "#cloudflare-security" playbooks: - "SECURITY-INVARIANTS.md" proofchain: primary_chain: platform backup_chain: security slack_channel: "#cloudflare-proofchain" playbooks: - "proofchain-incident.md" # ============================================================================== # ALERT -> RESPONSE MAPPING # ============================================================================== alert_responses: # TUNNEL ALERTS TunnelDown: severity: critical escalation_chain: infrastructure immediate_actions: - "Check cloudflared service status" - "Verify network connectivity to origin" - "Check Cloudflare status page" playbook: "TUNNEL-ROTATION-PROTOCOL.md" auto_remediation: false # Manual intervention required AllTunnelsDown: severity: critical escalation_chain: infrastructure immediate_actions: - "DECLARE INCIDENT" - "Check all cloudflared instances" - "Verify DNS resolution" - "Check for Cloudflare outage" playbook: "TUNNEL-ROTATION-PROTOCOL.md" auto_remediation: false TunnelRotationDue: severity: warning escalation_chain: platform immediate_actions: - "Schedule maintenance window" - "Prepare new tunnel credentials" playbook: "TUNNEL-ROTATION-PROTOCOL.md" auto_remediation: true # Can be auto-scheduled # DNS ALERTS DNSHijackDetected: severity: critical escalation_chain: security immediate_actions: - "DECLARE SECURITY INCIDENT" - "Verify DNS resolution from multiple locations" - "Check Cloudflare audit logs" - "Preserve evidence" playbook: "DNS-COMPROMISE-PLAYBOOK.md" auto_remediation: false # NEVER auto-remediate security incidents DNSDriftDetected: severity: warning escalation_chain: infrastructure immediate_actions: - "Run state reconciler" - "Identify changed records" - "Verify authorization" playbook: "DNS-COMPROMISE-PLAYBOOK.md" auto_remediation: true # Can auto-reconcile if authorized # WAF ALERTS WAFMassiveAttack: severity: critical escalation_chain: security immediate_actions: - "Verify attack is real (not false positive)" - "Consider Under Attack Mode" - "Check rate limiting" - "Document attack patterns" playbook: "waf_incident_playbook.md" auto_remediation: false WAFRuleBypass: severity: critical escalation_chain: security immediate_actions: - "Analyze bypassed requests" - "Tighten rule immediately" - "Check for related vulnerabilities" playbook: "waf_incident_playbook.md" auto_remediation: false WAFDisabled: severity: critical escalation_chain: security immediate_actions: - "IMMEDIATELY investigate why WAF is disabled" - "Re-enable unless documented exception" - "Review audit logs" playbook: "waf_incident_playbook.md" auto_remediation: true # Auto-enable WAF # INVARIANT ALERTS SSLModeDowngraded: severity: critical escalation_chain: security immediate_actions: - "Restore Full (Strict) SSL mode" - "Investigate who made the change" - "Review audit logs" playbook: null auto_remediation: true # Auto-restore SSL mode AccessPolicyViolation: severity: critical escalation_chain: security immediate_actions: - "Review access attempt" - "Block if malicious" - "Notify affected user if legitimate" playbook: null auto_remediation: false # PROOFCHAIN ALERTS ProofchainIntegrityFailure: severity: critical escalation_chain: security immediate_actions: - "HALT all new receipt generation" - "Preserve current state" - "Identify last known-good checkpoint" - "Do NOT attempt auto-recovery" playbook: null auto_remediation: false # NEVER auto-remediate integrity failures ReceiptHashMismatch: severity: critical escalation_chain: security immediate_actions: - "Identify affected receipt" - "Compare against backup" - "Preserve for forensics" playbook: null auto_remediation: false # ============================================================================== # CONTACTS # ============================================================================== contacts: infra-oncall: name: "Infrastructure On-Call" pagerduty_service: "PXXXXXX" slack_handle: "@infra-oncall" schedule: "follow-the-sun" infra-lead: name: "Infrastructure Team Lead" pagerduty_user: "UXXXXXX" phone: "+1-XXX-XXX-XXXX" email: "infra-lead@company.com" security-oncall: name: "Security On-Call" pagerduty_service: "PXXXXXX" slack_handle: "@security-oncall" schedule: "24x7" security-lead: name: "Security Team Lead" pagerduty_user: "UXXXXXX" phone: "+1-XXX-XXX-XXXX" email: "security-lead@company.com" ciso: name: "Chief Information Security Officer" phone: "+1-XXX-XXX-XXXX" email: "ciso@company.com" platform-oncall: name: "Platform On-Call" pagerduty_service: "PXXXXXX" slack_handle: "@platform-oncall" platform-lead: name: "Platform Team Lead" pagerduty_user: "UXXXXXX" email: "platform-lead@company.com" platform-director: name: "Platform Director" phone: "+1-XXX-XXX-XXXX" email: "platform-director@company.com" # ============================================================================== # NOTIFICATION CHANNELS # ============================================================================== channels: slack: default: "#cloudflare-alerts" critical: "#cloudflare-critical" tunnels: "#cloudflare-tunnels" dns: "#cloudflare-dns" waf: "#cloudflare-waf" security: "#cloudflare-security" proofchain: "#cloudflare-proofchain" pagerduty: integration_key: "${PAGERDUTY_SERVICE_KEY}" escalation_policy: "cloudflare-infrastructure" email: daily_digest: "cloudflare-team@company.com" weekly_report: "platform-leadership@company.com" # ============================================================================== # AUTO-REMEDIATION POLICIES # ============================================================================== auto_remediation: enabled: true require_confirmation_for: - "critical" - "security_incident" never_auto_remediate: - "ProofchainIntegrityFailure" - "ReceiptHashMismatch" - "DNSHijackDetected" - "WAFRuleBypass" max_auto_remediations_per_hour: 5 cooldown_period: "10m" # ============================================================================== # MAINTENANCE WINDOWS # ============================================================================== maintenance_windows: weekly_rotation: schedule: "0 3 * * SUN" # 3 AM Sunday duration: "2h" suppress_alerts: - "TunnelDown" - "TunnelDegraded" notify_channel: "#cloudflare-alerts" monthly_patch: schedule: "0 2 15 * *" # 2 AM on the 15th duration: "4h" suppress_alerts: - "TunnelDown" - "CloudflaredOutdated" notify_channel: "#cloudflare-alerts"