Files
vm-cloudflare/observatory/escalation-matrix.yml
Vault Sovereign 37a867c485 Initial commit: Cloudflare infrastructure with WAF Intelligence
- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access)
- WAF Intelligence MCP server with threat analysis and ML classification
- GitOps automation with PR workflows and drift detection
- Observatory monitoring stack with Prometheus/Grafana
- IDE operator rules for governed development
- Security playbooks and compliance frameworks
- Autonomous remediation and state reconciliation
2025-12-16 18:31:53 +00:00

352 lines
10 KiB
YAML

# Cloudflare Mesh Observatory - Escalation Matrix
# Phase 5B - Alerts & Escalation
#
# This matrix defines who gets notified for what, and when to escalate.
# Used by Alertmanager routing and for human reference.
---
version: "1.0"
last_updated: "2024-01-01"
# ==============================================================================
# SEVERITY DEFINITIONS
# ==============================================================================
severity_definitions:
critical:
description: "Service down, security incident, or data integrity issue"
response_time: "15 minutes"
notification_channels: ["pagerduty", "slack-critical", "phone"]
escalation_after: "30 minutes"
warning:
description: "Degraded service, policy violation, or impending issue"
response_time: "1 hour"
notification_channels: ["slack"]
escalation_after: "4 hours"
info:
description: "Informational, audit, or metric threshold"
response_time: "Next business day"
notification_channels: ["email-digest"]
escalation_after: null
# ==============================================================================
# ESCALATION CHAINS
# ==============================================================================
escalation_chains:
infrastructure:
name: "Infrastructure Team"
stages:
- stage: 1
delay: "0m"
contacts: ["infra-oncall"]
channels: ["pagerduty", "slack"]
- stage: 2
delay: "30m"
contacts: ["infra-lead"]
channels: ["pagerduty", "phone"]
- stage: 3
delay: "1h"
contacts: ["platform-director"]
channels: ["phone"]
security:
name: "Security Team"
stages:
- stage: 1
delay: "0m"
contacts: ["security-oncall"]
channels: ["pagerduty", "slack-security"]
- stage: 2
delay: "15m"
contacts: ["security-lead", "ciso"]
channels: ["pagerduty", "phone"]
platform:
name: "Platform Team"
stages:
- stage: 1
delay: "0m"
contacts: ["platform-oncall"]
channels: ["slack"]
- stage: 2
delay: "1h"
contacts: ["platform-lead"]
channels: ["pagerduty"]
# ==============================================================================
# COMPONENT -> ESCALATION CHAIN MAPPING
# ==============================================================================
component_ownership:
tunnel:
primary_chain: infrastructure
backup_chain: platform
slack_channel: "#cloudflare-tunnels"
playbooks:
- "TUNNEL-ROTATION-PROTOCOL.md"
dns:
primary_chain: infrastructure
backup_chain: security # DNS can be security-related
slack_channel: "#cloudflare-dns"
playbooks:
- "DNS-COMPROMISE-PLAYBOOK.md"
waf:
primary_chain: security
backup_chain: infrastructure
slack_channel: "#cloudflare-waf"
playbooks:
- "waf_incident_playbook.md"
invariant:
primary_chain: security
backup_chain: platform
slack_channel: "#cloudflare-security"
playbooks:
- "SECURITY-INVARIANTS.md"
proofchain:
primary_chain: platform
backup_chain: security
slack_channel: "#cloudflare-proofchain"
playbooks:
- "proofchain-incident.md"
# ==============================================================================
# ALERT -> RESPONSE MAPPING
# ==============================================================================
alert_responses:
# TUNNEL ALERTS
TunnelDown:
severity: critical
escalation_chain: infrastructure
immediate_actions:
- "Check cloudflared service status"
- "Verify network connectivity to origin"
- "Check Cloudflare status page"
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
auto_remediation: false # Manual intervention required
AllTunnelsDown:
severity: critical
escalation_chain: infrastructure
immediate_actions:
- "DECLARE INCIDENT"
- "Check all cloudflared instances"
- "Verify DNS resolution"
- "Check for Cloudflare outage"
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
auto_remediation: false
TunnelRotationDue:
severity: warning
escalation_chain: platform
immediate_actions:
- "Schedule maintenance window"
- "Prepare new tunnel credentials"
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
auto_remediation: true # Can be auto-scheduled
# DNS ALERTS
DNSHijackDetected:
severity: critical
escalation_chain: security
immediate_actions:
- "DECLARE SECURITY INCIDENT"
- "Verify DNS resolution from multiple locations"
- "Check Cloudflare audit logs"
- "Preserve evidence"
playbook: "DNS-COMPROMISE-PLAYBOOK.md"
auto_remediation: false # NEVER auto-remediate security incidents
DNSDriftDetected:
severity: warning
escalation_chain: infrastructure
immediate_actions:
- "Run state reconciler"
- "Identify changed records"
- "Verify authorization"
playbook: "DNS-COMPROMISE-PLAYBOOK.md"
auto_remediation: true # Can auto-reconcile if authorized
# WAF ALERTS
WAFMassiveAttack:
severity: critical
escalation_chain: security
immediate_actions:
- "Verify attack is real (not false positive)"
- "Consider Under Attack Mode"
- "Check rate limiting"
- "Document attack patterns"
playbook: "waf_incident_playbook.md"
auto_remediation: false
WAFRuleBypass:
severity: critical
escalation_chain: security
immediate_actions:
- "Analyze bypassed requests"
- "Tighten rule immediately"
- "Check for related vulnerabilities"
playbook: "waf_incident_playbook.md"
auto_remediation: false
WAFDisabled:
severity: critical
escalation_chain: security
immediate_actions:
- "IMMEDIATELY investigate why WAF is disabled"
- "Re-enable unless documented exception"
- "Review audit logs"
playbook: "waf_incident_playbook.md"
auto_remediation: true # Auto-enable WAF
# INVARIANT ALERTS
SSLModeDowngraded:
severity: critical
escalation_chain: security
immediate_actions:
- "Restore Full (Strict) SSL mode"
- "Investigate who made the change"
- "Review audit logs"
playbook: null
auto_remediation: true # Auto-restore SSL mode
AccessPolicyViolation:
severity: critical
escalation_chain: security
immediate_actions:
- "Review access attempt"
- "Block if malicious"
- "Notify affected user if legitimate"
playbook: null
auto_remediation: false
# PROOFCHAIN ALERTS
ProofchainIntegrityFailure:
severity: critical
escalation_chain: security
immediate_actions:
- "HALT all new receipt generation"
- "Preserve current state"
- "Identify last known-good checkpoint"
- "Do NOT attempt auto-recovery"
playbook: null
auto_remediation: false # NEVER auto-remediate integrity failures
ReceiptHashMismatch:
severity: critical
escalation_chain: security
immediate_actions:
- "Identify affected receipt"
- "Compare against backup"
- "Preserve for forensics"
playbook: null
auto_remediation: false
# ==============================================================================
# CONTACTS
# ==============================================================================
contacts:
infra-oncall:
name: "Infrastructure On-Call"
pagerduty_service: "PXXXXXX"
slack_handle: "@infra-oncall"
schedule: "follow-the-sun"
infra-lead:
name: "Infrastructure Team Lead"
pagerduty_user: "UXXXXXX"
phone: "+1-XXX-XXX-XXXX"
email: "infra-lead@company.com"
security-oncall:
name: "Security On-Call"
pagerduty_service: "PXXXXXX"
slack_handle: "@security-oncall"
schedule: "24x7"
security-lead:
name: "Security Team Lead"
pagerduty_user: "UXXXXXX"
phone: "+1-XXX-XXX-XXXX"
email: "security-lead@company.com"
ciso:
name: "Chief Information Security Officer"
phone: "+1-XXX-XXX-XXXX"
email: "ciso@company.com"
platform-oncall:
name: "Platform On-Call"
pagerduty_service: "PXXXXXX"
slack_handle: "@platform-oncall"
platform-lead:
name: "Platform Team Lead"
pagerduty_user: "UXXXXXX"
email: "platform-lead@company.com"
platform-director:
name: "Platform Director"
phone: "+1-XXX-XXX-XXXX"
email: "platform-director@company.com"
# ==============================================================================
# NOTIFICATION CHANNELS
# ==============================================================================
channels:
slack:
default: "#cloudflare-alerts"
critical: "#cloudflare-critical"
tunnels: "#cloudflare-tunnels"
dns: "#cloudflare-dns"
waf: "#cloudflare-waf"
security: "#cloudflare-security"
proofchain: "#cloudflare-proofchain"
pagerduty:
integration_key: "${PAGERDUTY_SERVICE_KEY}"
escalation_policy: "cloudflare-infrastructure"
email:
daily_digest: "cloudflare-team@company.com"
weekly_report: "platform-leadership@company.com"
# ==============================================================================
# AUTO-REMEDIATION POLICIES
# ==============================================================================
auto_remediation:
enabled: true
require_confirmation_for:
- "critical"
- "security_incident"
never_auto_remediate:
- "ProofchainIntegrityFailure"
- "ReceiptHashMismatch"
- "DNSHijackDetected"
- "WAFRuleBypass"
max_auto_remediations_per_hour: 5
cooldown_period: "10m"
# ==============================================================================
# MAINTENANCE WINDOWS
# ==============================================================================
maintenance_windows:
weekly_rotation:
schedule: "0 3 * * SUN" # 3 AM Sunday
duration: "2h"
suppress_alerts:
- "TunnelDown"
- "TunnelDegraded"
notify_channel: "#cloudflare-alerts"
monthly_patch:
schedule: "0 2 15 * *" # 2 AM on the 15th
duration: "4h"
suppress_alerts:
- "TunnelDown"
- "CloudflaredOutdated"
notify_channel: "#cloudflare-alerts"