Initial commit: Cloudflare infrastructure with WAF Intelligence
- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access) - WAF Intelligence MCP server with threat analysis and ML classification - GitOps automation with PR workflows and drift detection - Observatory monitoring stack with Prometheus/Grafana - IDE operator rules for governed development - Security playbooks and compliance frameworks - Autonomous remediation and state reconciliation
This commit is contained in:
351
observatory/escalation-matrix.yml
Normal file
351
observatory/escalation-matrix.yml
Normal file
@@ -0,0 +1,351 @@
|
||||
# Cloudflare Mesh Observatory - Escalation Matrix
|
||||
# Phase 5B - Alerts & Escalation
|
||||
#
|
||||
# This matrix defines who gets notified for what, and when to escalate.
|
||||
# Used by Alertmanager routing and for human reference.
|
||||
|
||||
---
|
||||
version: "1.0"
|
||||
last_updated: "2024-01-01"
|
||||
|
||||
# ==============================================================================
|
||||
# SEVERITY DEFINITIONS
|
||||
# ==============================================================================
|
||||
severity_definitions:
|
||||
critical:
|
||||
description: "Service down, security incident, or data integrity issue"
|
||||
response_time: "15 minutes"
|
||||
notification_channels: ["pagerduty", "slack-critical", "phone"]
|
||||
escalation_after: "30 minutes"
|
||||
|
||||
warning:
|
||||
description: "Degraded service, policy violation, or impending issue"
|
||||
response_time: "1 hour"
|
||||
notification_channels: ["slack"]
|
||||
escalation_after: "4 hours"
|
||||
|
||||
info:
|
||||
description: "Informational, audit, or metric threshold"
|
||||
response_time: "Next business day"
|
||||
notification_channels: ["email-digest"]
|
||||
escalation_after: null
|
||||
|
||||
# ==============================================================================
|
||||
# ESCALATION CHAINS
|
||||
# ==============================================================================
|
||||
escalation_chains:
|
||||
infrastructure:
|
||||
name: "Infrastructure Team"
|
||||
stages:
|
||||
- stage: 1
|
||||
delay: "0m"
|
||||
contacts: ["infra-oncall"]
|
||||
channels: ["pagerduty", "slack"]
|
||||
- stage: 2
|
||||
delay: "30m"
|
||||
contacts: ["infra-lead"]
|
||||
channels: ["pagerduty", "phone"]
|
||||
- stage: 3
|
||||
delay: "1h"
|
||||
contacts: ["platform-director"]
|
||||
channels: ["phone"]
|
||||
|
||||
security:
|
||||
name: "Security Team"
|
||||
stages:
|
||||
- stage: 1
|
||||
delay: "0m"
|
||||
contacts: ["security-oncall"]
|
||||
channels: ["pagerduty", "slack-security"]
|
||||
- stage: 2
|
||||
delay: "15m"
|
||||
contacts: ["security-lead", "ciso"]
|
||||
channels: ["pagerduty", "phone"]
|
||||
|
||||
platform:
|
||||
name: "Platform Team"
|
||||
stages:
|
||||
- stage: 1
|
||||
delay: "0m"
|
||||
contacts: ["platform-oncall"]
|
||||
channels: ["slack"]
|
||||
- stage: 2
|
||||
delay: "1h"
|
||||
contacts: ["platform-lead"]
|
||||
channels: ["pagerduty"]
|
||||
|
||||
# ==============================================================================
|
||||
# COMPONENT -> ESCALATION CHAIN MAPPING
|
||||
# ==============================================================================
|
||||
component_ownership:
|
||||
tunnel:
|
||||
primary_chain: infrastructure
|
||||
backup_chain: platform
|
||||
slack_channel: "#cloudflare-tunnels"
|
||||
playbooks:
|
||||
- "TUNNEL-ROTATION-PROTOCOL.md"
|
||||
|
||||
dns:
|
||||
primary_chain: infrastructure
|
||||
backup_chain: security # DNS can be security-related
|
||||
slack_channel: "#cloudflare-dns"
|
||||
playbooks:
|
||||
- "DNS-COMPROMISE-PLAYBOOK.md"
|
||||
|
||||
waf:
|
||||
primary_chain: security
|
||||
backup_chain: infrastructure
|
||||
slack_channel: "#cloudflare-waf"
|
||||
playbooks:
|
||||
- "waf_incident_playbook.md"
|
||||
|
||||
invariant:
|
||||
primary_chain: security
|
||||
backup_chain: platform
|
||||
slack_channel: "#cloudflare-security"
|
||||
playbooks:
|
||||
- "SECURITY-INVARIANTS.md"
|
||||
|
||||
proofchain:
|
||||
primary_chain: platform
|
||||
backup_chain: security
|
||||
slack_channel: "#cloudflare-proofchain"
|
||||
playbooks:
|
||||
- "proofchain-incident.md"
|
||||
|
||||
# ==============================================================================
|
||||
# ALERT -> RESPONSE MAPPING
|
||||
# ==============================================================================
|
||||
alert_responses:
|
||||
# TUNNEL ALERTS
|
||||
TunnelDown:
|
||||
severity: critical
|
||||
escalation_chain: infrastructure
|
||||
immediate_actions:
|
||||
- "Check cloudflared service status"
|
||||
- "Verify network connectivity to origin"
|
||||
- "Check Cloudflare status page"
|
||||
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
|
||||
auto_remediation: false # Manual intervention required
|
||||
|
||||
AllTunnelsDown:
|
||||
severity: critical
|
||||
escalation_chain: infrastructure
|
||||
immediate_actions:
|
||||
- "DECLARE INCIDENT"
|
||||
- "Check all cloudflared instances"
|
||||
- "Verify DNS resolution"
|
||||
- "Check for Cloudflare outage"
|
||||
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
|
||||
auto_remediation: false
|
||||
|
||||
TunnelRotationDue:
|
||||
severity: warning
|
||||
escalation_chain: platform
|
||||
immediate_actions:
|
||||
- "Schedule maintenance window"
|
||||
- "Prepare new tunnel credentials"
|
||||
playbook: "TUNNEL-ROTATION-PROTOCOL.md"
|
||||
auto_remediation: true # Can be auto-scheduled
|
||||
|
||||
# DNS ALERTS
|
||||
DNSHijackDetected:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "DECLARE SECURITY INCIDENT"
|
||||
- "Verify DNS resolution from multiple locations"
|
||||
- "Check Cloudflare audit logs"
|
||||
- "Preserve evidence"
|
||||
playbook: "DNS-COMPROMISE-PLAYBOOK.md"
|
||||
auto_remediation: false # NEVER auto-remediate security incidents
|
||||
|
||||
DNSDriftDetected:
|
||||
severity: warning
|
||||
escalation_chain: infrastructure
|
||||
immediate_actions:
|
||||
- "Run state reconciler"
|
||||
- "Identify changed records"
|
||||
- "Verify authorization"
|
||||
playbook: "DNS-COMPROMISE-PLAYBOOK.md"
|
||||
auto_remediation: true # Can auto-reconcile if authorized
|
||||
|
||||
# WAF ALERTS
|
||||
WAFMassiveAttack:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "Verify attack is real (not false positive)"
|
||||
- "Consider Under Attack Mode"
|
||||
- "Check rate limiting"
|
||||
- "Document attack patterns"
|
||||
playbook: "waf_incident_playbook.md"
|
||||
auto_remediation: false
|
||||
|
||||
WAFRuleBypass:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "Analyze bypassed requests"
|
||||
- "Tighten rule immediately"
|
||||
- "Check for related vulnerabilities"
|
||||
playbook: "waf_incident_playbook.md"
|
||||
auto_remediation: false
|
||||
|
||||
WAFDisabled:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "IMMEDIATELY investigate why WAF is disabled"
|
||||
- "Re-enable unless documented exception"
|
||||
- "Review audit logs"
|
||||
playbook: "waf_incident_playbook.md"
|
||||
auto_remediation: true # Auto-enable WAF
|
||||
|
||||
# INVARIANT ALERTS
|
||||
SSLModeDowngraded:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "Restore Full (Strict) SSL mode"
|
||||
- "Investigate who made the change"
|
||||
- "Review audit logs"
|
||||
playbook: null
|
||||
auto_remediation: true # Auto-restore SSL mode
|
||||
|
||||
AccessPolicyViolation:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "Review access attempt"
|
||||
- "Block if malicious"
|
||||
- "Notify affected user if legitimate"
|
||||
playbook: null
|
||||
auto_remediation: false
|
||||
|
||||
# PROOFCHAIN ALERTS
|
||||
ProofchainIntegrityFailure:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "HALT all new receipt generation"
|
||||
- "Preserve current state"
|
||||
- "Identify last known-good checkpoint"
|
||||
- "Do NOT attempt auto-recovery"
|
||||
playbook: null
|
||||
auto_remediation: false # NEVER auto-remediate integrity failures
|
||||
|
||||
ReceiptHashMismatch:
|
||||
severity: critical
|
||||
escalation_chain: security
|
||||
immediate_actions:
|
||||
- "Identify affected receipt"
|
||||
- "Compare against backup"
|
||||
- "Preserve for forensics"
|
||||
playbook: null
|
||||
auto_remediation: false
|
||||
|
||||
# ==============================================================================
|
||||
# CONTACTS
|
||||
# ==============================================================================
|
||||
contacts:
|
||||
infra-oncall:
|
||||
name: "Infrastructure On-Call"
|
||||
pagerduty_service: "PXXXXXX"
|
||||
slack_handle: "@infra-oncall"
|
||||
schedule: "follow-the-sun"
|
||||
|
||||
infra-lead:
|
||||
name: "Infrastructure Team Lead"
|
||||
pagerduty_user: "UXXXXXX"
|
||||
phone: "+1-XXX-XXX-XXXX"
|
||||
email: "infra-lead@company.com"
|
||||
|
||||
security-oncall:
|
||||
name: "Security On-Call"
|
||||
pagerduty_service: "PXXXXXX"
|
||||
slack_handle: "@security-oncall"
|
||||
schedule: "24x7"
|
||||
|
||||
security-lead:
|
||||
name: "Security Team Lead"
|
||||
pagerduty_user: "UXXXXXX"
|
||||
phone: "+1-XXX-XXX-XXXX"
|
||||
email: "security-lead@company.com"
|
||||
|
||||
ciso:
|
||||
name: "Chief Information Security Officer"
|
||||
phone: "+1-XXX-XXX-XXXX"
|
||||
email: "ciso@company.com"
|
||||
|
||||
platform-oncall:
|
||||
name: "Platform On-Call"
|
||||
pagerduty_service: "PXXXXXX"
|
||||
slack_handle: "@platform-oncall"
|
||||
|
||||
platform-lead:
|
||||
name: "Platform Team Lead"
|
||||
pagerduty_user: "UXXXXXX"
|
||||
email: "platform-lead@company.com"
|
||||
|
||||
platform-director:
|
||||
name: "Platform Director"
|
||||
phone: "+1-XXX-XXX-XXXX"
|
||||
email: "platform-director@company.com"
|
||||
|
||||
# ==============================================================================
|
||||
# NOTIFICATION CHANNELS
|
||||
# ==============================================================================
|
||||
channels:
|
||||
slack:
|
||||
default: "#cloudflare-alerts"
|
||||
critical: "#cloudflare-critical"
|
||||
tunnels: "#cloudflare-tunnels"
|
||||
dns: "#cloudflare-dns"
|
||||
waf: "#cloudflare-waf"
|
||||
security: "#cloudflare-security"
|
||||
proofchain: "#cloudflare-proofchain"
|
||||
|
||||
pagerduty:
|
||||
integration_key: "${PAGERDUTY_SERVICE_KEY}"
|
||||
escalation_policy: "cloudflare-infrastructure"
|
||||
|
||||
email:
|
||||
daily_digest: "cloudflare-team@company.com"
|
||||
weekly_report: "platform-leadership@company.com"
|
||||
|
||||
# ==============================================================================
|
||||
# AUTO-REMEDIATION POLICIES
|
||||
# ==============================================================================
|
||||
auto_remediation:
|
||||
enabled: true
|
||||
require_confirmation_for:
|
||||
- "critical"
|
||||
- "security_incident"
|
||||
never_auto_remediate:
|
||||
- "ProofchainIntegrityFailure"
|
||||
- "ReceiptHashMismatch"
|
||||
- "DNSHijackDetected"
|
||||
- "WAFRuleBypass"
|
||||
max_auto_remediations_per_hour: 5
|
||||
cooldown_period: "10m"
|
||||
|
||||
# ==============================================================================
|
||||
# MAINTENANCE WINDOWS
|
||||
# ==============================================================================
|
||||
maintenance_windows:
|
||||
weekly_rotation:
|
||||
schedule: "0 3 * * SUN" # 3 AM Sunday
|
||||
duration: "2h"
|
||||
suppress_alerts:
|
||||
- "TunnelDown"
|
||||
- "TunnelDegraded"
|
||||
notify_channel: "#cloudflare-alerts"
|
||||
|
||||
monthly_patch:
|
||||
schedule: "0 2 15 * *" # 2 AM on the 15th
|
||||
duration: "4h"
|
||||
suppress_alerts:
|
||||
- "TunnelDown"
|
||||
- "CloudflaredOutdated"
|
||||
notify_channel: "#cloudflare-alerts"
|
||||
Reference in New Issue
Block a user