422 lines
18 KiB
Python
422 lines
18 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Cloudflare Incident Response Playbooks
|
|
Standardized procedures for common infrastructure incidents
|
|
"""
|
|
|
|
from enum import Enum
|
|
from typing import Dict, List, Optional
|
|
from dataclasses import dataclass
|
|
from datetime import datetime
|
|
|
|
|
|
class IncidentSeverity(str, Enum):
|
|
"""Incident severity levels"""
|
|
|
|
LOW = "low"
|
|
MEDIUM = "medium"
|
|
HIGH = "high"
|
|
CRITICAL = "critical"
|
|
|
|
|
|
class IncidentType(str, Enum):
|
|
"""Types of infrastructure incidents"""
|
|
|
|
DNS_OUTAGE = "dns_outage"
|
|
WAF_BYPASS = "waf_bypass"
|
|
TUNNEL_FAILURE = "tunnel_failure"
|
|
SECURITY_BREACH = "security_breach"
|
|
CONFIGURATION_ERROR = "configuration_error"
|
|
PERFORMANCE_DEGRADATION = "performance_degradation"
|
|
|
|
|
|
@dataclass
|
|
class IncidentResponse:
|
|
"""Incident response procedure"""
|
|
|
|
incident_type: IncidentType
|
|
severity: IncidentSeverity
|
|
immediate_actions: List[str]
|
|
investigation_steps: List[str]
|
|
recovery_procedures: List[str]
|
|
prevention_measures: List[str]
|
|
escalation_path: List[str]
|
|
time_to_resolve: str
|
|
|
|
|
|
class IncidentResponsePlaybook:
|
|
"""Collection of incident response playbooks"""
|
|
|
|
def __init__(self):
|
|
self.playbooks = self._initialize_playbooks()
|
|
|
|
def _initialize_playbooks(self) -> Dict[IncidentType, IncidentResponse]:
|
|
"""Initialize all incident response playbooks"""
|
|
return {
|
|
IncidentType.DNS_OUTAGE: IncidentResponse(
|
|
incident_type=IncidentType.DNS_OUTAGE,
|
|
severity=IncidentSeverity.HIGH,
|
|
immediate_actions=[
|
|
"Verify DNS resolution using external tools (dig, nslookup)",
|
|
"Check Cloudflare DNS dashboard for zone status",
|
|
"Review recent DNS changes in version control",
|
|
"Verify origin server connectivity",
|
|
"Check Cloudflare status page for service issues",
|
|
],
|
|
investigation_steps=[
|
|
"Examine DNS record changes in Git history",
|
|
"Check Terraform state for unexpected modifications",
|
|
"Review Cloudflare audit logs for recent changes",
|
|
"Verify DNS propagation using multiple geographic locations",
|
|
"Check for DNSSEC configuration issues",
|
|
],
|
|
recovery_procedures=[
|
|
"Rollback recent DNS changes using Terraform",
|
|
"Manually restore critical DNS records if needed",
|
|
"Update TTL values for faster propagation",
|
|
"Contact Cloudflare support if service-related",
|
|
"Implement traffic rerouting if necessary",
|
|
],
|
|
prevention_measures=[
|
|
"Implement DNS change approval workflows",
|
|
"Use Terraform plan/apply with peer review",
|
|
"Monitor DNS resolution from multiple locations",
|
|
"Implement automated DNS health checks",
|
|
"Maintain backup DNS configurations",
|
|
],
|
|
escalation_path=[
|
|
"Primary DNS Administrator",
|
|
"Infrastructure Team Lead",
|
|
"Cloudflare Support",
|
|
"Security Team",
|
|
],
|
|
time_to_resolve="1-4 hours",
|
|
),
|
|
IncidentType.WAF_BYPASS: IncidentResponse(
|
|
incident_type=IncidentType.WAF_BYPASS,
|
|
severity=IncidentSeverity.CRITICAL,
|
|
immediate_actions=[
|
|
"Immediately review WAF event logs for suspicious activity",
|
|
"Check for recent WAF rule modifications",
|
|
"Verify WAF rule package status and mode",
|
|
"Temporarily block suspicious IP addresses",
|
|
"Enable challenge mode for suspicious traffic patterns",
|
|
],
|
|
investigation_steps=[
|
|
"Analyze WAF rule changes in version control",
|
|
"Review Cloudflare firewall event logs",
|
|
"Check for anomalous traffic patterns",
|
|
"Verify WAF rule effectiveness using test payloads",
|
|
"Examine rate limiting and threat score thresholds",
|
|
],
|
|
recovery_procedures=[
|
|
"Rollback WAF rule changes to known good state",
|
|
"Implement emergency WAF rules to block attack patterns",
|
|
"Update threat intelligence feeds",
|
|
"Increase security level for affected zones",
|
|
"Deploy additional security measures (Bot Fight Mode, etc.)",
|
|
],
|
|
prevention_measures=[
|
|
"Implement WAF change approval workflows",
|
|
"Regular security testing of WAF rules",
|
|
"Monitor WAF event logs for anomalies",
|
|
"Implement automated WAF rule validation",
|
|
"Regular security awareness training",
|
|
],
|
|
escalation_path=[
|
|
"Security Incident Response Team",
|
|
"WAF Administrator",
|
|
"Infrastructure Security Lead",
|
|
"CISO/Management",
|
|
],
|
|
time_to_resolve="2-6 hours",
|
|
),
|
|
IncidentType.TUNNEL_FAILURE: IncidentResponse(
|
|
incident_type=IncidentType.TUNNEL_FAILURE,
|
|
severity=IncidentSeverity.MEDIUM,
|
|
immediate_actions=[
|
|
"Check Cloudflare Tunnel status and connectivity",
|
|
"Verify origin server availability and configuration",
|
|
"Check tunnel connector logs for errors",
|
|
"Restart tunnel connector service if needed",
|
|
"Verify DNS records point to correct tunnel endpoints",
|
|
],
|
|
investigation_steps=[
|
|
"Review recent tunnel configuration changes",
|
|
"Check network connectivity between connector and Cloudflare",
|
|
"Examine tunnel connector resource usage",
|
|
"Verify certificate validity and renewal status",
|
|
"Check for firewall/network policy changes",
|
|
],
|
|
recovery_procedures=[
|
|
"Restart tunnel connector with updated configuration",
|
|
"Rollback recent tunnel configuration changes",
|
|
"Recreate tunnel connector if necessary",
|
|
"Update DNS records to alternative endpoints",
|
|
"Implement traffic failover mechanisms",
|
|
],
|
|
prevention_measures=[
|
|
"Implement tunnel health monitoring",
|
|
"Use redundant tunnel configurations",
|
|
"Regular tunnel connector updates and maintenance",
|
|
"Monitor certificate expiration dates",
|
|
"Implement automated tunnel failover",
|
|
],
|
|
escalation_path=[
|
|
"Network Administrator",
|
|
"Infrastructure Team",
|
|
"Cloudflare Support",
|
|
"Security Team",
|
|
],
|
|
time_to_resolve="1-3 hours",
|
|
),
|
|
IncidentType.SECURITY_BREACH: IncidentResponse(
|
|
incident_type=IncidentType.SECURITY_BREACH,
|
|
severity=IncidentSeverity.CRITICAL,
|
|
immediate_actions=[
|
|
"Isolate affected systems and services immediately",
|
|
"Preserve logs and evidence for forensic analysis",
|
|
"Change all relevant credentials and API tokens",
|
|
"Notify security incident response team",
|
|
"Implement emergency security controls",
|
|
],
|
|
investigation_steps=[
|
|
"Conduct forensic analysis of compromised systems",
|
|
"Review Cloudflare audit logs for unauthorized access",
|
|
"Check for API token misuse or unauthorized changes",
|
|
"Examine DNS/WAF/Tunnel configuration changes",
|
|
"Coordinate with legal and compliance teams",
|
|
],
|
|
recovery_procedures=[
|
|
"Rotate all Cloudflare API tokens and credentials",
|
|
"Restore configurations from verified backups",
|
|
"Implement enhanced security monitoring",
|
|
"Conduct post-incident security assessment",
|
|
"Update incident response procedures based on lessons learned",
|
|
],
|
|
prevention_measures=[
|
|
"Implement multi-factor authentication",
|
|
"Regular security audits and penetration testing",
|
|
"Monitor for suspicious API activity",
|
|
"Implement least privilege access controls",
|
|
"Regular security awareness training",
|
|
],
|
|
escalation_path=[
|
|
"Security Incident Response Team",
|
|
"CISO/Management",
|
|
"Legal Department",
|
|
"External Security Consultants",
|
|
],
|
|
time_to_resolve="4-24 hours",
|
|
),
|
|
IncidentType.CONFIGURATION_ERROR: IncidentResponse(
|
|
incident_type=IncidentType.CONFIGURATION_ERROR,
|
|
severity=IncidentSeverity.MEDIUM,
|
|
immediate_actions=[
|
|
"Identify the specific configuration error",
|
|
"Assess impact on services and users",
|
|
"Check version control for recent changes",
|
|
"Verify Terraform plan output for unexpected changes",
|
|
"Communicate status to stakeholders",
|
|
],
|
|
investigation_steps=[
|
|
"Review Git commit history for configuration changes",
|
|
"Examine Terraform state differences",
|
|
"Check Cloudflare configuration against documented standards",
|
|
"Verify configuration consistency across environments",
|
|
"Identify root cause of configuration error",
|
|
],
|
|
recovery_procedures=[
|
|
"Rollback configuration using Terraform",
|
|
"Apply corrected configuration changes",
|
|
"Verify service restoration and functionality",
|
|
"Update configuration documentation",
|
|
"Implement configuration validation checks",
|
|
],
|
|
prevention_measures=[
|
|
"Implement configuration change approval workflows",
|
|
"Use infrastructure as code with peer review",
|
|
"Implement automated configuration validation",
|
|
"Regular configuration audits",
|
|
"Maintain configuration documentation",
|
|
],
|
|
escalation_path=[
|
|
"Configuration Administrator",
|
|
"Infrastructure Team Lead",
|
|
"Quality Assurance Team",
|
|
"Management",
|
|
],
|
|
time_to_resolve="1-4 hours",
|
|
),
|
|
IncidentType.PERFORMANCE_DEGRADATION: IncidentResponse(
|
|
incident_type=IncidentType.PERFORMANCE_DEGRADATION,
|
|
severity=IncidentSeverity.LOW,
|
|
immediate_actions=[
|
|
"Monitor performance metrics and identify bottlenecks",
|
|
"Check Cloudflare analytics for traffic patterns",
|
|
"Verify origin server performance and resource usage",
|
|
"Review recent configuration changes",
|
|
"Implement temporary performance optimizations",
|
|
],
|
|
investigation_steps=[
|
|
"Analyze performance metrics over time",
|
|
"Check for DDoS attacks or abnormal traffic patterns",
|
|
"Review caching configuration and hit rates",
|
|
"Examine origin server response times",
|
|
"Identify specific performance bottlenecks",
|
|
],
|
|
recovery_procedures=[
|
|
"Optimize caching configuration",
|
|
"Adjust performance settings (Polish, Mirage, etc.)",
|
|
"Implement rate limiting if under attack",
|
|
"Scale origin server resources if needed",
|
|
"Update CDN configuration for better performance",
|
|
],
|
|
prevention_measures=[
|
|
"Implement performance monitoring and alerting",
|
|
"Regular performance testing and optimization",
|
|
"Capacity planning and resource forecasting",
|
|
"Implement automated scaling mechanisms",
|
|
"Regular performance reviews and optimizations",
|
|
],
|
|
escalation_path=[
|
|
"Performance Monitoring Team",
|
|
"Infrastructure Team",
|
|
"Application Development Team",
|
|
"Management",
|
|
],
|
|
time_to_resolve="2-8 hours",
|
|
),
|
|
}
|
|
|
|
def get_playbook(self, incident_type: IncidentType) -> Optional[IncidentResponse]:
|
|
"""Get the playbook for a specific incident type"""
|
|
return self.playbooks.get(incident_type)
|
|
|
|
def list_playbooks(self) -> List[IncidentType]:
|
|
"""List all available playbooks"""
|
|
return list(self.playbooks.keys())
|
|
|
|
def execute_playbook(
|
|
self, incident_type: IncidentType, custom_context: Optional[Dict] = None
|
|
) -> Dict:
|
|
"""Execute a specific incident response playbook"""
|
|
playbook = self.get_playbook(incident_type)
|
|
|
|
if not playbook:
|
|
return {"error": f"No playbook found for incident type: {incident_type}"}
|
|
|
|
execution_log = {
|
|
"incident_type": incident_type.value,
|
|
"severity": playbook.severity.value,
|
|
"start_time": datetime.now().isoformat(),
|
|
"steps_completed": [],
|
|
"custom_context": custom_context or {},
|
|
}
|
|
|
|
# Simulate execution (in real implementation, this would trigger actual actions)
|
|
execution_log["steps_completed"].extend(
|
|
[
|
|
f"Initiated {incident_type.value} response procedure",
|
|
f"Severity level: {playbook.severity.value}",
|
|
"Notified escalation path contacts",
|
|
]
|
|
)
|
|
|
|
execution_log["estimated_resolution_time"] = playbook.time_to_resolve
|
|
execution_log["completion_status"] = "in_progress"
|
|
|
|
return execution_log
|
|
|
|
|
|
def main():
|
|
"""Command-line interface for incident response playbooks"""
|
|
import argparse
|
|
|
|
parser = argparse.ArgumentParser(
|
|
description="Cloudflare Incident Response Playbooks"
|
|
)
|
|
parser.add_argument(
|
|
"action", choices=["list", "show", "execute"], help="Action to perform"
|
|
)
|
|
parser.add_argument(
|
|
"--type", choices=[t.value for t in IncidentType], help="Incident type"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
playbook_manager = IncidentResponsePlaybook()
|
|
|
|
if args.action == "list":
|
|
print("📋 Available Incident Response Playbooks:")
|
|
print("-" * 50)
|
|
for incident_type in playbook_manager.list_playbooks():
|
|
playbook = playbook_manager.get_playbook(incident_type)
|
|
if not playbook:
|
|
continue
|
|
|
|
print(f"🔸 {incident_type.value}")
|
|
print(f" Severity: {playbook.severity.value}")
|
|
print(f" Resolution Time: {playbook.time_to_resolve}")
|
|
print()
|
|
|
|
elif args.action == "show":
|
|
if not args.type:
|
|
print("❌ Error: --type argument required")
|
|
return
|
|
|
|
try:
|
|
incident_type = IncidentType(args.type)
|
|
except ValueError:
|
|
print(f"❌ Error: Invalid incident type: {args.type}")
|
|
return
|
|
|
|
playbook = playbook_manager.get_playbook(incident_type)
|
|
if not playbook:
|
|
print(f"❌ Error: No playbook found for {args.type}")
|
|
return
|
|
|
|
print(f"🔍 Incident Response Playbook: {incident_type.value}")
|
|
print("=" * 60)
|
|
print(f"Severity: {playbook.severity.value}")
|
|
print(f"Estimated Resolution: {playbook.time_to_resolve}")
|
|
|
|
print("\n🚨 Immediate Actions:")
|
|
for i, action in enumerate(playbook.immediate_actions, 1):
|
|
print(f" {i}. {action}")
|
|
|
|
print("\n🔍 Investigation Steps:")
|
|
for i, step in enumerate(playbook.investigation_steps, 1):
|
|
print(f" {i}. {step}")
|
|
|
|
print("\n🔄 Recovery Procedures:")
|
|
for i, procedure in enumerate(playbook.recovery_procedures, 1):
|
|
print(f" {i}. {procedure}")
|
|
|
|
print("\n🛡️ Prevention Measures:")
|
|
for i, measure in enumerate(playbook.prevention_measures, 1):
|
|
print(f" {i}. {measure}")
|
|
|
|
print("\n📞 Escalation Path:")
|
|
for i, contact in enumerate(playbook.escalation_path, 1):
|
|
print(f" {i}. {contact}")
|
|
|
|
elif args.action == "execute":
|
|
if not args.type:
|
|
print("❌ Error: --type argument required")
|
|
return
|
|
|
|
try:
|
|
incident_type = IncidentType(args.type)
|
|
except ValueError:
|
|
print(f"❌ Error: Invalid incident type: {args.type}")
|
|
return
|
|
|
|
result = playbook_manager.execute_playbook(incident_type)
|
|
print(f"🚀 Executing {incident_type.value} Incident Response")
|
|
print(f"📊 Result: {result}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|