Initial commit: Cloudflare infrastructure with WAF Intelligence

- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access)
- WAF Intelligence MCP server with threat analysis and ML classification
- GitOps automation with PR workflows and drift detection
- Observatory monitoring stack with Prometheus/Grafana
- IDE operator rules for governed development
- Security playbooks and compliance frameworks
- Autonomous remediation and state reconciliation
This commit is contained in:
Vault Sovereign
2025-12-16 18:31:53 +00:00
commit 37a867c485
123 changed files with 25407 additions and 0 deletions

View File

@@ -0,0 +1,41 @@
"""
WAF Intelligence Engine - Analyze, audit, and generate Cloudflare WAF rules.
This module provides tools to:
- Analyze existing WAF rules for gaps and compliance issues
- Generate new WAF rules based on threat models
- Map rules to compliance frameworks (NIST, PCI-DSS, GDPR, etc.)
- Validate Terraform WAF configurations
Export primary classes and functions:
"""
from mcp.waf_intelligence.analyzer import (
WAFRuleAnalyzer,
RuleViolation,
AnalysisResult,
)
from mcp.waf_intelligence.generator import (
WAFRuleGenerator,
GeneratedRule,
)
from mcp.waf_intelligence.compliance import (
ComplianceMapper,
FrameworkMapping,
)
from mcp.waf_intelligence.orchestrator import (
WAFIntelligence,
WAFInsight,
)
__all__ = [
"WAFRuleAnalyzer",
"WAFRuleGenerator",
"ComplianceMapper",
"WAFIntelligence",
"WAFInsight",
"RuleViolation",
"AnalysisResult",
"GeneratedRule",
"FrameworkMapping",
]

View File

@@ -0,0 +1,132 @@
from __future__ import annotations
import argparse
import json
import sys
from dataclasses import asdict
from pathlib import Path
from typing import Any, Dict, List
from .orchestrator import WAFInsight, WAFIntelligence
def _insight_to_dict(insight: WAFInsight) -> Dict[str, Any]:
"""Convert a WAFInsight dataclass into a plain dict."""
return asdict(insight)
def _has_error(insights: List[WAFInsight]) -> bool:
"""Return True if any violation is error-severity."""
for insight in insights:
if insight.violation and insight.violation.severity == "error":
return True
return False
def run_cli(argv: List[str] | None = None) -> int:
parser = argparse.ArgumentParser(
prog="python -m mcp.waf_intelligence",
description="Analyze Cloudflare WAF Terraform configs and produce curated security + compliance insights.",
)
parser.add_argument(
"--file",
"-f",
required=True,
help="Path to the Terraform WAF file (e.g. terraform/waf.tf)",
)
parser.add_argument(
"--limit",
"-n",
type=int,
default=3,
help="Maximum number of high-priority insights to return (default: 3)",
)
parser.add_argument(
"--format",
"-o",
choices=["text", "json"],
default="text",
help="Output format: text (human-readable) or json (machine-readable). Default: text.",
)
parser.add_argument(
"--fail-on-error",
action="store_true",
help="Exit with non-zero code if any error-severity violations are found.",
)
args = parser.parse_args(argv)
path = Path(args.file)
if not path.exists():
print(f"[error] file not found: {path}", file=sys.stderr)
return 1
intel = WAFIntelligence()
insights = intel.analyze_and_recommend(str(path), limit=args.limit)
if args.format == "json":
payload = {
"file": str(path),
"insights": [_insight_to_dict(insight) for insight in insights],
}
print(json.dumps(payload, indent=2))
if args.fail_on_error and _has_error(insights):
print(
"[waf_intel] error-severity violations present, failing as requested.",
file=sys.stderr,
)
return 2
return 0
print(f"\nWAF Intelligence Report for: {path}\n{'-' * 72}")
if not insights:
print("No high-severity, high-confidence issues detected based on current heuristics.")
return 0
for idx, insight in enumerate(insights, start=1):
print(f"\nInsight #{idx}")
print("-" * 40)
if insight.violation:
violation = insight.violation
print(f"Problem : {violation.message}")
print(f"Severity : {violation.severity.upper()}")
print(f"Confidence: {int(violation.confidence * 100)}%")
if violation.location:
print(f"Location : {violation.location}")
if violation.hint:
print(f"Remediate : {violation.hint}")
if insight.suggested_rule:
rule = insight.suggested_rule
print("\nSuggested Rule:")
print(f" Name : {rule.name}")
print(f" Severity: {rule.severity.upper()}")
print(f" Impact : {int(rule.impact_score * 100)}%")
print(f" Effort : {int(rule.effort_score * 100)}%")
print(f" Summary : {rule.description}")
if insight.mappings:
print("\nCompliance Mapping:")
for mapping in insight.mappings:
print(f" - {mapping.framework} {mapping.control_id}: {mapping.description}")
print()
if args.fail_on_error and _has_error(insights):
print(
"[waf_intel] error-severity violations present, failing as requested.",
file=sys.stderr,
)
return 2
return 0
def main() -> None:
raise SystemExit(run_cli())
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,231 @@
from __future__ import annotations
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional
@dataclass
class RuleViolation:
"""Represents a potential issue in a WAF rule or configuration."""
rule_id: Optional[str]
message: str
severity: str # "info" | "warning" | "error"
framework_refs: List[str] = field(default_factory=list)
location: Optional[str] = None
confidence: float = 0.5 # 0.0-1.0: how sure we are
hint: Optional[str] = None # short suggestion on how to fix
@dataclass
class AnalysisResult:
"""High-level result of analyzing one or more WAF configs."""
source: str
violations: List[RuleViolation] = field(default_factory=list)
metadata: Dict[str, Any] = field(default_factory=dict)
@property
def has_issues(self) -> bool:
return any(v.severity in ("warning", "error") for v in self.violations)
def top_violations(
self,
*,
min_severity: str = "warning",
min_confidence: float = 0.7,
limit: int = 5,
) -> List[RuleViolation]:
"""Return a small, high-quality subset of violations."""
severity_order = {"info": 0, "warning": 1, "error": 2}
min_level = severity_order.get(min_severity, 1)
ranked = [
v
for v in self.violations
if severity_order.get(v.severity, 0) >= min_level
and v.confidence >= min_confidence
]
ranked.sort(key=lambda v: (v.severity != "error", -v.confidence))
return ranked[:limit]
class WAFRuleAnalyzer:
"""
Analyze Cloudflare WAF rules from Terraform with a quality-first posture.
"""
def analyze_file(
self,
path: str | Path,
*,
min_severity: str = "warning",
min_confidence: float = 0.6,
) -> AnalysisResult:
path = Path(path)
text = path.read_text(encoding="utf-8")
violations: List[RuleViolation] = []
# Example heuristic: no managed rules present
if "managed_rules" not in text:
violations.append(
RuleViolation(
rule_id=None,
message="No managed WAF rules detected in this file.",
severity="warning",
confidence=0.9,
framework_refs=["PCI-DSS 6.6", "OWASP-ASVS 13"],
location=str(path),
hint="Enable Cloudflare managed WAF rulesets (SQLi, XSS, RCE, bots) for this zone.",
)
)
# Example heuristic: overly broad allow
if '"*"' in text and "allow" in text:
violations.append(
RuleViolation(
rule_id=None,
message="Potentially overly broad allow rule detected ('*').",
severity="error",
confidence=0.85,
framework_refs=["Zero-Trust Principle"],
location=str(path),
hint="Narrow the rule expression to specific paths, methods, or IP ranges.",
)
)
result = AnalysisResult(
source=str(path),
violations=violations,
metadata={
"file_size": path.stat().st_size,
"heuristics_version": "0.2.0",
},
)
result.violations = result.top_violations(
min_severity=min_severity,
min_confidence=min_confidence,
limit=5,
)
return result
def analyze_terraform_text(
self,
source_name: str,
text: str,
*,
min_severity: str = "warning",
min_confidence: float = 0.6,
) -> AnalysisResult:
"""Same as analyze_file but for already-loaded text."""
tmp_path = Path(source_name)
violations: List[RuleViolation] = []
if "managed_rules" not in text:
violations.append(
RuleViolation(
rule_id=None,
message="No managed WAF rules detected in this snippet.",
severity="warning",
confidence=0.9,
framework_refs=["PCI-DSS 6.6", "OWASP-ASVS 13"],
location=source_name,
hint="Enable Cloudflare managed WAF rulesets (SQLi, XSS, RCE, bots) for this zone.",
)
)
result = AnalysisResult(
source=str(tmp_path),
violations=violations,
metadata={"heuristics_version": "0.2.0"},
)
result.violations = result.top_violations(
min_severity=min_severity,
min_confidence=min_confidence,
limit=5,
)
return result
def analyze_with_threat_intel(
self,
path: str | Path,
threat_indicators: List[Any],
*,
min_severity: str = "warning",
min_confidence: float = 0.6,
) -> AnalysisResult:
"""
Enhanced analysis using threat intelligence data.
Args:
path: WAF config file path
threat_indicators: List of ThreatIndicator objects from threat_intel module
min_severity: Minimum severity to include
min_confidence: Minimum confidence threshold
Returns:
AnalysisResult with violations informed by threat intel
"""
# Start with base analysis
base_result = self.analyze_file(path, min_severity=min_severity, min_confidence=min_confidence)
path = Path(path)
text = path.read_text(encoding="utf-8")
text_lower = text.lower()
# Check if threat indicators are addressed by existing rules
critical_ips = [i for i in threat_indicators if i.indicator_type == "ip" and i.severity in ("critical", "high")]
critical_patterns = [i for i in threat_indicators if i.indicator_type == "pattern" and i.severity in ("critical", "high")]
# Check for IP blocking coverage
if critical_ips:
ip_block_present = "ip.src" in text_lower or "cf.client.ip" in text_lower
if not ip_block_present:
base_result.violations.append(
RuleViolation(
rule_id=None,
message=f"Threat intel identified {len(critical_ips)} high-risk IPs not addressed by WAF rules.",
severity="error",
confidence=0.85,
framework_refs=["Zero-Trust", "Threat Intelligence"],
location=str(path),
hint=f"Add IP blocking rules for identified threat actors. Sample IPs: {', '.join(i.value for i in critical_ips[:3])}",
)
)
# Check for pattern-based attack coverage
attack_types_seen = set()
for ind in critical_patterns:
for tag in ind.tags:
if tag in ("sqli", "xss", "rce", "path_traversal"):
attack_types_seen.add(tag)
# Check managed ruleset coverage
for attack_type in attack_types_seen:
if attack_type not in text_lower and f'"{attack_type}"' not in text_lower:
base_result.violations.append(
RuleViolation(
rule_id=None,
message=f"Threat intel detected {attack_type.upper()} attacks but no explicit protection found.",
severity="warning",
confidence=0.8,
framework_refs=["OWASP Top 10", "Threat Intelligence"],
location=str(path),
hint=f"Enable Cloudflare managed rules for {attack_type.upper()} protection.",
)
)
# Update metadata with threat intel stats
base_result.metadata["threat_intel"] = {
"critical_ips": len(critical_ips),
"critical_patterns": len(critical_patterns),
"attack_types_seen": list(attack_types_seen),
}
return base_result

View File

@@ -0,0 +1,564 @@
"""
Phase 7: ML-Based Threat Classifier
Uses simple but effective ML techniques for:
- Attack pattern classification (SQLi, XSS, RCE, etc.)
- Anomaly scoring based on request features
- Risk-level prediction for proposed rules
Designed to work offline without heavy dependencies.
Uses scikit-learn-style interface but can run with pure Python fallback.
"""
from __future__ import annotations
import hashlib
import json
import math
import re
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple
# Try to import sklearn, fall back to pure Python
try:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
HAS_SKLEARN = True
except ImportError:
HAS_SKLEARN = False
@dataclass
class ClassificationResult:
"""Result of classifying a threat indicator or pattern."""
label: str # "sqli", "xss", "rce", "path_traversal", "scanner", "benign", etc.
confidence: float # 0.0-1.0
probabilities: Dict[str, float] = field(default_factory=dict)
features_used: List[str] = field(default_factory=list)
explanation: str = ""
@dataclass
class AnomalyScore:
"""Anomaly detection result."""
score: float # 0.0-1.0 (higher = more anomalous)
baseline_deviation: float # standard deviations from mean
anomalous_features: List[str] = field(default_factory=list)
recommendation: str = ""
class FeatureExtractor:
"""Extract features from request/log data for ML classification."""
# Character distribution features
SPECIAL_CHARS = set("'\"<>(){}[];=&|`$\\")
# Known attack signatures for feature detection
SQLI_PATTERNS = [
r"(?i)union\s+select",
r"(?i)select\s+.*\s+from",
r"(?i)insert\s+into",
r"(?i)update\s+.*\s+set",
r"(?i)delete\s+from",
r"(?i)drop\s+table",
r"(?i);\s*--",
r"(?i)'\s*or\s+'?1'?\s*=\s*'?1",
r"(?i)'\s*and\s+'?1'?\s*=\s*'?1",
]
XSS_PATTERNS = [
r"(?i)<script",
r"(?i)javascript:",
r"(?i)on\w+\s*=",
r"(?i)alert\s*\(",
r"(?i)document\.",
r"(?i)window\.",
r"(?i)eval\s*\(",
]
RCE_PATTERNS = [
r"(?i);\s*(?:cat|ls|id|whoami|pwd)",
r"(?i)\|\s*(?:cat|ls|id|whoami)",
r"(?i)`[^`]+`",
r"(?i)\$\([^)]+\)",
r"(?i)system\s*\(",
r"(?i)exec\s*\(",
r"(?i)passthru\s*\(",
]
PATH_TRAVERSAL_PATTERNS = [
r"\.\./",
r"\.\.\\",
r"(?i)etc/passwd",
r"(?i)windows/system32",
]
def extract(self, text: str) -> Dict[str, float]:
"""Extract numerical features from text."""
features: Dict[str, float] = {}
if not text:
return features
text_lower = text.lower()
text_len = len(text)
# Length features
features["length"] = min(text_len / 1000, 1.0) # normalized
features["length_log"] = math.log1p(text_len) / 10
# Character distribution
special_count = sum(1 for c in text if c in self.SPECIAL_CHARS)
features["special_char_ratio"] = special_count / max(text_len, 1)
features["uppercase_ratio"] = sum(1 for c in text if c.isupper()) / max(text_len, 1)
features["digit_ratio"] = sum(1 for c in text if c.isdigit()) / max(text_len, 1)
# Entropy (randomness indicator)
features["entropy"] = self._calculate_entropy(text)
# Pattern-based features
features["sqli_score"] = self._pattern_score(text, self.SQLI_PATTERNS)
features["xss_score"] = self._pattern_score(text, self.XSS_PATTERNS)
features["rce_score"] = self._pattern_score(text, self.RCE_PATTERNS)
features["path_traversal_score"] = self._pattern_score(text, self.PATH_TRAVERSAL_PATTERNS)
# Structural features
features["quote_count"] = (text.count("'") + text.count('"')) / max(text_len, 1)
features["paren_count"] = (text.count("(") + text.count(")")) / max(text_len, 1)
features["bracket_count"] = (text.count("[") + text.count("]") + text.count("{") + text.count("}")) / max(text_len, 1)
# Keyword presence
features["has_select"] = 1.0 if "select" in text_lower else 0.0
features["has_script"] = 1.0 if "<script" in text_lower else 0.0
features["has_etc_passwd"] = 1.0 if "etc/passwd" in text_lower else 0.0
return features
def _calculate_entropy(self, text: str) -> float:
"""Calculate Shannon entropy of text."""
if not text:
return 0.0
freq = Counter(text)
length = len(text)
entropy = 0.0
for count in freq.values():
prob = count / length
if prob > 0:
entropy -= prob * math.log2(prob)
# Normalize to 0-1 range (max entropy for ASCII is ~7)
return min(entropy / 7, 1.0)
def _pattern_score(self, text: str, patterns: List[str]) -> float:
"""Calculate pattern match score."""
matches = sum(1 for p in patterns if re.search(p, text))
return min(matches / max(len(patterns), 1), 1.0)
class NaiveBayesClassifier:
"""
Simple Naive Bayes classifier for attack type classification.
Works with or without sklearn.
"""
LABELS = ["sqli", "xss", "rce", "path_traversal", "scanner", "benign"]
def __init__(self):
self.feature_extractor = FeatureExtractor()
self._trained = False
# Training data (curated examples)
self._training_data = self._get_training_data()
# Feature statistics per class (for pure Python implementation)
self._class_priors: Dict[str, float] = {}
self._feature_means: Dict[str, Dict[str, float]] = defaultdict(dict)
self._feature_vars: Dict[str, Dict[str, float]] = defaultdict(dict)
def _get_training_data(self) -> List[Tuple[str, str]]:
"""Return curated training examples."""
return [
# SQLi examples
("' OR '1'='1", "sqli"),
("1; DROP TABLE users--", "sqli"),
("UNION SELECT * FROM passwords", "sqli"),
("admin'--", "sqli"),
("1' AND 1=1--", "sqli"),
("'; INSERT INTO users VALUES('hack','hack')--", "sqli"),
# XSS examples
("<script>alert('xss')</script>", "xss"),
("<img src=x onerror=alert(1)>", "xss"),
("javascript:alert(document.cookie)", "xss"),
("<svg onload=alert(1)>", "xss"),
("'\"><script>alert('XSS')</script>", "xss"),
# RCE examples
("; cat /etc/passwd", "rce"),
("| ls -la", "rce"),
("`id`", "rce"),
("$(whoami)", "rce"),
("; rm -rf /", "rce"),
("system('cat /etc/passwd')", "rce"),
# Path traversal
("../../../etc/passwd", "path_traversal"),
("..\\..\\..\\windows\\system32\\config\\sam", "path_traversal"),
("/etc/passwd%00", "path_traversal"),
("....//....//etc/passwd", "path_traversal"),
# Scanner signatures
("Mozilla/5.0 (compatible; Nmap Scripting Engine)", "scanner"),
("sqlmap/1.0", "scanner"),
("Nikto/2.1.5", "scanner"),
("masscan/1.0", "scanner"),
# Benign examples
("/api/users/123", "benign"),
("Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "benign"),
("/products?category=electronics&page=2", "benign"),
("GET /index.html HTTP/1.1", "benign"),
("/static/css/main.css", "benign"),
]
def train(self) -> None:
"""Train the classifier on built-in examples."""
# Extract features for all training data
X: List[Dict[str, float]] = []
y: List[str] = []
for text, label in self._training_data:
features = self.feature_extractor.extract(text)
X.append(features)
y.append(label)
# Calculate class priors
label_counts = Counter(y)
total = len(y)
for label, count in label_counts.items():
self._class_priors[label] = count / total
# Calculate feature means and variances per class
all_features = set()
for features in X:
all_features.update(features.keys())
for label in self.LABELS:
class_features = [X[i] for i in range(len(X)) if y[i] == label]
if not class_features:
continue
for feature in all_features:
values = [f.get(feature, 0.0) for f in class_features]
mean = sum(values) / len(values)
var = sum((v - mean) ** 2 for v in values) / len(values)
self._feature_means[label][feature] = mean
self._feature_vars[label][feature] = max(var, 1e-6) # avoid division by zero
self._trained = True
def classify(self, text: str) -> ClassificationResult:
"""Classify text into attack category."""
if not self._trained:
self.train()
features = self.feature_extractor.extract(text)
# Calculate log probabilities for each class
log_probs: Dict[str, float] = {}
for label in self.LABELS:
if label not in self._class_priors:
continue
log_prob = math.log(self._class_priors[label])
for feature, value in features.items():
if feature in self._feature_means[label]:
mean = self._feature_means[label][feature]
var = self._feature_vars[label][feature]
# Gaussian likelihood
log_prob += -0.5 * math.log(2 * math.pi * var)
log_prob += -0.5 * ((value - mean) ** 2) / var
log_probs[label] = log_prob
# Convert to probabilities via softmax
max_log_prob = max(log_probs.values()) if log_probs else 0
exp_probs = {k: math.exp(v - max_log_prob) for k, v in log_probs.items()}
total = sum(exp_probs.values())
probs = {k: v / total for k, v in exp_probs.items()}
# Find best label
best_label = max(probs, key=probs.get) if probs else "benign"
confidence = probs.get(best_label, 0.0)
# Generate explanation
explanation = self._generate_explanation(text, features, best_label)
return ClassificationResult(
label=best_label,
confidence=confidence,
probabilities=probs,
features_used=list(features.keys()),
explanation=explanation
)
def _generate_explanation(self, text: str, features: Dict[str, float], label: str) -> str:
"""Generate human-readable explanation for classification."""
reasons = []
if features.get("sqli_score", 0) > 0.3:
reasons.append("SQL injection patterns detected")
if features.get("xss_score", 0) > 0.3:
reasons.append("XSS patterns detected")
if features.get("rce_score", 0) > 0.3:
reasons.append("Command injection patterns detected")
if features.get("path_traversal_score", 0) > 0.3:
reasons.append("Path traversal patterns detected")
if features.get("special_char_ratio", 0) > 0.2:
reasons.append("High special character ratio")
if features.get("entropy", 0) > 0.7:
reasons.append("High entropy (possible encoding/obfuscation)")
if not reasons:
reasons.append(f"General pattern matching suggests {label}")
return "; ".join(reasons)
class AnomalyDetector:
"""
Detect anomalous requests based on baseline behavior.
Uses statistical methods (z-score, IQR) without requiring ML libraries.
"""
def __init__(self):
self.feature_extractor = FeatureExtractor()
self._baseline_stats: Dict[str, Dict[str, float]] = {}
self._observations: List[Dict[str, float]] = []
def add_observation(self, text: str) -> None:
"""Add an observation to the baseline."""
features = self.feature_extractor.extract(text)
self._observations.append(features)
# Recalculate baseline after enough observations
if len(self._observations) >= 10:
self._update_baseline()
def _update_baseline(self) -> None:
"""Update baseline statistics."""
if not self._observations:
return
all_features = set()
for obs in self._observations:
all_features.update(obs.keys())
for feature in all_features:
values = [obs.get(feature, 0.0) for obs in self._observations]
mean = sum(values) / len(values)
var = sum((v - mean) ** 2 for v in values) / len(values)
std = math.sqrt(var) if var > 0 else 0.001
self._baseline_stats[feature] = {
"mean": mean,
"std": std,
"min": min(values),
"max": max(values),
}
def score(self, text: str) -> AnomalyScore:
"""Score how anomalous a request is."""
features = self.feature_extractor.extract(text)
if not self._baseline_stats:
# No baseline yet, use heuristics
return self._heuristic_score(features)
z_scores: Dict[str, float] = {}
anomalous_features: List[str] = []
for feature, value in features.items():
if feature in self._baseline_stats:
stats = self._baseline_stats[feature]
z = (value - stats["mean"]) / stats["std"]
z_scores[feature] = abs(z)
if abs(z) > 2: # More than 2 std deviations
anomalous_features.append(f"{feature} (z={z:.2f})")
# Overall anomaly score (average of z-scores, normalized)
if z_scores:
avg_z = sum(z_scores.values()) / len(z_scores)
max_z = max(z_scores.values())
score = min(max_z / 5, 1.0) # Normalize to 0-1
baseline_deviation = avg_z
else:
score = 0.5
baseline_deviation = 0.0
# Generate recommendation
if score > 0.8:
recommendation = "BLOCK: Highly anomalous, likely attack"
elif score > 0.5:
recommendation = "CHALLENGE: Moderately anomalous, requires verification"
elif score > 0.3:
recommendation = "LOG: Slightly unusual, monitor closely"
else:
recommendation = "ALLOW: Within normal parameters"
return AnomalyScore(
score=score,
baseline_deviation=baseline_deviation,
anomalous_features=anomalous_features,
recommendation=recommendation
)
def _heuristic_score(self, features: Dict[str, float]) -> AnomalyScore:
"""Score based on heuristics when no baseline exists."""
score = 0.0
anomalous_features: List[str] = []
# Check for attack indicators
for attack_type in ["sqli_score", "xss_score", "rce_score", "path_traversal_score"]:
if features.get(attack_type, 0) > 0.3:
score += 0.25
anomalous_features.append(attack_type)
# Check for suspicious characteristics
if features.get("special_char_ratio", 0) > 0.15:
score += 0.15
anomalous_features.append("high_special_chars")
if features.get("entropy", 0) > 0.8:
score += 0.1
anomalous_features.append("high_entropy")
score = min(score, 1.0)
if score > 0.7:
recommendation = "BLOCK: Multiple attack indicators"
elif score > 0.4:
recommendation = "CHALLENGE: Suspicious characteristics"
else:
recommendation = "ALLOW: No obvious threats"
return AnomalyScore(
score=score,
baseline_deviation=0.0,
anomalous_features=anomalous_features,
recommendation=recommendation
)
class ThreatClassifier:
"""
High-level threat classifier combining multiple techniques.
Usage:
classifier = ThreatClassifier()
result = classifier.classify("' OR '1'='1")
print(f"Label: {result.label}, Confidence: {result.confidence}")
"""
def __init__(self, model_path: Optional[Path] = None):
self.naive_bayes = NaiveBayesClassifier()
self.anomaly_detector = AnomalyDetector()
self.model_path = model_path
# Train on startup
self.naive_bayes.train()
def classify(self, text: str) -> ClassificationResult:
"""Classify a request/pattern."""
return self.naive_bayes.classify(text)
def score_anomaly(self, text: str) -> AnomalyScore:
"""Score how anomalous a request is."""
return self.anomaly_detector.score(text)
def analyze(self, text: str) -> Dict[str, Any]:
"""Full analysis combining classification and anomaly detection."""
classification = self.classify(text)
anomaly = self.score_anomaly(text)
return {
"classification": {
"label": classification.label,
"confidence": classification.confidence,
"probabilities": classification.probabilities,
"explanation": classification.explanation,
},
"anomaly": {
"score": anomaly.score,
"baseline_deviation": anomaly.baseline_deviation,
"anomalous_features": anomaly.anomalous_features,
"recommendation": anomaly.recommendation,
},
"risk_level": self._compute_risk_level(classification, anomaly),
}
def _compute_risk_level(
self,
classification: ClassificationResult,
anomaly: AnomalyScore
) -> str:
"""Compute overall risk level."""
# High-risk attack types
high_risk_labels = {"sqli", "xss", "rce"}
if classification.label in high_risk_labels and classification.confidence > 0.7:
return "critical"
if classification.label in high_risk_labels and classification.confidence > 0.4:
return "high"
if anomaly.score > 0.7:
return "high"
if classification.label == "scanner":
return "medium"
if anomaly.score > 0.4:
return "medium"
return "low"
# CLI for testing
if __name__ == "__main__":
import sys
classifier = ThreatClassifier()
test_inputs = [
"' OR '1'='1",
"<script>alert('xss')</script>",
"; cat /etc/passwd",
"../../../etc/passwd",
"Mozilla/5.0 (Windows NT 10.0)",
"/api/users/123",
]
if len(sys.argv) > 1:
test_inputs = sys.argv[1:]
print("\n🤖 ML Threat Classifier Test")
print("=" * 60)
for text in test_inputs:
result = classifier.analyze(text)
print(f"\nInput: {text[:50]}...")
print(f" Label: {result['classification']['label']}")
print(f" Confidence: {result['classification']['confidence']:.2%}")
print(f" Risk Level: {result['risk_level'].upper()}")
print(f" Anomaly Score: {result['anomaly']['score']:.2%}")
print(f" Recommendation: {result['anomaly']['recommendation']}")

View File

@@ -0,0 +1,83 @@
from __future__ import annotations
from dataclasses import dataclass
from typing import Dict, List, Optional
@dataclass
class FrameworkMapping:
"""
Mapping between a WAF concept (e.g. 'SQLi protection') and references
in one or more compliance frameworks.
"""
control_id: str
framework: str # e.g. "PCI-DSS", "NIST-800-53", "GDPR"
description: str
references: List[str]
class ComplianceMapper:
"""
Map WAF rules / violations to compliance frameworks.
This starts as a simple static lookup table that we can extend over time.
"""
def __init__(self) -> None:
self._mappings: Dict[str, List[FrameworkMapping]] = self._build_default_mappings()
def _build_default_mappings(self) -> Dict[str, List[FrameworkMapping]]:
return {
"sqli_protection": [
FrameworkMapping(
control_id="6.6",
framework="PCI-DSS",
description="Ensure web-facing applications are protected against attacks such as SQL injection.",
references=["PCI-DSS v4.0 6.6", "OWASP Top 10 - A03:2021"],
)
],
"xss_protection": [
FrameworkMapping(
control_id="A5",
framework="OWASP-ASVS",
description="Verify that all user-controllable input is properly encoded or escaped.",
references=["OWASP Top 10 - A3: Cross-Site Scripting"],
)
],
"baseline_waf": [
FrameworkMapping(
control_id="13",
framework="OWASP-ASVS",
description="Centralized input validation, filtering, and WAF as compensating control.",
references=["OWASP-ASVS 13", "PCI-DSS 6.4.1"],
)
],
}
def map_concept(self, concept: str) -> List[FrameworkMapping]:
"""
Map a high-level WAF concept to compliance controls.
Example concepts:
- "sqli_protection"
- "xss_protection"
- "baseline_waf"
"""
return self._mappings.get(concept, [])
def best_effort_from_violation(self, message: str) -> List[FrameworkMapping]:
"""
Try to infer framework mappings from a violation message string.
This allows the analyzer to stay dumb while still attaching controls.
"""
msg = message.lower()
if "sql" in msg and "inject" in msg:
return self.map_concept("sqli_protection")
if "xss" in msg or "cross-site scripting" in msg:
return self.map_concept("xss_protection")
if "waf" in msg or "managed rules" in msg:
return self.map_concept("baseline_waf")
return []

View File

@@ -0,0 +1,120 @@
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List, Optional
@dataclass
class GeneratedRule:
"""Represents a Terraform WAF rule we propose to add."""
name: str
description: str
terraform_snippet: str
severity: str # "low" | "medium" | "high" | "critical"
tags: List[str] = field(default_factory=list)
notes: Optional[str] = None
impact_score: float = 0.5 # 0-1: estimated security impact
effort_score: float = 0.5 # 0-1: estimated effort to implement
class WAFRuleGenerator:
"""
Generate Cloudflare WAF Terraform rules with a quality-first strategy.
"""
def generate_from_scenario(
self,
scenario: str,
*,
limit: int = 3,
max_effort: float = 0.8,
) -> List[GeneratedRule]:
"""
Return a small set of high-impact, reasonable-effort rules.
"""
scenario_lower = scenario.lower()
candidates: List[GeneratedRule] = []
if "sql injection" in scenario_lower or "sqli" in scenario_lower:
candidates.append(self._sql_injection_rule())
if "xss" in scenario_lower:
candidates.append(self._xss_rule())
# If nothing matched, fallback to baseline
if not candidates:
candidates.append(self._baseline_waf_rule())
# Filter by effort & sort by impact
filtered = [r for r in candidates if r.effort_score <= max_effort]
if not filtered:
filtered = candidates
filtered.sort(key=lambda r: (-r.impact_score, r.effort_score))
return filtered[:limit]
def _sql_injection_rule(self) -> GeneratedRule:
snippet = '''resource "cloudflare_ruleset" "waf_sqli_protection" {
# TODO: adjust zone_id / account_id and phase for your setup
name = "WAF - SQLi protection"
kind = "zone"
phase = "http_request_firewall_managed"
rules = [{
action = "block"
expression = "(cf.waf.ruleset eq \\"sqli\\")"
enabled = true
}]
}
'''
return GeneratedRule(
name="waf_sqli_protection",
description="Enable blocking against SQL injection attempts using Cloudflare managed rules.",
terraform_snippet=snippet,
severity="high",
tags=["sqli", "managed_rules", "waf"],
impact_score=0.95,
effort_score=0.3,
)
def _xss_rule(self) -> GeneratedRule:
snippet = '''resource "cloudflare_ruleset" "waf_xss_protection" {
name = "WAF - XSS protection"
kind = "zone"
phase = "http_request_firewall_managed"
rules = [{
action = "block"
expression = "(cf.waf.ruleset eq \\"xss\\")"
enabled = true
}]
}
'''
return GeneratedRule(
name="waf_xss_protection",
description="Enable blocking against cross-site scripting (XSS) attacks.",
terraform_snippet=snippet,
severity="high",
tags=["xss", "managed_rules", "waf"],
impact_score=0.9,
effort_score=0.3,
)
def _baseline_waf_rule(self) -> GeneratedRule:
snippet = '''# Baseline WAF hardening (placeholder - customize for your environment)
# Consider enabling Cloudflare managed WAF rulesets for:
# - SQLi
# - XSS
# - RCE
# - Bot protection
'''
return GeneratedRule(
name="waf_baseline_hardening",
description="Baseline recommendation to enable managed WAF rulesets.",
terraform_snippet=snippet,
severity="medium",
tags=["baseline", "waf"],
impact_score=0.7,
effort_score=0.1,
)

View File

@@ -0,0 +1,370 @@
from __future__ import annotations
import os
from dataclasses import dataclass, field
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List, Optional
from mcp.waf_intelligence.analyzer import AnalysisResult, RuleViolation, WAFRuleAnalyzer
from mcp.waf_intelligence.compliance import ComplianceMapper, FrameworkMapping
from mcp.waf_intelligence.generator import GeneratedRule, WAFRuleGenerator
# Optional advanced modules (Phase 7)
try:
from mcp.waf_intelligence.threat_intel import (
ThreatIntelCollector,
ThreatIntelReport,
ThreatIndicator,
)
_HAS_THREAT_INTEL = True
except ImportError:
_HAS_THREAT_INTEL = False
ThreatIntelCollector = None
try:
from mcp.waf_intelligence.classifier import (
ThreatClassifier,
ClassificationResult,
)
_HAS_CLASSIFIER = True
except ImportError:
_HAS_CLASSIFIER = False
ThreatClassifier = None
@dataclass
class WAFInsight:
"""Single high-quality insight across analysis + generation + compliance."""
violation: RuleViolation | None
suggested_rule: GeneratedRule | None
mappings: List[FrameworkMapping]
@dataclass
class ThreatAssessment:
"""Phase 7: Comprehensive threat assessment result."""
analysis_result: Optional[AnalysisResult] = None
threat_report: Optional[Any] = None # ThreatIntelReport when available
classification_summary: Dict[str, int] = field(default_factory=dict)
risk_score: float = 0.0
recommended_actions: List[str] = field(default_factory=list)
generated_at: datetime = field(default_factory=datetime.utcnow)
@property
def risk_level(self) -> str:
if self.risk_score >= 0.8:
return "critical"
elif self.risk_score >= 0.6:
return "high"
elif self.risk_score >= 0.4:
return "medium"
else:
return "low"
class WAFIntelligence:
"""
Quality-first orchestration layer:
- analyze WAF config
- propose a few rules
- attach compliance mappings
- Phase 7: integrate threat intel and ML classification
"""
def __init__(
self,
workspace_path: Optional[str] = None,
enable_threat_intel: bool = True,
enable_ml_classifier: bool = True,
) -> None:
self.workspace = Path(workspace_path) if workspace_path else Path.cwd()
# Core components
self.analyzer = WAFRuleAnalyzer()
self.generator = WAFRuleGenerator()
self.mapper = ComplianceMapper()
# Phase 7 components (optional)
self.threat_intel: Optional[Any] = None
self.classifier: Optional[Any] = None
if enable_threat_intel and _HAS_THREAT_INTEL:
try:
self.threat_intel = ThreatIntelCollector()
except Exception:
pass
if enable_ml_classifier and _HAS_CLASSIFIER:
try:
self.classifier = ThreatClassifier()
except Exception:
pass
def analyze_and_recommend(
self,
path: str,
*,
limit: int = 3,
min_severity: str = "warning",
) -> List[WAFInsight]:
analysis: AnalysisResult = self.analyzer.analyze_file(
path,
min_severity=min_severity,
)
top_violations = analysis.top_violations(
min_severity=min_severity,
limit=limit,
)
insights: List[WAFInsight] = []
for violation in top_violations:
mappings = self.mapper.best_effort_from_violation(violation.message)
scenario = violation.message
rules = self.generator.generate_from_scenario(scenario, limit=1)
suggested = rules[0] if rules else None
insights.append(
WAFInsight(
violation=violation,
suggested_rule=suggested,
mappings=mappings,
)
)
return insights
# ─────────────────────────────────────────────────────────────────────────
# Phase 7: Advanced threat intelligence methods
# ─────────────────────────────────────────────────────────────────────────
def collect_threat_intel(
self,
log_paths: Optional[List[str]] = None,
max_indicators: int = 100,
) -> Optional[Any]:
"""
Collect threat intelligence from logs and external feeds.
Args:
log_paths: Paths to Cloudflare log files
max_indicators: Maximum indicators to collect
Returns:
ThreatIntelReport or None if unavailable
"""
if not self.threat_intel:
return None
# Default log paths
if log_paths is None:
log_paths = [
str(self.workspace / "logs"),
"/var/log/cloudflare",
]
return self.threat_intel.collect(
log_paths=log_paths,
max_indicators=max_indicators,
)
def classify_threat(self, payload: str) -> Optional[Any]:
"""
Classify a payload using ML classifier.
Args:
payload: Request payload to classify
Returns:
ClassificationResult or None
"""
if not self.classifier:
return None
return self.classifier.classify(payload)
def full_assessment(
self,
waf_config_path: Optional[str] = None,
log_paths: Optional[List[str]] = None,
include_threat_intel: bool = True,
) -> ThreatAssessment:
"""
Phase 7: Perform comprehensive threat assessment.
Combines:
- WAF configuration analysis
- Threat intelligence collection
- ML classification summary
- Risk scoring
Args:
waf_config_path: Path to WAF Terraform file
log_paths: Paths to log files
include_threat_intel: Whether to collect threat intel
Returns:
ThreatAssessment with full analysis results
"""
assessment = ThreatAssessment()
risk_factors: List[float] = []
recommendations: List[str] = []
# 1. Analyze WAF configuration
if waf_config_path is None:
waf_config_path = str(self.workspace / "terraform" / "waf.tf")
if Path(waf_config_path).exists():
assessment.analysis_result = self.analyzer.analyze_file(
waf_config_path,
min_severity="info",
)
# Calculate risk from violations
severity_weights = {"error": 0.8, "warning": 0.5, "info": 0.2}
for violation in assessment.analysis_result.violations:
weight = severity_weights.get(violation.severity, 0.3)
risk_factors.append(weight)
# Generate recommendations
critical_count = sum(
1 for v in assessment.analysis_result.violations
if v.severity == "error"
)
if critical_count > 0:
recommendations.append(
f"🔴 Fix {critical_count} critical WAF configuration issues"
)
# 2. Collect threat intelligence
if include_threat_intel and self.threat_intel:
try:
assessment.threat_report = self.collect_threat_intel(
log_paths=log_paths,
max_indicators=50,
)
if assessment.threat_report:
indicators = assessment.threat_report.indicators
# Count by severity
severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
for ind in indicators:
sev = getattr(ind, "severity", "low")
severity_counts[sev] = severity_counts.get(sev, 0) + 1
# Add to classification summary
assessment.classification_summary["threat_indicators"] = len(indicators)
assessment.classification_summary.update(severity_counts)
# Calculate threat intel risk
if indicators:
critical_ratio = severity_counts["critical"] / len(indicators)
high_ratio = severity_counts["high"] / len(indicators)
risk_factors.append(critical_ratio * 0.9 + high_ratio * 0.7)
if severity_counts["critical"] > 0:
recommendations.append(
f"🚨 Block {severity_counts['critical']} critical threat IPs immediately"
)
except Exception:
pass
# 3. ML classification summary (from any collected data)
if self.classifier and assessment.threat_report:
try:
attack_types = {"sqli": 0, "xss": 0, "rce": 0, "clean": 0, "unknown": 0}
indicators = assessment.threat_report.indicators
pattern_indicators = [
i for i in indicators
if getattr(i, "indicator_type", "") == "pattern"
]
for ind in pattern_indicators[:20]: # Sample first 20
result = self.classifier.classify(ind.value)
if result:
label = result.label
attack_types[label] = attack_types.get(label, 0) + 1
assessment.classification_summary["ml_classifications"] = attack_types
# Add ML risk factor
dangerous = attack_types.get("sqli", 0) + attack_types.get("rce", 0)
if dangerous > 5:
risk_factors.append(0.8)
recommendations.append(
f"⚠️ ML detected {dangerous} dangerous attack patterns"
)
except Exception:
pass
# 4. Calculate final risk score
if risk_factors:
assessment.risk_score = min(1.0, sum(risk_factors) / max(len(risk_factors), 1))
else:
assessment.risk_score = 0.3 # Baseline risk
assessment.recommended_actions = recommendations
return assessment
def generate_gitops_proposals(
self,
threat_report: Optional[Any] = None,
max_proposals: int = 5,
) -> List[Dict[str, Any]]:
"""
Generate GitOps-ready rule proposals.
Args:
threat_report: ThreatIntelReport to use
max_proposals: Maximum proposals to generate
Returns:
List of proposal dicts ready for MR creation
"""
proposals: List[Dict[str, Any]] = []
if not threat_report:
return proposals
try:
# Import proposer dynamically
from gitops.waf_rule_proposer import WAFRuleProposer
proposer = WAFRuleProposer(workspace_path=str(self.workspace))
batch = proposer.generate_proposals(
threat_report=threat_report,
max_proposals=max_proposals,
)
for proposal in batch.proposals:
proposals.append({
"name": proposal.rule_name,
"type": proposal.rule_type,
"severity": proposal.severity,
"confidence": proposal.confidence,
"terraform": proposal.terraform_code,
"justification": proposal.justification,
"auto_deploy": proposal.auto_deploy_eligible,
})
except ImportError:
pass
return proposals
@property
def capabilities(self) -> Dict[str, bool]:
"""Report available capabilities."""
return {
"core_analysis": True,
"rule_generation": True,
"compliance_mapping": True,
"threat_intel": self.threat_intel is not None,
"ml_classification": self.classifier is not None,
}

279
mcp/waf_intelligence/server.py Executable file
View File

@@ -0,0 +1,279 @@
#!/usr/bin/env python3
"""
WAF Intelligence MCP Server for VS Code Copilot.
This implements the Model Context Protocol (MCP) stdio interface
so VS Code can communicate with your WAF Intelligence system.
"""
import json
import sys
from typing import Any
# Add parent to path for imports
sys.path.insert(0, '/Users/sovereign/Desktop/CLOUDFLARE')
from mcp.waf_intelligence.orchestrator import WAFIntelligence
from mcp.waf_intelligence.analyzer import WAFRuleAnalyzer
class WAFIntelligenceMCPServer:
"""MCP Server wrapper for WAF Intelligence."""
def __init__(self):
self.waf = WAFIntelligence()
self.analyzer = WAFRuleAnalyzer()
def get_capabilities(self) -> dict:
"""Return server capabilities."""
return {
"tools": [
{
"name": "waf_analyze",
"description": "Analyze WAF logs and detect attack patterns",
"inputSchema": {
"type": "object",
"properties": {
"log_file": {
"type": "string",
"description": "Path to WAF log file (optional)"
},
"zone_id": {
"type": "string",
"description": "Cloudflare zone ID (optional)"
}
}
}
},
{
"name": "waf_assess",
"description": "Run full security assessment with threat intel and ML classification",
"inputSchema": {
"type": "object",
"properties": {
"zone_id": {
"type": "string",
"description": "Cloudflare zone ID"
}
},
"required": ["zone_id"]
}
},
{
"name": "waf_generate_rules",
"description": "Generate Terraform WAF rules from threat intelligence",
"inputSchema": {
"type": "object",
"properties": {
"zone_id": {
"type": "string",
"description": "Cloudflare zone ID"
},
"min_confidence": {
"type": "number",
"description": "Minimum confidence threshold (0-1)",
"default": 0.7
}
},
"required": ["zone_id"]
}
},
{
"name": "waf_capabilities",
"description": "List available WAF Intelligence capabilities",
"inputSchema": {
"type": "object",
"properties": {}
}
}
]
}
def handle_tool_call(self, name: str, arguments: dict) -> dict:
"""Handle a tool invocation."""
try:
if name == "waf_capabilities":
return {
"content": [
{
"type": "text",
"text": json.dumps({
"capabilities": self.waf.capabilities,
"status": "operational"
}, indent=2)
}
]
}
elif name == "waf_analyze":
log_file = arguments.get("log_file")
zone_id = arguments.get("zone_id")
if log_file:
result = self.analyzer.analyze_log_file(log_file)
else:
result = {
"message": "No log file provided. Use zone_id for live analysis.",
"capabilities": self.waf.capabilities
}
return {
"content": [
{"type": "text", "text": json.dumps(result, indent=2, default=str)}
]
}
elif name == "waf_assess":
zone_id = arguments.get("zone_id")
# full_assessment uses workspace paths, not zone_id
assessment = self.waf.full_assessment(
include_threat_intel=True
)
# Build result from ThreatAssessment dataclass
result = {
"zone_id": zone_id,
"risk_score": assessment.risk_score,
"risk_level": assessment.risk_level,
"classification_summary": assessment.classification_summary,
"recommended_actions": assessment.recommended_actions[:10], # Top 10
"has_analysis": assessment.analysis_result is not None,
"has_threat_intel": assessment.threat_report is not None,
"generated_at": str(assessment.generated_at)
}
return {
"content": [
{"type": "text", "text": json.dumps(result, indent=2, default=str)}
]
}
elif name == "waf_generate_rules":
zone_id = arguments.get("zone_id")
min_confidence = arguments.get("min_confidence", 0.7)
# Generate proposals (doesn't use zone_id directly)
proposals = self.waf.generate_gitops_proposals(
max_proposals=5
)
result = {
"zone_id": zone_id,
"min_confidence": min_confidence,
"proposals_count": len(proposals),
"proposals": proposals
}
return {
"content": [
{"type": "text", "text": json.dumps(result, indent=2, default=str) if proposals else "No rules generated (no threat data available)"}
]
}
else:
return {
"content": [
{"type": "text", "text": f"Unknown tool: {name}"}
],
"isError": True
}
except Exception as e:
return {
"content": [
{"type": "text", "text": f"Error: {str(e)}"}
],
"isError": True
}
def run(self):
"""Run the MCP server (stdio mode)."""
# Send server info
server_info = {
"jsonrpc": "2.0",
"method": "initialized",
"params": {
"serverInfo": {
"name": "waf-intelligence",
"version": "1.0.0"
},
"capabilities": self.get_capabilities()
}
}
# Main loop - read JSON-RPC messages from stdin
for line in sys.stdin:
try:
message = json.loads(line.strip())
if message.get("method") == "initialize":
response = {
"jsonrpc": "2.0",
"id": message.get("id"),
"result": {
"protocolVersion": "2024-11-05",
"serverInfo": {
"name": "waf-intelligence",
"version": "1.0.0"
},
"capabilities": {
"tools": {}
}
}
}
print(json.dumps(response), flush=True)
elif message.get("method") == "tools/list":
response = {
"jsonrpc": "2.0",
"id": message.get("id"),
"result": self.get_capabilities()
}
print(json.dumps(response), flush=True)
elif message.get("method") == "tools/call":
params = message.get("params", {})
tool_name = params.get("name")
tool_args = params.get("arguments", {})
result = self.handle_tool_call(tool_name, tool_args)
response = {
"jsonrpc": "2.0",
"id": message.get("id"),
"result": result
}
print(json.dumps(response), flush=True)
elif message.get("method") == "notifications/initialized":
# Client acknowledged initialization
pass
else:
# Unknown method
response = {
"jsonrpc": "2.0",
"id": message.get("id"),
"error": {
"code": -32601,
"message": f"Method not found: {message.get('method')}"
}
}
print(json.dumps(response), flush=True)
except json.JSONDecodeError:
continue
except Exception as e:
error_response = {
"jsonrpc": "2.0",
"id": None,
"error": {
"code": -32603,
"message": str(e)
}
}
print(json.dumps(error_response), flush=True)
if __name__ == "__main__":
server = WAFIntelligenceMCPServer()
server.run()

View File

@@ -0,0 +1,445 @@
"""
Phase 7: Multi-Source Threat Intelligence Collector
Aggregates threat data from:
- Cloudflare Analytics API (WAF events, firewall logs)
- External threat feeds (AbuseIPDB, Emerging Threats, etc.)
- Local honeypot signals (if configured)
- Historical attack patterns from receipts/logs
Produces scored ThreatIndicators for ML classification and rule generation.
"""
from __future__ import annotations
import hashlib
import json
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timedelta
from pathlib import Path
from typing import Any, Dict, List, Optional, Set
from urllib.parse import urlparse
# Optional: requests for external API calls
try:
import requests
HAS_REQUESTS = True
except ImportError:
HAS_REQUESTS = False
@dataclass
class ThreatIndicator:
"""Single threat indicator with scoring metadata."""
indicator_type: str # "ip", "ua", "path", "pattern", "country"
value: str
confidence: float # 0.0-1.0
severity: str # "low", "medium", "high", "critical"
sources: List[str] = field(default_factory=list)
tags: List[str] = field(default_factory=list)
first_seen: Optional[datetime] = None
last_seen: Optional[datetime] = None
hit_count: int = 1
context: Dict[str, Any] = field(default_factory=dict)
@property
def fingerprint(self) -> str:
"""Unique identifier for deduplication."""
raw = f"{self.indicator_type}:{self.value}"
return hashlib.sha256(raw.encode()).hexdigest()[:16]
def merge(self, other: "ThreatIndicator") -> None:
"""Merge another indicator into this one (for deduplication)."""
self.hit_count += other.hit_count
self.confidence = max(self.confidence, other.confidence)
self.sources = list(set(self.sources + other.sources))
self.tags = list(set(self.tags + other.tags))
if other.first_seen and (not self.first_seen or other.first_seen < self.first_seen):
self.first_seen = other.first_seen
if other.last_seen and (not self.last_seen or other.last_seen > self.last_seen):
self.last_seen = other.last_seen
@dataclass
class ThreatIntelReport:
"""Aggregated threat intelligence from all sources."""
indicators: List[ThreatIndicator] = field(default_factory=list)
sources_queried: List[str] = field(default_factory=list)
collection_time: datetime = field(default_factory=datetime.utcnow)
metadata: Dict[str, Any] = field(default_factory=dict)
@property
def critical_count(self) -> int:
return sum(1 for i in self.indicators if i.severity == "critical")
@property
def high_count(self) -> int:
return sum(1 for i in self.indicators if i.severity == "high")
def top_indicators(self, limit: int = 10) -> List[ThreatIndicator]:
"""Return highest-priority indicators."""
severity_order = {"critical": 4, "high": 3, "medium": 2, "low": 1}
sorted_indicators = sorted(
self.indicators,
key=lambda x: (severity_order.get(x.severity, 0), x.confidence, x.hit_count),
reverse=True
)
return sorted_indicators[:limit]
class CloudflareLogParser:
"""Parse Cloudflare WAF/firewall logs for threat indicators."""
# Common attack patterns in URIs
ATTACK_PATTERNS = [
(r"(?i)(?:union\s+select|select\s+.*\s+from)", "sqli", "high"),
(r"(?i)<script[^>]*>", "xss", "high"),
(r"(?i)(?:\.\./|\.\.\\)", "path_traversal", "medium"),
(r"(?i)(?:cmd=|exec=|system\()", "rce", "critical"),
(r"(?i)(?:wp-admin|wp-login|xmlrpc\.php)", "wordpress_probe", "low"),
(r"(?i)(?:\.env|\.git|\.htaccess)", "sensitive_file", "medium"),
(r"(?i)(?:phpmyadmin|adminer|mysql)", "db_probe", "medium"),
(r"(?i)(?:eval\(|base64_decode)", "code_injection", "high"),
]
# Known bad user agents
BAD_USER_AGENTS = [
("sqlmap", "sqli_tool", "high"),
("nikto", "scanner", "medium"),
("nmap", "scanner", "medium"),
("masscan", "scanner", "medium"),
("zgrab", "scanner", "low"),
("python-requests", "bot", "low"), # contextual
("curl", "bot", "low"), # contextual
]
def parse_log_file(self, path: Path) -> List[ThreatIndicator]:
"""Parse a log file and extract threat indicators."""
indicators: List[ThreatIndicator] = []
if not path.exists():
return indicators
try:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
for line in f:
indicators.extend(self._parse_log_line(line))
except Exception:
pass
return indicators
def _parse_log_line(self, line: str) -> List[ThreatIndicator]:
"""Extract indicators from a single log line."""
indicators: List[ThreatIndicator] = []
# Try JSON format first
try:
data = json.loads(line)
indicators.extend(self._parse_json_log(data))
return indicators
except json.JSONDecodeError:
pass
# Fall back to pattern matching on raw line
indicators.extend(self._scan_for_patterns(line))
return indicators
def _parse_json_log(self, data: Dict[str, Any]) -> List[ThreatIndicator]:
"""Parse structured JSON log entry."""
indicators: List[ThreatIndicator] = []
# Extract IP if blocked or challenged
action = data.get("action", "").lower()
if action in ("block", "challenge", "managed_challenge"):
ip = data.get("clientIP") or data.get("client_ip") or data.get("ip")
if ip:
indicators.append(ThreatIndicator(
indicator_type="ip",
value=ip,
confidence=0.8 if action == "block" else 0.6,
severity="high" if action == "block" else "medium",
sources=["cloudflare_log"],
tags=[action, data.get("ruleId", "unknown_rule")],
context={"rule": data.get("ruleName", ""), "action": action}
))
# Extract URI patterns
uri = data.get("clientRequestURI") or data.get("uri") or data.get("path", "")
if uri:
indicators.extend(self._scan_for_patterns(uri))
# Extract user agent
ua = data.get("clientRequestHTTPHost") or data.get("user_agent", "")
if ua:
for pattern, tag, severity in self.BAD_USER_AGENTS:
if pattern.lower() in ua.lower():
indicators.append(ThreatIndicator(
indicator_type="ua",
value=ua[:200], # truncate
confidence=0.7,
severity=severity,
sources=["cloudflare_log"],
tags=[tag, "bad_ua"]
))
break
return indicators
def _scan_for_patterns(self, text: str) -> List[ThreatIndicator]:
"""Scan text for known attack patterns."""
indicators: List[ThreatIndicator] = []
for pattern, tag, severity in self.ATTACK_PATTERNS:
if re.search(pattern, text):
indicators.append(ThreatIndicator(
indicator_type="pattern",
value=text[:500], # truncate
confidence=0.75,
severity=severity,
sources=["pattern_match"],
tags=[tag, "attack_pattern"]
))
return indicators
class ExternalThreatFeed:
"""Fetch threat intelligence from external APIs."""
def __init__(self, api_keys: Optional[Dict[str, str]] = None):
self.api_keys = api_keys or {}
self._cache: Dict[str, ThreatIndicator] = {}
def query_abuseipdb(self, ip: str) -> Optional[ThreatIndicator]:
"""Query AbuseIPDB for IP reputation."""
if not HAS_REQUESTS:
return None
api_key = self.api_keys.get("abuseipdb") or os.getenv("ABUSEIPDB_API_KEY")
if not api_key:
return None
cache_key = f"abuseipdb:{ip}"
if cache_key in self._cache:
return self._cache[cache_key]
try:
resp = requests.get(
"https://api.abuseipdb.com/api/v2/check",
headers={"Key": api_key, "Accept": "application/json"},
params={"ipAddress": ip, "maxAgeInDays": 90},
timeout=5
)
if resp.status_code == 200:
data = resp.json().get("data", {})
abuse_score = data.get("abuseConfidenceScore", 0)
if abuse_score > 0:
severity = "critical" if abuse_score > 80 else "high" if abuse_score > 50 else "medium"
indicator = ThreatIndicator(
indicator_type="ip",
value=ip,
confidence=abuse_score / 100,
severity=severity,
sources=["abuseipdb"],
tags=["external_intel", "ip_reputation"],
hit_count=data.get("totalReports", 1),
context={
"abuse_score": abuse_score,
"country": data.get("countryCode"),
"isp": data.get("isp"),
"domain": data.get("domain"),
"usage_type": data.get("usageType"),
}
)
self._cache[cache_key] = indicator
return indicator
except Exception:
pass
return None
def query_emerging_threats(self, ip: str) -> Optional[ThreatIndicator]:
"""Check IP against Emerging Threats blocklist (free, no API key)."""
if not HAS_REQUESTS:
return None
# This is a simplified check - real implementation would cache the blocklist
# For demo purposes, we return None and rely on other sources
return None
def enrich_indicator(self, indicator: ThreatIndicator) -> ThreatIndicator:
"""Enrich an indicator with external intelligence."""
if indicator.indicator_type == "ip":
external = self.query_abuseipdb(indicator.value)
if external:
indicator.merge(external)
return indicator
class ThreatIntelCollector:
"""
Main collector that aggregates from all sources.
Usage:
collector = ThreatIntelCollector(workspace_path="/path/to/cloudflare")
report = collector.collect()
for indicator in report.top_indicators(10):
print(f"{indicator.severity}: {indicator.indicator_type}={indicator.value}")
"""
def __init__(
self,
workspace_path: Optional[str] = None,
api_keys: Optional[Dict[str, str]] = None,
enable_external: bool = True
):
self.workspace = Path(workspace_path) if workspace_path else Path.cwd()
self.log_parser = CloudflareLogParser()
self.external_feed = ExternalThreatFeed(api_keys) if enable_external else None
self._indicators: Dict[str, ThreatIndicator] = {}
def collect(
self,
log_dirs: Optional[List[str]] = None,
enrich_external: bool = True,
max_indicators: int = 1000
) -> ThreatIntelReport:
"""
Collect threat intelligence from all configured sources.
Args:
log_dirs: Directories to scan for logs (default: observatory/, anomalies/)
enrich_external: Whether to query external APIs for enrichment
max_indicators: Maximum indicators to return
Returns:
ThreatIntelReport with deduplicated, scored indicators
"""
sources_queried: List[str] = []
# Default log directories
if log_dirs is None:
log_dirs = ["observatory", "anomalies", "archive_runtime/receipts"]
# Collect from local logs
for log_dir in log_dirs:
dir_path = self.workspace / log_dir
if dir_path.exists():
sources_queried.append(f"local:{log_dir}")
self._collect_from_directory(dir_path)
# Collect from Terraform state (extract referenced IPs/patterns)
tf_path = self.workspace / "terraform"
if tf_path.exists():
sources_queried.append("terraform_state")
self._collect_from_terraform(tf_path)
# Enrich with external intel if enabled
if enrich_external and self.external_feed:
sources_queried.append("external_apis")
self._enrich_all_indicators()
# Build report
all_indicators = list(self._indicators.values())
# Sort by priority and truncate
severity_order = {"critical": 4, "high": 3, "medium": 2, "low": 1}
all_indicators.sort(
key=lambda x: (severity_order.get(x.severity, 0), x.confidence, x.hit_count),
reverse=True
)
return ThreatIntelReport(
indicators=all_indicators[:max_indicators],
sources_queried=sources_queried,
metadata={
"workspace": str(self.workspace),
"total_raw": len(self._indicators),
"external_enabled": enrich_external and self.external_feed is not None
}
)
def _collect_from_directory(self, dir_path: Path) -> None:
"""Scan a directory for log files and extract indicators."""
log_patterns = ["*.log", "*.json", "*.jsonl"]
for pattern in log_patterns:
for log_file in dir_path.rglob(pattern):
for indicator in self.log_parser.parse_log_file(log_file):
self._add_indicator(indicator)
def _collect_from_terraform(self, tf_path: Path) -> None:
"""Extract indicators referenced in Terraform files."""
for tf_file in tf_path.glob("*.tf"):
try:
content = tf_file.read_text(encoding="utf-8")
# Extract IPs from allow/block rules
ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}(?:/\d{1,2})?\b'
for match in re.finditer(ip_pattern, content):
ip = match.group()
# Only flag if in a block context
context_start = max(0, match.start() - 100)
context = content[context_start:match.start()].lower()
if "block" in context or "deny" in context:
self._add_indicator(ThreatIndicator(
indicator_type="ip",
value=ip,
confidence=0.9,
severity="medium",
sources=["terraform_blocklist"],
tags=["existing_rule", "blocked_ip"],
context={"file": str(tf_file.name)}
))
except Exception:
pass
def _add_indicator(self, indicator: ThreatIndicator) -> None:
"""Add indicator with deduplication."""
key = indicator.fingerprint
if key in self._indicators:
self._indicators[key].merge(indicator)
else:
self._indicators[key] = indicator
def _enrich_all_indicators(self) -> None:
"""Enrich all IP indicators with external intelligence."""
if not self.external_feed:
return
for key, indicator in list(self._indicators.items()):
if indicator.indicator_type == "ip":
self.external_feed.enrich_indicator(indicator)
# CLI interface for testing
if __name__ == "__main__":
import sys
workspace = sys.argv[1] if len(sys.argv) > 1 else "."
collector = ThreatIntelCollector(
workspace_path=workspace,
enable_external=False # Don't hit APIs in CLI test
)
report = collector.collect()
print(f"\n🔍 Threat Intelligence Report")
print(f"=" * 50)
print(f"Sources: {', '.join(report.sources_queried)}")
print(f"Total indicators: {len(report.indicators)}")
print(f"Critical: {report.critical_count} | High: {report.high_count}")
print(f"\nTop 10 Indicators:")
print("-" * 50)
for ind in report.top_indicators(10):
print(f" [{ind.severity.upper():8}] {ind.indicator_type}={ind.value[:50]}")
print(f" confidence={ind.confidence:.2f} hits={ind.hit_count} sources={ind.sources}")