Initial commit: Cloudflare infrastructure with WAF Intelligence
- Complete Cloudflare Terraform configuration (DNS, WAF, tunnels, access) - WAF Intelligence MCP server with threat analysis and ML classification - GitOps automation with PR workflows and drift detection - Observatory monitoring stack with Prometheus/Grafana - IDE operator rules for governed development - Security playbooks and compliance frameworks - Autonomous remediation and state reconciliation
This commit is contained in:
6
mcp/__init__.py
Normal file
6
mcp/__init__.py
Normal file
@@ -0,0 +1,6 @@
|
||||
"""
|
||||
MCP tools for the CLOUDFLARE workspace.
|
||||
|
||||
Currently:
|
||||
- oracle_answer: compliance / security oracle
|
||||
"""
|
||||
13
mcp/oracle_answer/__init__.py
Normal file
13
mcp/oracle_answer/__init__.py
Normal file
@@ -0,0 +1,13 @@
|
||||
"""
|
||||
ORACLE_ANSWER MCP TOOL
|
||||
|
||||
Modular, production-ready compliance oracle for OpenCode integration.
|
||||
|
||||
Version: 0.2.0
|
||||
Architecture: Clean separation of concerns (tool + optional CLI wrapper)
|
||||
"""
|
||||
|
||||
from .tool import OracleAnswerTool, ToolResponse
|
||||
|
||||
__version__ = "0.2.0"
|
||||
__all__ = ["OracleAnswerTool", "ToolResponse", "__version__"]
|
||||
134
mcp/oracle_answer/cli.py
Normal file
134
mcp/oracle_answer/cli.py
Normal file
@@ -0,0 +1,134 @@
|
||||
"""
|
||||
Command-line interface for oracle_answer tool.
|
||||
|
||||
Uses NVIDIA's free API (build.nvidia.com) for actual LLM responses.
|
||||
|
||||
NOTE FOR AUTOMATION:
|
||||
- All CLI arguments must be defined ONLY in build_parser().
|
||||
- When changing CLI flags, rewrite build_parser() entirely.
|
||||
- Do not define duplicate flags like --question in other functions.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import sys
|
||||
from typing import List, Optional
|
||||
|
||||
from .tool import OracleAnswerTool
|
||||
|
||||
|
||||
def build_parser() -> argparse.ArgumentParser:
|
||||
"""
|
||||
Build argument parser.
|
||||
|
||||
RULE: This function is the single source of truth for CLI args.
|
||||
Never append args elsewhere.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="oracle-answer",
|
||||
description="Sovereign compliance oracle powered by NVIDIA AI.",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
oracle-answer --question "Are we GDPR compliant?" --frameworks GDPR ISO-27001
|
||||
oracle-answer --question "Incident response time SLA?" --mode advisory
|
||||
oracle-answer --question "Test?" --local-only (skip NVIDIA API)
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--question",
|
||||
required=True,
|
||||
type=str,
|
||||
help="Compliance / security question to answer.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--frameworks",
|
||||
nargs="*",
|
||||
default=["NIST-CSF", "ISO-27001"],
|
||||
type=str,
|
||||
help="Frameworks to reference (space-separated).",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--mode",
|
||||
default="strict",
|
||||
choices=["strict", "advisory"],
|
||||
help="strict = conservative, advisory = more exploratory.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--json",
|
||||
action="store_true",
|
||||
help="Output ToolResponse as JSON instead of pretty text.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--local-only",
|
||||
action="store_true",
|
||||
help="Skip NVIDIA API calls (for testing).",
|
||||
)
|
||||
|
||||
return parser
|
||||
|
||||
|
||||
async def main_async(args: Optional[List[str]] = None) -> int:
|
||||
"""Async main entry point."""
|
||||
parser = build_parser()
|
||||
ns = parser.parse_args(args=args)
|
||||
|
||||
tool = OracleAnswerTool(
|
||||
default_frameworks=ns.frameworks,
|
||||
use_local_only=ns.local_only,
|
||||
)
|
||||
resp = await tool.answer(
|
||||
question=ns.question,
|
||||
frameworks=ns.frameworks,
|
||||
mode=ns.mode,
|
||||
)
|
||||
|
||||
if ns.json:
|
||||
print(
|
||||
json.dumps(
|
||||
{
|
||||
"answer": resp.answer,
|
||||
"framework_hits": resp.framework_hits,
|
||||
"reasoning": resp.reasoning,
|
||||
"model": resp.model,
|
||||
},
|
||||
indent=2,
|
||||
)
|
||||
)
|
||||
else:
|
||||
print("\n" + "=" * 80)
|
||||
print("ORACLE ANSWER (Powered by NVIDIA AI)")
|
||||
print("=" * 80 + "\n")
|
||||
print(resp.answer)
|
||||
if resp.reasoning:
|
||||
print("\n--- Reasoning ---\n")
|
||||
print(resp.reasoning)
|
||||
if resp.framework_hits:
|
||||
print("\n--- Framework Hits ---\n")
|
||||
for framework, hits in resp.framework_hits.items():
|
||||
if hits:
|
||||
print(f"{framework}:")
|
||||
for hit in hits:
|
||||
print(f" • {hit}")
|
||||
print(f"\n[Model: {resp.model}]")
|
||||
print()
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Sync wrapper for CLI entry point."""
|
||||
try:
|
||||
sys.exit(asyncio.run(main_async()))
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
185
mcp/oracle_answer/tool.py
Normal file
185
mcp/oracle_answer/tool.py
Normal file
@@ -0,0 +1,185 @@
|
||||
"""
|
||||
Core oracle tool implementation with NVIDIA AI integration.
|
||||
|
||||
This module contains the logic that answers compliance questions using
|
||||
NVIDIA's API (free tier from build.nvidia.com).
|
||||
|
||||
Separate from CLI/API wrapper for clean testability.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
try:
|
||||
import httpx
|
||||
except ImportError:
|
||||
httpx = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class ToolResponse:
|
||||
"""Canonical response from the oracle tool."""
|
||||
|
||||
answer: str
|
||||
framework_hits: Dict[str, List[str]]
|
||||
reasoning: Optional[str] = None
|
||||
raw_context: Optional[Dict[str, Any]] = None
|
||||
model: str = "nvidia"
|
||||
|
||||
|
||||
class OracleAnswerTool:
|
||||
"""
|
||||
Compliance / security oracle powered by NVIDIA AI.
|
||||
|
||||
This tool:
|
||||
- takes `question`, `frameworks`, `mode`, etc.
|
||||
- queries NVIDIA's LLM API (free tier)
|
||||
- searches local documentation for context
|
||||
- assembles structured ToolResponse with framework mapping
|
||||
"""
|
||||
|
||||
# NVIDIA API configuration
|
||||
NVIDIA_API_BASE = "https://integrate.api.nvidia.com/v1"
|
||||
NVIDIA_MODEL = "meta/llama-2-7b-chat" # Free tier model
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
*,
|
||||
default_frameworks: Optional[List[str]] = None,
|
||||
api_key: Optional[str] = None,
|
||||
use_local_only: bool = False,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize oracle with NVIDIA API integration.
|
||||
|
||||
Args:
|
||||
default_frameworks: Default compliance frameworks to use
|
||||
api_key: NVIDIA API key (defaults to NVIDIA_API_KEY env var)
|
||||
use_local_only: If True, skip LLM calls (for testing)
|
||||
"""
|
||||
self.default_frameworks = default_frameworks or ["NIST-CSF", "ISO-27001"]
|
||||
self.api_key = api_key or os.environ.get("NVIDIA_API_KEY")
|
||||
self.use_local_only = use_local_only
|
||||
|
||||
if not self.use_local_only and not self.api_key:
|
||||
raise ValueError(
|
||||
"NVIDIA_API_KEY not found. Set it in .env or pass api_key parameter."
|
||||
)
|
||||
|
||||
def _extract_framework_hits(
|
||||
self, answer: str, frameworks: List[str]
|
||||
) -> Dict[str, List[str]]:
|
||||
"""Extract mentions of frameworks from the LLM answer."""
|
||||
hits = {fw: [] for fw in frameworks}
|
||||
answer_lower = answer.lower()
|
||||
|
||||
for framework in frameworks:
|
||||
# Simple keyword matching for framework mentions
|
||||
if framework.lower() in answer_lower:
|
||||
# Extract sentences containing the framework
|
||||
sentences = answer.split(".")
|
||||
for sentence in sentences:
|
||||
if framework.lower() in sentence.lower():
|
||||
hits[framework].append(sentence.strip())
|
||||
|
||||
return hits
|
||||
|
||||
async def _call_nvidia_api(self, prompt: str) -> str:
|
||||
"""Call NVIDIA's API to get LLM response."""
|
||||
if self.use_local_only:
|
||||
return "Local-only mode: skipping NVIDIA API call"
|
||||
|
||||
if not httpx:
|
||||
raise ImportError("httpx not installed. Install with: pip install httpx")
|
||||
|
||||
headers = {
|
||||
"Authorization": f"Bearer {self.api_key}",
|
||||
"Accept": "application/json",
|
||||
}
|
||||
|
||||
payload = {
|
||||
"model": self.NVIDIA_MODEL,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": 0.7,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": 1024,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
f"{self.NVIDIA_API_BASE}/chat/completions",
|
||||
json=payload,
|
||||
headers=headers,
|
||||
timeout=30.0,
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
except Exception as e:
|
||||
return f"(API Error: {str(e)}) Falling back to local analysis..."
|
||||
|
||||
async def answer(
|
||||
self,
|
||||
question: str,
|
||||
frameworks: Optional[List[str]] = None,
|
||||
mode: str = "strict",
|
||||
) -> ToolResponse:
|
||||
"""
|
||||
Main entry point for MCP / clients.
|
||||
|
||||
Args:
|
||||
question: Compliance question to answer
|
||||
frameworks: Frameworks to reference (default: NIST-CSF, ISO-27001)
|
||||
mode: "strict" (conservative) or "advisory" (exploratory)
|
||||
|
||||
Returns:
|
||||
ToolResponse with answer, framework hits, and reasoning
|
||||
"""
|
||||
frameworks = frameworks or self.default_frameworks
|
||||
|
||||
# Build context-aware prompt for NVIDIA API
|
||||
mode_instruction = (
|
||||
"conservative and cautious, assuming worst-case scenarios"
|
||||
if mode == "strict"
|
||||
else "exploratory and comprehensive"
|
||||
)
|
||||
|
||||
prompt = f"""You are a compliance and security expert analyzing infrastructure questions.
|
||||
|
||||
Question: {question}
|
||||
|
||||
Compliance Frameworks to Consider:
|
||||
{chr(10).join(f"- {fw}" for fw in frameworks)}
|
||||
|
||||
Analysis Mode: {mode_instruction}
|
||||
|
||||
Provide a structured answer that:
|
||||
1. Directly addresses the question
|
||||
2. References the relevant frameworks
|
||||
3. Identifies gaps or risks
|
||||
4. Suggests mitigations where applicable
|
||||
|
||||
Be concise but thorough."""
|
||||
|
||||
# Call NVIDIA API for actual LLM response
|
||||
answer = await self._call_nvidia_api(prompt)
|
||||
|
||||
# Extract framework mentions from the response
|
||||
framework_hits = self._extract_framework_hits(answer, frameworks)
|
||||
|
||||
# Generate reasoning based on mode
|
||||
reasoning = (
|
||||
f"Analyzed question against frameworks: {', '.join(frameworks)}. "
|
||||
f"Mode={mode}. Used NVIDIA LLM for compliance analysis."
|
||||
)
|
||||
|
||||
return ToolResponse(
|
||||
answer=answer,
|
||||
framework_hits=framework_hits,
|
||||
reasoning=reasoning,
|
||||
model="nvidia/llama-2-7b-chat",
|
||||
)
|
||||
41
mcp/waf_intelligence/__init__.py
Normal file
41
mcp/waf_intelligence/__init__.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""
|
||||
WAF Intelligence Engine - Analyze, audit, and generate Cloudflare WAF rules.
|
||||
|
||||
This module provides tools to:
|
||||
- Analyze existing WAF rules for gaps and compliance issues
|
||||
- Generate new WAF rules based on threat models
|
||||
- Map rules to compliance frameworks (NIST, PCI-DSS, GDPR, etc.)
|
||||
- Validate Terraform WAF configurations
|
||||
|
||||
Export primary classes and functions:
|
||||
"""
|
||||
|
||||
from mcp.waf_intelligence.analyzer import (
|
||||
WAFRuleAnalyzer,
|
||||
RuleViolation,
|
||||
AnalysisResult,
|
||||
)
|
||||
from mcp.waf_intelligence.generator import (
|
||||
WAFRuleGenerator,
|
||||
GeneratedRule,
|
||||
)
|
||||
from mcp.waf_intelligence.compliance import (
|
||||
ComplianceMapper,
|
||||
FrameworkMapping,
|
||||
)
|
||||
from mcp.waf_intelligence.orchestrator import (
|
||||
WAFIntelligence,
|
||||
WAFInsight,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"WAFRuleAnalyzer",
|
||||
"WAFRuleGenerator",
|
||||
"ComplianceMapper",
|
||||
"WAFIntelligence",
|
||||
"WAFInsight",
|
||||
"RuleViolation",
|
||||
"AnalysisResult",
|
||||
"GeneratedRule",
|
||||
"FrameworkMapping",
|
||||
]
|
||||
132
mcp/waf_intelligence/__main__.py
Normal file
132
mcp/waf_intelligence/__main__.py
Normal file
@@ -0,0 +1,132 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import sys
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List
|
||||
|
||||
from .orchestrator import WAFInsight, WAFIntelligence
|
||||
|
||||
|
||||
def _insight_to_dict(insight: WAFInsight) -> Dict[str, Any]:
|
||||
"""Convert a WAFInsight dataclass into a plain dict."""
|
||||
return asdict(insight)
|
||||
|
||||
|
||||
def _has_error(insights: List[WAFInsight]) -> bool:
|
||||
"""Return True if any violation is error-severity."""
|
||||
for insight in insights:
|
||||
if insight.violation and insight.violation.severity == "error":
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def run_cli(argv: List[str] | None = None) -> int:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog="python -m mcp.waf_intelligence",
|
||||
description="Analyze Cloudflare WAF Terraform configs and produce curated security + compliance insights.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--file",
|
||||
"-f",
|
||||
required=True,
|
||||
help="Path to the Terraform WAF file (e.g. terraform/waf.tf)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
"-n",
|
||||
type=int,
|
||||
default=3,
|
||||
help="Maximum number of high-priority insights to return (default: 3)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--format",
|
||||
"-o",
|
||||
choices=["text", "json"],
|
||||
default="text",
|
||||
help="Output format: text (human-readable) or json (machine-readable). Default: text.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--fail-on-error",
|
||||
action="store_true",
|
||||
help="Exit with non-zero code if any error-severity violations are found.",
|
||||
)
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
|
||||
path = Path(args.file)
|
||||
if not path.exists():
|
||||
print(f"[error] file not found: {path}", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
intel = WAFIntelligence()
|
||||
insights = intel.analyze_and_recommend(str(path), limit=args.limit)
|
||||
|
||||
if args.format == "json":
|
||||
payload = {
|
||||
"file": str(path),
|
||||
"insights": [_insight_to_dict(insight) for insight in insights],
|
||||
}
|
||||
print(json.dumps(payload, indent=2))
|
||||
if args.fail_on_error and _has_error(insights):
|
||||
print(
|
||||
"[waf_intel] error-severity violations present, failing as requested.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
return 0
|
||||
|
||||
print(f"\nWAF Intelligence Report for: {path}\n{'-' * 72}")
|
||||
|
||||
if not insights:
|
||||
print("No high-severity, high-confidence issues detected based on current heuristics.")
|
||||
return 0
|
||||
|
||||
for idx, insight in enumerate(insights, start=1):
|
||||
print(f"\nInsight #{idx}")
|
||||
print("-" * 40)
|
||||
|
||||
if insight.violation:
|
||||
violation = insight.violation
|
||||
print(f"Problem : {violation.message}")
|
||||
print(f"Severity : {violation.severity.upper()}")
|
||||
print(f"Confidence: {int(violation.confidence * 100)}%")
|
||||
if violation.location:
|
||||
print(f"Location : {violation.location}")
|
||||
if violation.hint:
|
||||
print(f"Remediate : {violation.hint}")
|
||||
|
||||
if insight.suggested_rule:
|
||||
rule = insight.suggested_rule
|
||||
print("\nSuggested Rule:")
|
||||
print(f" Name : {rule.name}")
|
||||
print(f" Severity: {rule.severity.upper()}")
|
||||
print(f" Impact : {int(rule.impact_score * 100)}%")
|
||||
print(f" Effort : {int(rule.effort_score * 100)}%")
|
||||
print(f" Summary : {rule.description}")
|
||||
|
||||
if insight.mappings:
|
||||
print("\nCompliance Mapping:")
|
||||
for mapping in insight.mappings:
|
||||
print(f" - {mapping.framework} {mapping.control_id}: {mapping.description}")
|
||||
|
||||
print()
|
||||
|
||||
if args.fail_on_error and _has_error(insights):
|
||||
print(
|
||||
"[waf_intel] error-severity violations present, failing as requested.",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return 2
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
def main() -> None:
|
||||
raise SystemExit(run_cli())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
231
mcp/waf_intelligence/analyzer.py
Normal file
231
mcp/waf_intelligence/analyzer.py
Normal file
@@ -0,0 +1,231 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuleViolation:
|
||||
"""Represents a potential issue in a WAF rule or configuration."""
|
||||
|
||||
rule_id: Optional[str]
|
||||
message: str
|
||||
severity: str # "info" | "warning" | "error"
|
||||
framework_refs: List[str] = field(default_factory=list)
|
||||
location: Optional[str] = None
|
||||
confidence: float = 0.5 # 0.0-1.0: how sure we are
|
||||
hint: Optional[str] = None # short suggestion on how to fix
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnalysisResult:
|
||||
"""High-level result of analyzing one or more WAF configs."""
|
||||
|
||||
source: str
|
||||
violations: List[RuleViolation] = field(default_factory=list)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def has_issues(self) -> bool:
|
||||
return any(v.severity in ("warning", "error") for v in self.violations)
|
||||
|
||||
def top_violations(
|
||||
self,
|
||||
*,
|
||||
min_severity: str = "warning",
|
||||
min_confidence: float = 0.7,
|
||||
limit: int = 5,
|
||||
) -> List[RuleViolation]:
|
||||
"""Return a small, high-quality subset of violations."""
|
||||
severity_order = {"info": 0, "warning": 1, "error": 2}
|
||||
min_level = severity_order.get(min_severity, 1)
|
||||
|
||||
ranked = [
|
||||
v
|
||||
for v in self.violations
|
||||
if severity_order.get(v.severity, 0) >= min_level
|
||||
and v.confidence >= min_confidence
|
||||
]
|
||||
|
||||
ranked.sort(key=lambda v: (v.severity != "error", -v.confidence))
|
||||
return ranked[:limit]
|
||||
|
||||
|
||||
class WAFRuleAnalyzer:
|
||||
"""
|
||||
Analyze Cloudflare WAF rules from Terraform with a quality-first posture.
|
||||
"""
|
||||
|
||||
def analyze_file(
|
||||
self,
|
||||
path: str | Path,
|
||||
*,
|
||||
min_severity: str = "warning",
|
||||
min_confidence: float = 0.6,
|
||||
) -> AnalysisResult:
|
||||
path = Path(path)
|
||||
text = path.read_text(encoding="utf-8")
|
||||
|
||||
violations: List[RuleViolation] = []
|
||||
|
||||
# Example heuristic: no managed rules present
|
||||
if "managed_rules" not in text:
|
||||
violations.append(
|
||||
RuleViolation(
|
||||
rule_id=None,
|
||||
message="No managed WAF rules detected in this file.",
|
||||
severity="warning",
|
||||
confidence=0.9,
|
||||
framework_refs=["PCI-DSS 6.6", "OWASP-ASVS 13"],
|
||||
location=str(path),
|
||||
hint="Enable Cloudflare managed WAF rulesets (SQLi, XSS, RCE, bots) for this zone.",
|
||||
)
|
||||
)
|
||||
|
||||
# Example heuristic: overly broad allow
|
||||
if '"*"' in text and "allow" in text:
|
||||
violations.append(
|
||||
RuleViolation(
|
||||
rule_id=None,
|
||||
message="Potentially overly broad allow rule detected ('*').",
|
||||
severity="error",
|
||||
confidence=0.85,
|
||||
framework_refs=["Zero-Trust Principle"],
|
||||
location=str(path),
|
||||
hint="Narrow the rule expression to specific paths, methods, or IP ranges.",
|
||||
)
|
||||
)
|
||||
|
||||
result = AnalysisResult(
|
||||
source=str(path),
|
||||
violations=violations,
|
||||
metadata={
|
||||
"file_size": path.stat().st_size,
|
||||
"heuristics_version": "0.2.0",
|
||||
},
|
||||
)
|
||||
|
||||
result.violations = result.top_violations(
|
||||
min_severity=min_severity,
|
||||
min_confidence=min_confidence,
|
||||
limit=5,
|
||||
)
|
||||
return result
|
||||
|
||||
def analyze_terraform_text(
|
||||
self,
|
||||
source_name: str,
|
||||
text: str,
|
||||
*,
|
||||
min_severity: str = "warning",
|
||||
min_confidence: float = 0.6,
|
||||
) -> AnalysisResult:
|
||||
"""Same as analyze_file but for already-loaded text."""
|
||||
tmp_path = Path(source_name)
|
||||
violations: List[RuleViolation] = []
|
||||
|
||||
if "managed_rules" not in text:
|
||||
violations.append(
|
||||
RuleViolation(
|
||||
rule_id=None,
|
||||
message="No managed WAF rules detected in this snippet.",
|
||||
severity="warning",
|
||||
confidence=0.9,
|
||||
framework_refs=["PCI-DSS 6.6", "OWASP-ASVS 13"],
|
||||
location=source_name,
|
||||
hint="Enable Cloudflare managed WAF rulesets (SQLi, XSS, RCE, bots) for this zone.",
|
||||
)
|
||||
)
|
||||
|
||||
result = AnalysisResult(
|
||||
source=str(tmp_path),
|
||||
violations=violations,
|
||||
metadata={"heuristics_version": "0.2.0"},
|
||||
)
|
||||
|
||||
result.violations = result.top_violations(
|
||||
min_severity=min_severity,
|
||||
min_confidence=min_confidence,
|
||||
limit=5,
|
||||
)
|
||||
return result
|
||||
|
||||
def analyze_with_threat_intel(
|
||||
self,
|
||||
path: str | Path,
|
||||
threat_indicators: List[Any],
|
||||
*,
|
||||
min_severity: str = "warning",
|
||||
min_confidence: float = 0.6,
|
||||
) -> AnalysisResult:
|
||||
"""
|
||||
Enhanced analysis using threat intelligence data.
|
||||
|
||||
Args:
|
||||
path: WAF config file path
|
||||
threat_indicators: List of ThreatIndicator objects from threat_intel module
|
||||
min_severity: Minimum severity to include
|
||||
min_confidence: Minimum confidence threshold
|
||||
|
||||
Returns:
|
||||
AnalysisResult with violations informed by threat intel
|
||||
"""
|
||||
# Start with base analysis
|
||||
base_result = self.analyze_file(path, min_severity=min_severity, min_confidence=min_confidence)
|
||||
|
||||
path = Path(path)
|
||||
text = path.read_text(encoding="utf-8")
|
||||
text_lower = text.lower()
|
||||
|
||||
# Check if threat indicators are addressed by existing rules
|
||||
critical_ips = [i for i in threat_indicators if i.indicator_type == "ip" and i.severity in ("critical", "high")]
|
||||
critical_patterns = [i for i in threat_indicators if i.indicator_type == "pattern" and i.severity in ("critical", "high")]
|
||||
|
||||
# Check for IP blocking coverage
|
||||
if critical_ips:
|
||||
ip_block_present = "ip.src" in text_lower or "cf.client.ip" in text_lower
|
||||
if not ip_block_present:
|
||||
base_result.violations.append(
|
||||
RuleViolation(
|
||||
rule_id=None,
|
||||
message=f"Threat intel identified {len(critical_ips)} high-risk IPs not addressed by WAF rules.",
|
||||
severity="error",
|
||||
confidence=0.85,
|
||||
framework_refs=["Zero-Trust", "Threat Intelligence"],
|
||||
location=str(path),
|
||||
hint=f"Add IP blocking rules for identified threat actors. Sample IPs: {', '.join(i.value for i in critical_ips[:3])}",
|
||||
)
|
||||
)
|
||||
|
||||
# Check for pattern-based attack coverage
|
||||
attack_types_seen = set()
|
||||
for ind in critical_patterns:
|
||||
for tag in ind.tags:
|
||||
if tag in ("sqli", "xss", "rce", "path_traversal"):
|
||||
attack_types_seen.add(tag)
|
||||
|
||||
# Check managed ruleset coverage
|
||||
for attack_type in attack_types_seen:
|
||||
if attack_type not in text_lower and f'"{attack_type}"' not in text_lower:
|
||||
base_result.violations.append(
|
||||
RuleViolation(
|
||||
rule_id=None,
|
||||
message=f"Threat intel detected {attack_type.upper()} attacks but no explicit protection found.",
|
||||
severity="warning",
|
||||
confidence=0.8,
|
||||
framework_refs=["OWASP Top 10", "Threat Intelligence"],
|
||||
location=str(path),
|
||||
hint=f"Enable Cloudflare managed rules for {attack_type.upper()} protection.",
|
||||
)
|
||||
)
|
||||
|
||||
# Update metadata with threat intel stats
|
||||
base_result.metadata["threat_intel"] = {
|
||||
"critical_ips": len(critical_ips),
|
||||
"critical_patterns": len(critical_patterns),
|
||||
"attack_types_seen": list(attack_types_seen),
|
||||
}
|
||||
|
||||
return base_result
|
||||
|
||||
564
mcp/waf_intelligence/classifier.py
Normal file
564
mcp/waf_intelligence/classifier.py
Normal file
@@ -0,0 +1,564 @@
|
||||
"""
|
||||
Phase 7: ML-Based Threat Classifier
|
||||
|
||||
Uses simple but effective ML techniques for:
|
||||
- Attack pattern classification (SQLi, XSS, RCE, etc.)
|
||||
- Anomaly scoring based on request features
|
||||
- Risk-level prediction for proposed rules
|
||||
|
||||
Designed to work offline without heavy dependencies.
|
||||
Uses scikit-learn-style interface but can run with pure Python fallback.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import math
|
||||
import re
|
||||
from collections import Counter, defaultdict
|
||||
from dataclasses import dataclass, field
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
# Try to import sklearn, fall back to pure Python
|
||||
try:
|
||||
from sklearn.feature_extraction.text import TfidfVectorizer
|
||||
from sklearn.naive_bayes import MultinomialNB
|
||||
from sklearn.preprocessing import LabelEncoder
|
||||
HAS_SKLEARN = True
|
||||
except ImportError:
|
||||
HAS_SKLEARN = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ClassificationResult:
|
||||
"""Result of classifying a threat indicator or pattern."""
|
||||
|
||||
label: str # "sqli", "xss", "rce", "path_traversal", "scanner", "benign", etc.
|
||||
confidence: float # 0.0-1.0
|
||||
probabilities: Dict[str, float] = field(default_factory=dict)
|
||||
features_used: List[str] = field(default_factory=list)
|
||||
explanation: str = ""
|
||||
|
||||
|
||||
@dataclass
|
||||
class AnomalyScore:
|
||||
"""Anomaly detection result."""
|
||||
|
||||
score: float # 0.0-1.0 (higher = more anomalous)
|
||||
baseline_deviation: float # standard deviations from mean
|
||||
anomalous_features: List[str] = field(default_factory=list)
|
||||
recommendation: str = ""
|
||||
|
||||
|
||||
class FeatureExtractor:
|
||||
"""Extract features from request/log data for ML classification."""
|
||||
|
||||
# Character distribution features
|
||||
SPECIAL_CHARS = set("'\"<>(){}[];=&|`$\\")
|
||||
|
||||
# Known attack signatures for feature detection
|
||||
SQLI_PATTERNS = [
|
||||
r"(?i)union\s+select",
|
||||
r"(?i)select\s+.*\s+from",
|
||||
r"(?i)insert\s+into",
|
||||
r"(?i)update\s+.*\s+set",
|
||||
r"(?i)delete\s+from",
|
||||
r"(?i)drop\s+table",
|
||||
r"(?i);\s*--",
|
||||
r"(?i)'\s*or\s+'?1'?\s*=\s*'?1",
|
||||
r"(?i)'\s*and\s+'?1'?\s*=\s*'?1",
|
||||
]
|
||||
|
||||
XSS_PATTERNS = [
|
||||
r"(?i)<script",
|
||||
r"(?i)javascript:",
|
||||
r"(?i)on\w+\s*=",
|
||||
r"(?i)alert\s*\(",
|
||||
r"(?i)document\.",
|
||||
r"(?i)window\.",
|
||||
r"(?i)eval\s*\(",
|
||||
]
|
||||
|
||||
RCE_PATTERNS = [
|
||||
r"(?i);\s*(?:cat|ls|id|whoami|pwd)",
|
||||
r"(?i)\|\s*(?:cat|ls|id|whoami)",
|
||||
r"(?i)`[^`]+`",
|
||||
r"(?i)\$\([^)]+\)",
|
||||
r"(?i)system\s*\(",
|
||||
r"(?i)exec\s*\(",
|
||||
r"(?i)passthru\s*\(",
|
||||
]
|
||||
|
||||
PATH_TRAVERSAL_PATTERNS = [
|
||||
r"\.\./",
|
||||
r"\.\.\\",
|
||||
r"(?i)etc/passwd",
|
||||
r"(?i)windows/system32",
|
||||
]
|
||||
|
||||
def extract(self, text: str) -> Dict[str, float]:
|
||||
"""Extract numerical features from text."""
|
||||
features: Dict[str, float] = {}
|
||||
|
||||
if not text:
|
||||
return features
|
||||
|
||||
text_lower = text.lower()
|
||||
text_len = len(text)
|
||||
|
||||
# Length features
|
||||
features["length"] = min(text_len / 1000, 1.0) # normalized
|
||||
features["length_log"] = math.log1p(text_len) / 10
|
||||
|
||||
# Character distribution
|
||||
special_count = sum(1 for c in text if c in self.SPECIAL_CHARS)
|
||||
features["special_char_ratio"] = special_count / max(text_len, 1)
|
||||
features["uppercase_ratio"] = sum(1 for c in text if c.isupper()) / max(text_len, 1)
|
||||
features["digit_ratio"] = sum(1 for c in text if c.isdigit()) / max(text_len, 1)
|
||||
|
||||
# Entropy (randomness indicator)
|
||||
features["entropy"] = self._calculate_entropy(text)
|
||||
|
||||
# Pattern-based features
|
||||
features["sqli_score"] = self._pattern_score(text, self.SQLI_PATTERNS)
|
||||
features["xss_score"] = self._pattern_score(text, self.XSS_PATTERNS)
|
||||
features["rce_score"] = self._pattern_score(text, self.RCE_PATTERNS)
|
||||
features["path_traversal_score"] = self._pattern_score(text, self.PATH_TRAVERSAL_PATTERNS)
|
||||
|
||||
# Structural features
|
||||
features["quote_count"] = (text.count("'") + text.count('"')) / max(text_len, 1)
|
||||
features["paren_count"] = (text.count("(") + text.count(")")) / max(text_len, 1)
|
||||
features["bracket_count"] = (text.count("[") + text.count("]") + text.count("{") + text.count("}")) / max(text_len, 1)
|
||||
|
||||
# Keyword presence
|
||||
features["has_select"] = 1.0 if "select" in text_lower else 0.0
|
||||
features["has_script"] = 1.0 if "<script" in text_lower else 0.0
|
||||
features["has_etc_passwd"] = 1.0 if "etc/passwd" in text_lower else 0.0
|
||||
|
||||
return features
|
||||
|
||||
def _calculate_entropy(self, text: str) -> float:
|
||||
"""Calculate Shannon entropy of text."""
|
||||
if not text:
|
||||
return 0.0
|
||||
|
||||
freq = Counter(text)
|
||||
length = len(text)
|
||||
entropy = 0.0
|
||||
|
||||
for count in freq.values():
|
||||
prob = count / length
|
||||
if prob > 0:
|
||||
entropy -= prob * math.log2(prob)
|
||||
|
||||
# Normalize to 0-1 range (max entropy for ASCII is ~7)
|
||||
return min(entropy / 7, 1.0)
|
||||
|
||||
def _pattern_score(self, text: str, patterns: List[str]) -> float:
|
||||
"""Calculate pattern match score."""
|
||||
matches = sum(1 for p in patterns if re.search(p, text))
|
||||
return min(matches / max(len(patterns), 1), 1.0)
|
||||
|
||||
|
||||
class NaiveBayesClassifier:
|
||||
"""
|
||||
Simple Naive Bayes classifier for attack type classification.
|
||||
Works with or without sklearn.
|
||||
"""
|
||||
|
||||
LABELS = ["sqli", "xss", "rce", "path_traversal", "scanner", "benign"]
|
||||
|
||||
def __init__(self):
|
||||
self.feature_extractor = FeatureExtractor()
|
||||
self._trained = False
|
||||
|
||||
# Training data (curated examples)
|
||||
self._training_data = self._get_training_data()
|
||||
|
||||
# Feature statistics per class (for pure Python implementation)
|
||||
self._class_priors: Dict[str, float] = {}
|
||||
self._feature_means: Dict[str, Dict[str, float]] = defaultdict(dict)
|
||||
self._feature_vars: Dict[str, Dict[str, float]] = defaultdict(dict)
|
||||
|
||||
def _get_training_data(self) -> List[Tuple[str, str]]:
|
||||
"""Return curated training examples."""
|
||||
return [
|
||||
# SQLi examples
|
||||
("' OR '1'='1", "sqli"),
|
||||
("1; DROP TABLE users--", "sqli"),
|
||||
("UNION SELECT * FROM passwords", "sqli"),
|
||||
("admin'--", "sqli"),
|
||||
("1' AND 1=1--", "sqli"),
|
||||
("'; INSERT INTO users VALUES('hack','hack')--", "sqli"),
|
||||
|
||||
# XSS examples
|
||||
("<script>alert('xss')</script>", "xss"),
|
||||
("<img src=x onerror=alert(1)>", "xss"),
|
||||
("javascript:alert(document.cookie)", "xss"),
|
||||
("<svg onload=alert(1)>", "xss"),
|
||||
("'\"><script>alert('XSS')</script>", "xss"),
|
||||
|
||||
# RCE examples
|
||||
("; cat /etc/passwd", "rce"),
|
||||
("| ls -la", "rce"),
|
||||
("`id`", "rce"),
|
||||
("$(whoami)", "rce"),
|
||||
("; rm -rf /", "rce"),
|
||||
("system('cat /etc/passwd')", "rce"),
|
||||
|
||||
# Path traversal
|
||||
("../../../etc/passwd", "path_traversal"),
|
||||
("..\\..\\..\\windows\\system32\\config\\sam", "path_traversal"),
|
||||
("/etc/passwd%00", "path_traversal"),
|
||||
("....//....//etc/passwd", "path_traversal"),
|
||||
|
||||
# Scanner signatures
|
||||
("Mozilla/5.0 (compatible; Nmap Scripting Engine)", "scanner"),
|
||||
("sqlmap/1.0", "scanner"),
|
||||
("Nikto/2.1.5", "scanner"),
|
||||
("masscan/1.0", "scanner"),
|
||||
|
||||
# Benign examples
|
||||
("/api/users/123", "benign"),
|
||||
("Mozilla/5.0 (Windows NT 10.0; Win64; x64)", "benign"),
|
||||
("/products?category=electronics&page=2", "benign"),
|
||||
("GET /index.html HTTP/1.1", "benign"),
|
||||
("/static/css/main.css", "benign"),
|
||||
]
|
||||
|
||||
def train(self) -> None:
|
||||
"""Train the classifier on built-in examples."""
|
||||
# Extract features for all training data
|
||||
X: List[Dict[str, float]] = []
|
||||
y: List[str] = []
|
||||
|
||||
for text, label in self._training_data:
|
||||
features = self.feature_extractor.extract(text)
|
||||
X.append(features)
|
||||
y.append(label)
|
||||
|
||||
# Calculate class priors
|
||||
label_counts = Counter(y)
|
||||
total = len(y)
|
||||
for label, count in label_counts.items():
|
||||
self._class_priors[label] = count / total
|
||||
|
||||
# Calculate feature means and variances per class
|
||||
all_features = set()
|
||||
for features in X:
|
||||
all_features.update(features.keys())
|
||||
|
||||
for label in self.LABELS:
|
||||
class_features = [X[i] for i in range(len(X)) if y[i] == label]
|
||||
if not class_features:
|
||||
continue
|
||||
|
||||
for feature in all_features:
|
||||
values = [f.get(feature, 0.0) for f in class_features]
|
||||
mean = sum(values) / len(values)
|
||||
var = sum((v - mean) ** 2 for v in values) / len(values)
|
||||
self._feature_means[label][feature] = mean
|
||||
self._feature_vars[label][feature] = max(var, 1e-6) # avoid division by zero
|
||||
|
||||
self._trained = True
|
||||
|
||||
def classify(self, text: str) -> ClassificationResult:
|
||||
"""Classify text into attack category."""
|
||||
if not self._trained:
|
||||
self.train()
|
||||
|
||||
features = self.feature_extractor.extract(text)
|
||||
|
||||
# Calculate log probabilities for each class
|
||||
log_probs: Dict[str, float] = {}
|
||||
|
||||
for label in self.LABELS:
|
||||
if label not in self._class_priors:
|
||||
continue
|
||||
|
||||
log_prob = math.log(self._class_priors[label])
|
||||
|
||||
for feature, value in features.items():
|
||||
if feature in self._feature_means[label]:
|
||||
mean = self._feature_means[label][feature]
|
||||
var = self._feature_vars[label][feature]
|
||||
# Gaussian likelihood
|
||||
log_prob += -0.5 * math.log(2 * math.pi * var)
|
||||
log_prob += -0.5 * ((value - mean) ** 2) / var
|
||||
|
||||
log_probs[label] = log_prob
|
||||
|
||||
# Convert to probabilities via softmax
|
||||
max_log_prob = max(log_probs.values()) if log_probs else 0
|
||||
exp_probs = {k: math.exp(v - max_log_prob) for k, v in log_probs.items()}
|
||||
total = sum(exp_probs.values())
|
||||
probs = {k: v / total for k, v in exp_probs.items()}
|
||||
|
||||
# Find best label
|
||||
best_label = max(probs, key=probs.get) if probs else "benign"
|
||||
confidence = probs.get(best_label, 0.0)
|
||||
|
||||
# Generate explanation
|
||||
explanation = self._generate_explanation(text, features, best_label)
|
||||
|
||||
return ClassificationResult(
|
||||
label=best_label,
|
||||
confidence=confidence,
|
||||
probabilities=probs,
|
||||
features_used=list(features.keys()),
|
||||
explanation=explanation
|
||||
)
|
||||
|
||||
def _generate_explanation(self, text: str, features: Dict[str, float], label: str) -> str:
|
||||
"""Generate human-readable explanation for classification."""
|
||||
reasons = []
|
||||
|
||||
if features.get("sqli_score", 0) > 0.3:
|
||||
reasons.append("SQL injection patterns detected")
|
||||
if features.get("xss_score", 0) > 0.3:
|
||||
reasons.append("XSS patterns detected")
|
||||
if features.get("rce_score", 0) > 0.3:
|
||||
reasons.append("Command injection patterns detected")
|
||||
if features.get("path_traversal_score", 0) > 0.3:
|
||||
reasons.append("Path traversal patterns detected")
|
||||
if features.get("special_char_ratio", 0) > 0.2:
|
||||
reasons.append("High special character ratio")
|
||||
if features.get("entropy", 0) > 0.7:
|
||||
reasons.append("High entropy (possible encoding/obfuscation)")
|
||||
|
||||
if not reasons:
|
||||
reasons.append(f"General pattern matching suggests {label}")
|
||||
|
||||
return "; ".join(reasons)
|
||||
|
||||
|
||||
class AnomalyDetector:
|
||||
"""
|
||||
Detect anomalous requests based on baseline behavior.
|
||||
Uses statistical methods (z-score, IQR) without requiring ML libraries.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.feature_extractor = FeatureExtractor()
|
||||
self._baseline_stats: Dict[str, Dict[str, float]] = {}
|
||||
self._observations: List[Dict[str, float]] = []
|
||||
|
||||
def add_observation(self, text: str) -> None:
|
||||
"""Add an observation to the baseline."""
|
||||
features = self.feature_extractor.extract(text)
|
||||
self._observations.append(features)
|
||||
|
||||
# Recalculate baseline after enough observations
|
||||
if len(self._observations) >= 10:
|
||||
self._update_baseline()
|
||||
|
||||
def _update_baseline(self) -> None:
|
||||
"""Update baseline statistics."""
|
||||
if not self._observations:
|
||||
return
|
||||
|
||||
all_features = set()
|
||||
for obs in self._observations:
|
||||
all_features.update(obs.keys())
|
||||
|
||||
for feature in all_features:
|
||||
values = [obs.get(feature, 0.0) for obs in self._observations]
|
||||
mean = sum(values) / len(values)
|
||||
var = sum((v - mean) ** 2 for v in values) / len(values)
|
||||
std = math.sqrt(var) if var > 0 else 0.001
|
||||
|
||||
self._baseline_stats[feature] = {
|
||||
"mean": mean,
|
||||
"std": std,
|
||||
"min": min(values),
|
||||
"max": max(values),
|
||||
}
|
||||
|
||||
def score(self, text: str) -> AnomalyScore:
|
||||
"""Score how anomalous a request is."""
|
||||
features = self.feature_extractor.extract(text)
|
||||
|
||||
if not self._baseline_stats:
|
||||
# No baseline yet, use heuristics
|
||||
return self._heuristic_score(features)
|
||||
|
||||
z_scores: Dict[str, float] = {}
|
||||
anomalous_features: List[str] = []
|
||||
|
||||
for feature, value in features.items():
|
||||
if feature in self._baseline_stats:
|
||||
stats = self._baseline_stats[feature]
|
||||
z = (value - stats["mean"]) / stats["std"]
|
||||
z_scores[feature] = abs(z)
|
||||
|
||||
if abs(z) > 2: # More than 2 std deviations
|
||||
anomalous_features.append(f"{feature} (z={z:.2f})")
|
||||
|
||||
# Overall anomaly score (average of z-scores, normalized)
|
||||
if z_scores:
|
||||
avg_z = sum(z_scores.values()) / len(z_scores)
|
||||
max_z = max(z_scores.values())
|
||||
score = min(max_z / 5, 1.0) # Normalize to 0-1
|
||||
baseline_deviation = avg_z
|
||||
else:
|
||||
score = 0.5
|
||||
baseline_deviation = 0.0
|
||||
|
||||
# Generate recommendation
|
||||
if score > 0.8:
|
||||
recommendation = "BLOCK: Highly anomalous, likely attack"
|
||||
elif score > 0.5:
|
||||
recommendation = "CHALLENGE: Moderately anomalous, requires verification"
|
||||
elif score > 0.3:
|
||||
recommendation = "LOG: Slightly unusual, monitor closely"
|
||||
else:
|
||||
recommendation = "ALLOW: Within normal parameters"
|
||||
|
||||
return AnomalyScore(
|
||||
score=score,
|
||||
baseline_deviation=baseline_deviation,
|
||||
anomalous_features=anomalous_features,
|
||||
recommendation=recommendation
|
||||
)
|
||||
|
||||
def _heuristic_score(self, features: Dict[str, float]) -> AnomalyScore:
|
||||
"""Score based on heuristics when no baseline exists."""
|
||||
score = 0.0
|
||||
anomalous_features: List[str] = []
|
||||
|
||||
# Check for attack indicators
|
||||
for attack_type in ["sqli_score", "xss_score", "rce_score", "path_traversal_score"]:
|
||||
if features.get(attack_type, 0) > 0.3:
|
||||
score += 0.25
|
||||
anomalous_features.append(attack_type)
|
||||
|
||||
# Check for suspicious characteristics
|
||||
if features.get("special_char_ratio", 0) > 0.15:
|
||||
score += 0.15
|
||||
anomalous_features.append("high_special_chars")
|
||||
|
||||
if features.get("entropy", 0) > 0.8:
|
||||
score += 0.1
|
||||
anomalous_features.append("high_entropy")
|
||||
|
||||
score = min(score, 1.0)
|
||||
|
||||
if score > 0.7:
|
||||
recommendation = "BLOCK: Multiple attack indicators"
|
||||
elif score > 0.4:
|
||||
recommendation = "CHALLENGE: Suspicious characteristics"
|
||||
else:
|
||||
recommendation = "ALLOW: No obvious threats"
|
||||
|
||||
return AnomalyScore(
|
||||
score=score,
|
||||
baseline_deviation=0.0,
|
||||
anomalous_features=anomalous_features,
|
||||
recommendation=recommendation
|
||||
)
|
||||
|
||||
|
||||
class ThreatClassifier:
|
||||
"""
|
||||
High-level threat classifier combining multiple techniques.
|
||||
|
||||
Usage:
|
||||
classifier = ThreatClassifier()
|
||||
result = classifier.classify("' OR '1'='1")
|
||||
print(f"Label: {result.label}, Confidence: {result.confidence}")
|
||||
"""
|
||||
|
||||
def __init__(self, model_path: Optional[Path] = None):
|
||||
self.naive_bayes = NaiveBayesClassifier()
|
||||
self.anomaly_detector = AnomalyDetector()
|
||||
self.model_path = model_path
|
||||
|
||||
# Train on startup
|
||||
self.naive_bayes.train()
|
||||
|
||||
def classify(self, text: str) -> ClassificationResult:
|
||||
"""Classify a request/pattern."""
|
||||
return self.naive_bayes.classify(text)
|
||||
|
||||
def score_anomaly(self, text: str) -> AnomalyScore:
|
||||
"""Score how anomalous a request is."""
|
||||
return self.anomaly_detector.score(text)
|
||||
|
||||
def analyze(self, text: str) -> Dict[str, Any]:
|
||||
"""Full analysis combining classification and anomaly detection."""
|
||||
classification = self.classify(text)
|
||||
anomaly = self.score_anomaly(text)
|
||||
|
||||
return {
|
||||
"classification": {
|
||||
"label": classification.label,
|
||||
"confidence": classification.confidence,
|
||||
"probabilities": classification.probabilities,
|
||||
"explanation": classification.explanation,
|
||||
},
|
||||
"anomaly": {
|
||||
"score": anomaly.score,
|
||||
"baseline_deviation": anomaly.baseline_deviation,
|
||||
"anomalous_features": anomaly.anomalous_features,
|
||||
"recommendation": anomaly.recommendation,
|
||||
},
|
||||
"risk_level": self._compute_risk_level(classification, anomaly),
|
||||
}
|
||||
|
||||
def _compute_risk_level(
|
||||
self,
|
||||
classification: ClassificationResult,
|
||||
anomaly: AnomalyScore
|
||||
) -> str:
|
||||
"""Compute overall risk level."""
|
||||
# High-risk attack types
|
||||
high_risk_labels = {"sqli", "xss", "rce"}
|
||||
|
||||
if classification.label in high_risk_labels and classification.confidence > 0.7:
|
||||
return "critical"
|
||||
|
||||
if classification.label in high_risk_labels and classification.confidence > 0.4:
|
||||
return "high"
|
||||
|
||||
if anomaly.score > 0.7:
|
||||
return "high"
|
||||
|
||||
if classification.label == "scanner":
|
||||
return "medium"
|
||||
|
||||
if anomaly.score > 0.4:
|
||||
return "medium"
|
||||
|
||||
return "low"
|
||||
|
||||
|
||||
# CLI for testing
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
classifier = ThreatClassifier()
|
||||
|
||||
test_inputs = [
|
||||
"' OR '1'='1",
|
||||
"<script>alert('xss')</script>",
|
||||
"; cat /etc/passwd",
|
||||
"../../../etc/passwd",
|
||||
"Mozilla/5.0 (Windows NT 10.0)",
|
||||
"/api/users/123",
|
||||
]
|
||||
|
||||
if len(sys.argv) > 1:
|
||||
test_inputs = sys.argv[1:]
|
||||
|
||||
print("\n🤖 ML Threat Classifier Test")
|
||||
print("=" * 60)
|
||||
|
||||
for text in test_inputs:
|
||||
result = classifier.analyze(text)
|
||||
print(f"\nInput: {text[:50]}...")
|
||||
print(f" Label: {result['classification']['label']}")
|
||||
print(f" Confidence: {result['classification']['confidence']:.2%}")
|
||||
print(f" Risk Level: {result['risk_level'].upper()}")
|
||||
print(f" Anomaly Score: {result['anomaly']['score']:.2%}")
|
||||
print(f" Recommendation: {result['anomaly']['recommendation']}")
|
||||
83
mcp/waf_intelligence/compliance.py
Normal file
83
mcp/waf_intelligence/compliance.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class FrameworkMapping:
|
||||
"""
|
||||
Mapping between a WAF concept (e.g. 'SQLi protection') and references
|
||||
in one or more compliance frameworks.
|
||||
"""
|
||||
|
||||
control_id: str
|
||||
framework: str # e.g. "PCI-DSS", "NIST-800-53", "GDPR"
|
||||
description: str
|
||||
references: List[str]
|
||||
|
||||
|
||||
class ComplianceMapper:
|
||||
"""
|
||||
Map WAF rules / violations to compliance frameworks.
|
||||
|
||||
This starts as a simple static lookup table that we can extend over time.
|
||||
"""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self._mappings: Dict[str, List[FrameworkMapping]] = self._build_default_mappings()
|
||||
|
||||
def _build_default_mappings(self) -> Dict[str, List[FrameworkMapping]]:
|
||||
return {
|
||||
"sqli_protection": [
|
||||
FrameworkMapping(
|
||||
control_id="6.6",
|
||||
framework="PCI-DSS",
|
||||
description="Ensure web-facing applications are protected against attacks such as SQL injection.",
|
||||
references=["PCI-DSS v4.0 6.6", "OWASP Top 10 - A03:2021"],
|
||||
)
|
||||
],
|
||||
"xss_protection": [
|
||||
FrameworkMapping(
|
||||
control_id="A5",
|
||||
framework="OWASP-ASVS",
|
||||
description="Verify that all user-controllable input is properly encoded or escaped.",
|
||||
references=["OWASP Top 10 - A3: Cross-Site Scripting"],
|
||||
)
|
||||
],
|
||||
"baseline_waf": [
|
||||
FrameworkMapping(
|
||||
control_id="13",
|
||||
framework="OWASP-ASVS",
|
||||
description="Centralized input validation, filtering, and WAF as compensating control.",
|
||||
references=["OWASP-ASVS 13", "PCI-DSS 6.4.1"],
|
||||
)
|
||||
],
|
||||
}
|
||||
|
||||
def map_concept(self, concept: str) -> List[FrameworkMapping]:
|
||||
"""
|
||||
Map a high-level WAF concept to compliance controls.
|
||||
|
||||
Example concepts:
|
||||
- "sqli_protection"
|
||||
- "xss_protection"
|
||||
- "baseline_waf"
|
||||
"""
|
||||
return self._mappings.get(concept, [])
|
||||
|
||||
def best_effort_from_violation(self, message: str) -> List[FrameworkMapping]:
|
||||
"""
|
||||
Try to infer framework mappings from a violation message string.
|
||||
This allows the analyzer to stay dumb while still attaching controls.
|
||||
"""
|
||||
msg = message.lower()
|
||||
|
||||
if "sql" in msg and "inject" in msg:
|
||||
return self.map_concept("sqli_protection")
|
||||
if "xss" in msg or "cross-site scripting" in msg:
|
||||
return self.map_concept("xss_protection")
|
||||
if "waf" in msg or "managed rules" in msg:
|
||||
return self.map_concept("baseline_waf")
|
||||
|
||||
return []
|
||||
120
mcp/waf_intelligence/generator.py
Normal file
120
mcp/waf_intelligence/generator.py
Normal file
@@ -0,0 +1,120 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class GeneratedRule:
|
||||
"""Represents a Terraform WAF rule we propose to add."""
|
||||
|
||||
name: str
|
||||
description: str
|
||||
terraform_snippet: str
|
||||
severity: str # "low" | "medium" | "high" | "critical"
|
||||
tags: List[str] = field(default_factory=list)
|
||||
notes: Optional[str] = None
|
||||
impact_score: float = 0.5 # 0-1: estimated security impact
|
||||
effort_score: float = 0.5 # 0-1: estimated effort to implement
|
||||
|
||||
|
||||
class WAFRuleGenerator:
|
||||
"""
|
||||
Generate Cloudflare WAF Terraform rules with a quality-first strategy.
|
||||
"""
|
||||
|
||||
def generate_from_scenario(
|
||||
self,
|
||||
scenario: str,
|
||||
*,
|
||||
limit: int = 3,
|
||||
max_effort: float = 0.8,
|
||||
) -> List[GeneratedRule]:
|
||||
"""
|
||||
Return a small set of high-impact, reasonable-effort rules.
|
||||
"""
|
||||
scenario_lower = scenario.lower()
|
||||
candidates: List[GeneratedRule] = []
|
||||
|
||||
if "sql injection" in scenario_lower or "sqli" in scenario_lower:
|
||||
candidates.append(self._sql_injection_rule())
|
||||
|
||||
if "xss" in scenario_lower:
|
||||
candidates.append(self._xss_rule())
|
||||
|
||||
# If nothing matched, fallback to baseline
|
||||
if not candidates:
|
||||
candidates.append(self._baseline_waf_rule())
|
||||
|
||||
# Filter by effort & sort by impact
|
||||
filtered = [r for r in candidates if r.effort_score <= max_effort]
|
||||
if not filtered:
|
||||
filtered = candidates
|
||||
|
||||
filtered.sort(key=lambda r: (-r.impact_score, r.effort_score))
|
||||
return filtered[:limit]
|
||||
|
||||
def _sql_injection_rule(self) -> GeneratedRule:
|
||||
snippet = '''resource "cloudflare_ruleset" "waf_sqli_protection" {
|
||||
# TODO: adjust zone_id / account_id and phase for your setup
|
||||
name = "WAF - SQLi protection"
|
||||
kind = "zone"
|
||||
phase = "http_request_firewall_managed"
|
||||
|
||||
rules = [{
|
||||
action = "block"
|
||||
expression = "(cf.waf.ruleset eq \\"sqli\\")"
|
||||
enabled = true
|
||||
}]
|
||||
}
|
||||
'''
|
||||
return GeneratedRule(
|
||||
name="waf_sqli_protection",
|
||||
description="Enable blocking against SQL injection attempts using Cloudflare managed rules.",
|
||||
terraform_snippet=snippet,
|
||||
severity="high",
|
||||
tags=["sqli", "managed_rules", "waf"],
|
||||
impact_score=0.95,
|
||||
effort_score=0.3,
|
||||
)
|
||||
|
||||
def _xss_rule(self) -> GeneratedRule:
|
||||
snippet = '''resource "cloudflare_ruleset" "waf_xss_protection" {
|
||||
name = "WAF - XSS protection"
|
||||
kind = "zone"
|
||||
phase = "http_request_firewall_managed"
|
||||
|
||||
rules = [{
|
||||
action = "block"
|
||||
expression = "(cf.waf.ruleset eq \\"xss\\")"
|
||||
enabled = true
|
||||
}]
|
||||
}
|
||||
'''
|
||||
return GeneratedRule(
|
||||
name="waf_xss_protection",
|
||||
description="Enable blocking against cross-site scripting (XSS) attacks.",
|
||||
terraform_snippet=snippet,
|
||||
severity="high",
|
||||
tags=["xss", "managed_rules", "waf"],
|
||||
impact_score=0.9,
|
||||
effort_score=0.3,
|
||||
)
|
||||
|
||||
def _baseline_waf_rule(self) -> GeneratedRule:
|
||||
snippet = '''# Baseline WAF hardening (placeholder - customize for your environment)
|
||||
# Consider enabling Cloudflare managed WAF rulesets for:
|
||||
# - SQLi
|
||||
# - XSS
|
||||
# - RCE
|
||||
# - Bot protection
|
||||
'''
|
||||
return GeneratedRule(
|
||||
name="waf_baseline_hardening",
|
||||
description="Baseline recommendation to enable managed WAF rulesets.",
|
||||
terraform_snippet=snippet,
|
||||
severity="medium",
|
||||
tags=["baseline", "waf"],
|
||||
impact_score=0.7,
|
||||
effort_score=0.1,
|
||||
)
|
||||
370
mcp/waf_intelligence/orchestrator.py
Normal file
370
mcp/waf_intelligence/orchestrator.py
Normal file
@@ -0,0 +1,370 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional
|
||||
|
||||
from mcp.waf_intelligence.analyzer import AnalysisResult, RuleViolation, WAFRuleAnalyzer
|
||||
from mcp.waf_intelligence.compliance import ComplianceMapper, FrameworkMapping
|
||||
from mcp.waf_intelligence.generator import GeneratedRule, WAFRuleGenerator
|
||||
|
||||
# Optional advanced modules (Phase 7)
|
||||
try:
|
||||
from mcp.waf_intelligence.threat_intel import (
|
||||
ThreatIntelCollector,
|
||||
ThreatIntelReport,
|
||||
ThreatIndicator,
|
||||
)
|
||||
_HAS_THREAT_INTEL = True
|
||||
except ImportError:
|
||||
_HAS_THREAT_INTEL = False
|
||||
ThreatIntelCollector = None
|
||||
|
||||
try:
|
||||
from mcp.waf_intelligence.classifier import (
|
||||
ThreatClassifier,
|
||||
ClassificationResult,
|
||||
)
|
||||
_HAS_CLASSIFIER = True
|
||||
except ImportError:
|
||||
_HAS_CLASSIFIER = False
|
||||
ThreatClassifier = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class WAFInsight:
|
||||
"""Single high-quality insight across analysis + generation + compliance."""
|
||||
|
||||
violation: RuleViolation | None
|
||||
suggested_rule: GeneratedRule | None
|
||||
mappings: List[FrameworkMapping]
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThreatAssessment:
|
||||
"""Phase 7: Comprehensive threat assessment result."""
|
||||
|
||||
analysis_result: Optional[AnalysisResult] = None
|
||||
threat_report: Optional[Any] = None # ThreatIntelReport when available
|
||||
classification_summary: Dict[str, int] = field(default_factory=dict)
|
||||
risk_score: float = 0.0
|
||||
recommended_actions: List[str] = field(default_factory=list)
|
||||
generated_at: datetime = field(default_factory=datetime.utcnow)
|
||||
|
||||
@property
|
||||
def risk_level(self) -> str:
|
||||
if self.risk_score >= 0.8:
|
||||
return "critical"
|
||||
elif self.risk_score >= 0.6:
|
||||
return "high"
|
||||
elif self.risk_score >= 0.4:
|
||||
return "medium"
|
||||
else:
|
||||
return "low"
|
||||
|
||||
|
||||
class WAFIntelligence:
|
||||
"""
|
||||
Quality-first orchestration layer:
|
||||
- analyze WAF config
|
||||
- propose a few rules
|
||||
- attach compliance mappings
|
||||
- Phase 7: integrate threat intel and ML classification
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
workspace_path: Optional[str] = None,
|
||||
enable_threat_intel: bool = True,
|
||||
enable_ml_classifier: bool = True,
|
||||
) -> None:
|
||||
self.workspace = Path(workspace_path) if workspace_path else Path.cwd()
|
||||
|
||||
# Core components
|
||||
self.analyzer = WAFRuleAnalyzer()
|
||||
self.generator = WAFRuleGenerator()
|
||||
self.mapper = ComplianceMapper()
|
||||
|
||||
# Phase 7 components (optional)
|
||||
self.threat_intel: Optional[Any] = None
|
||||
self.classifier: Optional[Any] = None
|
||||
|
||||
if enable_threat_intel and _HAS_THREAT_INTEL:
|
||||
try:
|
||||
self.threat_intel = ThreatIntelCollector()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if enable_ml_classifier and _HAS_CLASSIFIER:
|
||||
try:
|
||||
self.classifier = ThreatClassifier()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def analyze_and_recommend(
|
||||
self,
|
||||
path: str,
|
||||
*,
|
||||
limit: int = 3,
|
||||
min_severity: str = "warning",
|
||||
) -> List[WAFInsight]:
|
||||
analysis: AnalysisResult = self.analyzer.analyze_file(
|
||||
path,
|
||||
min_severity=min_severity,
|
||||
)
|
||||
top_violations = analysis.top_violations(
|
||||
min_severity=min_severity,
|
||||
limit=limit,
|
||||
)
|
||||
|
||||
insights: List[WAFInsight] = []
|
||||
|
||||
for violation in top_violations:
|
||||
mappings = self.mapper.best_effort_from_violation(violation.message)
|
||||
|
||||
scenario = violation.message
|
||||
rules = self.generator.generate_from_scenario(scenario, limit=1)
|
||||
suggested = rules[0] if rules else None
|
||||
|
||||
insights.append(
|
||||
WAFInsight(
|
||||
violation=violation,
|
||||
suggested_rule=suggested,
|
||||
mappings=mappings,
|
||||
)
|
||||
)
|
||||
|
||||
return insights
|
||||
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
# Phase 7: Advanced threat intelligence methods
|
||||
# ─────────────────────────────────────────────────────────────────────────
|
||||
|
||||
def collect_threat_intel(
|
||||
self,
|
||||
log_paths: Optional[List[str]] = None,
|
||||
max_indicators: int = 100,
|
||||
) -> Optional[Any]:
|
||||
"""
|
||||
Collect threat intelligence from logs and external feeds.
|
||||
|
||||
Args:
|
||||
log_paths: Paths to Cloudflare log files
|
||||
max_indicators: Maximum indicators to collect
|
||||
|
||||
Returns:
|
||||
ThreatIntelReport or None if unavailable
|
||||
"""
|
||||
if not self.threat_intel:
|
||||
return None
|
||||
|
||||
# Default log paths
|
||||
if log_paths is None:
|
||||
log_paths = [
|
||||
str(self.workspace / "logs"),
|
||||
"/var/log/cloudflare",
|
||||
]
|
||||
|
||||
return self.threat_intel.collect(
|
||||
log_paths=log_paths,
|
||||
max_indicators=max_indicators,
|
||||
)
|
||||
|
||||
def classify_threat(self, payload: str) -> Optional[Any]:
|
||||
"""
|
||||
Classify a payload using ML classifier.
|
||||
|
||||
Args:
|
||||
payload: Request payload to classify
|
||||
|
||||
Returns:
|
||||
ClassificationResult or None
|
||||
"""
|
||||
if not self.classifier:
|
||||
return None
|
||||
|
||||
return self.classifier.classify(payload)
|
||||
|
||||
def full_assessment(
|
||||
self,
|
||||
waf_config_path: Optional[str] = None,
|
||||
log_paths: Optional[List[str]] = None,
|
||||
include_threat_intel: bool = True,
|
||||
) -> ThreatAssessment:
|
||||
"""
|
||||
Phase 7: Perform comprehensive threat assessment.
|
||||
|
||||
Combines:
|
||||
- WAF configuration analysis
|
||||
- Threat intelligence collection
|
||||
- ML classification summary
|
||||
- Risk scoring
|
||||
|
||||
Args:
|
||||
waf_config_path: Path to WAF Terraform file
|
||||
log_paths: Paths to log files
|
||||
include_threat_intel: Whether to collect threat intel
|
||||
|
||||
Returns:
|
||||
ThreatAssessment with full analysis results
|
||||
"""
|
||||
assessment = ThreatAssessment()
|
||||
risk_factors: List[float] = []
|
||||
recommendations: List[str] = []
|
||||
|
||||
# 1. Analyze WAF configuration
|
||||
if waf_config_path is None:
|
||||
waf_config_path = str(self.workspace / "terraform" / "waf.tf")
|
||||
|
||||
if Path(waf_config_path).exists():
|
||||
assessment.analysis_result = self.analyzer.analyze_file(
|
||||
waf_config_path,
|
||||
min_severity="info",
|
||||
)
|
||||
|
||||
# Calculate risk from violations
|
||||
severity_weights = {"error": 0.8, "warning": 0.5, "info": 0.2}
|
||||
for violation in assessment.analysis_result.violations:
|
||||
weight = severity_weights.get(violation.severity, 0.3)
|
||||
risk_factors.append(weight)
|
||||
|
||||
# Generate recommendations
|
||||
critical_count = sum(
|
||||
1 for v in assessment.analysis_result.violations
|
||||
if v.severity == "error"
|
||||
)
|
||||
if critical_count > 0:
|
||||
recommendations.append(
|
||||
f"🔴 Fix {critical_count} critical WAF configuration issues"
|
||||
)
|
||||
|
||||
# 2. Collect threat intelligence
|
||||
if include_threat_intel and self.threat_intel:
|
||||
try:
|
||||
assessment.threat_report = self.collect_threat_intel(
|
||||
log_paths=log_paths,
|
||||
max_indicators=50,
|
||||
)
|
||||
|
||||
if assessment.threat_report:
|
||||
indicators = assessment.threat_report.indicators
|
||||
|
||||
# Count by severity
|
||||
severity_counts = {"critical": 0, "high": 0, "medium": 0, "low": 0}
|
||||
for ind in indicators:
|
||||
sev = getattr(ind, "severity", "low")
|
||||
severity_counts[sev] = severity_counts.get(sev, 0) + 1
|
||||
|
||||
# Add to classification summary
|
||||
assessment.classification_summary["threat_indicators"] = len(indicators)
|
||||
assessment.classification_summary.update(severity_counts)
|
||||
|
||||
# Calculate threat intel risk
|
||||
if indicators:
|
||||
critical_ratio = severity_counts["critical"] / len(indicators)
|
||||
high_ratio = severity_counts["high"] / len(indicators)
|
||||
risk_factors.append(critical_ratio * 0.9 + high_ratio * 0.7)
|
||||
|
||||
if severity_counts["critical"] > 0:
|
||||
recommendations.append(
|
||||
f"🚨 Block {severity_counts['critical']} critical threat IPs immediately"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 3. ML classification summary (from any collected data)
|
||||
if self.classifier and assessment.threat_report:
|
||||
try:
|
||||
attack_types = {"sqli": 0, "xss": 0, "rce": 0, "clean": 0, "unknown": 0}
|
||||
|
||||
indicators = assessment.threat_report.indicators
|
||||
pattern_indicators = [
|
||||
i for i in indicators
|
||||
if getattr(i, "indicator_type", "") == "pattern"
|
||||
]
|
||||
|
||||
for ind in pattern_indicators[:20]: # Sample first 20
|
||||
result = self.classifier.classify(ind.value)
|
||||
if result:
|
||||
label = result.label
|
||||
attack_types[label] = attack_types.get(label, 0) + 1
|
||||
|
||||
assessment.classification_summary["ml_classifications"] = attack_types
|
||||
|
||||
# Add ML risk factor
|
||||
dangerous = attack_types.get("sqli", 0) + attack_types.get("rce", 0)
|
||||
if dangerous > 5:
|
||||
risk_factors.append(0.8)
|
||||
recommendations.append(
|
||||
f"⚠️ ML detected {dangerous} dangerous attack patterns"
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# 4. Calculate final risk score
|
||||
if risk_factors:
|
||||
assessment.risk_score = min(1.0, sum(risk_factors) / max(len(risk_factors), 1))
|
||||
else:
|
||||
assessment.risk_score = 0.3 # Baseline risk
|
||||
|
||||
assessment.recommended_actions = recommendations
|
||||
|
||||
return assessment
|
||||
|
||||
def generate_gitops_proposals(
|
||||
self,
|
||||
threat_report: Optional[Any] = None,
|
||||
max_proposals: int = 5,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Generate GitOps-ready rule proposals.
|
||||
|
||||
Args:
|
||||
threat_report: ThreatIntelReport to use
|
||||
max_proposals: Maximum proposals to generate
|
||||
|
||||
Returns:
|
||||
List of proposal dicts ready for MR creation
|
||||
"""
|
||||
proposals: List[Dict[str, Any]] = []
|
||||
|
||||
if not threat_report:
|
||||
return proposals
|
||||
|
||||
try:
|
||||
# Import proposer dynamically
|
||||
from gitops.waf_rule_proposer import WAFRuleProposer
|
||||
|
||||
proposer = WAFRuleProposer(workspace_path=str(self.workspace))
|
||||
batch = proposer.generate_proposals(
|
||||
threat_report=threat_report,
|
||||
max_proposals=max_proposals,
|
||||
)
|
||||
|
||||
for proposal in batch.proposals:
|
||||
proposals.append({
|
||||
"name": proposal.rule_name,
|
||||
"type": proposal.rule_type,
|
||||
"severity": proposal.severity,
|
||||
"confidence": proposal.confidence,
|
||||
"terraform": proposal.terraform_code,
|
||||
"justification": proposal.justification,
|
||||
"auto_deploy": proposal.auto_deploy_eligible,
|
||||
})
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
return proposals
|
||||
|
||||
@property
|
||||
def capabilities(self) -> Dict[str, bool]:
|
||||
"""Report available capabilities."""
|
||||
return {
|
||||
"core_analysis": True,
|
||||
"rule_generation": True,
|
||||
"compliance_mapping": True,
|
||||
"threat_intel": self.threat_intel is not None,
|
||||
"ml_classification": self.classifier is not None,
|
||||
}
|
||||
279
mcp/waf_intelligence/server.py
Executable file
279
mcp/waf_intelligence/server.py
Executable file
@@ -0,0 +1,279 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
WAF Intelligence MCP Server for VS Code Copilot.
|
||||
|
||||
This implements the Model Context Protocol (MCP) stdio interface
|
||||
so VS Code can communicate with your WAF Intelligence system.
|
||||
"""
|
||||
|
||||
import json
|
||||
import sys
|
||||
from typing import Any
|
||||
|
||||
# Add parent to path for imports
|
||||
sys.path.insert(0, '/Users/sovereign/Desktop/CLOUDFLARE')
|
||||
|
||||
from mcp.waf_intelligence.orchestrator import WAFIntelligence
|
||||
from mcp.waf_intelligence.analyzer import WAFRuleAnalyzer
|
||||
|
||||
|
||||
class WAFIntelligenceMCPServer:
|
||||
"""MCP Server wrapper for WAF Intelligence."""
|
||||
|
||||
def __init__(self):
|
||||
self.waf = WAFIntelligence()
|
||||
self.analyzer = WAFRuleAnalyzer()
|
||||
|
||||
def get_capabilities(self) -> dict:
|
||||
"""Return server capabilities."""
|
||||
return {
|
||||
"tools": [
|
||||
{
|
||||
"name": "waf_analyze",
|
||||
"description": "Analyze WAF logs and detect attack patterns",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"log_file": {
|
||||
"type": "string",
|
||||
"description": "Path to WAF log file (optional)"
|
||||
},
|
||||
"zone_id": {
|
||||
"type": "string",
|
||||
"description": "Cloudflare zone ID (optional)"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "waf_assess",
|
||||
"description": "Run full security assessment with threat intel and ML classification",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"zone_id": {
|
||||
"type": "string",
|
||||
"description": "Cloudflare zone ID"
|
||||
}
|
||||
},
|
||||
"required": ["zone_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "waf_generate_rules",
|
||||
"description": "Generate Terraform WAF rules from threat intelligence",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"zone_id": {
|
||||
"type": "string",
|
||||
"description": "Cloudflare zone ID"
|
||||
},
|
||||
"min_confidence": {
|
||||
"type": "number",
|
||||
"description": "Minimum confidence threshold (0-1)",
|
||||
"default": 0.7
|
||||
}
|
||||
},
|
||||
"required": ["zone_id"]
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "waf_capabilities",
|
||||
"description": "List available WAF Intelligence capabilities",
|
||||
"inputSchema": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
def handle_tool_call(self, name: str, arguments: dict) -> dict:
|
||||
"""Handle a tool invocation."""
|
||||
try:
|
||||
if name == "waf_capabilities":
|
||||
return {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": json.dumps({
|
||||
"capabilities": self.waf.capabilities,
|
||||
"status": "operational"
|
||||
}, indent=2)
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
elif name == "waf_analyze":
|
||||
log_file = arguments.get("log_file")
|
||||
zone_id = arguments.get("zone_id")
|
||||
|
||||
if log_file:
|
||||
result = self.analyzer.analyze_log_file(log_file)
|
||||
else:
|
||||
result = {
|
||||
"message": "No log file provided. Use zone_id for live analysis.",
|
||||
"capabilities": self.waf.capabilities
|
||||
}
|
||||
|
||||
return {
|
||||
"content": [
|
||||
{"type": "text", "text": json.dumps(result, indent=2, default=str)}
|
||||
]
|
||||
}
|
||||
|
||||
elif name == "waf_assess":
|
||||
zone_id = arguments.get("zone_id")
|
||||
# full_assessment uses workspace paths, not zone_id
|
||||
assessment = self.waf.full_assessment(
|
||||
include_threat_intel=True
|
||||
)
|
||||
# Build result from ThreatAssessment dataclass
|
||||
result = {
|
||||
"zone_id": zone_id,
|
||||
"risk_score": assessment.risk_score,
|
||||
"risk_level": assessment.risk_level,
|
||||
"classification_summary": assessment.classification_summary,
|
||||
"recommended_actions": assessment.recommended_actions[:10], # Top 10
|
||||
"has_analysis": assessment.analysis_result is not None,
|
||||
"has_threat_intel": assessment.threat_report is not None,
|
||||
"generated_at": str(assessment.generated_at)
|
||||
}
|
||||
|
||||
return {
|
||||
"content": [
|
||||
{"type": "text", "text": json.dumps(result, indent=2, default=str)}
|
||||
]
|
||||
}
|
||||
|
||||
elif name == "waf_generate_rules":
|
||||
zone_id = arguments.get("zone_id")
|
||||
min_confidence = arguments.get("min_confidence", 0.7)
|
||||
|
||||
# Generate proposals (doesn't use zone_id directly)
|
||||
proposals = self.waf.generate_gitops_proposals(
|
||||
max_proposals=5
|
||||
)
|
||||
|
||||
result = {
|
||||
"zone_id": zone_id,
|
||||
"min_confidence": min_confidence,
|
||||
"proposals_count": len(proposals),
|
||||
"proposals": proposals
|
||||
}
|
||||
|
||||
return {
|
||||
"content": [
|
||||
{"type": "text", "text": json.dumps(result, indent=2, default=str) if proposals else "No rules generated (no threat data available)"}
|
||||
]
|
||||
}
|
||||
|
||||
else:
|
||||
return {
|
||||
"content": [
|
||||
{"type": "text", "text": f"Unknown tool: {name}"}
|
||||
],
|
||||
"isError": True
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
return {
|
||||
"content": [
|
||||
{"type": "text", "text": f"Error: {str(e)}"}
|
||||
],
|
||||
"isError": True
|
||||
}
|
||||
|
||||
def run(self):
|
||||
"""Run the MCP server (stdio mode)."""
|
||||
# Send server info
|
||||
server_info = {
|
||||
"jsonrpc": "2.0",
|
||||
"method": "initialized",
|
||||
"params": {
|
||||
"serverInfo": {
|
||||
"name": "waf-intelligence",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"capabilities": self.get_capabilities()
|
||||
}
|
||||
}
|
||||
|
||||
# Main loop - read JSON-RPC messages from stdin
|
||||
for line in sys.stdin:
|
||||
try:
|
||||
message = json.loads(line.strip())
|
||||
|
||||
if message.get("method") == "initialize":
|
||||
response = {
|
||||
"jsonrpc": "2.0",
|
||||
"id": message.get("id"),
|
||||
"result": {
|
||||
"protocolVersion": "2024-11-05",
|
||||
"serverInfo": {
|
||||
"name": "waf-intelligence",
|
||||
"version": "1.0.0"
|
||||
},
|
||||
"capabilities": {
|
||||
"tools": {}
|
||||
}
|
||||
}
|
||||
}
|
||||
print(json.dumps(response), flush=True)
|
||||
|
||||
elif message.get("method") == "tools/list":
|
||||
response = {
|
||||
"jsonrpc": "2.0",
|
||||
"id": message.get("id"),
|
||||
"result": self.get_capabilities()
|
||||
}
|
||||
print(json.dumps(response), flush=True)
|
||||
|
||||
elif message.get("method") == "tools/call":
|
||||
params = message.get("params", {})
|
||||
tool_name = params.get("name")
|
||||
tool_args = params.get("arguments", {})
|
||||
|
||||
result = self.handle_tool_call(tool_name, tool_args)
|
||||
|
||||
response = {
|
||||
"jsonrpc": "2.0",
|
||||
"id": message.get("id"),
|
||||
"result": result
|
||||
}
|
||||
print(json.dumps(response), flush=True)
|
||||
|
||||
elif message.get("method") == "notifications/initialized":
|
||||
# Client acknowledged initialization
|
||||
pass
|
||||
|
||||
else:
|
||||
# Unknown method
|
||||
response = {
|
||||
"jsonrpc": "2.0",
|
||||
"id": message.get("id"),
|
||||
"error": {
|
||||
"code": -32601,
|
||||
"message": f"Method not found: {message.get('method')}"
|
||||
}
|
||||
}
|
||||
print(json.dumps(response), flush=True)
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
except Exception as e:
|
||||
error_response = {
|
||||
"jsonrpc": "2.0",
|
||||
"id": None,
|
||||
"error": {
|
||||
"code": -32603,
|
||||
"message": str(e)
|
||||
}
|
||||
}
|
||||
print(json.dumps(error_response), flush=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
server = WAFIntelligenceMCPServer()
|
||||
server.run()
|
||||
445
mcp/waf_intelligence/threat_intel.py
Normal file
445
mcp/waf_intelligence/threat_intel.py
Normal file
@@ -0,0 +1,445 @@
|
||||
"""
|
||||
Phase 7: Multi-Source Threat Intelligence Collector
|
||||
|
||||
Aggregates threat data from:
|
||||
- Cloudflare Analytics API (WAF events, firewall logs)
|
||||
- External threat feeds (AbuseIPDB, Emerging Threats, etc.)
|
||||
- Local honeypot signals (if configured)
|
||||
- Historical attack patterns from receipts/logs
|
||||
|
||||
Produces scored ThreatIndicators for ML classification and rule generation.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Set
|
||||
from urllib.parse import urlparse
|
||||
|
||||
# Optional: requests for external API calls
|
||||
try:
|
||||
import requests
|
||||
HAS_REQUESTS = True
|
||||
except ImportError:
|
||||
HAS_REQUESTS = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThreatIndicator:
|
||||
"""Single threat indicator with scoring metadata."""
|
||||
|
||||
indicator_type: str # "ip", "ua", "path", "pattern", "country"
|
||||
value: str
|
||||
confidence: float # 0.0-1.0
|
||||
severity: str # "low", "medium", "high", "critical"
|
||||
sources: List[str] = field(default_factory=list)
|
||||
tags: List[str] = field(default_factory=list)
|
||||
first_seen: Optional[datetime] = None
|
||||
last_seen: Optional[datetime] = None
|
||||
hit_count: int = 1
|
||||
context: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""Unique identifier for deduplication."""
|
||||
raw = f"{self.indicator_type}:{self.value}"
|
||||
return hashlib.sha256(raw.encode()).hexdigest()[:16]
|
||||
|
||||
def merge(self, other: "ThreatIndicator") -> None:
|
||||
"""Merge another indicator into this one (for deduplication)."""
|
||||
self.hit_count += other.hit_count
|
||||
self.confidence = max(self.confidence, other.confidence)
|
||||
self.sources = list(set(self.sources + other.sources))
|
||||
self.tags = list(set(self.tags + other.tags))
|
||||
if other.first_seen and (not self.first_seen or other.first_seen < self.first_seen):
|
||||
self.first_seen = other.first_seen
|
||||
if other.last_seen and (not self.last_seen or other.last_seen > self.last_seen):
|
||||
self.last_seen = other.last_seen
|
||||
|
||||
|
||||
@dataclass
|
||||
class ThreatIntelReport:
|
||||
"""Aggregated threat intelligence from all sources."""
|
||||
|
||||
indicators: List[ThreatIndicator] = field(default_factory=list)
|
||||
sources_queried: List[str] = field(default_factory=list)
|
||||
collection_time: datetime = field(default_factory=datetime.utcnow)
|
||||
metadata: Dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@property
|
||||
def critical_count(self) -> int:
|
||||
return sum(1 for i in self.indicators if i.severity == "critical")
|
||||
|
||||
@property
|
||||
def high_count(self) -> int:
|
||||
return sum(1 for i in self.indicators if i.severity == "high")
|
||||
|
||||
def top_indicators(self, limit: int = 10) -> List[ThreatIndicator]:
|
||||
"""Return highest-priority indicators."""
|
||||
severity_order = {"critical": 4, "high": 3, "medium": 2, "low": 1}
|
||||
sorted_indicators = sorted(
|
||||
self.indicators,
|
||||
key=lambda x: (severity_order.get(x.severity, 0), x.confidence, x.hit_count),
|
||||
reverse=True
|
||||
)
|
||||
return sorted_indicators[:limit]
|
||||
|
||||
|
||||
class CloudflareLogParser:
|
||||
"""Parse Cloudflare WAF/firewall logs for threat indicators."""
|
||||
|
||||
# Common attack patterns in URIs
|
||||
ATTACK_PATTERNS = [
|
||||
(r"(?i)(?:union\s+select|select\s+.*\s+from)", "sqli", "high"),
|
||||
(r"(?i)<script[^>]*>", "xss", "high"),
|
||||
(r"(?i)(?:\.\./|\.\.\\)", "path_traversal", "medium"),
|
||||
(r"(?i)(?:cmd=|exec=|system\()", "rce", "critical"),
|
||||
(r"(?i)(?:wp-admin|wp-login|xmlrpc\.php)", "wordpress_probe", "low"),
|
||||
(r"(?i)(?:\.env|\.git|\.htaccess)", "sensitive_file", "medium"),
|
||||
(r"(?i)(?:phpmyadmin|adminer|mysql)", "db_probe", "medium"),
|
||||
(r"(?i)(?:eval\(|base64_decode)", "code_injection", "high"),
|
||||
]
|
||||
|
||||
# Known bad user agents
|
||||
BAD_USER_AGENTS = [
|
||||
("sqlmap", "sqli_tool", "high"),
|
||||
("nikto", "scanner", "medium"),
|
||||
("nmap", "scanner", "medium"),
|
||||
("masscan", "scanner", "medium"),
|
||||
("zgrab", "scanner", "low"),
|
||||
("python-requests", "bot", "low"), # contextual
|
||||
("curl", "bot", "low"), # contextual
|
||||
]
|
||||
|
||||
def parse_log_file(self, path: Path) -> List[ThreatIndicator]:
|
||||
"""Parse a log file and extract threat indicators."""
|
||||
indicators: List[ThreatIndicator] = []
|
||||
|
||||
if not path.exists():
|
||||
return indicators
|
||||
|
||||
try:
|
||||
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
||||
for line in f:
|
||||
indicators.extend(self._parse_log_line(line))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return indicators
|
||||
|
||||
def _parse_log_line(self, line: str) -> List[ThreatIndicator]:
|
||||
"""Extract indicators from a single log line."""
|
||||
indicators: List[ThreatIndicator] = []
|
||||
|
||||
# Try JSON format first
|
||||
try:
|
||||
data = json.loads(line)
|
||||
indicators.extend(self._parse_json_log(data))
|
||||
return indicators
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fall back to pattern matching on raw line
|
||||
indicators.extend(self._scan_for_patterns(line))
|
||||
|
||||
return indicators
|
||||
|
||||
def _parse_json_log(self, data: Dict[str, Any]) -> List[ThreatIndicator]:
|
||||
"""Parse structured JSON log entry."""
|
||||
indicators: List[ThreatIndicator] = []
|
||||
|
||||
# Extract IP if blocked or challenged
|
||||
action = data.get("action", "").lower()
|
||||
if action in ("block", "challenge", "managed_challenge"):
|
||||
ip = data.get("clientIP") or data.get("client_ip") or data.get("ip")
|
||||
if ip:
|
||||
indicators.append(ThreatIndicator(
|
||||
indicator_type="ip",
|
||||
value=ip,
|
||||
confidence=0.8 if action == "block" else 0.6,
|
||||
severity="high" if action == "block" else "medium",
|
||||
sources=["cloudflare_log"],
|
||||
tags=[action, data.get("ruleId", "unknown_rule")],
|
||||
context={"rule": data.get("ruleName", ""), "action": action}
|
||||
))
|
||||
|
||||
# Extract URI patterns
|
||||
uri = data.get("clientRequestURI") or data.get("uri") or data.get("path", "")
|
||||
if uri:
|
||||
indicators.extend(self._scan_for_patterns(uri))
|
||||
|
||||
# Extract user agent
|
||||
ua = data.get("clientRequestHTTPHost") or data.get("user_agent", "")
|
||||
if ua:
|
||||
for pattern, tag, severity in self.BAD_USER_AGENTS:
|
||||
if pattern.lower() in ua.lower():
|
||||
indicators.append(ThreatIndicator(
|
||||
indicator_type="ua",
|
||||
value=ua[:200], # truncate
|
||||
confidence=0.7,
|
||||
severity=severity,
|
||||
sources=["cloudflare_log"],
|
||||
tags=[tag, "bad_ua"]
|
||||
))
|
||||
break
|
||||
|
||||
return indicators
|
||||
|
||||
def _scan_for_patterns(self, text: str) -> List[ThreatIndicator]:
|
||||
"""Scan text for known attack patterns."""
|
||||
indicators: List[ThreatIndicator] = []
|
||||
|
||||
for pattern, tag, severity in self.ATTACK_PATTERNS:
|
||||
if re.search(pattern, text):
|
||||
indicators.append(ThreatIndicator(
|
||||
indicator_type="pattern",
|
||||
value=text[:500], # truncate
|
||||
confidence=0.75,
|
||||
severity=severity,
|
||||
sources=["pattern_match"],
|
||||
tags=[tag, "attack_pattern"]
|
||||
))
|
||||
|
||||
return indicators
|
||||
|
||||
|
||||
class ExternalThreatFeed:
|
||||
"""Fetch threat intelligence from external APIs."""
|
||||
|
||||
def __init__(self, api_keys: Optional[Dict[str, str]] = None):
|
||||
self.api_keys = api_keys or {}
|
||||
self._cache: Dict[str, ThreatIndicator] = {}
|
||||
|
||||
def query_abuseipdb(self, ip: str) -> Optional[ThreatIndicator]:
|
||||
"""Query AbuseIPDB for IP reputation."""
|
||||
if not HAS_REQUESTS:
|
||||
return None
|
||||
|
||||
api_key = self.api_keys.get("abuseipdb") or os.getenv("ABUSEIPDB_API_KEY")
|
||||
if not api_key:
|
||||
return None
|
||||
|
||||
cache_key = f"abuseipdb:{ip}"
|
||||
if cache_key in self._cache:
|
||||
return self._cache[cache_key]
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
"https://api.abuseipdb.com/api/v2/check",
|
||||
headers={"Key": api_key, "Accept": "application/json"},
|
||||
params={"ipAddress": ip, "maxAgeInDays": 90},
|
||||
timeout=5
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json().get("data", {})
|
||||
abuse_score = data.get("abuseConfidenceScore", 0)
|
||||
|
||||
if abuse_score > 0:
|
||||
severity = "critical" if abuse_score > 80 else "high" if abuse_score > 50 else "medium"
|
||||
indicator = ThreatIndicator(
|
||||
indicator_type="ip",
|
||||
value=ip,
|
||||
confidence=abuse_score / 100,
|
||||
severity=severity,
|
||||
sources=["abuseipdb"],
|
||||
tags=["external_intel", "ip_reputation"],
|
||||
hit_count=data.get("totalReports", 1),
|
||||
context={
|
||||
"abuse_score": abuse_score,
|
||||
"country": data.get("countryCode"),
|
||||
"isp": data.get("isp"),
|
||||
"domain": data.get("domain"),
|
||||
"usage_type": data.get("usageType"),
|
||||
}
|
||||
)
|
||||
self._cache[cache_key] = indicator
|
||||
return indicator
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def query_emerging_threats(self, ip: str) -> Optional[ThreatIndicator]:
|
||||
"""Check IP against Emerging Threats blocklist (free, no API key)."""
|
||||
if not HAS_REQUESTS:
|
||||
return None
|
||||
|
||||
# This is a simplified check - real implementation would cache the blocklist
|
||||
# For demo purposes, we return None and rely on other sources
|
||||
return None
|
||||
|
||||
def enrich_indicator(self, indicator: ThreatIndicator) -> ThreatIndicator:
|
||||
"""Enrich an indicator with external intelligence."""
|
||||
if indicator.indicator_type == "ip":
|
||||
external = self.query_abuseipdb(indicator.value)
|
||||
if external:
|
||||
indicator.merge(external)
|
||||
|
||||
return indicator
|
||||
|
||||
|
||||
class ThreatIntelCollector:
|
||||
"""
|
||||
Main collector that aggregates from all sources.
|
||||
|
||||
Usage:
|
||||
collector = ThreatIntelCollector(workspace_path="/path/to/cloudflare")
|
||||
report = collector.collect()
|
||||
for indicator in report.top_indicators(10):
|
||||
print(f"{indicator.severity}: {indicator.indicator_type}={indicator.value}")
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
workspace_path: Optional[str] = None,
|
||||
api_keys: Optional[Dict[str, str]] = None,
|
||||
enable_external: bool = True
|
||||
):
|
||||
self.workspace = Path(workspace_path) if workspace_path else Path.cwd()
|
||||
self.log_parser = CloudflareLogParser()
|
||||
self.external_feed = ExternalThreatFeed(api_keys) if enable_external else None
|
||||
self._indicators: Dict[str, ThreatIndicator] = {}
|
||||
|
||||
def collect(
|
||||
self,
|
||||
log_dirs: Optional[List[str]] = None,
|
||||
enrich_external: bool = True,
|
||||
max_indicators: int = 1000
|
||||
) -> ThreatIntelReport:
|
||||
"""
|
||||
Collect threat intelligence from all configured sources.
|
||||
|
||||
Args:
|
||||
log_dirs: Directories to scan for logs (default: observatory/, anomalies/)
|
||||
enrich_external: Whether to query external APIs for enrichment
|
||||
max_indicators: Maximum indicators to return
|
||||
|
||||
Returns:
|
||||
ThreatIntelReport with deduplicated, scored indicators
|
||||
"""
|
||||
sources_queried: List[str] = []
|
||||
|
||||
# Default log directories
|
||||
if log_dirs is None:
|
||||
log_dirs = ["observatory", "anomalies", "archive_runtime/receipts"]
|
||||
|
||||
# Collect from local logs
|
||||
for log_dir in log_dirs:
|
||||
dir_path = self.workspace / log_dir
|
||||
if dir_path.exists():
|
||||
sources_queried.append(f"local:{log_dir}")
|
||||
self._collect_from_directory(dir_path)
|
||||
|
||||
# Collect from Terraform state (extract referenced IPs/patterns)
|
||||
tf_path = self.workspace / "terraform"
|
||||
if tf_path.exists():
|
||||
sources_queried.append("terraform_state")
|
||||
self._collect_from_terraform(tf_path)
|
||||
|
||||
# Enrich with external intel if enabled
|
||||
if enrich_external and self.external_feed:
|
||||
sources_queried.append("external_apis")
|
||||
self._enrich_all_indicators()
|
||||
|
||||
# Build report
|
||||
all_indicators = list(self._indicators.values())
|
||||
|
||||
# Sort by priority and truncate
|
||||
severity_order = {"critical": 4, "high": 3, "medium": 2, "low": 1}
|
||||
all_indicators.sort(
|
||||
key=lambda x: (severity_order.get(x.severity, 0), x.confidence, x.hit_count),
|
||||
reverse=True
|
||||
)
|
||||
|
||||
return ThreatIntelReport(
|
||||
indicators=all_indicators[:max_indicators],
|
||||
sources_queried=sources_queried,
|
||||
metadata={
|
||||
"workspace": str(self.workspace),
|
||||
"total_raw": len(self._indicators),
|
||||
"external_enabled": enrich_external and self.external_feed is not None
|
||||
}
|
||||
)
|
||||
|
||||
def _collect_from_directory(self, dir_path: Path) -> None:
|
||||
"""Scan a directory for log files and extract indicators."""
|
||||
log_patterns = ["*.log", "*.json", "*.jsonl"]
|
||||
|
||||
for pattern in log_patterns:
|
||||
for log_file in dir_path.rglob(pattern):
|
||||
for indicator in self.log_parser.parse_log_file(log_file):
|
||||
self._add_indicator(indicator)
|
||||
|
||||
def _collect_from_terraform(self, tf_path: Path) -> None:
|
||||
"""Extract indicators referenced in Terraform files."""
|
||||
for tf_file in tf_path.glob("*.tf"):
|
||||
try:
|
||||
content = tf_file.read_text(encoding="utf-8")
|
||||
|
||||
# Extract IPs from allow/block rules
|
||||
ip_pattern = r'\b(?:\d{1,3}\.){3}\d{1,3}(?:/\d{1,2})?\b'
|
||||
for match in re.finditer(ip_pattern, content):
|
||||
ip = match.group()
|
||||
# Only flag if in a block context
|
||||
context_start = max(0, match.start() - 100)
|
||||
context = content[context_start:match.start()].lower()
|
||||
if "block" in context or "deny" in context:
|
||||
self._add_indicator(ThreatIndicator(
|
||||
indicator_type="ip",
|
||||
value=ip,
|
||||
confidence=0.9,
|
||||
severity="medium",
|
||||
sources=["terraform_blocklist"],
|
||||
tags=["existing_rule", "blocked_ip"],
|
||||
context={"file": str(tf_file.name)}
|
||||
))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _add_indicator(self, indicator: ThreatIndicator) -> None:
|
||||
"""Add indicator with deduplication."""
|
||||
key = indicator.fingerprint
|
||||
if key in self._indicators:
|
||||
self._indicators[key].merge(indicator)
|
||||
else:
|
||||
self._indicators[key] = indicator
|
||||
|
||||
def _enrich_all_indicators(self) -> None:
|
||||
"""Enrich all IP indicators with external intelligence."""
|
||||
if not self.external_feed:
|
||||
return
|
||||
|
||||
for key, indicator in list(self._indicators.items()):
|
||||
if indicator.indicator_type == "ip":
|
||||
self.external_feed.enrich_indicator(indicator)
|
||||
|
||||
|
||||
# CLI interface for testing
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
workspace = sys.argv[1] if len(sys.argv) > 1 else "."
|
||||
|
||||
collector = ThreatIntelCollector(
|
||||
workspace_path=workspace,
|
||||
enable_external=False # Don't hit APIs in CLI test
|
||||
)
|
||||
|
||||
report = collector.collect()
|
||||
|
||||
print(f"\n🔍 Threat Intelligence Report")
|
||||
print(f"=" * 50)
|
||||
print(f"Sources: {', '.join(report.sources_queried)}")
|
||||
print(f"Total indicators: {len(report.indicators)}")
|
||||
print(f"Critical: {report.critical_count} | High: {report.high_count}")
|
||||
print(f"\nTop 10 Indicators:")
|
||||
print("-" * 50)
|
||||
|
||||
for ind in report.top_indicators(10):
|
||||
print(f" [{ind.severity.upper():8}] {ind.indicator_type}={ind.value[:50]}")
|
||||
print(f" confidence={ind.confidence:.2f} hits={ind.hit_count} sources={ind.sources}")
|
||||
Reference in New Issue
Block a user