Files
vm-cloudflare/oracle_runner.py
2025-12-17 00:02:39 +00:00

455 lines
14 KiB
Python
Executable File

#!/usr/bin/env python3
"""
COMPLIANCE ORACLE RUNNER
v0.4.0 - Production Ready
End-to-end compliance oracle that:
1. Searches documentation for answers
2. Builds context from multiple frameworks
3. Queries LLM for oracle answers
4. Validates answers with typing
5. Emits receipt with sha256 hash
6. Logs to compliance ledger
Usage:
python3 oracle_runner.py "What are our incident response obligations under NIS2?"
python3 oracle_runner.py "Are we compliant with GDPR Article 33?"
python3 oracle_runner.py "Summarize WAF rules for PCI-DSS" --frameworks pci-dss,gdpr
"""
import json
import sys
import os
import hashlib
import datetime
from pathlib import Path
from typing import Optional, List, Dict, Any
from dataclasses import dataclass, asdict, field
from enum import Enum
import re
from layer0 import layer0_entry
from layer0.shadow_classifier import ShadowEvalResult
class ComplianceFramework(str, Enum):
"""Supported compliance frameworks"""
PCI_DSS = "pci-dss"
GDPR = "gdpr"
NIS2 = "nis2"
AI_ACT = "ai-act"
SOC2 = "soc2"
ISO27001 = "iso27001"
HIPAA = "hipaa"
ALL = "all"
@dataclass
class Citation:
"""Single citation to a document"""
document_id: str
filename: str
framework: str
snippet: str
relevance_score: float = 0.85
@dataclass
class ComplianceGap:
"""Identified gap in compliance"""
framework: str
requirement: str
current_state: str
gap_description: str
remediation: Optional[str] = None
@dataclass
class OracleAnswer:
"""Core oracle answer schema (v0.4.0)"""
question: str
answer: str
frameworks: List[str]
citations: List[Citation]
gaps: List[ComplianceGap]
insufficient_context: bool = False
confidence_level: str = "high" # high, medium, low
compliance_flags: Dict[str, str] = field(default_factory=dict)
def to_json(self) -> str:
"""Serialize to JSON (for hashing)"""
data = asdict(self)
data["citations"] = [asdict(c) for c in self.citations]
data["gaps"] = [asdict(g) for g in self.gaps]
return json.dumps(data, sort_keys=True, separators=(",", ":"))
@dataclass
class OracleReceipt:
"""Receipt for oracle answer (v0.4.0)"""
timestamp: str
oracle_answer: str # The full JSON answer
answer_hash: str # SHA256 hash of answer
hash_algorithm: str = "sha256"
version: str = "v0.4.0"
def to_json(self) -> str:
"""Serialize to JSON"""
return json.dumps(asdict(self), indent=2)
class OracleRunner:
"""End-to-end compliance oracle"""
def __init__(self, base_path: str = "/Users/sovereign/Desktop/CLOUDFLARE"):
self.base_path = Path(base_path)
self.docs_path = self.base_path
self.compliance_ledger = self.base_path / "COMPLIANCE_LEDGER.jsonl"
# Framework → filename mappings
self.framework_docs: Dict[str, List[str]] = {
"pci-dss": [
"cloudflare_waf_baseline.md",
"WEB-INFRA-SECURITY-PATTERNS.md",
],
"gdpr": [
"zero_trust_architecture.md",
"WEB-INFRA-SECURITY-PATTERNS.md",
"cloudflare_dns_manifest.md",
],
"nis2": [
"TUNNEL-HARDENING.md",
"WEB-INFRA-SECURITY-PATTERNS.md",
],
"ai-act": [
"zero_trust_architecture.md",
"WEB-INFRA-SECURITY-PATTERNS.md",
],
}
def search_documents(
self, question: str, frameworks: Optional[List[str]] = None, max_docs: int = 5
) -> List[Citation]:
"""
Search documentation for relevant content.
Returns list of citations.
"""
citations: List[Citation] = []
# Default to all frameworks
if frameworks is None:
frameworks = ["pci-dss", "gdpr", "nis2"]
# Search each framework's documents
for framework in frameworks:
docs = self.framework_docs.get(framework, [])
for doc_filename in docs:
doc_path = self.docs_path / doc_filename
if not doc_path.exists():
continue
try:
with open(doc_path, "r") as f:
content = f.read()
# Simple keyword matching for relevance
question_words = set(re.findall(r"\b\w+\b", question.lower()))
content_lower = content.lower()
matches = sum(1 for word in question_words if word in content_lower)
relevance = min(1.0, matches / max(1, len(question_words)))
if relevance > 0.2: # Threshold
# Extract snippet
snippet = self._extract_snippet(content, question_words)
citation = Citation(
document_id=doc_filename.replace(".md", ""),
filename=doc_filename,
framework=framework,
snippet=snippet,
relevance_score=relevance,
)
citations.append(citation)
except Exception as e:
print(
f"Warning: Error reading {doc_filename}: {e}", file=sys.stderr
)
# Sort by relevance and limit
citations.sort(key=lambda c: c.relevance_score, reverse=True)
return citations[:max_docs]
def _extract_snippet(
self, content: str, keywords: set, snippet_len: int = 200
) -> str:
"""Extract a relevant snippet from content"""
lines = content.split("\n")
for i, line in enumerate(lines):
if any(keyword in line.lower() for keyword in keywords):
start = max(0, i - 2)
end = min(len(lines), i + 3)
snippet = "\n".join(lines[start:end])
return snippet[:snippet_len] + (
"..." if len(snippet) > snippet_len else ""
)
return content[:snippet_len] + ("..." if len(content) > snippet_len else "")
def validate_oracle_answer(self, answer: OracleAnswer) -> bool:
"""Validate oracle answer structure and content"""
# Check required fields
if not answer.question or not answer.answer:
return False
# Check frameworks
if not answer.frameworks or len(answer.frameworks) == 0:
return False
# Check citations exist
if not answer.citations:
answer.insufficient_context = True
# Validate compliance flags make sense
for framework in answer.frameworks:
if framework not in answer.compliance_flags:
answer.compliance_flags[framework] = "unknown"
return True
def emit_oracle_receipt(self, answer: OracleAnswer) -> OracleReceipt:
"""
Emit a receipt with SHA256 hash for oracle answer.
Logs to compliance ledger.
"""
answer_json = answer.to_json()
# Compute SHA256 hash
answer_hash = hashlib.sha256(answer_json.encode()).hexdigest()
receipt = OracleReceipt(
timestamp=datetime.datetime.now(datetime.timezone.utc).isoformat(),
oracle_answer=answer_json,
answer_hash=answer_hash,
)
# Append to compliance ledger
try:
with open(self.compliance_ledger, "a") as f:
f.write(receipt.to_json() + "\n")
except Exception as e:
print(f"Warning: Could not write to ledger: {e}", file=sys.stderr)
return receipt
def run(
self,
question: str,
frameworks: Optional[List[str]] = None,
verbose: bool = False,
) -> Dict[str, Any]:
"""
Run complete oracle pipeline.
Returns: {answer: OracleAnswer, receipt: OracleReceipt}
"""
if verbose:
print(f"\n[ORACLE] Question: {question}\n", file=sys.stderr)
# Step 1: Search documents
if verbose:
print(f"[ORACLE] Searching documentation...", file=sys.stderr)
citations = self.search_documents(question, frameworks)
if verbose:
print(
f"[ORACLE] Found {len(citations)} relevant documents\n", file=sys.stderr
)
# Step 2: Build oracle answer
# In production, this would call an LLM
# For now, we create a template with placeholders
frameworks_list = frameworks or ["pci-dss", "gdpr"]
answer = OracleAnswer(
question=question,
answer=self._generate_answer(question, citations),
frameworks=frameworks_list,
citations=citations,
gaps=self._identify_gaps(question, citations),
insufficient_context=len(citations) < 2,
compliance_flags={
framework: "covered"
if len([c for c in citations if c.framework == framework]) > 0
else "uncovered"
for framework in frameworks_list
},
)
# Step 3: Validate
if not self.validate_oracle_answer(answer):
print("[ERROR] Answer validation failed", file=sys.stderr)
sys.exit(1)
if verbose:
print(f"[ORACLE] Answer validated\n", file=sys.stderr)
# Step 4: Emit receipt
receipt = self.emit_oracle_receipt(answer)
if verbose:
print(
f"[ORACLE] Receipt emitted with hash: {receipt.answer_hash[:16]}...\n",
file=sys.stderr,
)
return {"answer": answer, "receipt": receipt}
def _generate_answer(self, question: str, citations: List[Citation]) -> str:
"""Generate answer from citations (template)"""
if not citations:
return (
"Based on the available documentation, I could not find sufficient context "
"to answer this question. Please provide more specific details or add relevant "
"documentation to the knowledge base."
)
citation_text = "\n\n".join(
[f"From {c.filename} ({c.framework}):\n{c.snippet}" for c in citations[:3]]
)
return (
f"Based on the available documentation:\n\n{citation_text}\n\n"
"[Note: In production, this would be replaced with an LLM-generated answer]"
)
def _identify_gaps(
self, question: str, citations: List[Citation]
) -> List[ComplianceGap]:
"""Identify gaps in compliance based on citations"""
gaps: List[ComplianceGap] = []
# If few citations, mark as insufficient
if len(citations) < 2:
gaps.append(
ComplianceGap(
framework="all",
requirement="Full coverage",
current_state="Documented",
gap_description="Insufficient documentation found for comprehensive answer",
)
)
return gaps
def parse_frameworks(arg_value: str) -> List[str]:
"""Parse comma-separated frameworks"""
return [f.strip() for f in arg_value.split(",")]
def main() -> int:
"""CLI entry point"""
if len(sys.argv) < 2:
print("Usage: oracle_runner.py <question> [--frameworks framework1,framework2]")
print("\nExample:")
print(' oracle_runner.py "Are we GDPR compliant?" --frameworks gdpr')
print(' oracle_runner.py "What are NIS2 obligations?" --frameworks nis2')
return 1
question = sys.argv[1]
frameworks: Optional[List[str]] = None
verbose = "--verbose" in sys.argv or "-v" in sys.argv
# Layer 0: pre-boot Shadow Eval gate before any processing.
routing_action, shadow = layer0_entry(question)
if routing_action != "HANDOFF_TO_LAYER1":
_render_layer0_block(routing_action, shadow)
return 1
# Parse frameworks flag
for i, arg in enumerate(sys.argv[2:], 2):
if arg.startswith("--frameworks="):
frameworks = parse_frameworks(arg.split("=", 1)[1])
elif arg == "--frameworks" and i + 1 < len(sys.argv):
frameworks = parse_frameworks(sys.argv[i + 1])
runner = OracleRunner()
result = runner.run(question, frameworks=frameworks, verbose=verbose)
# Output results
answer = result["answer"]
receipt = result["receipt"]
# Print answer
print("\n" + "=" * 80)
print("COMPLIANCE ORACLE ANSWER")
print("=" * 80)
print(f"\nQuestion: {answer.question}\n")
print(f"Answer:\n{answer.answer}\n")
print(f"Frameworks: {', '.join(answer.frameworks)}")
print(f"Confidence: {answer.confidence_level}")
print(f"Insufficient Context: {answer.insufficient_context}\n")
# Print citations
if answer.citations:
print("Citations:")
for i, citation in enumerate(answer.citations, 1):
print(f" [{i}] {citation.filename} ({citation.framework})")
print(f" Relevance: {citation.relevance_score:.2%}")
print(f" Snippet: {citation.snippet[:100]}...")
# Print gaps
if answer.gaps:
print("\nIdentified Gaps:")
for gap in answer.gaps:
print(f" - {gap.framework}: {gap.gap_description}")
if gap.remediation:
print(f" Remediation: {gap.remediation}")
# Print compliance flags
print("\nCompliance Status:")
for framework, status in answer.compliance_flags.items():
symbol = "" if status == "covered" else ""
print(f" {symbol} {framework}: {status}")
# Print receipt hash
print(f"\nReceipt Hash (sha256): {receipt.answer_hash}")
print(f"Timestamp: {receipt.timestamp}")
print("=" * 80)
return 0
if __name__ == "__main__":
sys.exit(main())
def _render_layer0_block(routing_action: str, shadow: ShadowEvalResult) -> None:
"""
Minimal user-facing responses for Layer 0 decisions.
"""
if routing_action == "FAIL_CLOSED":
print("Layer 0: cannot comply with this request.", file=sys.stderr)
return
if routing_action == "HANDOFF_TO_GUARDRAILS":
reason = shadow.reason or "governance_violation"
print(
f"Layer 0: governance violation detected ({reason}).",
file=sys.stderr,
)
return
if routing_action == "PROMPT_FOR_CLARIFICATION":
print(
"Layer 0: request is ambiguous. Please add specifics before rerunning.",
file=sys.stderr,
)
return
print("Layer 0: unrecognized routing action; refusing request.", file=sys.stderr)