feat: enforce layer0 gate and add tests

2025-12-17 00:02:39 +00:00
parent 37a867c485
commit 7f2e60e1c5
21 changed files with 2066 additions and 16 deletions
--- a/layer0/init.py
+++ b/layer0/init.py
@@ -0,0 +1,7 @@
+"""
+Layer 0 package: pre-boot Shadow Eval classifier and logger.
+"""
+
+from .entrypoint import layer0_entry  # re-export for convenience
+
+__all__ = ["layer0_entry"]
--- a/layer0/entrypoint.py
+++ b/layer0/entrypoint.py
@@ -0,0 +1,17 @@
+from .shadow_classifier import ShadowClassifier, Classification, ShadowEvalResult
+from .preboot_logger import PrebootLogger
+
+classifier = ShadowClassifier()
+
+
+def layer0_entry(query: str) -> tuple[str, ShadowEvalResult]:
+    """
+    Main entrypoint called before Layer 1 (Doctrine Load).
+    Returns the routing action and the full evaluation result.
+    """
+    result = classifier.classify(query)
+
+    if result.classification in (Classification.CATASTROPHIC, Classification.FORBIDDEN):
+        PrebootLogger.log(result, query)
+
+    return result.to_routing_action(), result
--- a/layer0/preboot_logger.py
+++ b/layer0/preboot_logger.py
@@ -0,0 +1,33 @@
+import datetime
+import json
+import os
+from typing import Optional
+
+from .shadow_classifier import ShadowEvalResult, Classification
+
+
+class PrebootLogger:
+    LOG_PATH = "anomalies/preboot_shield.jsonl"
+
+    @staticmethod
+    def log(event: ShadowEvalResult, query: str, reason_override: Optional[str] = None):
+        if event.classification not in (Classification.CATASTROPHIC, Classification.FORBIDDEN):
+            return  # Only violations get logged
+
+        record = {
+            "timestamp": datetime.datetime.utcnow().isoformat() + "Z",
+            "query": query,
+            "classification": event.classification.value,
+            "reason": reason_override or event.reason,
+            "trace_id": event.trace_id,
+            "metadata": {
+                "risk_score": event.risk_score,
+                "flags": event.flags,
+                "source": "layer0",
+            },
+        }
+
+        os.makedirs(os.path.dirname(PrebootLogger.LOG_PATH), exist_ok=True)
+
+        with open(PrebootLogger.LOG_PATH, "a", encoding="utf-8") as f:
+            f.write(json.dumps(record) + "\n")
--- a/layer0/shadow_classifier.py
+++ b/layer0/shadow_classifier.py
@@ -0,0 +1,93 @@
+from enum import Enum
+from typing import Optional, List
+import uuid
+
+
+class Classification(str, Enum):
+    BLESSED = "blessed"
+    AMBIGUOUS = "ambiguous"
+    FORBIDDEN = "forbidden"
+    CATASTROPHIC = "catastrophic"
+
+
+class ShadowEvalResult:
+    def __init__(
+        self,
+        classification: Classification,
+        reason: Optional[str] = None,
+        risk_score: int = 0,
+        flags: Optional[List[str]] = None,
+    ):
+        self.classification = classification
+        self.reason = reason
+        self.risk_score = risk_score
+        self.flags = flags or []
+        self.trace_id = str(uuid.uuid4())
+
+    def to_routing_action(self) -> str:
+        if self.classification == Classification.CATASTROPHIC:
+            return "FAIL_CLOSED"
+        if self.classification == Classification.FORBIDDEN:
+            return "HANDOFF_TO_GUARDRAILS"
+        if self.classification == Classification.AMBIGUOUS:
+            return "PROMPT_FOR_CLARIFICATION"
+        return "HANDOFF_TO_LAYER1"
+
+
+class ShadowClassifier:
+    """
+    Minimal doctrinal classifier for Layer 0 (Shadow Eval).
+    """
+
+    def classify(self, query: str) -> ShadowEvalResult:
+        """Return a doctrinal classification for the incoming query."""
+
+        q = query.lower().strip()
+
+        # 1. Catastrophic (fail closed)
+        if any(x in q for x in [
+            "disable guardrails",
+            "override agent permissions",
+            "bypass governance",
+            "self-modifying",
+        ]):
+            return ShadowEvalResult(
+                classification=Classification.CATASTROPHIC,
+                reason="catastrophic_indicator",
+                risk_score=5,
+                flags=["permission_override", "guardrail_disable"],
+            )
+
+        # 2. Forbidden (governance violation)
+        if any(x in q for x in [
+            "skip git",
+            "apply directly",
+            "dashboard",
+            "manual change",
+        ]):
+            return ShadowEvalResult(
+                classification=Classification.FORBIDDEN,
+                reason="governance_violation",
+                risk_score=3,
+                flags=["gitops_bypass"],
+            )
+
+        # 3. Ambiguous (needs clarification)
+        if any(x in q for x in [
+            "fix it",
+            "change this",
+            "update stuff",
+        ]) or len(q.split()) <= 2:
+            return ShadowEvalResult(
+                classification=Classification.AMBIGUOUS,
+                reason="insufficient_context",
+                risk_score=1,
+                flags=["needs_clarification"],
+            )
+
+        # 4. Blessed (valid + lawful)
+        return ShadowEvalResult(
+            classification=Classification.BLESSED,
+            reason=None,
+            risk_score=0,
+        )