feat: enforce layer0 gate and add tests

This commit is contained in:
Vault Sovereign
2025-12-17 00:02:39 +00:00
parent 37a867c485
commit 7f2e60e1c5
21 changed files with 2066 additions and 16 deletions

7
layer0/__init__.py Normal file
View File

@@ -0,0 +1,7 @@
"""
Layer 0 package: pre-boot Shadow Eval classifier and logger.
"""
from .entrypoint import layer0_entry # re-export for convenience
__all__ = ["layer0_entry"]

17
layer0/entrypoint.py Normal file
View File

@@ -0,0 +1,17 @@
from .shadow_classifier import ShadowClassifier, Classification, ShadowEvalResult
from .preboot_logger import PrebootLogger
classifier = ShadowClassifier()
def layer0_entry(query: str) -> tuple[str, ShadowEvalResult]:
"""
Main entrypoint called before Layer 1 (Doctrine Load).
Returns the routing action and the full evaluation result.
"""
result = classifier.classify(query)
if result.classification in (Classification.CATASTROPHIC, Classification.FORBIDDEN):
PrebootLogger.log(result, query)
return result.to_routing_action(), result

33
layer0/preboot_logger.py Normal file
View File

@@ -0,0 +1,33 @@
import datetime
import json
import os
from typing import Optional
from .shadow_classifier import ShadowEvalResult, Classification
class PrebootLogger:
LOG_PATH = "anomalies/preboot_shield.jsonl"
@staticmethod
def log(event: ShadowEvalResult, query: str, reason_override: Optional[str] = None):
if event.classification not in (Classification.CATASTROPHIC, Classification.FORBIDDEN):
return # Only violations get logged
record = {
"timestamp": datetime.datetime.utcnow().isoformat() + "Z",
"query": query,
"classification": event.classification.value,
"reason": reason_override or event.reason,
"trace_id": event.trace_id,
"metadata": {
"risk_score": event.risk_score,
"flags": event.flags,
"source": "layer0",
},
}
os.makedirs(os.path.dirname(PrebootLogger.LOG_PATH), exist_ok=True)
with open(PrebootLogger.LOG_PATH, "a", encoding="utf-8") as f:
f.write(json.dumps(record) + "\n")

View File

@@ -0,0 +1,93 @@
from enum import Enum
from typing import Optional, List
import uuid
class Classification(str, Enum):
BLESSED = "blessed"
AMBIGUOUS = "ambiguous"
FORBIDDEN = "forbidden"
CATASTROPHIC = "catastrophic"
class ShadowEvalResult:
def __init__(
self,
classification: Classification,
reason: Optional[str] = None,
risk_score: int = 0,
flags: Optional[List[str]] = None,
):
self.classification = classification
self.reason = reason
self.risk_score = risk_score
self.flags = flags or []
self.trace_id = str(uuid.uuid4())
def to_routing_action(self) -> str:
if self.classification == Classification.CATASTROPHIC:
return "FAIL_CLOSED"
if self.classification == Classification.FORBIDDEN:
return "HANDOFF_TO_GUARDRAILS"
if self.classification == Classification.AMBIGUOUS:
return "PROMPT_FOR_CLARIFICATION"
return "HANDOFF_TO_LAYER1"
class ShadowClassifier:
"""
Minimal doctrinal classifier for Layer 0 (Shadow Eval).
"""
def classify(self, query: str) -> ShadowEvalResult:
"""Return a doctrinal classification for the incoming query."""
q = query.lower().strip()
# 1. Catastrophic (fail closed)
if any(x in q for x in [
"disable guardrails",
"override agent permissions",
"bypass governance",
"self-modifying",
]):
return ShadowEvalResult(
classification=Classification.CATASTROPHIC,
reason="catastrophic_indicator",
risk_score=5,
flags=["permission_override", "guardrail_disable"],
)
# 2. Forbidden (governance violation)
if any(x in q for x in [
"skip git",
"apply directly",
"dashboard",
"manual change",
]):
return ShadowEvalResult(
classification=Classification.FORBIDDEN,
reason="governance_violation",
risk_score=3,
flags=["gitops_bypass"],
)
# 3. Ambiguous (needs clarification)
if any(x in q for x in [
"fix it",
"change this",
"update stuff",
]) or len(q.split()) <= 2:
return ShadowEvalResult(
classification=Classification.AMBIGUOUS,
reason="insufficient_context",
risk_score=1,
flags=["needs_clarification"],
)
# 4. Blessed (valid + lawful)
return ShadowEvalResult(
classification=Classification.BLESSED,
reason=None,
risk_score=0,
)