feat: enforce layer0 gate and add tests

This commit is contained in:
Vault Sovereign
2025-12-17 00:02:39 +00:00
parent 37a867c485
commit 7f2e60e1c5
21 changed files with 2066 additions and 16 deletions

View File

@@ -0,0 +1,93 @@
from enum import Enum
from typing import Optional, List
import uuid
class Classification(str, Enum):
BLESSED = "blessed"
AMBIGUOUS = "ambiguous"
FORBIDDEN = "forbidden"
CATASTROPHIC = "catastrophic"
class ShadowEvalResult:
def __init__(
self,
classification: Classification,
reason: Optional[str] = None,
risk_score: int = 0,
flags: Optional[List[str]] = None,
):
self.classification = classification
self.reason = reason
self.risk_score = risk_score
self.flags = flags or []
self.trace_id = str(uuid.uuid4())
def to_routing_action(self) -> str:
if self.classification == Classification.CATASTROPHIC:
return "FAIL_CLOSED"
if self.classification == Classification.FORBIDDEN:
return "HANDOFF_TO_GUARDRAILS"
if self.classification == Classification.AMBIGUOUS:
return "PROMPT_FOR_CLARIFICATION"
return "HANDOFF_TO_LAYER1"
class ShadowClassifier:
"""
Minimal doctrinal classifier for Layer 0 (Shadow Eval).
"""
def classify(self, query: str) -> ShadowEvalResult:
"""Return a doctrinal classification for the incoming query."""
q = query.lower().strip()
# 1. Catastrophic (fail closed)
if any(x in q for x in [
"disable guardrails",
"override agent permissions",
"bypass governance",
"self-modifying",
]):
return ShadowEvalResult(
classification=Classification.CATASTROPHIC,
reason="catastrophic_indicator",
risk_score=5,
flags=["permission_override", "guardrail_disable"],
)
# 2. Forbidden (governance violation)
if any(x in q for x in [
"skip git",
"apply directly",
"dashboard",
"manual change",
]):
return ShadowEvalResult(
classification=Classification.FORBIDDEN,
reason="governance_violation",
risk_score=3,
flags=["gitops_bypass"],
)
# 3. Ambiguous (needs clarification)
if any(x in q for x in [
"fix it",
"change this",
"update stuff",
]) or len(q.split()) <= 2:
return ShadowEvalResult(
classification=Classification.AMBIGUOUS,
reason="insufficient_context",
risk_score=1,
flags=["needs_clarification"],
)
# 4. Blessed (valid + lawful)
return ShadowEvalResult(
classification=Classification.BLESSED,
reason=None,
risk_score=0,
)