feat: enforce layer0 gate and add tests
This commit is contained in:
93
layer0/shadow_classifier.py
Normal file
93
layer0/shadow_classifier.py
Normal file
@@ -0,0 +1,93 @@
|
||||
from enum import Enum
|
||||
from typing import Optional, List
|
||||
import uuid
|
||||
|
||||
|
||||
class Classification(str, Enum):
|
||||
BLESSED = "blessed"
|
||||
AMBIGUOUS = "ambiguous"
|
||||
FORBIDDEN = "forbidden"
|
||||
CATASTROPHIC = "catastrophic"
|
||||
|
||||
|
||||
class ShadowEvalResult:
|
||||
def __init__(
|
||||
self,
|
||||
classification: Classification,
|
||||
reason: Optional[str] = None,
|
||||
risk_score: int = 0,
|
||||
flags: Optional[List[str]] = None,
|
||||
):
|
||||
self.classification = classification
|
||||
self.reason = reason
|
||||
self.risk_score = risk_score
|
||||
self.flags = flags or []
|
||||
self.trace_id = str(uuid.uuid4())
|
||||
|
||||
def to_routing_action(self) -> str:
|
||||
if self.classification == Classification.CATASTROPHIC:
|
||||
return "FAIL_CLOSED"
|
||||
if self.classification == Classification.FORBIDDEN:
|
||||
return "HANDOFF_TO_GUARDRAILS"
|
||||
if self.classification == Classification.AMBIGUOUS:
|
||||
return "PROMPT_FOR_CLARIFICATION"
|
||||
return "HANDOFF_TO_LAYER1"
|
||||
|
||||
|
||||
class ShadowClassifier:
|
||||
"""
|
||||
Minimal doctrinal classifier for Layer 0 (Shadow Eval).
|
||||
"""
|
||||
|
||||
def classify(self, query: str) -> ShadowEvalResult:
|
||||
"""Return a doctrinal classification for the incoming query."""
|
||||
|
||||
q = query.lower().strip()
|
||||
|
||||
# 1. Catastrophic (fail closed)
|
||||
if any(x in q for x in [
|
||||
"disable guardrails",
|
||||
"override agent permissions",
|
||||
"bypass governance",
|
||||
"self-modifying",
|
||||
]):
|
||||
return ShadowEvalResult(
|
||||
classification=Classification.CATASTROPHIC,
|
||||
reason="catastrophic_indicator",
|
||||
risk_score=5,
|
||||
flags=["permission_override", "guardrail_disable"],
|
||||
)
|
||||
|
||||
# 2. Forbidden (governance violation)
|
||||
if any(x in q for x in [
|
||||
"skip git",
|
||||
"apply directly",
|
||||
"dashboard",
|
||||
"manual change",
|
||||
]):
|
||||
return ShadowEvalResult(
|
||||
classification=Classification.FORBIDDEN,
|
||||
reason="governance_violation",
|
||||
risk_score=3,
|
||||
flags=["gitops_bypass"],
|
||||
)
|
||||
|
||||
# 3. Ambiguous (needs clarification)
|
||||
if any(x in q for x in [
|
||||
"fix it",
|
||||
"change this",
|
||||
"update stuff",
|
||||
]) or len(q.split()) <= 2:
|
||||
return ShadowEvalResult(
|
||||
classification=Classification.AMBIGUOUS,
|
||||
reason="insufficient_context",
|
||||
risk_score=1,
|
||||
flags=["needs_clarification"],
|
||||
)
|
||||
|
||||
# 4. Blessed (valid + lawful)
|
||||
return ShadowEvalResult(
|
||||
classification=Classification.BLESSED,
|
||||
reason=None,
|
||||
risk_score=0,
|
||||
)
|
||||
Reference in New Issue
Block a user