94 lines
2.7 KiB
Python
94 lines
2.7 KiB
Python
from enum import Enum
|
|
from typing import Optional, List
|
|
import uuid
|
|
|
|
|
|
class Classification(str, Enum):
|
|
BLESSED = "blessed"
|
|
AMBIGUOUS = "ambiguous"
|
|
FORBIDDEN = "forbidden"
|
|
CATASTROPHIC = "catastrophic"
|
|
|
|
|
|
class ShadowEvalResult:
|
|
def __init__(
|
|
self,
|
|
classification: Classification,
|
|
reason: Optional[str] = None,
|
|
risk_score: int = 0,
|
|
flags: Optional[List[str]] = None,
|
|
):
|
|
self.classification = classification
|
|
self.reason = reason
|
|
self.risk_score = risk_score
|
|
self.flags = flags or []
|
|
self.trace_id = str(uuid.uuid4())
|
|
|
|
def to_routing_action(self) -> str:
|
|
if self.classification == Classification.CATASTROPHIC:
|
|
return "FAIL_CLOSED"
|
|
if self.classification == Classification.FORBIDDEN:
|
|
return "HANDOFF_TO_GUARDRAILS"
|
|
if self.classification == Classification.AMBIGUOUS:
|
|
return "PROMPT_FOR_CLARIFICATION"
|
|
return "HANDOFF_TO_LAYER1"
|
|
|
|
|
|
class ShadowClassifier:
|
|
"""
|
|
Minimal doctrinal classifier for Layer 0 (Shadow Eval).
|
|
"""
|
|
|
|
def classify(self, query: str) -> ShadowEvalResult:
|
|
"""Return a doctrinal classification for the incoming query."""
|
|
|
|
q = query.lower().strip()
|
|
|
|
# 1. Catastrophic (fail closed)
|
|
if any(x in q for x in [
|
|
"disable guardrails",
|
|
"override agent permissions",
|
|
"bypass governance",
|
|
"self-modifying",
|
|
]):
|
|
return ShadowEvalResult(
|
|
classification=Classification.CATASTROPHIC,
|
|
reason="catastrophic_indicator",
|
|
risk_score=5,
|
|
flags=["permission_override", "guardrail_disable"],
|
|
)
|
|
|
|
# 2. Forbidden (governance violation)
|
|
if any(x in q for x in [
|
|
"skip git",
|
|
"apply directly",
|
|
"dashboard",
|
|
"manual change",
|
|
]):
|
|
return ShadowEvalResult(
|
|
classification=Classification.FORBIDDEN,
|
|
reason="governance_violation",
|
|
risk_score=3,
|
|
flags=["gitops_bypass"],
|
|
)
|
|
|
|
# 3. Ambiguous (needs clarification)
|
|
if any(x in q for x in [
|
|
"fix it",
|
|
"change this",
|
|
"update stuff",
|
|
]) or len(q.split()) <= 2:
|
|
return ShadowEvalResult(
|
|
classification=Classification.AMBIGUOUS,
|
|
reason="insufficient_context",
|
|
risk_score=1,
|
|
flags=["needs_clarification"],
|
|
)
|
|
|
|
# 4. Blessed (valid + lawful)
|
|
return ShadowEvalResult(
|
|
classification=Classification.BLESSED,
|
|
reason=None,
|
|
risk_score=0,
|
|
)
|