chore: pre-migration snapshot
Layer0, MCP servers, Terraform consolidation
This commit is contained in:
331
layer0/pattern_store.py
Normal file
331
layer0/pattern_store.py
Normal file
@@ -0,0 +1,331 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Iterable, Sequence
|
||||
|
||||
THIS_FILE = Path(__file__).resolve()
|
||||
LAYER0_DIR = THIS_FILE.parent
|
||||
REPO_ROOT = LAYER0_DIR.parent.parent
|
||||
|
||||
|
||||
_RE_URL = re.compile(r"\bhttps?://\S+\b", re.IGNORECASE)
|
||||
_RE_EMAIL = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
|
||||
_RE_IPV4 = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
|
||||
_RE_IPV6 = re.compile(r"\b(?:[0-9a-f]{0,4}:){2,}[0-9a-f]{0,4}\b", re.IGNORECASE)
|
||||
_RE_UUID = re.compile(
|
||||
r"\b[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_RE_HEX_LONG = re.compile(r"\b[0-9a-f]{32,}\b", re.IGNORECASE)
|
||||
_RE_BASE64ISH = re.compile(r"\b[A-Za-z0-9+/]{28,}={0,2}\b")
|
||||
_RE_PATHISH = re.compile(r"(?:(?:\.\.?/)|/)[A-Za-z0-9._~/-]{2,}")
|
||||
_RE_NUMBER = re.compile(r"\b\d+\b")
|
||||
_RE_TOKEN = re.compile(r"[a-z][a-z_-]{1,31}", re.IGNORECASE)
|
||||
|
||||
|
||||
SAFE_VOCAB = {
|
||||
# Governance / safety verbs
|
||||
"disable",
|
||||
"override",
|
||||
"bypass",
|
||||
"skip",
|
||||
"ignore",
|
||||
"evade",
|
||||
"break",
|
||||
"force",
|
||||
"apply",
|
||||
"deploy",
|
||||
"destroy",
|
||||
"delete",
|
||||
"drop",
|
||||
"remove",
|
||||
"exfiltrate",
|
||||
# Critical nouns / domains
|
||||
"guardrails",
|
||||
"permissions",
|
||||
"governance",
|
||||
"git",
|
||||
"gitops",
|
||||
"dashboard",
|
||||
"manual",
|
||||
"prod",
|
||||
"production",
|
||||
"staging",
|
||||
"terraform",
|
||||
"waf",
|
||||
"dns",
|
||||
"tunnel",
|
||||
"access",
|
||||
"token",
|
||||
"secret",
|
||||
"key",
|
||||
"credential",
|
||||
"admin",
|
||||
"root",
|
||||
# Phrases often seen in L0 rules (tokenized)
|
||||
"self",
|
||||
"modifying",
|
||||
"directly",
|
||||
}
|
||||
|
||||
|
||||
def _utc_now_iso_z() -> str:
|
||||
return (
|
||||
datetime.now(timezone.utc)
|
||||
.replace(microsecond=0)
|
||||
.isoformat()
|
||||
.replace("+00:00", "Z")
|
||||
)
|
||||
|
||||
|
||||
def normalize_query_for_matching(query: str) -> str:
|
||||
"""
|
||||
Produce a low-leakage normalized string suitable for storing and matching.
|
||||
|
||||
Invariants:
|
||||
- Never stores raw URLs, IPs, emails, long hex strings, base64ish blobs, UUIDs, or paths.
|
||||
- Numbers are stripped to <NUM>.
|
||||
- Only safe vocabulary tokens are preserved; other words are dropped.
|
||||
"""
|
||||
q = (query or "").lower().strip()
|
||||
if not q:
|
||||
return ""
|
||||
|
||||
# Keep placeholders lowercase to make matching stable across sources.
|
||||
q = _RE_URL.sub("<url>", q)
|
||||
q = _RE_EMAIL.sub("<email>", q)
|
||||
q = _RE_IPV4.sub("<ip>", q)
|
||||
q = _RE_IPV6.sub("<ip>", q)
|
||||
q = _RE_UUID.sub("<uuid>", q)
|
||||
q = _RE_PATHISH.sub("<path>", q)
|
||||
q = _RE_HEX_LONG.sub("<hex>", q)
|
||||
q = _RE_BASE64ISH.sub("<b64>", q)
|
||||
q = _RE_NUMBER.sub("<num>", q)
|
||||
|
||||
# Tokenize; keep placeholders and a tight safe vocabulary.
|
||||
tokens: list[str] = []
|
||||
for raw in re.split(r"[^a-z0-9_<>\-_/]+", q):
|
||||
t = raw.strip()
|
||||
if not t:
|
||||
continue
|
||||
if t.startswith("<") and t.endswith(">"):
|
||||
tokens.append(t)
|
||||
continue
|
||||
if _RE_TOKEN.fullmatch(t) and t in SAFE_VOCAB:
|
||||
tokens.append(t)
|
||||
|
||||
# De-dupe while preserving order.
|
||||
seen: set[str] = set()
|
||||
out: list[str] = []
|
||||
for t in tokens:
|
||||
if t in seen:
|
||||
continue
|
||||
seen.add(t)
|
||||
out.append(t)
|
||||
return " ".join(out)
|
||||
|
||||
|
||||
def normalized_tokens(query: str) -> list[str]:
|
||||
s = normalize_query_for_matching(query)
|
||||
return s.split() if s else []
|
||||
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class LearnedPattern:
|
||||
pattern_id: str
|
||||
tokens_all: tuple[str, ...]
|
||||
classification: str
|
||||
reason: str | None
|
||||
risk_score: int
|
||||
flags: tuple[str, ...]
|
||||
specificity_score: int
|
||||
min_support: int
|
||||
last_seen: str | None
|
||||
source: dict[str, Any] | None
|
||||
mode: str # "escalate" | "relax"
|
||||
|
||||
def matches(self, normalized_query: str) -> bool:
|
||||
if not normalized_query:
|
||||
return False
|
||||
hay = set(normalized_query.split())
|
||||
return all(t in hay for t in self.tokens_all)
|
||||
|
||||
|
||||
def _default_active_path() -> Path:
|
||||
configured = os.environ.get("LAYER0_ACTIVE_PATTERNS_PATH")
|
||||
if configured:
|
||||
return Path(configured).expanduser().resolve()
|
||||
return (REPO_ROOT / ".state" / "layer0_patterns_active.json").resolve()
|
||||
|
||||
|
||||
class PatternStore:
|
||||
"""
|
||||
Read-only active pattern snapshot.
|
||||
|
||||
This is intentionally immutable during request handling; mutations happen in
|
||||
offline jobs (learn/replay) that write a new snapshot and log an artifact.
|
||||
"""
|
||||
|
||||
def __init__(self, active_path: Path | None = None):
|
||||
self._active_path = active_path or _default_active_path()
|
||||
self._active: list[LearnedPattern] = []
|
||||
self._loaded = False
|
||||
|
||||
@property
|
||||
def active_path(self) -> Path:
|
||||
return self._active_path
|
||||
|
||||
def load(self) -> None:
|
||||
if self._loaded:
|
||||
return
|
||||
self._loaded = True
|
||||
self._active = self._load_patterns_file(self._active_path)
|
||||
|
||||
def patterns(self) -> list[LearnedPattern]:
|
||||
self.load()
|
||||
return list(self._active)
|
||||
|
||||
def match_ordered(self, normalized_query: str) -> list[LearnedPattern]:
|
||||
self.load()
|
||||
matched = [p for p in self._active if p.matches(normalized_query)]
|
||||
severity_rank = {
|
||||
"blessed": 0,
|
||||
"ambiguous": 1,
|
||||
"forbidden": 2,
|
||||
"catastrophic": 3,
|
||||
}
|
||||
matched.sort(
|
||||
key=lambda p: (
|
||||
severity_rank.get(p.classification, 0),
|
||||
p.specificity_score,
|
||||
p.min_support,
|
||||
p.last_seen or "",
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return matched
|
||||
|
||||
@staticmethod
|
||||
def _load_patterns_file(path: Path) -> list[LearnedPattern]:
|
||||
if not path.exists():
|
||||
return []
|
||||
data = json.loads(path.read_text(encoding="utf-8"))
|
||||
items = data.get("patterns") if isinstance(data, dict) else data
|
||||
if not isinstance(items, list):
|
||||
return []
|
||||
|
||||
patterns: list[LearnedPattern] = []
|
||||
for item in items:
|
||||
if not isinstance(item, dict):
|
||||
continue
|
||||
tokens = item.get("tokens_all") or item.get("tokens") or []
|
||||
if not isinstance(tokens, list) or not tokens:
|
||||
continue
|
||||
tokens_norm = tuple(
|
||||
t.lower() if isinstance(t, str) else ""
|
||||
for t in tokens
|
||||
if isinstance(t, str)
|
||||
and t
|
||||
and (t.startswith("<") or t.lower() in SAFE_VOCAB)
|
||||
)
|
||||
if not tokens_norm:
|
||||
continue
|
||||
|
||||
classification = item.get("classification")
|
||||
if classification not in {
|
||||
"blessed",
|
||||
"ambiguous",
|
||||
"forbidden",
|
||||
"catastrophic",
|
||||
}:
|
||||
continue
|
||||
|
||||
flags = item.get("flags") or []
|
||||
if not isinstance(flags, list):
|
||||
flags = []
|
||||
|
||||
mode = item.get("mode") or "escalate"
|
||||
if mode not in {"escalate", "relax"}:
|
||||
mode = "escalate"
|
||||
|
||||
min_support = int(item.get("min_support") or item.get("support") or 0)
|
||||
specificity = int(item.get("specificity_score") or len(tokens_norm))
|
||||
risk_score = int(item.get("risk_score") or 0)
|
||||
|
||||
patterns.append(
|
||||
LearnedPattern(
|
||||
pattern_id=str(item.get("pattern_id") or item.get("id") or ""),
|
||||
tokens_all=tokens_norm,
|
||||
classification=classification,
|
||||
reason=item.get("reason"),
|
||||
risk_score=risk_score,
|
||||
flags=tuple(str(f) for f in flags if isinstance(f, str)),
|
||||
specificity_score=specificity,
|
||||
min_support=min_support,
|
||||
last_seen=item.get("last_seen"),
|
||||
source=item.get("source")
|
||||
if isinstance(item.get("source"), dict)
|
||||
else None,
|
||||
mode=mode,
|
||||
)
|
||||
)
|
||||
|
||||
severity_rank = {
|
||||
"blessed": 0,
|
||||
"ambiguous": 1,
|
||||
"forbidden": 2,
|
||||
"catastrophic": 3,
|
||||
}
|
||||
patterns.sort(
|
||||
key=lambda p: (
|
||||
severity_rank.get(p.classification, 0),
|
||||
p.specificity_score,
|
||||
p.min_support,
|
||||
p.last_seen or "",
|
||||
),
|
||||
reverse=True,
|
||||
)
|
||||
return patterns
|
||||
|
||||
|
||||
def pattern_dict(
|
||||
*,
|
||||
tokens_all: Sequence[str],
|
||||
classification: str,
|
||||
reason: str | None,
|
||||
risk_score: int,
|
||||
flags: Sequence[str],
|
||||
min_support: int,
|
||||
last_seen: str | None = None,
|
||||
source: dict[str, Any] | None = None,
|
||||
mode: str = "escalate",
|
||||
pattern_id: str | None = None,
|
||||
) -> dict[str, Any]:
|
||||
tokens = [t for t in tokens_all if isinstance(t, str) and t]
|
||||
return {
|
||||
"pattern_id": pattern_id or "",
|
||||
"tokens_all": tokens,
|
||||
"classification": classification,
|
||||
"reason": reason,
|
||||
"risk_score": int(risk_score),
|
||||
"flags": list(flags),
|
||||
"specificity_score": int(len(tokens)),
|
||||
"min_support": int(min_support),
|
||||
"last_seen": last_seen or _utc_now_iso_z(),
|
||||
"source": source or {},
|
||||
"mode": mode,
|
||||
}
|
||||
|
||||
|
||||
def write_pattern_snapshot(path: Path, patterns: Iterable[dict[str, Any]]) -> None:
|
||||
path.parent.mkdir(parents=True, exist_ok=True)
|
||||
payload = {"generated_at": _utc_now_iso_z(), "patterns": list(patterns)}
|
||||
path.write_text(
|
||||
json.dumps(payload, ensure_ascii=False, sort_keys=True, indent=2) + "\n",
|
||||
encoding="utf-8",
|
||||
)
|
||||
Reference in New Issue
Block a user