chore: pre-migration snapshot

Layer0, MCP servers, Terraform consolidation
2025-12-27 01:52:27 +00:00
parent 7f2e60e1c5
commit f0b8d962de
67 changed files with 14887 additions and 650 deletions
--- a/layer0/pattern_store.py
+++ b/layer0/pattern_store.py
@@ -0,0 +1,331 @@
+from __future__ import annotations
+
+import json
+import os
+import re
+from dataclasses import dataclass
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any, Iterable, Sequence
+
+THIS_FILE = Path(__file__).resolve()
+LAYER0_DIR = THIS_FILE.parent
+REPO_ROOT = LAYER0_DIR.parent.parent
+
+
+_RE_URL = re.compile(r"\bhttps?://\S+\b", re.IGNORECASE)
+_RE_EMAIL = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b", re.IGNORECASE)
+_RE_IPV4 = re.compile(r"\b(?:\d{1,3}\.){3}\d{1,3}\b")
+_RE_IPV6 = re.compile(r"\b(?:[0-9a-f]{0,4}:){2,}[0-9a-f]{0,4}\b", re.IGNORECASE)
+_RE_UUID = re.compile(
+    r"\b[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}\b",
+    re.IGNORECASE,
+)
+_RE_HEX_LONG = re.compile(r"\b[0-9a-f]{32,}\b", re.IGNORECASE)
+_RE_BASE64ISH = re.compile(r"\b[A-Za-z0-9+/]{28,}={0,2}\b")
+_RE_PATHISH = re.compile(r"(?:(?:\.\.?/)|/)[A-Za-z0-9._~/-]{2,}")
+_RE_NUMBER = re.compile(r"\b\d+\b")
+_RE_TOKEN = re.compile(r"[a-z][a-z_-]{1,31}", re.IGNORECASE)
+
+
+SAFE_VOCAB = {
+    # Governance / safety verbs
+    "disable",
+    "override",
+    "bypass",
+    "skip",
+    "ignore",
+    "evade",
+    "break",
+    "force",
+    "apply",
+    "deploy",
+    "destroy",
+    "delete",
+    "drop",
+    "remove",
+    "exfiltrate",
+    # Critical nouns / domains
+    "guardrails",
+    "permissions",
+    "governance",
+    "git",
+    "gitops",
+    "dashboard",
+    "manual",
+    "prod",
+    "production",
+    "staging",
+    "terraform",
+    "waf",
+    "dns",
+    "tunnel",
+    "access",
+    "token",
+    "secret",
+    "key",
+    "credential",
+    "admin",
+    "root",
+    # Phrases often seen in L0 rules (tokenized)
+    "self",
+    "modifying",
+    "directly",
+}
+
+
+def _utc_now_iso_z() -> str:
+    return (
+        datetime.now(timezone.utc)
+        .replace(microsecond=0)
+        .isoformat()
+        .replace("+00:00", "Z")
+    )
+
+
+def normalize_query_for_matching(query: str) -> str:
+    """
+    Produce a low-leakage normalized string suitable for storing and matching.
+
+    Invariants:
+    - Never stores raw URLs, IPs, emails, long hex strings, base64ish blobs, UUIDs, or paths.
+    - Numbers are stripped to <NUM>.
+    - Only safe vocabulary tokens are preserved; other words are dropped.
+    """
+    q = (query or "").lower().strip()
+    if not q:
+        return ""
+
+    # Keep placeholders lowercase to make matching stable across sources.
+    q = _RE_URL.sub("<url>", q)
+    q = _RE_EMAIL.sub("<email>", q)
+    q = _RE_IPV4.sub("<ip>", q)
+    q = _RE_IPV6.sub("<ip>", q)
+    q = _RE_UUID.sub("<uuid>", q)
+    q = _RE_PATHISH.sub("<path>", q)
+    q = _RE_HEX_LONG.sub("<hex>", q)
+    q = _RE_BASE64ISH.sub("<b64>", q)
+    q = _RE_NUMBER.sub("<num>", q)
+
+    # Tokenize; keep placeholders and a tight safe vocabulary.
+    tokens: list[str] = []
+    for raw in re.split(r"[^a-z0-9_<>\-_/]+", q):
+        t = raw.strip()
+        if not t:
+            continue
+        if t.startswith("<") and t.endswith(">"):
+            tokens.append(t)
+            continue
+        if _RE_TOKEN.fullmatch(t) and t in SAFE_VOCAB:
+            tokens.append(t)
+
+    # De-dupe while preserving order.
+    seen: set[str] = set()
+    out: list[str] = []
+    for t in tokens:
+        if t in seen:
+            continue
+        seen.add(t)
+        out.append(t)
+    return " ".join(out)
+
+
+def normalized_tokens(query: str) -> list[str]:
+    s = normalize_query_for_matching(query)
+    return s.split() if s else []
+
+
+@dataclass(frozen=True)
+class LearnedPattern:
+    pattern_id: str
+    tokens_all: tuple[str, ...]
+    classification: str
+    reason: str | None
+    risk_score: int
+    flags: tuple[str, ...]
+    specificity_score: int
+    min_support: int
+    last_seen: str | None
+    source: dict[str, Any] | None
+    mode: str  # "escalate" | "relax"
+
+    def matches(self, normalized_query: str) -> bool:
+        if not normalized_query:
+            return False
+        hay = set(normalized_query.split())
+        return all(t in hay for t in self.tokens_all)
+
+
+def _default_active_path() -> Path:
+    configured = os.environ.get("LAYER0_ACTIVE_PATTERNS_PATH")
+    if configured:
+        return Path(configured).expanduser().resolve()
+    return (REPO_ROOT / ".state" / "layer0_patterns_active.json").resolve()
+
+
+class PatternStore:
+    """
+    Read-only active pattern snapshot.
+
+    This is intentionally immutable during request handling; mutations happen in
+    offline jobs (learn/replay) that write a new snapshot and log an artifact.
+    """
+
+    def __init__(self, active_path: Path | None = None):
+        self._active_path = active_path or _default_active_path()
+        self._active: list[LearnedPattern] = []
+        self._loaded = False
+
+    @property
+    def active_path(self) -> Path:
+        return self._active_path
+
+    def load(self) -> None:
+        if self._loaded:
+            return
+        self._loaded = True
+        self._active = self._load_patterns_file(self._active_path)
+
+    def patterns(self) -> list[LearnedPattern]:
+        self.load()
+        return list(self._active)
+
+    def match_ordered(self, normalized_query: str) -> list[LearnedPattern]:
+        self.load()
+        matched = [p for p in self._active if p.matches(normalized_query)]
+        severity_rank = {
+            "blessed": 0,
+            "ambiguous": 1,
+            "forbidden": 2,
+            "catastrophic": 3,
+        }
+        matched.sort(
+            key=lambda p: (
+                severity_rank.get(p.classification, 0),
+                p.specificity_score,
+                p.min_support,
+                p.last_seen or "",
+            ),
+            reverse=True,
+        )
+        return matched
+
+    @staticmethod
+    def _load_patterns_file(path: Path) -> list[LearnedPattern]:
+        if not path.exists():
+            return []
+        data = json.loads(path.read_text(encoding="utf-8"))
+        items = data.get("patterns") if isinstance(data, dict) else data
+        if not isinstance(items, list):
+            return []
+
+        patterns: list[LearnedPattern] = []
+        for item in items:
+            if not isinstance(item, dict):
+                continue
+            tokens = item.get("tokens_all") or item.get("tokens") or []
+            if not isinstance(tokens, list) or not tokens:
+                continue
+            tokens_norm = tuple(
+                t.lower() if isinstance(t, str) else ""
+                for t in tokens
+                if isinstance(t, str)
+                and t
+                and (t.startswith("<") or t.lower() in SAFE_VOCAB)
+            )
+            if not tokens_norm:
+                continue
+
+            classification = item.get("classification")
+            if classification not in {
+                "blessed",
+                "ambiguous",
+                "forbidden",
+                "catastrophic",
+            }:
+                continue
+
+            flags = item.get("flags") or []
+            if not isinstance(flags, list):
+                flags = []
+
+            mode = item.get("mode") or "escalate"
+            if mode not in {"escalate", "relax"}:
+                mode = "escalate"
+
+            min_support = int(item.get("min_support") or item.get("support") or 0)
+            specificity = int(item.get("specificity_score") or len(tokens_norm))
+            risk_score = int(item.get("risk_score") or 0)
+
+            patterns.append(
+                LearnedPattern(
+                    pattern_id=str(item.get("pattern_id") or item.get("id") or ""),
+                    tokens_all=tokens_norm,
+                    classification=classification,
+                    reason=item.get("reason"),
+                    risk_score=risk_score,
+                    flags=tuple(str(f) for f in flags if isinstance(f, str)),
+                    specificity_score=specificity,
+                    min_support=min_support,
+                    last_seen=item.get("last_seen"),
+                    source=item.get("source")
+                    if isinstance(item.get("source"), dict)
+                    else None,
+                    mode=mode,
+                )
+            )
+
+        severity_rank = {
+            "blessed": 0,
+            "ambiguous": 1,
+            "forbidden": 2,
+            "catastrophic": 3,
+        }
+        patterns.sort(
+            key=lambda p: (
+                severity_rank.get(p.classification, 0),
+                p.specificity_score,
+                p.min_support,
+                p.last_seen or "",
+            ),
+            reverse=True,
+        )
+        return patterns
+
+
+def pattern_dict(
+    *,
+    tokens_all: Sequence[str],
+    classification: str,
+    reason: str | None,
+    risk_score: int,
+    flags: Sequence[str],
+    min_support: int,
+    last_seen: str | None = None,
+    source: dict[str, Any] | None = None,
+    mode: str = "escalate",
+    pattern_id: str | None = None,
+) -> dict[str, Any]:
+    tokens = [t for t in tokens_all if isinstance(t, str) and t]
+    return {
+        "pattern_id": pattern_id or "",
+        "tokens_all": tokens,
+        "classification": classification,
+        "reason": reason,
+        "risk_score": int(risk_score),
+        "flags": list(flags),
+        "specificity_score": int(len(tokens)),
+        "min_support": int(min_support),
+        "last_seen": last_seen or _utc_now_iso_z(),
+        "source": source or {},
+        "mode": mode,
+    }
+
+
+def write_pattern_snapshot(path: Path, patterns: Iterable[dict[str, Any]]) -> None:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    payload = {"generated_at": _utc_now_iso_z(), "patterns": list(patterns)}
+    path.write_text(
+        json.dumps(payload, ensure_ascii=False, sort_keys=True, indent=2) + "\n",
+        encoding="utf-8",
+    )