Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: XNLLLLH <XNLLLLH@users.noreply.github.com>
308 lines
10 KiB
Python
308 lines
10 KiB
Python
"""OPS-07 prompt-injection shield (D-30, D-31) -- Plan 02-05.
|
||
|
||
Three-tier deployment per D-31:
|
||
HARD_BLOCK -> L0 identity + S5 invariant writes (reject on detection)
|
||
FLAG_FOR_REVIEW -> profile updates (flag + warn, write proceeds)
|
||
LOG_ONLY -> content records (log only, allow)
|
||
|
||
D-30 threat model (three severities):
|
||
- Direct override (e.g. "forget X, now Y") -> HARD BLOCK via signal words
|
||
- Gradual drift (subtle lies over weeks) -> DETECT via trajectory M4 anomaly
|
||
(see s5.detect_drift_anomaly)
|
||
- Data poisoning (intentional false write) -> MITIGATE via ART vigilance
|
||
+ user-approval UX
|
||
|
||
Global-product mandate: signal words cover 7+ languages
|
||
(en + ru + ja + ar + de + fr + es + zh) at minimum. The module exports
|
||
`SHIELD_LANGUAGES_SUPPORTED` as the authoritative set; downstream acceptance
|
||
tests grep against it.
|
||
|
||
The shield is a PURE LOCAL filter: no LLM call, no network. Detection uses
|
||
case-insensitive substring matching against curated signal-word lists. The
|
||
tier policy is additive: warning signals escalate to critical in the
|
||
HARD_BLOCK tier (L0 is sacred).
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
from dataclasses import dataclass, field
|
||
from enum import Enum
|
||
from typing import Any
|
||
from uuid import UUID
|
||
|
||
from iai_mcp.events import write_event
|
||
|
||
|
||
# ------------------------------------------------------------ constitutional constants
|
||
|
||
# Confidence thresholds for the shield verdict. Confidence is a simple signal:
|
||
# matched_count / TOTAL_BASELINE -- used for downstream analytics, not the
|
||
# tier-policy gate. The tier enum + match count drives the action.
|
||
SHIELD_SIGNAL_WORDS_MAX_CONFIDENCE: float = 0.9 # upper bound reported on any match
|
||
SHIELD_FLAG_CONFIDENCE: float = 0.6 # reported when matches are warning-only
|
||
|
||
# global-product mandate: 7+ languages supported.
|
||
SHIELD_LANGUAGES_SUPPORTED: frozenset[str] = frozenset({
|
||
"en", "ru", "ja", "ar", "de", "fr", "es", "zh",
|
||
})
|
||
|
||
# gradual-drift detection threshold -- used by s5.detect_drift_anomaly
|
||
# but declared here so the single authoritative constant sits alongside the
|
||
# other shield thresholds (downstream greps one file).
|
||
DRIFT_M4_ANOMALY_SIGMA: float = 3.0
|
||
|
||
|
||
# ------------------------------------------------------------ signal-word catalogues
|
||
|
||
# English critical signal words: classic prompt-injection imperatives.
|
||
SIGNAL_WORDS_CRITICAL_EN: list[str] = [
|
||
"forget", "override", "ignore previous", "you are now",
|
||
"from now on", "system:", "admin:", "instruction:",
|
||
"disregard", "new instructions", "ignore previous instructions",
|
||
]
|
||
|
||
# English warning signals: softer but still suspicious rephrasings.
|
||
SIGNAL_WORDS_WARNING_EN: list[str] = [
|
||
"different", "instead", "actually", "update",
|
||
]
|
||
|
||
# Per-language critical signal words (D-02a mandate).
|
||
# Keys are ISO-639-1 codes; values are minimal strictly-imperative tokens.
|
||
# Conservative by design: false positives on legitimate non-English chatter are
|
||
# worse than false negatives at this tier (users have multiple layers of
|
||
# defence; the shield is one slice of defence-in-depth).
|
||
SIGNAL_WORDS_CRITICAL_BY_LANG: dict[str, list[str]] = {
|
||
"ru": [
|
||
"забудь", "забыть", "игнорируй",
|
||
"отмени", "сбрось", "новые инструкции",
|
||
"теперь ты", "с этого момента",
|
||
],
|
||
"ja": [
|
||
"忘れて", "無視", "リセット",
|
||
"新しい指示", "これから", "今から",
|
||
],
|
||
"ar": [
|
||
"انسى", "تجاهل",
|
||
"إعادة تعيين", "تعليمات جديدة", "أنت الآن",
|
||
],
|
||
"de": [
|
||
"vergiss", "ignoriere", "überschreibe",
|
||
"neue anweisungen", "ab jetzt",
|
||
],
|
||
"fr": [
|
||
"oublie", "ignore",
|
||
"remplace", "nouvelles instructions",
|
||
],
|
||
"es": [
|
||
"olvida", "ignora",
|
||
"sobrescribe", "nuevas instrucciones",
|
||
],
|
||
"zh": [
|
||
"忘记", "忽略", "重置",
|
||
"新指令", "从现在开始",
|
||
],
|
||
}
|
||
|
||
|
||
# ------------------------------------------------------------ enums + types
|
||
|
||
|
||
class ShieldTier(str, Enum):
|
||
"""D-31 three-tier deployment."""
|
||
|
||
HARD_BLOCK = "hard_block" # L0 identity + S5 invariants
|
||
FLAG_FOR_REVIEW = "flag" # profile updates
|
||
LOG_ONLY = "log" # content records
|
||
|
||
|
||
@dataclass
|
||
class ShieldVerdict:
|
||
"""Result of evaluating injection risk for a single text blob."""
|
||
|
||
tier: ShieldTier
|
||
detected: bool
|
||
matched_patterns: list[str] = field(default_factory=list)
|
||
severity: str = "info" # "info" | "warning" | "critical"
|
||
action: str = "log_allow" # "reject" | "flag" | "log_allow"
|
||
reason: str = ""
|
||
language: str | None = None
|
||
confidence: float = 0.0
|
||
|
||
|
||
# ------------------------------------------------------------ private helpers
|
||
|
||
|
||
def _signal_lists_for_language(
|
||
lang: str | None,
|
||
) -> tuple[list[str], list[str]]:
|
||
"""Return (critical, warning) lists for the given language.
|
||
|
||
English signals are ALWAYS included (prompt-injection attempts are often
|
||
copy-pasted English regardless of the user's native language). When a
|
||
`lang` is given AND supported, its per-language critical list is appended.
|
||
"""
|
||
critical = list(SIGNAL_WORDS_CRITICAL_EN)
|
||
warning = list(SIGNAL_WORDS_WARNING_EN)
|
||
if lang and lang in SIGNAL_WORDS_CRITICAL_BY_LANG:
|
||
critical.extend(SIGNAL_WORDS_CRITICAL_BY_LANG[lang])
|
||
return critical, warning
|
||
|
||
|
||
def _match_patterns(text: str, patterns: list[str]) -> list[str]:
|
||
"""Return the subset of patterns present in the (lowercased) text.
|
||
|
||
For Latin-script patterns we lowercase both sides. For non-ASCII scripts
|
||
(Cyrillic, Hiragana, CJK, Arabic) lowercasing is either identity-preserving
|
||
(CJK has no case) or handled uniformly by str.lower() which is safe for
|
||
our lists.
|
||
"""
|
||
t = (text or "").lower()
|
||
out: list[str] = []
|
||
for p in patterns:
|
||
if p.lower() in t:
|
||
out.append(p)
|
||
return out
|
||
|
||
|
||
# ------------------------------------------------------------ public API
|
||
|
||
|
||
def evaluate_injection_risk(
|
||
text: str,
|
||
tier: ShieldTier,
|
||
target_language: str | None = None,
|
||
) -> ShieldVerdict:
|
||
"""Core shield detection (pure function, no side effects).
|
||
|
||
Tier escalation policy:
|
||
HARD_BLOCK -- any critical OR warning match -> reject (severity critical)
|
||
FLAG_FOR_REVIEW -- any match -> flag (severity warning)
|
||
LOG_ONLY -- any match -> log_allow (severity info)
|
||
no match -- detected=False, action=log_allow
|
||
"""
|
||
critical_list, warning_list = _signal_lists_for_language(target_language)
|
||
matched_critical = _match_patterns(text, critical_list)
|
||
matched_warning = _match_patterns(text, warning_list)
|
||
all_matched = matched_critical + matched_warning
|
||
|
||
if not all_matched:
|
||
return ShieldVerdict(
|
||
tier=tier,
|
||
detected=False,
|
||
matched_patterns=[],
|
||
severity="info",
|
||
action="log_allow",
|
||
reason="no signal patterns detected",
|
||
language=target_language,
|
||
confidence=0.0,
|
||
)
|
||
|
||
# Confidence: 0.9 when any critical match, 0.6 when warning-only.
|
||
confidence = (
|
||
SHIELD_SIGNAL_WORDS_MAX_CONFIDENCE
|
||
if matched_critical
|
||
else SHIELD_FLAG_CONFIDENCE
|
||
)
|
||
|
||
if tier == ShieldTier.HARD_BLOCK:
|
||
return ShieldVerdict(
|
||
tier=tier,
|
||
detected=True,
|
||
matched_patterns=all_matched,
|
||
severity="critical",
|
||
action="reject",
|
||
reason=(
|
||
f"injection signals detected in HARD_BLOCK tier: {all_matched}"
|
||
),
|
||
language=target_language,
|
||
confidence=confidence,
|
||
)
|
||
if tier == ShieldTier.FLAG_FOR_REVIEW:
|
||
return ShieldVerdict(
|
||
tier=tier,
|
||
detected=True,
|
||
matched_patterns=all_matched,
|
||
severity="warning",
|
||
action="flag",
|
||
reason=f"injection signals detected in FLAG tier: {all_matched}",
|
||
language=target_language,
|
||
confidence=confidence,
|
||
)
|
||
# LOG_ONLY
|
||
return ShieldVerdict(
|
||
tier=tier,
|
||
detected=True,
|
||
matched_patterns=all_matched,
|
||
severity="info",
|
||
action="log_allow",
|
||
reason=f"injection signals detected in LOG tier: {all_matched}",
|
||
language=target_language,
|
||
confidence=confidence,
|
||
)
|
||
|
||
|
||
def apply_shield(
|
||
store: Any, # MemoryStore
|
||
record: Any, # MemoryRecord (avoids import cycle with types)
|
||
tier: ShieldTier,
|
||
session_id: str = "-",
|
||
) -> ShieldVerdict:
|
||
"""Evaluate + emit event (side-effectful wrapper).
|
||
|
||
Event kind is determined by the tier policy:
|
||
- reject -> kind="shield_rejection" (severity critical)
|
||
- flag -> kind="shield_flag" (severity warning)
|
||
- log_allow -> kind="shield_log" (severity info, ONLY on detection)
|
||
|
||
No event is emitted when the verdict is "not detected" -- no signal, no
|
||
noise in the events table.
|
||
"""
|
||
verdict = evaluate_injection_risk(
|
||
record.literal_surface or "",
|
||
tier,
|
||
target_language=record.language or None,
|
||
)
|
||
if verdict.detected:
|
||
kind_map = {
|
||
"reject": "shield_rejection",
|
||
"flag": "shield_flag",
|
||
"log_allow": "shield_log",
|
||
}
|
||
event_kind = kind_map.get(verdict.action, "shield_log")
|
||
# Clip matched patterns payload so the events table does not grow
|
||
# unbounded on adversarial input.
|
||
matched_clipped = [str(p)[:80] for p in verdict.matched_patterns[:10]]
|
||
record_id = record.id
|
||
source_ids: list[UUID] = []
|
||
if isinstance(record_id, UUID):
|
||
source_ids = [record_id]
|
||
write_event(
|
||
store,
|
||
kind=event_kind,
|
||
data={
|
||
"record_id": str(record_id) if record_id is not None else None,
|
||
"tier": verdict.tier.value,
|
||
"matched": matched_clipped,
|
||
"language": record.language,
|
||
"action": verdict.action,
|
||
"confidence": verdict.confidence,
|
||
},
|
||
severity=verdict.severity,
|
||
session_id=session_id,
|
||
source_ids=source_ids,
|
||
)
|
||
return verdict
|
||
|
||
|
||
__all__ = [
|
||
"DRIFT_M4_ANOMALY_SIGMA",
|
||
"SHIELD_FLAG_CONFIDENCE",
|
||
"SHIELD_LANGUAGES_SUPPORTED",
|
||
"SHIELD_SIGNAL_WORDS_MAX_CONFIDENCE",
|
||
"SIGNAL_WORDS_CRITICAL_BY_LANG",
|
||
"SIGNAL_WORDS_CRITICAL_EN",
|
||
"SIGNAL_WORDS_WARNING_EN",
|
||
"ShieldTier",
|
||
"ShieldVerdict",
|
||
"apply_shield",
|
||
"evaluate_injection_risk",
|
||
]
|