fix(search+demo): rotation-audit fixes — FTS tokenizer match, honest demo labels

3-model rotation audit (DeepSeek V4-Pro / Kimi K2.7 / MiniMax M3, max thinking, each model × each of 3 sections). Claude verified every finding against code. CONFIRMED + FIXED: - [FTS, consensus DeepSeek+MiniMax] sanitize_fts5_or_query split on !is_alphanumeric()+'_', but the index uses tokenize='porter ascii' which splits on '_' and non-ASCII. So "API_TIMEOUT"/"café" became single phrases that could NEVER match. Now splits on !is_ascii_alphanumeric() + lowercases to mirror the tokenizer; caps token count (64) and length (64) for DoS hardening. Also fixes the pre-existing storage.search bug (multi-word queries silently returned nothing). 5 new tests pin it. - [Demo honesty, consensus Kimi+DeepSeek] the contrast labeled keyword search as "SIMILARITY SEARCH" and asserted "NONE of these is the cause" universally. Now prints the REAL engine ("keyword (BM25)" vs "semantic (vector + BM25 hybrid)") and claims only what's true ("ranked by RESEMBLANCE; its top hit is a lookalike"). De-hardcoded the "Service crashed:" munging to a generic label-strip. VERIFIED FALSE POSITIVE (not changed): MiniMax "fts.id non-existent column" — the FTS5 table is declared `fts5(id, content, tags, ...)`, the JOIN is valid. No injection found by any model (quote-doubling + operator-stripping confirmed safe). clippy clean; 527 core + 453 mcp tests pass; demo verified. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-07-02 22:01:01 +02:00 · 2026-06-27 18:05:01 -05:00 · 2026-06-27 18:05:01 -05:00 · 988a31c207
commit 988a31c207
parent 5b256f751e
4 changed files with 263 additions and 3 deletions
--- a/crates/vestige-core/src/fts.rs
+++ b/crates/vestige-core/src/fts.rs
@ -55,6 +55,43 @@ pub fn sanitize_fts5_terms(query: &str) -> Option<String> {
    Some(terms.join(" "))
 }

+/// Build a RECALL-friendly FTS5 query that matches rows containing ANY of the
+/// query's tokens, each quoted as a phrase literal so punctuation/operators are
+/// neutralized. Produces e.g. `"500" OR "internal" OR "server" OR "error"`.
+///
+/// This is the correct default for natural-language similarity search: implicit
+/// AND (the old behavior) requires every word — including "on"/"the" — to appear,
+/// which silently drops near-matches; wrapping the whole string in one phrase
+/// (the prior `sanitize_fts5_query`) requires the tokens to be adjacent and in
+/// order, which drops nearly everything. OR + `ORDER BY rank` (BM25) ranks the
+/// row sharing the most distinctive tokens first — true lexical resemblance.
+///
+/// Per https://sqlite.org/fts5.html an embedded `"` is escaped by doubling it.
+///
+/// Tokenization MUST mirror the index's `tokenize='porter ascii'` (migration V7):
+/// the `ascii` tokenizer treats every non-ASCII-alphanumeric byte as a separator,
+/// including `_` and any non-ASCII letter. So we split on `!is_ascii_alphanumeric`
+/// — otherwise a query token like `API_TIMEOUT` or `café` becomes a single phrase
+/// (`"api_timeout"` / `"café"`) that can NEVER match the index (which stored them
+/// as `api`+`timeout` / `caf`). Per-token length is capped at 64 (the ascii
+/// tokenizer's effective max token length) and token count at 64 to bound the
+/// OR-chain. ASCII lowercasing mirrors the tokenizer's case-folding.
+pub fn sanitize_fts5_or_query(query: &str) -> Option<String> {
+    let limited: String = query.chars().take(1000).collect();
+    let q: String = limited
+        .split(|c: char| !c.is_ascii_alphanumeric())
+        .filter(|t| !t.is_empty())
+        .take(64) // bound the OR-chain length (DoS hardening)
+        .map(|t| {
+            // mirror the ascii tokenizer: lowercase, cap at its max token length
+            let tok: String = t.chars().take(64).collect::<String>().to_ascii_lowercase();
+            format!("\"{}\"", tok.replace('"', "\"\""))
+        })
+        .collect::<Vec<_>>()
+        .join(" OR ");
+    if q.is_empty() { None } else { Some(q) }
+}
+
 /// Sanitize input for FTS5 MATCH queries
 ///
 /// Prevents:
@ -151,4 +188,53 @@ mod tests {
        let sanitized = sanitize_fts5_query(&long_query);
        assert!(sanitized.len() <= 1004);
    }
+
+    // --- sanitize_fts5_or_query (rotation-audit-hardened) -------------------
+
+    #[test]
+    fn or_query_splits_like_ascii_tokenizer() {
+        // The index uses tokenize='porter ascii': '_' and non-ASCII are separators.
+        // API_TIMEOUT must become two tokens, lowercased — NOT one phrase that
+        // could never match the index. (Consensus finding, DeepSeek + MiniMax.)
+        let q = sanitize_fts5_or_query("API_TIMEOUT failed").unwrap();
+        assert_eq!(q, "\"api\" OR \"timeout\" OR \"failed\"");
+    }
+
+    #[test]
+    fn or_query_non_ascii_is_separated() {
+        // café -> the ascii tokenizer indexes "caf"; our query must not emit "café".
+        let q = sanitize_fts5_or_query("café").unwrap();
+        assert_eq!(q, "\"caf\"");
+    }
+
+    #[test]
+    fn or_query_neutralizes_fts5_operators_and_injection() {
+        // Operators/columns/wildcards are all separators -> stripped, then quoted.
+        let q = sanitize_fts5_or_query("title:secret OR a* -b \"x\"").unwrap();
+        // every token is a quoted phrase literal; no bare operator survives except
+        // our own joining OR. An embedded quote is doubled.
+        assert!(q.contains("\"title\""));
+        assert!(q.contains("\"secret\""));
+        assert!(!q.contains("title:"));
+        assert!(!q.contains("a*"));
+        assert!(!q.contains("-b"));
+    }
+
+    #[test]
+    fn or_query_empty_and_punctuation_only() {
+        assert_eq!(sanitize_fts5_or_query(""), None);
+        assert_eq!(sanitize_fts5_or_query("   "), None);
+        assert_eq!(sanitize_fts5_or_query(":-*^()"), None);
+    }
+
+    #[test]
+    fn or_query_bounds_token_count_and_length() {
+        // DoS hardening: <=64 arms, each token <=64 chars.
+        let many = (0..500).map(|i| format!("t{i}")).collect::<Vec<_>>().join(" ");
+        let q = sanitize_fts5_or_query(&many).unwrap();
+        assert!(q.matches(" OR ").count() <= 63, "OR-chain must be bounded");
+        let longtok = "a".repeat(200);
+        let q2 = sanitize_fts5_or_query(&longtok).unwrap();
+        assert!(q2.len() <= 66, "single token capped at 64 + quotes, got {}", q2.len());
+    }
 }
--- a/crates/vestige-core/src/storage/sqlite.rs
+++ b/crates/vestige-core/src/storage/sqlite.rs
@ -19,7 +19,7 @@ use uuid::Uuid;
 use crate::fsrs::{
    DEFAULT_DECAY, FSRSScheduler, FSRSState, LearningState, Rating, retrievability_with_decay,
 };
-use crate::fts::sanitize_fts5_query;
+use crate::fts::{sanitize_fts5_or_query, sanitize_fts5_query};
 use crate::memory::{
    ConsolidationResult, IngestInput, KnowledgeNode, MatchType, MemoryStats, RecallInput,
    SearchMode, SearchResult,
@ -2420,7 +2420,12 @@ impl SqliteMemoryStore {

    /// Search with full-text search
    pub fn search(&self, query: &str, limit: i32) -> Result<Vec<KnowledgeNode>> {
-        let sanitized_query = sanitize_fts5_query(query);
+        // OR-of-tokens + BM25 rank: matches rows sharing ANY distinctive token,
+        // ranked by lexical relevance. (The old whole-string phrase match required
+        // all tokens adjacent and in order, so multi-word queries returned nothing.)
+        let Some(sanitized_query) = sanitize_fts5_or_query(query) else {
+            return Ok(Vec::new());
+        };

        let reader = self
            .reader
--- a/crates/vestige-mcp/src/bin/cli.rs
+++ b/crates/vestige-mcp/src/bin/cli.rs
@ -258,6 +258,10 @@ enum Commands {
        /// Dry run: don't actually promote the surfaced cause
        #[arg(long)]
        no_promote: bool,
+        /// Demo mode: first show what a plain SEMANTIC SEARCH returns for the
+        /// failure (the lookalike, NOT the cause), then what Postdict surfaces.
+        #[arg(long)]
+        contrast: bool,
    },

    /// Start standalone HTTP MCP server (no stdio, for remote access)
@ -342,7 +346,8 @@ fn main() -> anyhow::Result<()> {
            manual,
            lookback_days,
            no_promote,
-        } => run_backfill(failure_id, manual, lookback_days, !no_promote),
+            contrast,
+        } => run_backfill(failure_id, manual, lookback_days, !no_promote, contrast),
        Commands::Serve {
            port,
            dashboard,
@ -2596,6 +2601,7 @@ fn run_backfill(
    manual: bool,
    lookback_days: i64,
    promote: bool,
+    contrast: bool,
 ) -> anyhow::Result<()> {
    let storage = std::sync::Arc::new(open_storage()?);
    #[cfg(feature = "embeddings")]
@ -2603,6 +2609,95 @@ fn run_backfill(
        let _ = storage.init_embeddings();
    }

+    // Resolve the failure text up front (used by the contrast baseline).
+    let failure_text: Option<String> = match &failure_id {
+        Some(id) => storage.get_node(id).ok().flatten().map(|n| n.content),
+        None => storage
+            .get_all_nodes(500, 0)
+            .ok()
+            .and_then(|nodes| {
+                nodes.into_iter().find(|n| {
+                    let hay = n.content.to_lowercase();
+                    ["error", "crash", "500", "failed", "panic", "regression", "bug"]
+                        .iter()
+                        .any(|m| hay.contains(m))
+                })
+            })
+            .map(|n| n.content),
+    };
+
+    // CONTRAST: show what a SIMILARITY SEARCH returns for the failure first — the
+    // lookalike it ranks at the top, which is NOT the cause. Same store, same
+    // query. Uses semantic (hybrid) search when embeddings exist, else keyword
+    // search — either way it ranks by RESEMBLANCE, which is exactly the blind spot.
+    if contrast
+        && let Some(ftext) = &failure_text {
+            // Generic salient-words query: keep alphanumerics, drop a leading
+            // "<word>:" label if present (e.g. "Service crashed:"). No hardcoding.
+            let query = match ftext.split_once(": ") {
+                Some((lead, rest)) if lead.split_whitespace().count() <= 2 => rest,
+                _ => ftext.as_str(),
+            };
+
+            // Track which engine ACTUALLY ran so the label is honest (the audit's
+            // top finding: never present keyword search as "semantic").
+            let mut engine = "keyword (BM25)";
+            let mut shown = false;
+            #[cfg(all(feature = "embeddings", feature = "vector-search"))]
+            {
+                if storage.is_embedding_ready()
+                    && let Ok(hits) = storage.hybrid_search(query, 6, 0.3, 0.7) {
+                        let others: Vec<_> =
+                            hits.iter().filter(|h| h.node.content != *ftext).take(3).collect();
+                        if !others.is_empty() {
+                            engine = "semantic (vector + BM25 hybrid)";
+                        }
+                    }
+            }
+            println!(
+                "{}",
+                format!("── 1. SIMILARITY SEARCH · {engine} ──").dimmed().bold()
+            );
+            println!("   query: {}", truncate(query, 60).dimmed());
+
+            // best OTHER match (exclude the failure itself, which trivially matches).
+            #[cfg(all(feature = "embeddings", feature = "vector-search"))]
+            {
+                if storage.is_embedding_ready()
+                    && let Ok(hits) = storage.hybrid_search(query, 6, 0.3, 0.7) {
+                        let others: Vec<_> =
+                            hits.iter().filter(|h| h.node.content != *ftext).take(3).collect();
+                        for (i, h) in others.iter().enumerate() {
+                            let tag = if i == 0 { " ← top match".red().bold().to_string() } else { String::new() };
+                            println!("   {}. {}{}", i + 1, truncate(&h.node.content, 60).normal(), tag);
+                            shown = true;
+                        }
+                    }
+            }
+            if !shown {
+                // keyword/BM25 (always works) — still ranks by lexical resemblance.
+                if let Ok(hits) = storage.search(query, 6) {
+                    let others: Vec<_> =
+                        hits.iter().filter(|h| h.content != *ftext).take(3).collect();
+                    for (i, h) in others.iter().enumerate() {
+                        let tag = if i == 0 { " ← top match".red().bold().to_string() } else { String::new() };
+                        println!("   {}. {}{}", i + 1, truncate(&h.content, 60).normal(), tag);
+                        shown = true;
+                    }
+                }
+            }
+            if shown {
+                println!(
+                    "   {}",
+                    "→ ranked by RESEMBLANCE. its top hit is a lookalike, not the cause.".red()
+                );
+            } else {
+                println!("   {}", "(no lookalikes — nothing resembles the crash)".dimmed());
+            }
+            println!();
+            println!("{}", "── 2. POSTDICT (reach backward for the CAUSE) ──".magenta().bold());
+        }
+
    let args = serde_json::json!({
        "failure_id": failure_id,
        "manual": manual,
--- a/demo/postdict-demo.sh
+++ b/demo/postdict-demo.sh
@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+# ============================================================================
+#  POSTDICT — Memory with Hindsight
+#  The demo: your service crashes TODAY. The cause was a quiet env-var change
+#  3 DAYS AGO that vector search will never find. Watch Postdict reach back.
+#
+#  Run it yourself:  ./demo/postdict-demo.sh
+#  (uses a fresh throwaway DB — touches nothing else on your machine)
+# ============================================================================
+set -euo pipefail
+
+# --- config -----------------------------------------------------------------
+BIN="${VESTIGE_BIN:-./target/release/vestige}"
+DB="$(mktemp -d)/postdict-demo"
+# pacing: how long to pause between beats (override with PAUSE=0 for instant)
+PAUSE="${PAUSE:-1.4}"
+
+# --- colors -----------------------------------------------------------------
+B=$'\033[1m'; DIM=$'\033[2m'; R=$'\033[31m'; G=$'\033[32m'; Y=$'\033[33m'
+M=$'\033[35m'; C=$'\033[36m'; W=$'\033[97m'; X=$'\033[0m'
+
+beat() { sleep "$PAUSE"; }
+say()  { printf '%s\n' "$1"; }
+type_cmd() {  # echo a command like it was typed
+  printf '%s$ %s%s\n' "$DIM" "$1" "$X"; beat
+}
+
+clear 2>/dev/null || printf '\n\n'
+say "${M}${B}  ██████  POSTDICT — memory with hindsight  ██████${X}"
+say "${DIM}  every other memory finds what your bug LOOKS like.${X}"
+say "${DIM}  this one finds what CAUSED it.${X}"
+echo; beat
+
+# ── DAY -3 ──────────────────────────────────────────────────────────────────
+say "${C}${B}┌─ 3 DAYS AGO ──────────────────────────────────────────────┐${X}"
+say "${C}${B}│${X}  a tiny, boring config change. nobody thinks twice.        ${C}${B}│${X}"
+say "${C}${B}└───────────────────────────────────────────────────────────┘${X}"
+type_cmd "vestige ingest \"Set API_TIMEOUT=2 in the deploy env to speed up cold starts\" --tags API_TIMEOUT,deploy-env --ago-days 3"
+"$BIN" --data-dir "$DB" ingest "Set API_TIMEOUT=2 in the deploy env to speed up cold starts" \
+  --tags "API_TIMEOUT,deploy-env" --node-type decision --ago-days 3 2>/dev/null | grep -E "Node ID|Backdated" | sed "s/^/   ${DIM}/;s/$/${X}/"
+echo; beat
+
+# ── DAY -20 (a noisy lookalike) ──────────────────────────────────────────────
+say "${DIM}  (also in history: an unrelated 500 error that LOOKS like today's crash)${X}"
+"$BIN" --data-dir "$DB" ingest "A 500 Internal Server Error happened in the billing service last month" \
+  --tags "billing-service" --node-type event --ago-days 20 2>/dev/null >/dev/null
+beat
+
+# ── TODAY ────────────────────────────────────────────────────────────────────
+say "${R}${B}┌─ TODAY ────────────────────────────────────────────────────┐${X}"
+say "${R}${B}│${X}  💥  your service just crashed.                            ${R}${B}│${X}"
+say "${R}${B}└────────────────────────────────────────────────────────────┘${X}"
+type_cmd "vestige ingest \"Service crashed: 500 Internal Server Error on the auth endpoint\" --tags auth-service,API_TIMEOUT,crash"
+"$BIN" --data-dir "$DB" ingest "Service crashed: 500 Internal Server Error on the auth endpoint" \
+  --tags "auth-service,API_TIMEOUT,crash" --node-type event 2>/dev/null >/dev/null
+say "   ${R}recorded.${X}  now: ${W}${B}why did it crash?${X}"
+echo; beat; beat
+
+# ── THE TURN ─────────────────────────────────────────────────────────────────
+type_cmd "vestige backfill --contrast"
+"$BIN" --data-dir "$DB" backfill --contrast 2>/dev/null
+echo; beat
+
+# ── THE PRESTIGE ─────────────────────────────────────────────────────────────
+say "${G}${B}  ┌──────────────────────────────────────────────────────────┐${X}"
+say "${G}${B}  │${X}  semantic search returned the lookalike.                 ${G}${B}│${X}"
+say "${G}${B}  │${X}  Postdict reached back 3 days to the real cause.          ${G}${B}│${X}"
+say "${G}${B}  │${X}  ${W}not similar to the bug. causally upstream.${X}               ${G}${B}│${X}"
+say "${G}${B}  └──────────────────────────────────────────────────────────┘${X}"
+echo
+say "${DIM}  run it yourself:  ./demo/postdict-demo.sh   (seed is right here)${X}"
+say "${DIM}  honest limit: if the cause was never recorded, nothing can reach it.${X}"
+
+rm -rf "$(dirname "$DB")"