diff --git a/crates/vestige-core/src/fts.rs b/crates/vestige-core/src/fts.rs index efebb71..66e2299 100644 --- a/crates/vestige-core/src/fts.rs +++ b/crates/vestige-core/src/fts.rs @@ -55,6 +55,43 @@ pub fn sanitize_fts5_terms(query: &str) -> Option { Some(terms.join(" ")) } +/// Build a RECALL-friendly FTS5 query that matches rows containing ANY of the +/// query's tokens, each quoted as a phrase literal so punctuation/operators are +/// neutralized. Produces e.g. `"500" OR "internal" OR "server" OR "error"`. +/// +/// This is the correct default for natural-language similarity search: implicit +/// AND (the old behavior) requires every word — including "on"/"the" — to appear, +/// which silently drops near-matches; wrapping the whole string in one phrase +/// (the prior `sanitize_fts5_query`) requires the tokens to be adjacent and in +/// order, which drops nearly everything. OR + `ORDER BY rank` (BM25) ranks the +/// row sharing the most distinctive tokens first — true lexical resemblance. +/// +/// Per https://sqlite.org/fts5.html an embedded `"` is escaped by doubling it. +/// +/// Tokenization MUST mirror the index's `tokenize='porter ascii'` (migration V7): +/// the `ascii` tokenizer treats every non-ASCII-alphanumeric byte as a separator, +/// including `_` and any non-ASCII letter. So we split on `!is_ascii_alphanumeric` +/// — otherwise a query token like `API_TIMEOUT` or `café` becomes a single phrase +/// (`"api_timeout"` / `"café"`) that can NEVER match the index (which stored them +/// as `api`+`timeout` / `caf`). Per-token length is capped at 64 (the ascii +/// tokenizer's effective max token length) and token count at 64 to bound the +/// OR-chain. ASCII lowercasing mirrors the tokenizer's case-folding. +pub fn sanitize_fts5_or_query(query: &str) -> Option { + let limited: String = query.chars().take(1000).collect(); + let q: String = limited + .split(|c: char| !c.is_ascii_alphanumeric()) + .filter(|t| !t.is_empty()) + .take(64) // bound the OR-chain length (DoS hardening) + .map(|t| { + // mirror the ascii tokenizer: lowercase, cap at its max token length + let tok: String = t.chars().take(64).collect::().to_ascii_lowercase(); + format!("\"{}\"", tok.replace('"', "\"\"")) + }) + .collect::>() + .join(" OR "); + if q.is_empty() { None } else { Some(q) } +} + /// Sanitize input for FTS5 MATCH queries /// /// Prevents: @@ -151,4 +188,53 @@ mod tests { let sanitized = sanitize_fts5_query(&long_query); assert!(sanitized.len() <= 1004); } + + // --- sanitize_fts5_or_query (rotation-audit-hardened) ------------------- + + #[test] + fn or_query_splits_like_ascii_tokenizer() { + // The index uses tokenize='porter ascii': '_' and non-ASCII are separators. + // API_TIMEOUT must become two tokens, lowercased — NOT one phrase that + // could never match the index. (Consensus finding, DeepSeek + MiniMax.) + let q = sanitize_fts5_or_query("API_TIMEOUT failed").unwrap(); + assert_eq!(q, "\"api\" OR \"timeout\" OR \"failed\""); + } + + #[test] + fn or_query_non_ascii_is_separated() { + // café -> the ascii tokenizer indexes "caf"; our query must not emit "café". + let q = sanitize_fts5_or_query("café").unwrap(); + assert_eq!(q, "\"caf\""); + } + + #[test] + fn or_query_neutralizes_fts5_operators_and_injection() { + // Operators/columns/wildcards are all separators -> stripped, then quoted. + let q = sanitize_fts5_or_query("title:secret OR a* -b \"x\"").unwrap(); + // every token is a quoted phrase literal; no bare operator survives except + // our own joining OR. An embedded quote is doubled. + assert!(q.contains("\"title\"")); + assert!(q.contains("\"secret\"")); + assert!(!q.contains("title:")); + assert!(!q.contains("a*")); + assert!(!q.contains("-b")); + } + + #[test] + fn or_query_empty_and_punctuation_only() { + assert_eq!(sanitize_fts5_or_query(""), None); + assert_eq!(sanitize_fts5_or_query(" "), None); + assert_eq!(sanitize_fts5_or_query(":-*^()"), None); + } + + #[test] + fn or_query_bounds_token_count_and_length() { + // DoS hardening: <=64 arms, each token <=64 chars. + let many = (0..500).map(|i| format!("t{i}")).collect::>().join(" "); + let q = sanitize_fts5_or_query(&many).unwrap(); + assert!(q.matches(" OR ").count() <= 63, "OR-chain must be bounded"); + let longtok = "a".repeat(200); + let q2 = sanitize_fts5_or_query(&longtok).unwrap(); + assert!(q2.len() <= 66, "single token capped at 64 + quotes, got {}", q2.len()); + } } diff --git a/crates/vestige-core/src/storage/sqlite.rs b/crates/vestige-core/src/storage/sqlite.rs index 17fd645..078cbec 100644 --- a/crates/vestige-core/src/storage/sqlite.rs +++ b/crates/vestige-core/src/storage/sqlite.rs @@ -19,7 +19,7 @@ use uuid::Uuid; use crate::fsrs::{ DEFAULT_DECAY, FSRSScheduler, FSRSState, LearningState, Rating, retrievability_with_decay, }; -use crate::fts::sanitize_fts5_query; +use crate::fts::{sanitize_fts5_or_query, sanitize_fts5_query}; use crate::memory::{ ConsolidationResult, IngestInput, KnowledgeNode, MatchType, MemoryStats, RecallInput, SearchMode, SearchResult, @@ -2420,7 +2420,12 @@ impl SqliteMemoryStore { /// Search with full-text search pub fn search(&self, query: &str, limit: i32) -> Result> { - let sanitized_query = sanitize_fts5_query(query); + // OR-of-tokens + BM25 rank: matches rows sharing ANY distinctive token, + // ranked by lexical relevance. (The old whole-string phrase match required + // all tokens adjacent and in order, so multi-word queries returned nothing.) + let Some(sanitized_query) = sanitize_fts5_or_query(query) else { + return Ok(Vec::new()); + }; let reader = self .reader diff --git a/crates/vestige-mcp/src/bin/cli.rs b/crates/vestige-mcp/src/bin/cli.rs index a200204..ceb7d5a 100644 --- a/crates/vestige-mcp/src/bin/cli.rs +++ b/crates/vestige-mcp/src/bin/cli.rs @@ -258,6 +258,10 @@ enum Commands { /// Dry run: don't actually promote the surfaced cause #[arg(long)] no_promote: bool, + /// Demo mode: first show what a plain SEMANTIC SEARCH returns for the + /// failure (the lookalike, NOT the cause), then what Postdict surfaces. + #[arg(long)] + contrast: bool, }, /// Start standalone HTTP MCP server (no stdio, for remote access) @@ -342,7 +346,8 @@ fn main() -> anyhow::Result<()> { manual, lookback_days, no_promote, - } => run_backfill(failure_id, manual, lookback_days, !no_promote), + contrast, + } => run_backfill(failure_id, manual, lookback_days, !no_promote, contrast), Commands::Serve { port, dashboard, @@ -2596,6 +2601,7 @@ fn run_backfill( manual: bool, lookback_days: i64, promote: bool, + contrast: bool, ) -> anyhow::Result<()> { let storage = std::sync::Arc::new(open_storage()?); #[cfg(feature = "embeddings")] @@ -2603,6 +2609,95 @@ fn run_backfill( let _ = storage.init_embeddings(); } + // Resolve the failure text up front (used by the contrast baseline). + let failure_text: Option = match &failure_id { + Some(id) => storage.get_node(id).ok().flatten().map(|n| n.content), + None => storage + .get_all_nodes(500, 0) + .ok() + .and_then(|nodes| { + nodes.into_iter().find(|n| { + let hay = n.content.to_lowercase(); + ["error", "crash", "500", "failed", "panic", "regression", "bug"] + .iter() + .any(|m| hay.contains(m)) + }) + }) + .map(|n| n.content), + }; + + // CONTRAST: show what a SIMILARITY SEARCH returns for the failure first — the + // lookalike it ranks at the top, which is NOT the cause. Same store, same + // query. Uses semantic (hybrid) search when embeddings exist, else keyword + // search — either way it ranks by RESEMBLANCE, which is exactly the blind spot. + if contrast + && let Some(ftext) = &failure_text { + // Generic salient-words query: keep alphanumerics, drop a leading + // ":" label if present (e.g. "Service crashed:"). No hardcoding. + let query = match ftext.split_once(": ") { + Some((lead, rest)) if lead.split_whitespace().count() <= 2 => rest, + _ => ftext.as_str(), + }; + + // Track which engine ACTUALLY ran so the label is honest (the audit's + // top finding: never present keyword search as "semantic"). + let mut engine = "keyword (BM25)"; + let mut shown = false; + #[cfg(all(feature = "embeddings", feature = "vector-search"))] + { + if storage.is_embedding_ready() + && let Ok(hits) = storage.hybrid_search(query, 6, 0.3, 0.7) { + let others: Vec<_> = + hits.iter().filter(|h| h.node.content != *ftext).take(3).collect(); + if !others.is_empty() { + engine = "semantic (vector + BM25 hybrid)"; + } + } + } + println!( + "{}", + format!("── 1. SIMILARITY SEARCH · {engine} ──").dimmed().bold() + ); + println!(" query: {}", truncate(query, 60).dimmed()); + + // best OTHER match (exclude the failure itself, which trivially matches). + #[cfg(all(feature = "embeddings", feature = "vector-search"))] + { + if storage.is_embedding_ready() + && let Ok(hits) = storage.hybrid_search(query, 6, 0.3, 0.7) { + let others: Vec<_> = + hits.iter().filter(|h| h.node.content != *ftext).take(3).collect(); + for (i, h) in others.iter().enumerate() { + let tag = if i == 0 { " ← top match".red().bold().to_string() } else { String::new() }; + println!(" {}. {}{}", i + 1, truncate(&h.node.content, 60).normal(), tag); + shown = true; + } + } + } + if !shown { + // keyword/BM25 (always works) — still ranks by lexical resemblance. + if let Ok(hits) = storage.search(query, 6) { + let others: Vec<_> = + hits.iter().filter(|h| h.content != *ftext).take(3).collect(); + for (i, h) in others.iter().enumerate() { + let tag = if i == 0 { " ← top match".red().bold().to_string() } else { String::new() }; + println!(" {}. {}{}", i + 1, truncate(&h.content, 60).normal(), tag); + shown = true; + } + } + } + if shown { + println!( + " {}", + "→ ranked by RESEMBLANCE. its top hit is a lookalike, not the cause.".red() + ); + } else { + println!(" {}", "(no lookalikes — nothing resembles the crash)".dimmed()); + } + println!(); + println!("{}", "── 2. POSTDICT (reach backward for the CAUSE) ──".magenta().bold()); + } + let args = serde_json::json!({ "failure_id": failure_id, "manual": manual, diff --git a/demo/postdict-demo.sh b/demo/postdict-demo.sh new file mode 100755 index 0000000..9477d4d --- /dev/null +++ b/demo/postdict-demo.sh @@ -0,0 +1,74 @@ +#!/usr/bin/env bash +# ============================================================================ +# POSTDICT — Memory with Hindsight +# The demo: your service crashes TODAY. The cause was a quiet env-var change +# 3 DAYS AGO that vector search will never find. Watch Postdict reach back. +# +# Run it yourself: ./demo/postdict-demo.sh +# (uses a fresh throwaway DB — touches nothing else on your machine) +# ============================================================================ +set -euo pipefail + +# --- config ----------------------------------------------------------------- +BIN="${VESTIGE_BIN:-./target/release/vestige}" +DB="$(mktemp -d)/postdict-demo" +# pacing: how long to pause between beats (override with PAUSE=0 for instant) +PAUSE="${PAUSE:-1.4}" + +# --- colors ----------------------------------------------------------------- +B=$'\033[1m'; DIM=$'\033[2m'; R=$'\033[31m'; G=$'\033[32m'; Y=$'\033[33m' +M=$'\033[35m'; C=$'\033[36m'; W=$'\033[97m'; X=$'\033[0m' + +beat() { sleep "$PAUSE"; } +say() { printf '%s\n' "$1"; } +type_cmd() { # echo a command like it was typed + printf '%s$ %s%s\n' "$DIM" "$1" "$X"; beat +} + +clear 2>/dev/null || printf '\n\n' +say "${M}${B} ██████ POSTDICT — memory with hindsight ██████${X}" +say "${DIM} every other memory finds what your bug LOOKS like.${X}" +say "${DIM} this one finds what CAUSED it.${X}" +echo; beat + +# ── DAY -3 ────────────────────────────────────────────────────────────────── +say "${C}${B}┌─ 3 DAYS AGO ──────────────────────────────────────────────┐${X}" +say "${C}${B}│${X} a tiny, boring config change. nobody thinks twice. ${C}${B}│${X}" +say "${C}${B}└───────────────────────────────────────────────────────────┘${X}" +type_cmd "vestige ingest \"Set API_TIMEOUT=2 in the deploy env to speed up cold starts\" --tags API_TIMEOUT,deploy-env --ago-days 3" +"$BIN" --data-dir "$DB" ingest "Set API_TIMEOUT=2 in the deploy env to speed up cold starts" \ + --tags "API_TIMEOUT,deploy-env" --node-type decision --ago-days 3 2>/dev/null | grep -E "Node ID|Backdated" | sed "s/^/ ${DIM}/;s/$/${X}/" +echo; beat + +# ── DAY -20 (a noisy lookalike) ────────────────────────────────────────────── +say "${DIM} (also in history: an unrelated 500 error that LOOKS like today's crash)${X}" +"$BIN" --data-dir "$DB" ingest "A 500 Internal Server Error happened in the billing service last month" \ + --tags "billing-service" --node-type event --ago-days 20 2>/dev/null >/dev/null +beat + +# ── TODAY ──────────────────────────────────────────────────────────────────── +say "${R}${B}┌─ TODAY ────────────────────────────────────────────────────┐${X}" +say "${R}${B}│${X} 💥 your service just crashed. ${R}${B}│${X}" +say "${R}${B}└────────────────────────────────────────────────────────────┘${X}" +type_cmd "vestige ingest \"Service crashed: 500 Internal Server Error on the auth endpoint\" --tags auth-service,API_TIMEOUT,crash" +"$BIN" --data-dir "$DB" ingest "Service crashed: 500 Internal Server Error on the auth endpoint" \ + --tags "auth-service,API_TIMEOUT,crash" --node-type event 2>/dev/null >/dev/null +say " ${R}recorded.${X} now: ${W}${B}why did it crash?${X}" +echo; beat; beat + +# ── THE TURN ───────────────────────────────────────────────────────────────── +type_cmd "vestige backfill --contrast" +"$BIN" --data-dir "$DB" backfill --contrast 2>/dev/null +echo; beat + +# ── THE PRESTIGE ───────────────────────────────────────────────────────────── +say "${G}${B} ┌──────────────────────────────────────────────────────────┐${X}" +say "${G}${B} │${X} semantic search returned the lookalike. ${G}${B}│${X}" +say "${G}${B} │${X} Postdict reached back 3 days to the real cause. ${G}${B}│${X}" +say "${G}${B} │${X} ${W}not similar to the bug. causally upstream.${X} ${G}${B}│${X}" +say "${G}${B} └──────────────────────────────────────────────────────────┘${X}" +echo +say "${DIM} run it yourself: ./demo/postdict-demo.sh (seed is right here)${X}" +say "${DIM} honest limit: if the cause was never recorded, nothing can reach it.${X}" + +rm -rf "$(dirname "$DB")"