mirror of
https://github.com/samvallad33/vestige.git
synced 2026-07-02 22:01:01 +02:00
fix(search+demo): rotation-audit fixes — FTS tokenizer match, honest demo labels
3-model rotation audit (DeepSeek V4-Pro / Kimi K2.7 / MiniMax M3, max thinking,
each model × each of 3 sections). Claude verified every finding against code.
CONFIRMED + FIXED:
- [FTS, consensus DeepSeek+MiniMax] sanitize_fts5_or_query split on
!is_alphanumeric()+'_', but the index uses tokenize='porter ascii' which
splits on '_' and non-ASCII. So "API_TIMEOUT"/"café" became single phrases that
could NEVER match. Now splits on !is_ascii_alphanumeric() + lowercases to mirror
the tokenizer; caps token count (64) and length (64) for DoS hardening. Also
fixes the pre-existing storage.search bug (multi-word queries silently returned
nothing). 5 new tests pin it.
- [Demo honesty, consensus Kimi+DeepSeek] the contrast labeled keyword search as
"SIMILARITY SEARCH" and asserted "NONE of these is the cause" universally. Now
prints the REAL engine ("keyword (BM25)" vs "semantic (vector + BM25 hybrid)")
and claims only what's true ("ranked by RESEMBLANCE; its top hit is a lookalike").
De-hardcoded the "Service crashed:" munging to a generic label-strip.
VERIFIED FALSE POSITIVE (not changed): MiniMax "fts.id non-existent column" —
the FTS5 table is declared `fts5(id, content, tags, ...)`, the JOIN is valid.
No injection found by any model (quote-doubling + operator-stripping confirmed safe).
clippy clean; 527 core + 453 mcp tests pass; demo verified.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
5b256f751e
commit
988a31c207
4 changed files with 263 additions and 3 deletions
|
|
@ -55,6 +55,43 @@ pub fn sanitize_fts5_terms(query: &str) -> Option<String> {
|
|||
Some(terms.join(" "))
|
||||
}
|
||||
|
||||
/// Build a RECALL-friendly FTS5 query that matches rows containing ANY of the
|
||||
/// query's tokens, each quoted as a phrase literal so punctuation/operators are
|
||||
/// neutralized. Produces e.g. `"500" OR "internal" OR "server" OR "error"`.
|
||||
///
|
||||
/// This is the correct default for natural-language similarity search: implicit
|
||||
/// AND (the old behavior) requires every word — including "on"/"the" — to appear,
|
||||
/// which silently drops near-matches; wrapping the whole string in one phrase
|
||||
/// (the prior `sanitize_fts5_query`) requires the tokens to be adjacent and in
|
||||
/// order, which drops nearly everything. OR + `ORDER BY rank` (BM25) ranks the
|
||||
/// row sharing the most distinctive tokens first — true lexical resemblance.
|
||||
///
|
||||
/// Per https://sqlite.org/fts5.html an embedded `"` is escaped by doubling it.
|
||||
///
|
||||
/// Tokenization MUST mirror the index's `tokenize='porter ascii'` (migration V7):
|
||||
/// the `ascii` tokenizer treats every non-ASCII-alphanumeric byte as a separator,
|
||||
/// including `_` and any non-ASCII letter. So we split on `!is_ascii_alphanumeric`
|
||||
/// — otherwise a query token like `API_TIMEOUT` or `café` becomes a single phrase
|
||||
/// (`"api_timeout"` / `"café"`) that can NEVER match the index (which stored them
|
||||
/// as `api`+`timeout` / `caf`). Per-token length is capped at 64 (the ascii
|
||||
/// tokenizer's effective max token length) and token count at 64 to bound the
|
||||
/// OR-chain. ASCII lowercasing mirrors the tokenizer's case-folding.
|
||||
pub fn sanitize_fts5_or_query(query: &str) -> Option<String> {
|
||||
let limited: String = query.chars().take(1000).collect();
|
||||
let q: String = limited
|
||||
.split(|c: char| !c.is_ascii_alphanumeric())
|
||||
.filter(|t| !t.is_empty())
|
||||
.take(64) // bound the OR-chain length (DoS hardening)
|
||||
.map(|t| {
|
||||
// mirror the ascii tokenizer: lowercase, cap at its max token length
|
||||
let tok: String = t.chars().take(64).collect::<String>().to_ascii_lowercase();
|
||||
format!("\"{}\"", tok.replace('"', "\"\""))
|
||||
})
|
||||
.collect::<Vec<_>>()
|
||||
.join(" OR ");
|
||||
if q.is_empty() { None } else { Some(q) }
|
||||
}
|
||||
|
||||
/// Sanitize input for FTS5 MATCH queries
|
||||
///
|
||||
/// Prevents:
|
||||
|
|
@ -151,4 +188,53 @@ mod tests {
|
|||
let sanitized = sanitize_fts5_query(&long_query);
|
||||
assert!(sanitized.len() <= 1004);
|
||||
}
|
||||
|
||||
// --- sanitize_fts5_or_query (rotation-audit-hardened) -------------------
|
||||
|
||||
#[test]
|
||||
fn or_query_splits_like_ascii_tokenizer() {
|
||||
// The index uses tokenize='porter ascii': '_' and non-ASCII are separators.
|
||||
// API_TIMEOUT must become two tokens, lowercased — NOT one phrase that
|
||||
// could never match the index. (Consensus finding, DeepSeek + MiniMax.)
|
||||
let q = sanitize_fts5_or_query("API_TIMEOUT failed").unwrap();
|
||||
assert_eq!(q, "\"api\" OR \"timeout\" OR \"failed\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn or_query_non_ascii_is_separated() {
|
||||
// café -> the ascii tokenizer indexes "caf"; our query must not emit "café".
|
||||
let q = sanitize_fts5_or_query("café").unwrap();
|
||||
assert_eq!(q, "\"caf\"");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn or_query_neutralizes_fts5_operators_and_injection() {
|
||||
// Operators/columns/wildcards are all separators -> stripped, then quoted.
|
||||
let q = sanitize_fts5_or_query("title:secret OR a* -b \"x\"").unwrap();
|
||||
// every token is a quoted phrase literal; no bare operator survives except
|
||||
// our own joining OR. An embedded quote is doubled.
|
||||
assert!(q.contains("\"title\""));
|
||||
assert!(q.contains("\"secret\""));
|
||||
assert!(!q.contains("title:"));
|
||||
assert!(!q.contains("a*"));
|
||||
assert!(!q.contains("-b"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn or_query_empty_and_punctuation_only() {
|
||||
assert_eq!(sanitize_fts5_or_query(""), None);
|
||||
assert_eq!(sanitize_fts5_or_query(" "), None);
|
||||
assert_eq!(sanitize_fts5_or_query(":-*^()"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn or_query_bounds_token_count_and_length() {
|
||||
// DoS hardening: <=64 arms, each token <=64 chars.
|
||||
let many = (0..500).map(|i| format!("t{i}")).collect::<Vec<_>>().join(" ");
|
||||
let q = sanitize_fts5_or_query(&many).unwrap();
|
||||
assert!(q.matches(" OR ").count() <= 63, "OR-chain must be bounded");
|
||||
let longtok = "a".repeat(200);
|
||||
let q2 = sanitize_fts5_or_query(&longtok).unwrap();
|
||||
assert!(q2.len() <= 66, "single token capped at 64 + quotes, got {}", q2.len());
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -19,7 +19,7 @@ use uuid::Uuid;
|
|||
use crate::fsrs::{
|
||||
DEFAULT_DECAY, FSRSScheduler, FSRSState, LearningState, Rating, retrievability_with_decay,
|
||||
};
|
||||
use crate::fts::sanitize_fts5_query;
|
||||
use crate::fts::{sanitize_fts5_or_query, sanitize_fts5_query};
|
||||
use crate::memory::{
|
||||
ConsolidationResult, IngestInput, KnowledgeNode, MatchType, MemoryStats, RecallInput,
|
||||
SearchMode, SearchResult,
|
||||
|
|
@ -2420,7 +2420,12 @@ impl SqliteMemoryStore {
|
|||
|
||||
/// Search with full-text search
|
||||
pub fn search(&self, query: &str, limit: i32) -> Result<Vec<KnowledgeNode>> {
|
||||
let sanitized_query = sanitize_fts5_query(query);
|
||||
// OR-of-tokens + BM25 rank: matches rows sharing ANY distinctive token,
|
||||
// ranked by lexical relevance. (The old whole-string phrase match required
|
||||
// all tokens adjacent and in order, so multi-word queries returned nothing.)
|
||||
let Some(sanitized_query) = sanitize_fts5_or_query(query) else {
|
||||
return Ok(Vec::new());
|
||||
};
|
||||
|
||||
let reader = self
|
||||
.reader
|
||||
|
|
|
|||
|
|
@ -258,6 +258,10 @@ enum Commands {
|
|||
/// Dry run: don't actually promote the surfaced cause
|
||||
#[arg(long)]
|
||||
no_promote: bool,
|
||||
/// Demo mode: first show what a plain SEMANTIC SEARCH returns for the
|
||||
/// failure (the lookalike, NOT the cause), then what Postdict surfaces.
|
||||
#[arg(long)]
|
||||
contrast: bool,
|
||||
},
|
||||
|
||||
/// Start standalone HTTP MCP server (no stdio, for remote access)
|
||||
|
|
@ -342,7 +346,8 @@ fn main() -> anyhow::Result<()> {
|
|||
manual,
|
||||
lookback_days,
|
||||
no_promote,
|
||||
} => run_backfill(failure_id, manual, lookback_days, !no_promote),
|
||||
contrast,
|
||||
} => run_backfill(failure_id, manual, lookback_days, !no_promote, contrast),
|
||||
Commands::Serve {
|
||||
port,
|
||||
dashboard,
|
||||
|
|
@ -2596,6 +2601,7 @@ fn run_backfill(
|
|||
manual: bool,
|
||||
lookback_days: i64,
|
||||
promote: bool,
|
||||
contrast: bool,
|
||||
) -> anyhow::Result<()> {
|
||||
let storage = std::sync::Arc::new(open_storage()?);
|
||||
#[cfg(feature = "embeddings")]
|
||||
|
|
@ -2603,6 +2609,95 @@ fn run_backfill(
|
|||
let _ = storage.init_embeddings();
|
||||
}
|
||||
|
||||
// Resolve the failure text up front (used by the contrast baseline).
|
||||
let failure_text: Option<String> = match &failure_id {
|
||||
Some(id) => storage.get_node(id).ok().flatten().map(|n| n.content),
|
||||
None => storage
|
||||
.get_all_nodes(500, 0)
|
||||
.ok()
|
||||
.and_then(|nodes| {
|
||||
nodes.into_iter().find(|n| {
|
||||
let hay = n.content.to_lowercase();
|
||||
["error", "crash", "500", "failed", "panic", "regression", "bug"]
|
||||
.iter()
|
||||
.any(|m| hay.contains(m))
|
||||
})
|
||||
})
|
||||
.map(|n| n.content),
|
||||
};
|
||||
|
||||
// CONTRAST: show what a SIMILARITY SEARCH returns for the failure first — the
|
||||
// lookalike it ranks at the top, which is NOT the cause. Same store, same
|
||||
// query. Uses semantic (hybrid) search when embeddings exist, else keyword
|
||||
// search — either way it ranks by RESEMBLANCE, which is exactly the blind spot.
|
||||
if contrast
|
||||
&& let Some(ftext) = &failure_text {
|
||||
// Generic salient-words query: keep alphanumerics, drop a leading
|
||||
// "<word>:" label if present (e.g. "Service crashed:"). No hardcoding.
|
||||
let query = match ftext.split_once(": ") {
|
||||
Some((lead, rest)) if lead.split_whitespace().count() <= 2 => rest,
|
||||
_ => ftext.as_str(),
|
||||
};
|
||||
|
||||
// Track which engine ACTUALLY ran so the label is honest (the audit's
|
||||
// top finding: never present keyword search as "semantic").
|
||||
let mut engine = "keyword (BM25)";
|
||||
let mut shown = false;
|
||||
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
|
||||
{
|
||||
if storage.is_embedding_ready()
|
||||
&& let Ok(hits) = storage.hybrid_search(query, 6, 0.3, 0.7) {
|
||||
let others: Vec<_> =
|
||||
hits.iter().filter(|h| h.node.content != *ftext).take(3).collect();
|
||||
if !others.is_empty() {
|
||||
engine = "semantic (vector + BM25 hybrid)";
|
||||
}
|
||||
}
|
||||
}
|
||||
println!(
|
||||
"{}",
|
||||
format!("── 1. SIMILARITY SEARCH · {engine} ──").dimmed().bold()
|
||||
);
|
||||
println!(" query: {}", truncate(query, 60).dimmed());
|
||||
|
||||
// best OTHER match (exclude the failure itself, which trivially matches).
|
||||
#[cfg(all(feature = "embeddings", feature = "vector-search"))]
|
||||
{
|
||||
if storage.is_embedding_ready()
|
||||
&& let Ok(hits) = storage.hybrid_search(query, 6, 0.3, 0.7) {
|
||||
let others: Vec<_> =
|
||||
hits.iter().filter(|h| h.node.content != *ftext).take(3).collect();
|
||||
for (i, h) in others.iter().enumerate() {
|
||||
let tag = if i == 0 { " ← top match".red().bold().to_string() } else { String::new() };
|
||||
println!(" {}. {}{}", i + 1, truncate(&h.node.content, 60).normal(), tag);
|
||||
shown = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if !shown {
|
||||
// keyword/BM25 (always works) — still ranks by lexical resemblance.
|
||||
if let Ok(hits) = storage.search(query, 6) {
|
||||
let others: Vec<_> =
|
||||
hits.iter().filter(|h| h.content != *ftext).take(3).collect();
|
||||
for (i, h) in others.iter().enumerate() {
|
||||
let tag = if i == 0 { " ← top match".red().bold().to_string() } else { String::new() };
|
||||
println!(" {}. {}{}", i + 1, truncate(&h.content, 60).normal(), tag);
|
||||
shown = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
if shown {
|
||||
println!(
|
||||
" {}",
|
||||
"→ ranked by RESEMBLANCE. its top hit is a lookalike, not the cause.".red()
|
||||
);
|
||||
} else {
|
||||
println!(" {}", "(no lookalikes — nothing resembles the crash)".dimmed());
|
||||
}
|
||||
println!();
|
||||
println!("{}", "── 2. POSTDICT (reach backward for the CAUSE) ──".magenta().bold());
|
||||
}
|
||||
|
||||
let args = serde_json::json!({
|
||||
"failure_id": failure_id,
|
||||
"manual": manual,
|
||||
|
|
|
|||
74
demo/postdict-demo.sh
Executable file
74
demo/postdict-demo.sh
Executable file
|
|
@ -0,0 +1,74 @@
|
|||
#!/usr/bin/env bash
|
||||
# ============================================================================
|
||||
# POSTDICT — Memory with Hindsight
|
||||
# The demo: your service crashes TODAY. The cause was a quiet env-var change
|
||||
# 3 DAYS AGO that vector search will never find. Watch Postdict reach back.
|
||||
#
|
||||
# Run it yourself: ./demo/postdict-demo.sh
|
||||
# (uses a fresh throwaway DB — touches nothing else on your machine)
|
||||
# ============================================================================
|
||||
set -euo pipefail
|
||||
|
||||
# --- config -----------------------------------------------------------------
|
||||
BIN="${VESTIGE_BIN:-./target/release/vestige}"
|
||||
DB="$(mktemp -d)/postdict-demo"
|
||||
# pacing: how long to pause between beats (override with PAUSE=0 for instant)
|
||||
PAUSE="${PAUSE:-1.4}"
|
||||
|
||||
# --- colors -----------------------------------------------------------------
|
||||
B=$'\033[1m'; DIM=$'\033[2m'; R=$'\033[31m'; G=$'\033[32m'; Y=$'\033[33m'
|
||||
M=$'\033[35m'; C=$'\033[36m'; W=$'\033[97m'; X=$'\033[0m'
|
||||
|
||||
beat() { sleep "$PAUSE"; }
|
||||
say() { printf '%s\n' "$1"; }
|
||||
type_cmd() { # echo a command like it was typed
|
||||
printf '%s$ %s%s\n' "$DIM" "$1" "$X"; beat
|
||||
}
|
||||
|
||||
clear 2>/dev/null || printf '\n\n'
|
||||
say "${M}${B} ██████ POSTDICT — memory with hindsight ██████${X}"
|
||||
say "${DIM} every other memory finds what your bug LOOKS like.${X}"
|
||||
say "${DIM} this one finds what CAUSED it.${X}"
|
||||
echo; beat
|
||||
|
||||
# ── DAY -3 ──────────────────────────────────────────────────────────────────
|
||||
say "${C}${B}┌─ 3 DAYS AGO ──────────────────────────────────────────────┐${X}"
|
||||
say "${C}${B}│${X} a tiny, boring config change. nobody thinks twice. ${C}${B}│${X}"
|
||||
say "${C}${B}└───────────────────────────────────────────────────────────┘${X}"
|
||||
type_cmd "vestige ingest \"Set API_TIMEOUT=2 in the deploy env to speed up cold starts\" --tags API_TIMEOUT,deploy-env --ago-days 3"
|
||||
"$BIN" --data-dir "$DB" ingest "Set API_TIMEOUT=2 in the deploy env to speed up cold starts" \
|
||||
--tags "API_TIMEOUT,deploy-env" --node-type decision --ago-days 3 2>/dev/null | grep -E "Node ID|Backdated" | sed "s/^/ ${DIM}/;s/$/${X}/"
|
||||
echo; beat
|
||||
|
||||
# ── DAY -20 (a noisy lookalike) ──────────────────────────────────────────────
|
||||
say "${DIM} (also in history: an unrelated 500 error that LOOKS like today's crash)${X}"
|
||||
"$BIN" --data-dir "$DB" ingest "A 500 Internal Server Error happened in the billing service last month" \
|
||||
--tags "billing-service" --node-type event --ago-days 20 2>/dev/null >/dev/null
|
||||
beat
|
||||
|
||||
# ── TODAY ────────────────────────────────────────────────────────────────────
|
||||
say "${R}${B}┌─ TODAY ────────────────────────────────────────────────────┐${X}"
|
||||
say "${R}${B}│${X} 💥 your service just crashed. ${R}${B}│${X}"
|
||||
say "${R}${B}└────────────────────────────────────────────────────────────┘${X}"
|
||||
type_cmd "vestige ingest \"Service crashed: 500 Internal Server Error on the auth endpoint\" --tags auth-service,API_TIMEOUT,crash"
|
||||
"$BIN" --data-dir "$DB" ingest "Service crashed: 500 Internal Server Error on the auth endpoint" \
|
||||
--tags "auth-service,API_TIMEOUT,crash" --node-type event 2>/dev/null >/dev/null
|
||||
say " ${R}recorded.${X} now: ${W}${B}why did it crash?${X}"
|
||||
echo; beat; beat
|
||||
|
||||
# ── THE TURN ─────────────────────────────────────────────────────────────────
|
||||
type_cmd "vestige backfill --contrast"
|
||||
"$BIN" --data-dir "$DB" backfill --contrast 2>/dev/null
|
||||
echo; beat
|
||||
|
||||
# ── THE PRESTIGE ─────────────────────────────────────────────────────────────
|
||||
say "${G}${B} ┌──────────────────────────────────────────────────────────┐${X}"
|
||||
say "${G}${B} │${X} semantic search returned the lookalike. ${G}${B}│${X}"
|
||||
say "${G}${B} │${X} Postdict reached back 3 days to the real cause. ${G}${B}│${X}"
|
||||
say "${G}${B} │${X} ${W}not similar to the bug. causally upstream.${X} ${G}${B}│${X}"
|
||||
say "${G}${B} └──────────────────────────────────────────────────────────┘${X}"
|
||||
echo
|
||||
say "${DIM} run it yourself: ./demo/postdict-demo.sh (seed is right here)${X}"
|
||||
say "${DIM} honest limit: if the cause was never recorded, nothing can reach it.${X}"
|
||||
|
||||
rm -rf "$(dirname "$DB")"
|
||||
Loading…
Add table
Add a link
Reference in a new issue