From 5e411833f51bfcab86b9e3c5f24f990970786b3c Mon Sep 17 00:00:00 2001 From: Jan De Landtsheer Date: Tue, 21 Apr 2026 21:43:28 +0200 Subject: [PATCH] fix(fts): match multi-word queries as implicit-AND, not adjacent phrase sanitize_fts5_query wraps queries in quotes, producing FTS5 phrase search where the words must be adjacent. So "quantum physics" against a doc containing "quantum entanglement superposition physics" returned no FTS hit; semantic search hid the issue whenever embeddings were enabled. Add sanitize_fts5_terms that splits into space-separated terms (FTS5 implicit AND, any order, any position), and use it in: - keyword_search_with_scores (hybrid-search FTS leg) so multi-word queries return docs containing all words regardless of adjacency - a new SqliteMemoryStore::search_terms inherent method for callers that want individual-term matching without the full hybrid pipeline sanitize_fts5_query stays in place; KeywordSearcher still uses it (phrase semantics preserved where they were wanted). --- crates/vestige-core/src/fts.rs | 47 +++++++++++++++++++++++ crates/vestige-core/src/storage/sqlite.rs | 41 +++++++++++++++++++- 2 files changed, 86 insertions(+), 2 deletions(-) diff --git a/crates/vestige-core/src/fts.rs b/crates/vestige-core/src/fts.rs index e4cadfb..eae8ed8 100644 --- a/crates/vestige-core/src/fts.rs +++ b/crates/vestige-core/src/fts.rs @@ -7,6 +7,53 @@ /// Dangerous FTS5 operators that could be used for injection or DoS const FTS5_OPERATORS: &[&str] = &["OR", "AND", "NOT", "NEAR"]; +/// Sanitize input for FTS5 MATCH queries using individual term matching. +/// +/// Unlike `sanitize_fts5_query` which wraps in quotes for a phrase search, +/// this function produces individual terms joined with implicit AND. +/// This matches documents that contain ALL the query words in any order. +/// +/// Use this when you want "find all records containing these words" rather +/// than "find records with this exact phrase". +pub fn sanitize_fts5_terms(query: &str) -> Option { + let limited: String = query.chars().take(1000).collect(); + let mut sanitized = limited; + + sanitized = sanitized + .chars() + .map(|c| match c { + '*' | ':' | '^' | '-' | '"' | '(' | ')' | '{' | '}' | '[' | ']' => ' ', + _ => c, + }) + .collect(); + + for op in FTS5_OPERATORS { + let pattern = format!(" {} ", op); + sanitized = sanitized.replace(&pattern, " "); + sanitized = sanitized.replace(&pattern.to_lowercase(), " "); + let upper = sanitized.to_uppercase(); + let start_pattern = format!("{} ", op); + if upper.starts_with(&start_pattern) { + sanitized = sanitized.chars().skip(op.len()).collect(); + } + let end_pattern = format!(" {}", op); + if upper.ends_with(&end_pattern) { + let char_count = sanitized.chars().count(); + sanitized = sanitized + .chars() + .take(char_count.saturating_sub(op.len())) + .collect(); + } + } + + let terms: Vec<&str> = sanitized.split_whitespace().collect(); + if terms.is_empty() { + return None; + } + // Join with space: FTS5 implicit AND — all terms must appear + Some(terms.join(" ")) +} + /// Sanitize input for FTS5 MATCH queries /// /// Prevents: diff --git a/crates/vestige-core/src/storage/sqlite.rs b/crates/vestige-core/src/storage/sqlite.rs index 81197cb..398db9f 100644 --- a/crates/vestige-core/src/storage/sqlite.rs +++ b/crates/vestige-core/src/storage/sqlite.rs @@ -1520,6 +1520,38 @@ impl Storage { Ok(result) } + /// FTS5 keyword search using individual-term matching (implicit AND). + /// + /// Unlike `search()` which uses phrase matching (words must be adjacent), + /// this returns documents containing ALL query words in any order and position. + /// This is more useful for free-text queries from external callers. + pub fn search_terms(&self, query: &str, limit: i32) -> Result> { + use crate::fts::sanitize_fts5_terms; + let Some(terms) = sanitize_fts5_terms(query) else { + return Ok(vec![]); + }; + + let reader = self + .reader + .lock() + .map_err(|_| StorageError::Init("Reader lock poisoned".into()))?; + let mut stmt = reader.prepare( + "SELECT n.* FROM knowledge_nodes n + JOIN knowledge_fts fts ON n.id = fts.id + WHERE knowledge_fts MATCH ?1 + ORDER BY rank + LIMIT ?2", + )?; + + let nodes = stmt.query_map(params![terms, limit], Self::row_to_node)?; + + let mut result = Vec::new(); + for node in nodes { + result.push(node?); + } + Ok(result) + } + /// Get all nodes (paginated) pub fn get_all_nodes(&self, limit: i32, offset: i32) -> Result> { let reader = self @@ -1841,7 +1873,12 @@ impl Storage { include_types: Option<&[String]>, exclude_types: Option<&[String]>, ) -> Result> { - let sanitized_query = sanitize_fts5_query(query); + // Use individual-term matching (implicit AND) so multi-word queries find + // documents where all words appear anywhere, not just as adjacent phrases. + use crate::fts::sanitize_fts5_terms; + let Some(terms_query) = sanitize_fts5_terms(query) else { + return Ok(vec![]); + }; // Build the type filter clause and collect parameter values. // We use numbered parameters: ?1 = query, ?2 = limit, ?3.. = type strings. @@ -1887,7 +1924,7 @@ impl Storage { // Build the parameter list: [query, limit, ...type_values] let mut param_values: Vec> = Vec::new(); - param_values.push(Box::new(sanitized_query.clone())); + param_values.push(Box::new(terms_query)); param_values.push(Box::new(limit)); for tv in &type_values { param_values.push(Box::new(tv.to_string()));