fix(fts): match multi-word queries as implicit-AND, not adjacent phrase

sanitize_fts5_query wraps queries in quotes, producing FTS5 phrase search
where the words must be adjacent. So "quantum physics" against a doc
containing "quantum entanglement superposition physics" returned no FTS
hit; semantic search hid the issue whenever embeddings were enabled.

Add sanitize_fts5_terms that splits into space-separated terms (FTS5
implicit AND, any order, any position), and use it in:

- keyword_search_with_scores (hybrid-search FTS leg) so multi-word
  queries return docs containing all words regardless of adjacency
- a new SqliteMemoryStore::search_terms inherent method for callers
  that want individual-term matching without the full hybrid pipeline

sanitize_fts5_query stays in place; KeywordSearcher still uses it
(phrase semantics preserved where they were wanted).
This commit is contained in:
Jan De Landtsheer 2026-04-21 21:43:28 +02:00
parent 2391acf480
commit 5e411833f5
No known key found for this signature in database
GPG key ID: 95CD37F0C226040B
2 changed files with 86 additions and 2 deletions

View file

@ -7,6 +7,53 @@
/// Dangerous FTS5 operators that could be used for injection or DoS
const FTS5_OPERATORS: &[&str] = &["OR", "AND", "NOT", "NEAR"];
/// Sanitize input for FTS5 MATCH queries using individual term matching.
///
/// Unlike `sanitize_fts5_query` which wraps in quotes for a phrase search,
/// this function produces individual terms joined with implicit AND.
/// This matches documents that contain ALL the query words in any order.
///
/// Use this when you want "find all records containing these words" rather
/// than "find records with this exact phrase".
pub fn sanitize_fts5_terms(query: &str) -> Option<String> {
let limited: String = query.chars().take(1000).collect();
let mut sanitized = limited;
sanitized = sanitized
.chars()
.map(|c| match c {
'*' | ':' | '^' | '-' | '"' | '(' | ')' | '{' | '}' | '[' | ']' => ' ',
_ => c,
})
.collect();
for op in FTS5_OPERATORS {
let pattern = format!(" {} ", op);
sanitized = sanitized.replace(&pattern, " ");
sanitized = sanitized.replace(&pattern.to_lowercase(), " ");
let upper = sanitized.to_uppercase();
let start_pattern = format!("{} ", op);
if upper.starts_with(&start_pattern) {
sanitized = sanitized.chars().skip(op.len()).collect();
}
let end_pattern = format!(" {}", op);
if upper.ends_with(&end_pattern) {
let char_count = sanitized.chars().count();
sanitized = sanitized
.chars()
.take(char_count.saturating_sub(op.len()))
.collect();
}
}
let terms: Vec<&str> = sanitized.split_whitespace().collect();
if terms.is_empty() {
return None;
}
// Join with space: FTS5 implicit AND — all terms must appear
Some(terms.join(" "))
}
/// Sanitize input for FTS5 MATCH queries
///
/// Prevents:

View file

@ -1520,6 +1520,38 @@ impl Storage {
Ok(result)
}
/// FTS5 keyword search using individual-term matching (implicit AND).
///
/// Unlike `search()` which uses phrase matching (words must be adjacent),
/// this returns documents containing ALL query words in any order and position.
/// This is more useful for free-text queries from external callers.
pub fn search_terms(&self, query: &str, limit: i32) -> Result<Vec<KnowledgeNode>> {
use crate::fts::sanitize_fts5_terms;
let Some(terms) = sanitize_fts5_terms(query) else {
return Ok(vec![]);
};
let reader = self
.reader
.lock()
.map_err(|_| StorageError::Init("Reader lock poisoned".into()))?;
let mut stmt = reader.prepare(
"SELECT n.* FROM knowledge_nodes n
JOIN knowledge_fts fts ON n.id = fts.id
WHERE knowledge_fts MATCH ?1
ORDER BY rank
LIMIT ?2",
)?;
let nodes = stmt.query_map(params![terms, limit], Self::row_to_node)?;
let mut result = Vec::new();
for node in nodes {
result.push(node?);
}
Ok(result)
}
/// Get all nodes (paginated)
pub fn get_all_nodes(&self, limit: i32, offset: i32) -> Result<Vec<KnowledgeNode>> {
let reader = self
@ -1841,7 +1873,12 @@ impl Storage {
include_types: Option<&[String]>,
exclude_types: Option<&[String]>,
) -> Result<Vec<(String, f32)>> {
let sanitized_query = sanitize_fts5_query(query);
// Use individual-term matching (implicit AND) so multi-word queries find
// documents where all words appear anywhere, not just as adjacent phrases.
use crate::fts::sanitize_fts5_terms;
let Some(terms_query) = sanitize_fts5_terms(query) else {
return Ok(vec![]);
};
// Build the type filter clause and collect parameter values.
// We use numbered parameters: ?1 = query, ?2 = limit, ?3.. = type strings.
@ -1887,7 +1924,7 @@ impl Storage {
// Build the parameter list: [query, limit, ...type_values]
let mut param_values: Vec<Box<dyn rusqlite::ToSql>> = Vec::new();
param_values.push(Box::new(sanitized_query.clone()));
param_values.push(Box::new(terms_query));
param_values.push(Box::new(limit));
for tv in &type_values {
param_values.push(Box::new(tv.to_string()));