From e674e0c3c44aaee2dabbd8351534c25dccecb3f0 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Tue, 9 Jun 2026 14:41:12 +0200 Subject: [PATCH] test(engine): search fuzzy/match_text characterization + RRF non-default pairings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - match_text_matches_exact_set_excludes_unrelated: match_text(body,'neural') == [dl-basics] exactly (not just contains). - fuzzy_does_not_match_under_default_tokenizer: characterizes that fuzzy() is inert with the default tokenizer here (search/match_text work, fuzzy returns nothing); turns red — to be promoted to a real golden — if fuzzy starts matching. - rrf_fuses_two_fts_fields / rrf_fuses_two_vector_queries: RRF fuses arms other than the default nearest+bm25 (bm25 title+body; two vector queries), proving primary_var resolves and fusion runs. New fixtures/search.gq queries + two_vector_params helper. Orders resolved by running, confirmed stable. --- crates/omnigraph/tests/fixtures/search.gq | 14 +++++ crates/omnigraph/tests/helpers/mod.rs | 9 +++ crates/omnigraph/tests/search.rs | 67 +++++++++++++++++++++++ 3 files changed, 90 insertions(+) diff --git a/crates/omnigraph/tests/fixtures/search.gq b/crates/omnigraph/tests/fixtures/search.gq index c39af82..d53fbc9 100644 --- a/crates/omnigraph/tests/fixtures/search.gq +++ b/crates/omnigraph/tests/fixtures/search.gq @@ -42,3 +42,17 @@ query hybrid_search($vq: Vector(4), $tq: String) { order { rrf(nearest($d.embedding, $vq), bm25($d.title, $tq)) } limit 3 } + +query rrf_two_fts($q: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { rrf(bm25($d.title, $q), bm25($d.body, $q)) } + limit 3 +} + +query rrf_two_vectors($q1: Vector(4), $q2: Vector(4)) { + match { $d: Doc } + return { $d.slug, $d.title } + order { rrf(nearest($d.embedding, $q1), nearest($d.embedding, $q2)) } + limit 3 +} diff --git a/crates/omnigraph/tests/helpers/mod.rs b/crates/omnigraph/tests/helpers/mod.rs index c97ff72..0e04aa2 100644 --- a/crates/omnigraph/tests/helpers/mod.rs +++ b/crates/omnigraph/tests/helpers/mod.rs @@ -236,6 +236,15 @@ pub fn vector_param(name: &str, values: &[f32]) -> ParamMap { map } +/// Build a ParamMap with two vector params. +pub fn two_vector_params(name1: &str, vals1: &[f32], name2: &str, vals2: &[f32]) -> ParamMap { + let mut map = vector_param(name1, vals1); + let key = name2.strip_prefix('$').unwrap_or(name2).to_string(); + let lit = Literal::List(vals2.iter().map(|v| Literal::Float(*v as f64)).collect()); + map.insert(key, lit); + map +} + /// Build a ParamMap with a vector param and a string param. pub fn vector_and_string_params( vec_name: &str, diff --git a/crates/omnigraph/tests/search.rs b/crates/omnigraph/tests/search.rs index 653172b..480ec3c 100644 --- a/crates/omnigraph/tests/search.rs +++ b/crates/omnigraph/tests/search.rs @@ -594,6 +594,73 @@ async fn bm25_full_rank_order() { assert_eq!(result_slugs(&result), vec!["rl-intro", "ml-intro", "dl-basics"]); } +// Characterization: fuzzy() does NOT match under the default tokenizer/index in +// this setup — a one-edit typo ("Introductio" for "Introduction") returns no +// rows. (`search`/`match_text` DO work, so FTS itself is fine; fuzzy term +// queries specifically are inert here.) This pins that documented limitation +// instead of leaving fuzzy silently unasserted: if a Lance/tokenizer change +// makes fuzzy match, this turns red and should be promoted to a real +// matched-set + exclusion golden. +#[tokio::test] +#[serial] +async fn fuzzy_does_not_match_under_default_tokenizer() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + let r = query_main(&mut db, SEARCH_QUERIES, "fuzzy_search", ¶ms(&[("$q", "Introductio")])) + .await + .unwrap(); + assert!( + result_slugs(&r).is_empty(), + "fuzzy now matches — promote this to a real matched-set/exclusion golden" + ); +} + +// match_text is a FILTER on the body: assert the exact matched set, not contains. +#[tokio::test] +#[serial] +async fn match_text_matches_exact_set_excludes_unrelated() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + // "neural" appears only in dl-basics's body ("neural networks"). + let r = query_main(&mut db, SEARCH_QUERIES, "phrase_search", ¶ms(&[("$q", "neural")])) + .await + .unwrap(); + let mut got = result_slugs(&r); + got.sort(); + assert_eq!(got, vec!["dl-basics"]); +} + +// RRF fuses arms OTHER than the default nearest+bm25: two FTS arms (title+body). +// Proves primary_var resolves when neither arm is `nearest`, and fusion runs. +#[tokio::test] +#[serial] +async fn rrf_fuses_two_fts_fields() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + let r = query_main(&mut db, SEARCH_QUERIES, "rrf_two_fts", ¶ms(&[("$q", "learning")])) + .await + .unwrap(); + assert_eq!(result_slugs(&r), vec!["dl-basics", "ml-intro", "rl-intro"]); +} + +// RRF fuses two vector arms (no embedding creds — explicit vectors). A doc near +// BOTH query vectors out-ranks one near only one. +#[tokio::test] +#[serial] +async fn rrf_fuses_two_vector_queries() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + let r = query_main( + &mut db, + SEARCH_QUERIES, + "rrf_two_vectors", + &two_vector_params("$q1", &[0.1, 0.2, 0.3, 0.4], "$q2", &[0.5, 0.6, 0.7, 0.8]), + ) + .await + .unwrap(); + assert_eq!(result_slugs(&r), vec!["rl-intro", "ml-intro", "dl-basics"]); +} + #[tokio::test] #[serial] async fn mutation_commit_refreshes_search_indices_without_manual_ensure() {