test(engine): search fuzzy/match_text characterization + RRF non-default pairings

- match_text_matches_exact_set_excludes_unrelated: match_text(body,'neural') ==
  [dl-basics] exactly (not just contains).
- fuzzy_does_not_match_under_default_tokenizer: characterizes that fuzzy() is
  inert with the default tokenizer here (search/match_text work, fuzzy returns
  nothing); turns red — to be promoted to a real golden — if fuzzy starts matching.
- rrf_fuses_two_fts_fields / rrf_fuses_two_vector_queries: RRF fuses arms other
  than the default nearest+bm25 (bm25 title+body; two vector queries), proving
  primary_var resolves and fusion runs. New fixtures/search.gq queries +
  two_vector_params helper. Orders resolved by running, confirmed stable.
This commit is contained in:
Ragnor Comerford 2026-06-09 14:41:12 +02:00
parent e2784cad58
commit e674e0c3c4
No known key found for this signature in database
3 changed files with 90 additions and 0 deletions

View file

@ -42,3 +42,17 @@ query hybrid_search($vq: Vector(4), $tq: String) {
order { rrf(nearest($d.embedding, $vq), bm25($d.title, $tq)) }
limit 3
}
query rrf_two_fts($q: String) {
match { $d: Doc }
return { $d.slug, $d.title }
order { rrf(bm25($d.title, $q), bm25($d.body, $q)) }
limit 3
}
query rrf_two_vectors($q1: Vector(4), $q2: Vector(4)) {
match { $d: Doc }
return { $d.slug, $d.title }
order { rrf(nearest($d.embedding, $q1), nearest($d.embedding, $q2)) }
limit 3
}

View file

@ -236,6 +236,15 @@ pub fn vector_param(name: &str, values: &[f32]) -> ParamMap {
map
}
/// Build a ParamMap with two vector params.
pub fn two_vector_params(name1: &str, vals1: &[f32], name2: &str, vals2: &[f32]) -> ParamMap {
let mut map = vector_param(name1, vals1);
let key = name2.strip_prefix('$').unwrap_or(name2).to_string();
let lit = Literal::List(vals2.iter().map(|v| Literal::Float(*v as f64)).collect());
map.insert(key, lit);
map
}
/// Build a ParamMap with a vector param and a string param.
pub fn vector_and_string_params(
vec_name: &str,

View file

@ -594,6 +594,73 @@ async fn bm25_full_rank_order() {
assert_eq!(result_slugs(&result), vec!["rl-intro", "ml-intro", "dl-basics"]);
}
// Characterization: fuzzy() does NOT match under the default tokenizer/index in
// this setup — a one-edit typo ("Introductio" for "Introduction") returns no
// rows. (`search`/`match_text` DO work, so FTS itself is fine; fuzzy term
// queries specifically are inert here.) This pins that documented limitation
// instead of leaving fuzzy silently unasserted: if a Lance/tokenizer change
// makes fuzzy match, this turns red and should be promoted to a real
// matched-set + exclusion golden.
#[tokio::test]
#[serial]
async fn fuzzy_does_not_match_under_default_tokenizer() {
let dir = tempfile::tempdir().unwrap();
let mut db = init_search_db(&dir).await;
let r = query_main(&mut db, SEARCH_QUERIES, "fuzzy_search", &params(&[("$q", "Introductio")]))
.await
.unwrap();
assert!(
result_slugs(&r).is_empty(),
"fuzzy now matches — promote this to a real matched-set/exclusion golden"
);
}
// match_text is a FILTER on the body: assert the exact matched set, not contains.
#[tokio::test]
#[serial]
async fn match_text_matches_exact_set_excludes_unrelated() {
let dir = tempfile::tempdir().unwrap();
let mut db = init_search_db(&dir).await;
// "neural" appears only in dl-basics's body ("neural networks").
let r = query_main(&mut db, SEARCH_QUERIES, "phrase_search", &params(&[("$q", "neural")]))
.await
.unwrap();
let mut got = result_slugs(&r);
got.sort();
assert_eq!(got, vec!["dl-basics"]);
}
// RRF fuses arms OTHER than the default nearest+bm25: two FTS arms (title+body).
// Proves primary_var resolves when neither arm is `nearest`, and fusion runs.
#[tokio::test]
#[serial]
async fn rrf_fuses_two_fts_fields() {
let dir = tempfile::tempdir().unwrap();
let mut db = init_search_db(&dir).await;
let r = query_main(&mut db, SEARCH_QUERIES, "rrf_two_fts", &params(&[("$q", "learning")]))
.await
.unwrap();
assert_eq!(result_slugs(&r), vec!["dl-basics", "ml-intro", "rl-intro"]);
}
// RRF fuses two vector arms (no embedding creds — explicit vectors). A doc near
// BOTH query vectors out-ranks one near only one.
#[tokio::test]
#[serial]
async fn rrf_fuses_two_vector_queries() {
let dir = tempfile::tempdir().unwrap();
let mut db = init_search_db(&dir).await;
let r = query_main(
&mut db,
SEARCH_QUERIES,
"rrf_two_vectors",
&two_vector_params("$q1", &[0.1, 0.2, 0.3, 0.4], "$q2", &[0.5, 0.6, 0.7, 0.8]),
)
.await
.unwrap();
assert_eq!(result_slugs(&r), vec!["rl-intro", "ml-intro", "dl-basics"]);
}
#[tokio::test]
#[serial]
async fn mutation_commit_refreshes_search_indices_without_manual_ensure() {