fix(engine): lower date/datetime filter literals as typed Arrow scalars

`literal_to_expr` lowered `Date`/`DateTime` query literals as Utf8 strings,
relying on DataFusion implicit casts. Against a physical `Date32`/`Date64`
column that can coerce the column side (`CAST(col AS Utf8)`), which defeats a
scalar BTREE and degrades the scan to a full filtered read. Lower to typed
`Date32`/`Date64` scalars instead (reusing the loader's
`parse_date32_literal`/`parse_date64_literal`, already used by the in-memory
comparison arm), so the predicate stays a direct column comparison and the
index is used. Malformed literals fall back to the Utf8 string so pushdown
behavior never regresses.

Tests: unit goldens asserting the lowered literal is typed (red before, green
after) + inline-binding pushdown equality in literal_filters confirming the
epoch conversion selects the right rows.
This commit is contained in:
Ragnor Comerford 2026-06-13 18:42:58 +02:00
parent e4334deb14
commit e4ef67b0bb
No known key found for this signature in database
2 changed files with 63 additions and 5 deletions

View file

@ -2073,11 +2073,24 @@ fn literal_to_expr(lit: &Literal) -> Option<datafusion::prelude::Expr> {
Literal::Integer(n) => df_lit(*n),
Literal::Float(f) => df_lit(*f),
Literal::Bool(b) => df_lit(*b),
// Date/DateTime stored as strings; pass through as string literals
// — Lance/DataFusion handles the comparison against typed columns
// via implicit cast, matching the existing string-SQL behavior.
Literal::Date(s) => df_lit(s.clone()),
Literal::DateTime(s) => df_lit(s.clone()),
// Date/DateTime columns are physically Date32/Date64 (see the loader's
// `to_arrow`). Lower the literal to the matching TYPED Arrow scalar so
// the predicate stays a direct column comparison and the persisted
// BTREE is used. A Utf8 literal would force DataFusion to coerce one
// side; if it casts the column (`CAST(col AS Utf8)`) the scalar index
// is defeated and the scan degrades to a full filtered read. This
// matches the already-typed in-memory comparison arm in
// `projection.rs::literal_to_array`. On a malformed literal, fall back
// to the Utf8 string so pushdown behavior never regresses (the
// in-memory path surfaces the parse error if it is load-bearing).
Literal::Date(s) => match crate::loader::parse_date32_literal(s) {
Ok(days) => df_lit(datafusion::scalar::ScalarValue::Date32(Some(days))),
Err(_) => df_lit(s.clone()),
},
Literal::DateTime(s) => match crate::loader::parse_date64_literal(s) {
Ok(ms) => df_lit(datafusion::scalar::ScalarValue::Date64(Some(ms))),
Err(_) => df_lit(s.clone()),
},
Literal::List(_) => return None,
})
}
@ -2285,3 +2298,42 @@ mod expand_chooser_tests {
assert_eq!(choose_expand_mode(&i), ExpandMode::Csr);
}
}
#[cfg(test)]
mod literal_lowering_tests {
use super::*;
use datafusion::prelude::Expr;
use datafusion::scalar::ScalarValue;
// Date/DateTime filter literals must lower to TYPED Arrow scalars
// (Date32 / Date64), not Utf8 strings. A Utf8 literal against a typed
// Date column forces DataFusion to coerce one side; if it casts the
// column (`CAST(col AS Utf8)`) the persisted BTREE is defeated and the
// scan falls back to a full filtered read. A typed literal keeps the
// predicate a direct column comparison so the scalar index is used.
#[test]
fn date_literals_lower_to_typed_arrow_scalars() {
let dt = literal_to_expr(&Literal::DateTime("2024-06-01T12:00:00Z".into())).unwrap();
assert!(
matches!(dt, Expr::Literal(ScalarValue::Date64(Some(_)), ..)),
"DateTime literal must lower to a typed Date64 scalar, got {dt:?}"
);
let d = literal_to_expr(&Literal::Date("2024-06-01".into())).unwrap();
assert!(
matches!(d, Expr::Literal(ScalarValue::Date32(Some(_)), ..)),
"Date literal must lower to a typed Date32 scalar, got {d:?}"
);
}
// A malformed date string must not panic or error in the (infallible)
// lowering: it falls back to the Utf8 literal so pushdown behavior never
// regresses (the in-memory path surfaces the parse error if it matters).
#[test]
fn malformed_date_literal_falls_back_to_string() {
let bad = literal_to_expr(&Literal::DateTime("not-a-date".into())).unwrap();
assert!(
matches!(bad, Expr::Literal(ScalarValue::Utf8(Some(_)), ..)),
"malformed DateTime literal should fall back to a Utf8 literal, got {bad:?}"
);
}
}

View file

@ -88,9 +88,15 @@ async fn date_and_datetime_literal_filters_execute() {
let q = r#"
query born_ge() { match { $m: Metric $m.born >= date("2024-01-01") } return { $m.name } }
query seen_lt() { match { $m: Metric $m.seen < datetime("2024-01-01T00:00:00Z") } return { $m.name } }
query born_eq() { match { $m: Metric { born: date("2024-06-01") } } return { $m.name } }
query seen_eq() { match { $m: Metric { seen: datetime("2024-06-01T12:00:00Z") } } return { $m.name } }
"#;
// born: m1 2024-06, m3 2025 >= 2024-01-01
assert_eq!(sorted_metric_names(&mut db, q, "born_ge").await, vec!["m1", "m3"]);
// seen: m2 2023, m4 2022 < 2024-01-01
assert_eq!(sorted_metric_names(&mut db, q, "seen_lt").await, vec!["m2", "m4"]);
// Inline-binding equality exercises the Lance-pushdown arm with a typed
// Date32/Date64 literal: the epoch conversion must select exactly m1.
assert_eq!(sorted_metric_names(&mut db, q, "born_eq").await, vec!["m1"]);
assert_eq!(sorted_metric_names(&mut db, q, "seen_eq").await, vec!["m1"]);
}