diff --git a/crates/omnigraph/src/exec/projection.rs b/crates/omnigraph/src/exec/projection.rs index dec13a8..7280ec5 100644 --- a/crates/omnigraph/src/exec/projection.rs +++ b/crates/omnigraph/src/exec/projection.rs @@ -422,6 +422,35 @@ pub(super) fn apply_ordering( }); } + // Deterministic tie-break for a TOTAL order. `lexsort_to_indices` is unstable + // and the input row order is not guaranteed (scan parallelism, upstream + // hashing), so equal user-sort keys would otherwise come out run-dependent — + // making `ORDER ... LIMIT` non-deterministic. Append the bound entities' key + // columns (`.id`, unique per row) in canonical (name-sorted) order as + // ascending tie-breaks. The combination of all bound keys uniquely identifies + // a result row, so the order is total and reproducible. (Aggregate results + // have no `.id` columns; their group rows are already distinct on the + // projected group keys.) + let mut tiebreak_cols: Vec = source + .schema() + .fields() + .iter() + .map(|f| f.name().to_string()) + .filter(|name| name.ends_with(".id")) + .collect(); + tiebreak_cols.sort(); + for name in &tiebreak_cols { + if let Some(col) = source.column_by_name(name) { + sort_columns.push(SortColumn { + values: col.clone(), + options: Some(arrow_schema::SortOptions { + descending: false, + nulls_first: true, + }), + }); + } + } + let indices = lexsort_to_indices(&sort_columns, None).map_err(|e| OmniError::Lance(e.to_string()))?; diff --git a/crates/omnigraph/tests/ordering.rs b/crates/omnigraph/tests/ordering.rs new file mode 100644 index 0000000..4e9296b --- /dev/null +++ b/crates/omnigraph/tests/ordering.rs @@ -0,0 +1,134 @@ +//! ORDER BY golden coverage: descending, multi-key precedence, deterministic +//! tie-break (total order), and NULL placement. +//! +//! These pin the observable output-ordering contract (deny-list: "output +//! ordering … become dependencies once shipped"). `apply_ordering` appends the +//! bound entities' key columns as an ascending tie-break, so equal user-sort +//! keys yield a TOTAL, deterministic order (and `ORDER … LIMIT` is +//! deterministic). NULL placement is `nulls_first = !descending` (NULLs first +//! under ASC, last under DESC). Both are documented in +//! `docs/user/query-language.md`. + +mod helpers; + +use arrow_array::{Array, StringArray}; + +use omnigraph::db::Omnigraph; +use omnigraph::loader::{LoadMode, load_jsonl}; +use omnigraph_compiler::ir::ParamMap; +use omnigraph_compiler::result::QueryResult; + +use helpers::*; + +/// Names in result ROW order (not sorted) — these tests assert positional order. +fn names_in_order(result: &QueryResult) -> Vec { + let batch = result.concat_batches().unwrap(); + if batch.num_rows() == 0 { + return Vec::new(); + } + let col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + (0..col.len()).map(|i| col.value(i).to_string()).collect() +} + +/// Init the standard schema and load a custom Person-only dataset. +async fn init_people(dir: &tempfile::TempDir, jsonl: &str) -> Omnigraph { + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, jsonl, LoadMode::Overwrite).await.unwrap(); + db +} + +#[tokio::test] +async fn ordering_descending() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let q = r#" +query q() { + match { $p: Person } + return { $p.name } + order { $p.age desc } +} +"#; + let got = names_in_order(&query_main(&mut db, q, "q", &ParamMap::new()).await.unwrap()); + // Charlie(35), Alice(30), Diana(28), Bob(25) + assert_eq!(got, vec!["Charlie", "Alice", "Diana", "Bob"]); +} + +#[tokio::test] +async fn ordering_multi_key_age_desc_name_asc() { + let dir = tempfile::tempdir().unwrap(); + // Alice & Bob tie at age 30; loaded Bob-first so the expected output order + // cannot be the load order. + let data = r#"{"type":"Person","data":{"name":"Bob","age":30}} +{"type":"Person","data":{"name":"Alice","age":30}} +{"type":"Person","data":{"name":"Charlie","age":25}}"#; + let mut db = init_people(&dir, data).await; + let q = r#" +query q() { + match { $p: Person } + return { $p.name } + order { $p.age desc, $p.name asc } +} +"#; + let got = names_in_order(&query_main(&mut db, q, "q", &ParamMap::new()).await.unwrap()); + // age desc -> [30,30,25]; the 30-tie broken by name asc -> Alice before Bob. + assert_eq!(got, vec!["Alice", "Bob", "Charlie"]); +} + +#[tokio::test] +async fn ordering_tiebreak_by_key_is_deterministic() { + let dir = tempfile::tempdir().unwrap(); + // Same tie at age 30, NO secondary sort key. Loaded Bob-first; the tie must + // break by the entity key (name) ascending -> Alice before Bob, regardless + // of load order. This locks the total-order tie-break in apply_ordering. + let data = r#"{"type":"Person","data":{"name":"Bob","age":30}} +{"type":"Person","data":{"name":"Alice","age":30}} +{"type":"Person","data":{"name":"Charlie","age":25}}"#; + let mut db = init_people(&dir, data).await; + let q = r#" +query q() { + match { $p: Person } + return { $p.name } + order { $p.age asc } +} +"#; + let got = names_in_order(&query_main(&mut db, q, "q", &ParamMap::new()).await.unwrap()); + // age asc -> Charlie(25), then the 30-tie broken by key asc -> Alice, Bob. + assert_eq!(got, vec!["Charlie", "Alice", "Bob"]); +} + +#[tokio::test] +async fn ordering_nulls_placement_asc_and_desc() { + let dir = tempfile::tempdir().unwrap(); + // Bob has a NULL age. + let data = r#"{"type":"Person","data":{"name":"Alice","age":30}} +{"type":"Person","data":{"name":"Bob","age":null}} +{"type":"Person","data":{"name":"Charlie","age":25}}"#; + let mut db = init_people(&dir, data).await; + + let asc = r#" +query q() { + match { $p: Person } + return { $p.name } + order { $p.age asc } +} +"#; + let got_asc = names_in_order(&query_main(&mut db, asc, "q", &ParamMap::new()).await.unwrap()); + // ASC: nulls_first -> Bob(null), then 25, 30. + assert_eq!(got_asc, vec!["Bob", "Charlie", "Alice"]); + + let desc = r#" +query q() { + match { $p: Person } + return { $p.name } + order { $p.age desc } +} +"#; + let got_desc = names_in_order(&query_main(&mut db, desc, "q", &ParamMap::new()).await.unwrap()); + // DESC: nulls last -> 30, 25, then Bob(null). + assert_eq!(got_desc, vec!["Alice", "Charlie", "Bob"]); +} diff --git a/docs/user/query-language.md b/docs/user/query-language.md index 9ed5686..bc22064 100644 --- a/docs/user/query-language.md +++ b/docs/user/query-language.md @@ -55,6 +55,8 @@ Used inside MATCH or as expressions inside RETURN/ORDER: - `order { [asc|desc], … }` — supports plain expressions and `nearest(...)`. - `limit ` — required when there is a `nearest(...)` ordering. +- **Total, deterministic order.** Rows with equal user-sort keys are broken by the bound entities' key columns (`.id`, ascending) appended as a final tie-break, so the result is a *total* order — reproducible across runs, and `order … limit N` returns a deterministic top-N even when ties straddle the cutoff. (Aggregate results have no entity-key columns; their group rows are already distinct on the projected group keys.) +- **NULL placement** is *nulls-first ascending, nulls-last descending* (i.e. `nulls_first = !descending`): a NULL sorts as if smaller than any value. ## Mutation statements