fix(engine): total deterministic ORDER via entity-key tie-break + NULL contract

apply_ordering used an unstable lexsort with no tie-break, so rows with equal
user-sort keys came out in a run-dependent order (the input order depends on
scan parallelism / upstream hashing) — making ORDER ... LIMIT non-deterministic,
a latent deny-list violation (no nondeterministic result ordering).

Append the bound entities' key columns (<var>.id, unique per row) in canonical
name-sorted order as ascending tie-breaks, giving a total, reproducible order
(and a deterministic top-N when ties straddle the LIMIT cutoff). NULL placement
(nulls_first = !descending) is unchanged and now documented as the contract.

New tests/ordering.rs locks descending, multi-key precedence, the deterministic
key tie-break (data loaded in a different order than the expected output, so it
proves the tie sorts by key not by load order), and NULL placement under ASC/DESC.
docs/user/query-language.md documents the total-order + NULL contract.
This commit is contained in:
Ragnor Comerford 2026-06-09 13:50:59 +02:00
parent d746a69237
commit 8b13096183
No known key found for this signature in database
3 changed files with 165 additions and 0 deletions

View file

@ -422,6 +422,35 @@ pub(super) fn apply_ordering(
});
}
// Deterministic tie-break for a TOTAL order. `lexsort_to_indices` is unstable
// and the input row order is not guaranteed (scan parallelism, upstream
// hashing), so equal user-sort keys would otherwise come out run-dependent —
// making `ORDER ... LIMIT` non-deterministic. Append the bound entities' key
// columns (`<var>.id`, unique per row) in canonical (name-sorted) order as
// ascending tie-breaks. The combination of all bound keys uniquely identifies
// a result row, so the order is total and reproducible. (Aggregate results
// have no `.id` columns; their group rows are already distinct on the
// projected group keys.)
let mut tiebreak_cols: Vec<String> = source
.schema()
.fields()
.iter()
.map(|f| f.name().to_string())
.filter(|name| name.ends_with(".id"))
.collect();
tiebreak_cols.sort();
for name in &tiebreak_cols {
if let Some(col) = source.column_by_name(name) {
sort_columns.push(SortColumn {
values: col.clone(),
options: Some(arrow_schema::SortOptions {
descending: false,
nulls_first: true,
}),
});
}
}
let indices =
lexsort_to_indices(&sort_columns, None).map_err(|e| OmniError::Lance(e.to_string()))?;

View file

@ -0,0 +1,134 @@
//! ORDER BY golden coverage: descending, multi-key precedence, deterministic
//! tie-break (total order), and NULL placement.
//!
//! These pin the observable output-ordering contract (deny-list: "output
//! ordering … become dependencies once shipped"). `apply_ordering` appends the
//! bound entities' key columns as an ascending tie-break, so equal user-sort
//! keys yield a TOTAL, deterministic order (and `ORDER … LIMIT` is
//! deterministic). NULL placement is `nulls_first = !descending` (NULLs first
//! under ASC, last under DESC). Both are documented in
//! `docs/user/query-language.md`.
mod helpers;
use arrow_array::{Array, StringArray};
use omnigraph::db::Omnigraph;
use omnigraph::loader::{LoadMode, load_jsonl};
use omnigraph_compiler::ir::ParamMap;
use omnigraph_compiler::result::QueryResult;
use helpers::*;
/// Names in result ROW order (not sorted) — these tests assert positional order.
fn names_in_order(result: &QueryResult) -> Vec<String> {
let batch = result.concat_batches().unwrap();
if batch.num_rows() == 0 {
return Vec::new();
}
let col = batch
.column(0)
.as_any()
.downcast_ref::<StringArray>()
.unwrap();
(0..col.len()).map(|i| col.value(i).to_string()).collect()
}
/// Init the standard schema and load a custom Person-only dataset.
async fn init_people(dir: &tempfile::TempDir, jsonl: &str) -> Omnigraph {
let uri = dir.path().to_str().unwrap();
let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
load_jsonl(&mut db, jsonl, LoadMode::Overwrite).await.unwrap();
db
}
#[tokio::test]
async fn ordering_descending() {
let dir = tempfile::tempdir().unwrap();
let mut db = init_and_load(&dir).await;
let q = r#"
query q() {
match { $p: Person }
return { $p.name }
order { $p.age desc }
}
"#;
let got = names_in_order(&query_main(&mut db, q, "q", &ParamMap::new()).await.unwrap());
// Charlie(35), Alice(30), Diana(28), Bob(25)
assert_eq!(got, vec!["Charlie", "Alice", "Diana", "Bob"]);
}
#[tokio::test]
async fn ordering_multi_key_age_desc_name_asc() {
let dir = tempfile::tempdir().unwrap();
// Alice & Bob tie at age 30; loaded Bob-first so the expected output order
// cannot be the load order.
let data = r#"{"type":"Person","data":{"name":"Bob","age":30}}
{"type":"Person","data":{"name":"Alice","age":30}}
{"type":"Person","data":{"name":"Charlie","age":25}}"#;
let mut db = init_people(&dir, data).await;
let q = r#"
query q() {
match { $p: Person }
return { $p.name }
order { $p.age desc, $p.name asc }
}
"#;
let got = names_in_order(&query_main(&mut db, q, "q", &ParamMap::new()).await.unwrap());
// age desc -> [30,30,25]; the 30-tie broken by name asc -> Alice before Bob.
assert_eq!(got, vec!["Alice", "Bob", "Charlie"]);
}
#[tokio::test]
async fn ordering_tiebreak_by_key_is_deterministic() {
let dir = tempfile::tempdir().unwrap();
// Same tie at age 30, NO secondary sort key. Loaded Bob-first; the tie must
// break by the entity key (name) ascending -> Alice before Bob, regardless
// of load order. This locks the total-order tie-break in apply_ordering.
let data = r#"{"type":"Person","data":{"name":"Bob","age":30}}
{"type":"Person","data":{"name":"Alice","age":30}}
{"type":"Person","data":{"name":"Charlie","age":25}}"#;
let mut db = init_people(&dir, data).await;
let q = r#"
query q() {
match { $p: Person }
return { $p.name }
order { $p.age asc }
}
"#;
let got = names_in_order(&query_main(&mut db, q, "q", &ParamMap::new()).await.unwrap());
// age asc -> Charlie(25), then the 30-tie broken by key asc -> Alice, Bob.
assert_eq!(got, vec!["Charlie", "Alice", "Bob"]);
}
#[tokio::test]
async fn ordering_nulls_placement_asc_and_desc() {
let dir = tempfile::tempdir().unwrap();
// Bob has a NULL age.
let data = r#"{"type":"Person","data":{"name":"Alice","age":30}}
{"type":"Person","data":{"name":"Bob","age":null}}
{"type":"Person","data":{"name":"Charlie","age":25}}"#;
let mut db = init_people(&dir, data).await;
let asc = r#"
query q() {
match { $p: Person }
return { $p.name }
order { $p.age asc }
}
"#;
let got_asc = names_in_order(&query_main(&mut db, asc, "q", &ParamMap::new()).await.unwrap());
// ASC: nulls_first -> Bob(null), then 25, 30.
assert_eq!(got_asc, vec!["Bob", "Charlie", "Alice"]);
let desc = r#"
query q() {
match { $p: Person }
return { $p.name }
order { $p.age desc }
}
"#;
let got_desc = names_in_order(&query_main(&mut db, desc, "q", &ParamMap::new()).await.unwrap());
// DESC: nulls last -> 30, 25, then Bob(null).
assert_eq!(got_desc, vec!["Alice", "Charlie", "Bob"]);
}

View file

@ -55,6 +55,8 @@ Used inside MATCH or as expressions inside RETURN/ORDER:
- `order { <expr> [asc|desc], … }` — supports plain expressions and `nearest(...)`.
- `limit <integer>` — required when there is a `nearest(...)` ordering.
- **Total, deterministic order.** Rows with equal user-sort keys are broken by the bound entities' key columns (`<var>.id`, ascending) appended as a final tie-break, so the result is a *total* order — reproducible across runs, and `order … limit N` returns a deterministic top-N even when ties straddle the cutoff. (Aggregate results have no entity-key columns; their group rows are already distinct on the projected group keys.)
- **NULL placement** is *nulls-first ascending, nulls-last descending* (i.e. `nulls_first = !descending`): a NULL sorts as if smaller than any value.
## Mutation statements