From 505303f93edfbc367efa89022e97236b70eb61fc Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Tue, 9 Jun 2026 09:12:36 +0200 Subject: [PATCH] test(engine): pin Lance scalar-index coverage + system-column/deletion-metadata surface Add three Lance surface guards de-risking a future persisted-adjacency cache: - a compile-only guard pinning the fragment physical_rows + index-detail surface that key_column_index_coverage mirrors (the C6 fallback); - a runtime probe confirming a scalar BTREE on the system column _row_last_updated_at_version is not buildable via the normal create-index path (the column is not in the user schema), so a version-column range delta is not viable as drafted; - a runtime probe confirming per-fragment deletion metadata (deletion_file.num_deleted_rows) is available as cheap O(fragments) metadata, the primitive a fragment-coverage delete model would rely on. The probes turn the two largest substrate assumptions into green/red CI facts before any cache work begins. --- .../omnigraph/tests/lance_surface_guards.rs | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) diff --git a/crates/omnigraph/tests/lance_surface_guards.rs b/crates/omnigraph/tests/lance_surface_guards.rs index 1d60c08..bf8e83d 100644 --- a/crates/omnigraph/tests/lance_surface_guards.rs +++ b/crates/omnigraph/tests/lance_surface_guards.rs @@ -32,7 +32,10 @@ use lance::dataset::builder::DatasetBuilder; use lance::dataset::optimize::{CompactionOptions, compact_files}; use lance::dataset::write::delete::DeleteResult; use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams}; +use lance::index::DatasetIndexExt; use lance_file::version::LanceFileVersion; +use lance_index::IndexType; +use lance_index::scalar::ScalarIndexParams; use lance_namespace::LanceNamespace; use lance_table::io::commit::ManifestNamingScheme; @@ -375,3 +378,130 @@ async fn compact_files_still_fails_on_blob_columns() { shifted): {err}" ); } + +// --- Guard 11: scalar-index coverage surface (physical_rows + index details) --- +// +// `table_store.rs::key_column_index_coverage` mirrors Lance's `create_filter_plan` +// C6 fallback: it reads `fragment.physical_rows` (the field whose absence on ANY +// fragment disables the scalar index for the whole scan) and sniffs the BTREE via +// `load_indices()` → `index.fields` / `index.index_details.type_url`. This is the +// one real Lance-internal coupling on the indexed-traversal read path. If any of +// these surfaces renames or changes type, the coverage check (and the cost-based +// traversal chooser that consumes it) silently misclassifies. Compile-only. + +#[allow( + dead_code, + unreachable_code, + unused_variables, + unused_mut, + clippy::diverging_sub_expression +)] +async fn _compile_scalar_index_coverage_surface() -> lance::Result<()> { + let ds: Dataset = unimplemented!(); + // The create_filter_plan coupling: a fragment lacking `physical_rows` + // disables the scalar index for the entire scan. + for frag in ds.fragments().iter() { + let _physical_rows: Option = frag.physical_rows; + } + // The index sniff: BTREE presence is detected by single-field index whose + // details type_url ends with "BTreeIndexDetails". + let indices = ds.load_indices().await?; + for index in indices.iter() { + let _fields: &Vec = &index.fields; + if let Some(details) = index.index_details.as_ref() { + let _type_url: &str = details.type_url.as_str(); + } + } + Ok(()) +} + +// --- Guard 12: can a scalar BTREE be built on a system version column? -------- +// +// The deferred persisted-adjacency artifact plan assumed a cheap delta read of +// `_row_last_updated_at_version > V` could be a BTREE range lookup. Lance resolves +// index columns from the dataset schema, and the version columns are system +// metadata — so this probe documents whether the assumption holds. The outcome is +// the load-bearing fact, not a pass/fail of intent: if this starts SUCCEEDING when +// it currently errors (or vice versa), the artifact's delta-cost story changes. + +#[tokio::test] +async fn scalar_index_on_system_version_column_probe() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("guard12.lance"); + let mut ds = fresh_dataset(uri.to_str().unwrap()).await; + + // Sanity: the system version column is present (stable row ids + V2_2). + assert!( + ds.schema().field("_row_last_updated_at_version").is_none(), + "PROBE NOTE: `_row_last_updated_at_version` is NOT in the user schema \ + (it is system metadata); indexing it resolves through a different path." + ); + + let result = ds + .create_index_builder( + &["_row_last_updated_at_version"], + IndexType::BTree, + &ScalarIndexParams::default(), + ) + .replace(true) + .await; + + // Pin the observed behavior: a scalar index on the system version column is + // NOT buildable via the normal create-index path in this Lance. If this turns + // green (Ok), the artifact delta CAN use a version-column BTREE — revisit the + // deferred plan's Phase-2 delta-cost note in docs/dev/traversal handoff. + assert!( + result.is_err(), + "create_index on `_row_last_updated_at_version` unexpectedly SUCCEEDED — \ + a system-column scalar index is now buildable; the persisted-artifact \ + delta read could use it. Update the deferred-design notes." + ); +} + +// --- Guard 13: per-fragment deletion metadata is exposed without a scan ------- +// +// The deferred artifact's delete-correctness coverage model needs to detect, +// cheaply (O(fragments), no row scan), that a covered fragment acquired new +// deletions. That hinges on Lance tracking deletions at fragment-metadata level. +// This pins that a delete populates `fragment.deletion_file`, and probes whether +// the deleted-row COUNT is available as metadata (`num_deleted_rows`) — the +// difference between an O(fragments) coverage check and an O(|E|) scan. + +#[tokio::test] +async fn fragment_deletion_metadata_is_available() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("guard13.lance"); + let ds = fresh_dataset(uri.to_str().unwrap()).await; // 2 rows: alice, bob + + let deleted: DeleteResult = { + let mut ds = ds; + ds.delete("id = 'alice'").await.unwrap() + }; + assert_eq!(deleted.num_deleted_rows, 1, "one row deleted"); + let ds = deleted.new_dataset; + + // A delete must be tracked at fragment-metadata level (not only in data). + let with_deletion = ds + .fragments() + .iter() + .find(|f| f.deletion_file.is_some()) + .expect( + "after a delete, some fragment must carry a deletion_file — if not, \ + Lance changed deletion tracking; the artifact coverage model's \ + cheap delete-detection assumption is invalid.", + ); + + // Probe: is the deleted-row count available as metadata (cheap), or must the + // deletion vector be read? Pin whichever holds so the artifact plan knows. + let count: Option = with_deletion + .deletion_file + .as_ref() + .and_then(|df| df.num_deleted_rows); + assert_eq!( + count, + Some(1), + "PROBE: deletion_file.num_deleted_rows is not a populated metadata count \ + (got {count:?}); the artifact coverage model cannot cheaply detect \ + per-fragment deletions and would need to read the deletion vector.", + ); +}