test(engine): pin Lance scalar-index coverage + system-column/deletion-metadata surface

Add three Lance surface guards de-risking a future persisted-adjacency cache:
- a compile-only guard pinning the fragment physical_rows + index-detail
  surface that key_column_index_coverage mirrors (the C6 fallback);
- a runtime probe confirming a scalar BTREE on the system column
  _row_last_updated_at_version is not buildable via the normal create-index
  path (the column is not in the user schema), so a version-column range delta
  is not viable as drafted;
- a runtime probe confirming per-fragment deletion metadata
  (deletion_file.num_deleted_rows) is available as cheap O(fragments) metadata,
  the primitive a fragment-coverage delete model would rely on.

The probes turn the two largest substrate assumptions into green/red CI facts
before any cache work begins.
This commit is contained in:
Ragnor Comerford 2026-06-09 09:12:36 +02:00
parent 787d41ec5f
commit 505303f93e
No known key found for this signature in database

View file

@ -32,7 +32,10 @@ use lance::dataset::builder::DatasetBuilder;
use lance::dataset::optimize::{CompactionOptions, compact_files};
use lance::dataset::write::delete::DeleteResult;
use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams};
use lance::index::DatasetIndexExt;
use lance_file::version::LanceFileVersion;
use lance_index::IndexType;
use lance_index::scalar::ScalarIndexParams;
use lance_namespace::LanceNamespace;
use lance_table::io::commit::ManifestNamingScheme;
@ -375,3 +378,130 @@ async fn compact_files_still_fails_on_blob_columns() {
shifted): {err}"
);
}
// --- Guard 11: scalar-index coverage surface (physical_rows + index details) ---
//
// `table_store.rs::key_column_index_coverage` mirrors Lance's `create_filter_plan`
// C6 fallback: it reads `fragment.physical_rows` (the field whose absence on ANY
// fragment disables the scalar index for the whole scan) and sniffs the BTREE via
// `load_indices()` → `index.fields` / `index.index_details.type_url`. This is the
// one real Lance-internal coupling on the indexed-traversal read path. If any of
// these surfaces renames or changes type, the coverage check (and the cost-based
// traversal chooser that consumes it) silently misclassifies. Compile-only.
#[allow(
dead_code,
unreachable_code,
unused_variables,
unused_mut,
clippy::diverging_sub_expression
)]
async fn _compile_scalar_index_coverage_surface() -> lance::Result<()> {
let ds: Dataset = unimplemented!();
// The create_filter_plan coupling: a fragment lacking `physical_rows`
// disables the scalar index for the entire scan.
for frag in ds.fragments().iter() {
let _physical_rows: Option<usize> = frag.physical_rows;
}
// The index sniff: BTREE presence is detected by single-field index whose
// details type_url ends with "BTreeIndexDetails".
let indices = ds.load_indices().await?;
for index in indices.iter() {
let _fields: &Vec<i32> = &index.fields;
if let Some(details) = index.index_details.as_ref() {
let _type_url: &str = details.type_url.as_str();
}
}
Ok(())
}
// --- Guard 12: can a scalar BTREE be built on a system version column? --------
//
// The deferred persisted-adjacency artifact plan assumed a cheap delta read of
// `_row_last_updated_at_version > V` could be a BTREE range lookup. Lance resolves
// index columns from the dataset schema, and the version columns are system
// metadata — so this probe documents whether the assumption holds. The outcome is
// the load-bearing fact, not a pass/fail of intent: if this starts SUCCEEDING when
// it currently errors (or vice versa), the artifact's delta-cost story changes.
#[tokio::test]
async fn scalar_index_on_system_version_column_probe() {
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().join("guard12.lance");
let mut ds = fresh_dataset(uri.to_str().unwrap()).await;
// Sanity: the system version column is present (stable row ids + V2_2).
assert!(
ds.schema().field("_row_last_updated_at_version").is_none(),
"PROBE NOTE: `_row_last_updated_at_version` is NOT in the user schema \
(it is system metadata); indexing it resolves through a different path."
);
let result = ds
.create_index_builder(
&["_row_last_updated_at_version"],
IndexType::BTree,
&ScalarIndexParams::default(),
)
.replace(true)
.await;
// Pin the observed behavior: a scalar index on the system version column is
// NOT buildable via the normal create-index path in this Lance. If this turns
// green (Ok), the artifact delta CAN use a version-column BTREE — revisit the
// deferred plan's Phase-2 delta-cost note in docs/dev/traversal handoff.
assert!(
result.is_err(),
"create_index on `_row_last_updated_at_version` unexpectedly SUCCEEDED — \
a system-column scalar index is now buildable; the persisted-artifact \
delta read could use it. Update the deferred-design notes."
);
}
// --- Guard 13: per-fragment deletion metadata is exposed without a scan -------
//
// The deferred artifact's delete-correctness coverage model needs to detect,
// cheaply (O(fragments), no row scan), that a covered fragment acquired new
// deletions. That hinges on Lance tracking deletions at fragment-metadata level.
// This pins that a delete populates `fragment.deletion_file`, and probes whether
// the deleted-row COUNT is available as metadata (`num_deleted_rows`) — the
// difference between an O(fragments) coverage check and an O(|E|) scan.
#[tokio::test]
async fn fragment_deletion_metadata_is_available() {
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().join("guard13.lance");
let ds = fresh_dataset(uri.to_str().unwrap()).await; // 2 rows: alice, bob
let deleted: DeleteResult = {
let mut ds = ds;
ds.delete("id = 'alice'").await.unwrap()
};
assert_eq!(deleted.num_deleted_rows, 1, "one row deleted");
let ds = deleted.new_dataset;
// A delete must be tracked at fragment-metadata level (not only in data).
let with_deletion = ds
.fragments()
.iter()
.find(|f| f.deletion_file.is_some())
.expect(
"after a delete, some fragment must carry a deletion_file — if not, \
Lance changed deletion tracking; the artifact coverage model's \
cheap delete-detection assumption is invalid.",
);
// Probe: is the deleted-row count available as metadata (cheap), or must the
// deletion vector be read? Pin whichever holds so the artifact plan knows.
let count: Option<usize> = with_deletion
.deletion_file
.as_ref()
.and_then(|df| df.num_deleted_rows);
assert_eq!(
count,
Some(1),
"PROBE: deletion_file.num_deleted_rows is not a populated metadata count \
(got {count:?}); the artifact coverage model cannot cheaply detect \
per-fragment deletions and would need to read the deletion vector.",
);
}