mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-30 02:49:39 +02:00
test(engine): pin Lance scalar-index coverage + system-column/deletion-metadata surface
Add three Lance surface guards de-risking a future persisted-adjacency cache: - a compile-only guard pinning the fragment physical_rows + index-detail surface that key_column_index_coverage mirrors (the C6 fallback); - a runtime probe confirming a scalar BTREE on the system column _row_last_updated_at_version is not buildable via the normal create-index path (the column is not in the user schema), so a version-column range delta is not viable as drafted; - a runtime probe confirming per-fragment deletion metadata (deletion_file.num_deleted_rows) is available as cheap O(fragments) metadata, the primitive a fragment-coverage delete model would rely on. The probes turn the two largest substrate assumptions into green/red CI facts before any cache work begins.
This commit is contained in:
parent
787d41ec5f
commit
505303f93e
1 changed files with 130 additions and 0 deletions
|
|
@ -32,7 +32,10 @@ use lance::dataset::builder::DatasetBuilder;
|
||||||
use lance::dataset::optimize::{CompactionOptions, compact_files};
|
use lance::dataset::optimize::{CompactionOptions, compact_files};
|
||||||
use lance::dataset::write::delete::DeleteResult;
|
use lance::dataset::write::delete::DeleteResult;
|
||||||
use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams};
|
use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams};
|
||||||
|
use lance::index::DatasetIndexExt;
|
||||||
use lance_file::version::LanceFileVersion;
|
use lance_file::version::LanceFileVersion;
|
||||||
|
use lance_index::IndexType;
|
||||||
|
use lance_index::scalar::ScalarIndexParams;
|
||||||
use lance_namespace::LanceNamespace;
|
use lance_namespace::LanceNamespace;
|
||||||
use lance_table::io::commit::ManifestNamingScheme;
|
use lance_table::io::commit::ManifestNamingScheme;
|
||||||
|
|
||||||
|
|
@ -375,3 +378,130 @@ async fn compact_files_still_fails_on_blob_columns() {
|
||||||
shifted): {err}"
|
shifted): {err}"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// --- Guard 11: scalar-index coverage surface (physical_rows + index details) ---
|
||||||
|
//
|
||||||
|
// `table_store.rs::key_column_index_coverage` mirrors Lance's `create_filter_plan`
|
||||||
|
// C6 fallback: it reads `fragment.physical_rows` (the field whose absence on ANY
|
||||||
|
// fragment disables the scalar index for the whole scan) and sniffs the BTREE via
|
||||||
|
// `load_indices()` → `index.fields` / `index.index_details.type_url`. This is the
|
||||||
|
// one real Lance-internal coupling on the indexed-traversal read path. If any of
|
||||||
|
// these surfaces renames or changes type, the coverage check (and the cost-based
|
||||||
|
// traversal chooser that consumes it) silently misclassifies. Compile-only.
|
||||||
|
|
||||||
|
#[allow(
|
||||||
|
dead_code,
|
||||||
|
unreachable_code,
|
||||||
|
unused_variables,
|
||||||
|
unused_mut,
|
||||||
|
clippy::diverging_sub_expression
|
||||||
|
)]
|
||||||
|
async fn _compile_scalar_index_coverage_surface() -> lance::Result<()> {
|
||||||
|
let ds: Dataset = unimplemented!();
|
||||||
|
// The create_filter_plan coupling: a fragment lacking `physical_rows`
|
||||||
|
// disables the scalar index for the entire scan.
|
||||||
|
for frag in ds.fragments().iter() {
|
||||||
|
let _physical_rows: Option<usize> = frag.physical_rows;
|
||||||
|
}
|
||||||
|
// The index sniff: BTREE presence is detected by single-field index whose
|
||||||
|
// details type_url ends with "BTreeIndexDetails".
|
||||||
|
let indices = ds.load_indices().await?;
|
||||||
|
for index in indices.iter() {
|
||||||
|
let _fields: &Vec<i32> = &index.fields;
|
||||||
|
if let Some(details) = index.index_details.as_ref() {
|
||||||
|
let _type_url: &str = details.type_url.as_str();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Guard 12: can a scalar BTREE be built on a system version column? --------
|
||||||
|
//
|
||||||
|
// The deferred persisted-adjacency artifact plan assumed a cheap delta read of
|
||||||
|
// `_row_last_updated_at_version > V` could be a BTREE range lookup. Lance resolves
|
||||||
|
// index columns from the dataset schema, and the version columns are system
|
||||||
|
// metadata — so this probe documents whether the assumption holds. The outcome is
|
||||||
|
// the load-bearing fact, not a pass/fail of intent: if this starts SUCCEEDING when
|
||||||
|
// it currently errors (or vice versa), the artifact's delta-cost story changes.
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn scalar_index_on_system_version_column_probe() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let uri = dir.path().join("guard12.lance");
|
||||||
|
let mut ds = fresh_dataset(uri.to_str().unwrap()).await;
|
||||||
|
|
||||||
|
// Sanity: the system version column is present (stable row ids + V2_2).
|
||||||
|
assert!(
|
||||||
|
ds.schema().field("_row_last_updated_at_version").is_none(),
|
||||||
|
"PROBE NOTE: `_row_last_updated_at_version` is NOT in the user schema \
|
||||||
|
(it is system metadata); indexing it resolves through a different path."
|
||||||
|
);
|
||||||
|
|
||||||
|
let result = ds
|
||||||
|
.create_index_builder(
|
||||||
|
&["_row_last_updated_at_version"],
|
||||||
|
IndexType::BTree,
|
||||||
|
&ScalarIndexParams::default(),
|
||||||
|
)
|
||||||
|
.replace(true)
|
||||||
|
.await;
|
||||||
|
|
||||||
|
// Pin the observed behavior: a scalar index on the system version column is
|
||||||
|
// NOT buildable via the normal create-index path in this Lance. If this turns
|
||||||
|
// green (Ok), the artifact delta CAN use a version-column BTREE — revisit the
|
||||||
|
// deferred plan's Phase-2 delta-cost note in docs/dev/traversal handoff.
|
||||||
|
assert!(
|
||||||
|
result.is_err(),
|
||||||
|
"create_index on `_row_last_updated_at_version` unexpectedly SUCCEEDED — \
|
||||||
|
a system-column scalar index is now buildable; the persisted-artifact \
|
||||||
|
delta read could use it. Update the deferred-design notes."
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Guard 13: per-fragment deletion metadata is exposed without a scan -------
|
||||||
|
//
|
||||||
|
// The deferred artifact's delete-correctness coverage model needs to detect,
|
||||||
|
// cheaply (O(fragments), no row scan), that a covered fragment acquired new
|
||||||
|
// deletions. That hinges on Lance tracking deletions at fragment-metadata level.
|
||||||
|
// This pins that a delete populates `fragment.deletion_file`, and probes whether
|
||||||
|
// the deleted-row COUNT is available as metadata (`num_deleted_rows`) — the
|
||||||
|
// difference between an O(fragments) coverage check and an O(|E|) scan.
|
||||||
|
|
||||||
|
#[tokio::test]
|
||||||
|
async fn fragment_deletion_metadata_is_available() {
|
||||||
|
let dir = tempfile::tempdir().unwrap();
|
||||||
|
let uri = dir.path().join("guard13.lance");
|
||||||
|
let ds = fresh_dataset(uri.to_str().unwrap()).await; // 2 rows: alice, bob
|
||||||
|
|
||||||
|
let deleted: DeleteResult = {
|
||||||
|
let mut ds = ds;
|
||||||
|
ds.delete("id = 'alice'").await.unwrap()
|
||||||
|
};
|
||||||
|
assert_eq!(deleted.num_deleted_rows, 1, "one row deleted");
|
||||||
|
let ds = deleted.new_dataset;
|
||||||
|
|
||||||
|
// A delete must be tracked at fragment-metadata level (not only in data).
|
||||||
|
let with_deletion = ds
|
||||||
|
.fragments()
|
||||||
|
.iter()
|
||||||
|
.find(|f| f.deletion_file.is_some())
|
||||||
|
.expect(
|
||||||
|
"after a delete, some fragment must carry a deletion_file — if not, \
|
||||||
|
Lance changed deletion tracking; the artifact coverage model's \
|
||||||
|
cheap delete-detection assumption is invalid.",
|
||||||
|
);
|
||||||
|
|
||||||
|
// Probe: is the deleted-row count available as metadata (cheap), or must the
|
||||||
|
// deletion vector be read? Pin whichever holds so the artifact plan knows.
|
||||||
|
let count: Option<usize> = with_deletion
|
||||||
|
.deletion_file
|
||||||
|
.as_ref()
|
||||||
|
.and_then(|df| df.num_deleted_rows);
|
||||||
|
assert_eq!(
|
||||||
|
count,
|
||||||
|
Some(1),
|
||||||
|
"PROBE: deletion_file.num_deleted_rows is not a populated metadata count \
|
||||||
|
(got {count:?}); the artifact coverage model cannot cheaply detect \
|
||||||
|
per-fragment deletions and would need to read the deletion vector.",
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue