mirror of
https://github.com/ModernRelay/omnigraph.git
synced 2026-06-21 02:28:07 +02:00
test(engine): pin Lance scalar-index coverage + system-column/deletion-metadata surface
Add three Lance surface guards de-risking a future persisted-adjacency cache: - a compile-only guard pinning the fragment physical_rows + index-detail surface that key_column_index_coverage mirrors (the C6 fallback); - a runtime probe confirming a scalar BTREE on the system column _row_last_updated_at_version is not buildable via the normal create-index path (the column is not in the user schema), so a version-column range delta is not viable as drafted; - a runtime probe confirming per-fragment deletion metadata (deletion_file.num_deleted_rows) is available as cheap O(fragments) metadata, the primitive a fragment-coverage delete model would rely on. The probes turn the two largest substrate assumptions into green/red CI facts before any cache work begins.
This commit is contained in:
parent
787d41ec5f
commit
505303f93e
1 changed files with 130 additions and 0 deletions
|
|
@ -32,7 +32,10 @@ use lance::dataset::builder::DatasetBuilder;
|
|||
use lance::dataset::optimize::{CompactionOptions, compact_files};
|
||||
use lance::dataset::write::delete::DeleteResult;
|
||||
use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams};
|
||||
use lance::index::DatasetIndexExt;
|
||||
use lance_file::version::LanceFileVersion;
|
||||
use lance_index::IndexType;
|
||||
use lance_index::scalar::ScalarIndexParams;
|
||||
use lance_namespace::LanceNamespace;
|
||||
use lance_table::io::commit::ManifestNamingScheme;
|
||||
|
||||
|
|
@ -375,3 +378,130 @@ async fn compact_files_still_fails_on_blob_columns() {
|
|||
shifted): {err}"
|
||||
);
|
||||
}
|
||||
|
||||
// --- Guard 11: scalar-index coverage surface (physical_rows + index details) ---
|
||||
//
|
||||
// `table_store.rs::key_column_index_coverage` mirrors Lance's `create_filter_plan`
|
||||
// C6 fallback: it reads `fragment.physical_rows` (the field whose absence on ANY
|
||||
// fragment disables the scalar index for the whole scan) and sniffs the BTREE via
|
||||
// `load_indices()` → `index.fields` / `index.index_details.type_url`. This is the
|
||||
// one real Lance-internal coupling on the indexed-traversal read path. If any of
|
||||
// these surfaces renames or changes type, the coverage check (and the cost-based
|
||||
// traversal chooser that consumes it) silently misclassifies. Compile-only.
|
||||
|
||||
#[allow(
|
||||
dead_code,
|
||||
unreachable_code,
|
||||
unused_variables,
|
||||
unused_mut,
|
||||
clippy::diverging_sub_expression
|
||||
)]
|
||||
async fn _compile_scalar_index_coverage_surface() -> lance::Result<()> {
|
||||
let ds: Dataset = unimplemented!();
|
||||
// The create_filter_plan coupling: a fragment lacking `physical_rows`
|
||||
// disables the scalar index for the entire scan.
|
||||
for frag in ds.fragments().iter() {
|
||||
let _physical_rows: Option<usize> = frag.physical_rows;
|
||||
}
|
||||
// The index sniff: BTREE presence is detected by single-field index whose
|
||||
// details type_url ends with "BTreeIndexDetails".
|
||||
let indices = ds.load_indices().await?;
|
||||
for index in indices.iter() {
|
||||
let _fields: &Vec<i32> = &index.fields;
|
||||
if let Some(details) = index.index_details.as_ref() {
|
||||
let _type_url: &str = details.type_url.as_str();
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// --- Guard 12: can a scalar BTREE be built on a system version column? --------
|
||||
//
|
||||
// The deferred persisted-adjacency artifact plan assumed a cheap delta read of
|
||||
// `_row_last_updated_at_version > V` could be a BTREE range lookup. Lance resolves
|
||||
// index columns from the dataset schema, and the version columns are system
|
||||
// metadata — so this probe documents whether the assumption holds. The outcome is
|
||||
// the load-bearing fact, not a pass/fail of intent: if this starts SUCCEEDING when
|
||||
// it currently errors (or vice versa), the artifact's delta-cost story changes.
|
||||
|
||||
#[tokio::test]
|
||||
async fn scalar_index_on_system_version_column_probe() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().join("guard12.lance");
|
||||
let mut ds = fresh_dataset(uri.to_str().unwrap()).await;
|
||||
|
||||
// Sanity: the system version column is present (stable row ids + V2_2).
|
||||
assert!(
|
||||
ds.schema().field("_row_last_updated_at_version").is_none(),
|
||||
"PROBE NOTE: `_row_last_updated_at_version` is NOT in the user schema \
|
||||
(it is system metadata); indexing it resolves through a different path."
|
||||
);
|
||||
|
||||
let result = ds
|
||||
.create_index_builder(
|
||||
&["_row_last_updated_at_version"],
|
||||
IndexType::BTree,
|
||||
&ScalarIndexParams::default(),
|
||||
)
|
||||
.replace(true)
|
||||
.await;
|
||||
|
||||
// Pin the observed behavior: a scalar index on the system version column is
|
||||
// NOT buildable via the normal create-index path in this Lance. If this turns
|
||||
// green (Ok), the artifact delta CAN use a version-column BTREE — revisit the
|
||||
// deferred plan's Phase-2 delta-cost note in docs/dev/traversal handoff.
|
||||
assert!(
|
||||
result.is_err(),
|
||||
"create_index on `_row_last_updated_at_version` unexpectedly SUCCEEDED — \
|
||||
a system-column scalar index is now buildable; the persisted-artifact \
|
||||
delta read could use it. Update the deferred-design notes."
|
||||
);
|
||||
}
|
||||
|
||||
// --- Guard 13: per-fragment deletion metadata is exposed without a scan -------
|
||||
//
|
||||
// The deferred artifact's delete-correctness coverage model needs to detect,
|
||||
// cheaply (O(fragments), no row scan), that a covered fragment acquired new
|
||||
// deletions. That hinges on Lance tracking deletions at fragment-metadata level.
|
||||
// This pins that a delete populates `fragment.deletion_file`, and probes whether
|
||||
// the deleted-row COUNT is available as metadata (`num_deleted_rows`) — the
|
||||
// difference between an O(fragments) coverage check and an O(|E|) scan.
|
||||
|
||||
#[tokio::test]
|
||||
async fn fragment_deletion_metadata_is_available() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let uri = dir.path().join("guard13.lance");
|
||||
let ds = fresh_dataset(uri.to_str().unwrap()).await; // 2 rows: alice, bob
|
||||
|
||||
let deleted: DeleteResult = {
|
||||
let mut ds = ds;
|
||||
ds.delete("id = 'alice'").await.unwrap()
|
||||
};
|
||||
assert_eq!(deleted.num_deleted_rows, 1, "one row deleted");
|
||||
let ds = deleted.new_dataset;
|
||||
|
||||
// A delete must be tracked at fragment-metadata level (not only in data).
|
||||
let with_deletion = ds
|
||||
.fragments()
|
||||
.iter()
|
||||
.find(|f| f.deletion_file.is_some())
|
||||
.expect(
|
||||
"after a delete, some fragment must carry a deletion_file — if not, \
|
||||
Lance changed deletion tracking; the artifact coverage model's \
|
||||
cheap delete-detection assumption is invalid.",
|
||||
);
|
||||
|
||||
// Probe: is the deleted-row count available as metadata (cheap), or must the
|
||||
// deletion vector be read? Pin whichever holds so the artifact plan knows.
|
||||
let count: Option<usize> = with_deletion
|
||||
.deletion_file
|
||||
.as_ref()
|
||||
.and_then(|df| df.num_deleted_rows);
|
||||
assert_eq!(
|
||||
count,
|
||||
Some(1),
|
||||
"PROBE: deletion_file.num_deleted_rows is not a populated metadata count \
|
||||
(got {count:?}); the artifact coverage model cannot cheaply detect \
|
||||
per-fragment deletions and would need to read the deletion vector.",
|
||||
);
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue