diff --git a/crates/omnigraph/src/exec/query.rs b/crates/omnigraph/src/exec/query.rs index 63343e7..5509b43 100644 --- a/crates/omnigraph/src/exec/query.rs +++ b/crates/omnigraph/src/exec/query.rs @@ -870,6 +870,27 @@ async fn execute_expand_indexed( }; let edge_table_key = format!("edge:{}", edge_type); let edge_ds = snapshot.open(&edge_table_key).await?; + + // C6 guard: surface the silent scalar-index fallback. If Lance won't route + // the per-hop `key_col IN (...)` through the BTREE (no index, or a fragment + // missing physical_rows), the scan is a correct but O(|E|) full scan. + // Detection-only (metadata, no IO); never fails the query. + match crate::table_store::TableStore::key_column_index_coverage(&edge_ds, key_col).await { + Ok(crate::table_store::IndexCoverage::Degraded { reason }) => tracing::warn!( + target: "omnigraph::traverse", + edge = %edge_type, + key_col = key_col, + reason = %reason, + "indexed traversal falls back to a full edge scan (results correct, perf degraded)" + ), + Ok(crate::table_store::IndexCoverage::Indexed) => {} + Err(e) => tracing::debug!( + target: "omnigraph::traverse", + error = %e, + "index-coverage check failed; proceeding with traversal" + ), + } + let max = max_hops.unwrap_or(min_hops.max(1)); // Per-source BFS state in string-id space (no dense interning needed). diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index 19f1ddf..ca2cbd0 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -43,6 +43,19 @@ pub struct DeleteState { pub(crate) version_metadata: TableVersionMetadata, } +/// Whether a `key_col IN (...)` scan on a dataset will be served by the +/// persisted scalar (BTREE) index, or silently fall back to a full filtered +/// scan. Detection-only (metadata, no IO); the scan returns the correct rows +/// either way. Surfaced by the indexed traversal path so the silent perf +/// fallback is observable, and available to a future cost-based planner. +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum IndexCoverage { + /// The column has a usable BTREE and every fragment records `physical_rows`. + Indexed, + /// Lance will not use the scalar index for this scan (correct, full scan). + Degraded { reason: String }, +} + /// A Lance write that has produced fragment files on object storage but is /// not yet committed to the dataset's manifest. The staged-write primitives /// are consumed by `MutationStaging` (`exec/staging.rs`, @@ -628,6 +641,50 @@ impl TableStore { .map_err(|e| OmniError::Lance(e.to_string())) } + /// Metadata-only check (no IO) of whether `scan_edges_by_endpoint` — a + /// `key_col IN (...)` filter — on `ds` will be served by the persisted BTREE + /// on `column`, or silently fall back to a full filtered scan. Mirrors + /// Lance's own decision: scalar indices are disabled for the whole scan if + /// ANY fragment lacks `physical_rows` (lance `dataset/scanner.rs` + /// `create_filter_plan`), and are obviously unused if no BTREE on the + /// column exists. The scan is correct (returns all rows) either way — this + /// only surfaces the perf cliff so the indexed traversal can warn on it. + pub async fn key_column_index_coverage(ds: &Dataset, column: &str) -> Result { + let Some(field_id) = ds.schema().field(column).map(|field| field.id) else { + return Ok(IndexCoverage::Degraded { + reason: format!("column '{}' not in schema", column), + }); + }; + let indices = ds + .load_indices() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let has_btree = indices + .iter() + .filter(|index| !is_system_index(index)) + .filter(|index| index.fields.len() == 1 && index.fields[0] == field_id) + .any(|index| { + index + .index_details + .as_ref() + .map(|details| details.type_url.ends_with("BTreeIndexDetails")) + .unwrap_or(false) + }); + if !has_btree { + return Ok(IndexCoverage::Degraded { + reason: format!("no BTREE index on '{}'", column), + }); + } + // Same check Lance runs: a fragment missing physical_rows disables + // scalar indices for the entire scan (all-or-nothing). + if ds.fragments().iter().any(|f| f.physical_rows.is_none()) { + return Ok(IndexCoverage::Degraded { + reason: "a fragment is missing physical_rows".to_string(), + }); + } + Ok(IndexCoverage::Indexed) + } + pub async fn count_rows(&self, ds: &Dataset, filter: Option) -> Result { ds.count_rows(filter) .await diff --git a/crates/omnigraph/tests/traversal_indexed.rs b/crates/omnigraph/tests/traversal_indexed.rs index 7808655..d7acb92 100644 --- a/crates/omnigraph/tests/traversal_indexed.rs +++ b/crates/omnigraph/tests/traversal_indexed.rs @@ -14,6 +14,7 @@ mod helpers; use arrow_array::{Array, StringArray}; use omnigraph::db::Omnigraph; +use omnigraph::table_store::{IndexCoverage, TableStore}; use omnigraph_compiler::ir::ParamMap; use serial_test::serial; @@ -61,6 +62,33 @@ async fn both_modes(db: &mut Omnigraph, queries: &str, name: &str, params: &Para indexed } +// The C6 index-coverage guard: `key_column_index_coverage` must report whether +// a `key_col IN (...)` scan will use the persisted BTREE or silently full-scan. +// Not #[serial] — it calls the helper directly and reads no env. +#[tokio::test] +async fn key_column_index_coverage_detects_btree_presence() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + let snap = snapshot_main(&db).await.unwrap(); + + // Edge `src` gets a BTREE from ensure_indices on load → Indexed. + let edge_ds = snap.open("edge:Knows").await.unwrap(); + let src_cov = TableStore::key_column_index_coverage(&edge_ds, "src") + .await + .unwrap(); + assert_eq!(src_cov, IndexCoverage::Indexed, "edge src is BTREE-indexed"); + + // A node property column with no scalar index → Degraded (the warn path). + let node_ds = snap.open("node:Person").await.unwrap(); + let age_cov = TableStore::key_column_index_coverage(&node_ds, "age") + .await + .unwrap(); + assert!( + matches!(age_cov, IndexCoverage::Degraded { .. }), + "non-indexed column should be Degraded, got {age_cov:?}" + ); +} + #[tokio::test] #[serial] async fn indexed_matches_csr_one_hop_same_type() {