diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 779a2e0..6c80117 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -16,7 +16,7 @@ use lance::dataset::scanner::ColumnOrdering; use lance::datatypes::BlobKind; use omnigraph_compiler::catalog::{Catalog, EdgeType, NodeType}; use omnigraph_compiler::schema::parser::parse_schema; -use omnigraph_compiler::types::ScalarType; +use omnigraph_compiler::types::{PropType, ScalarType}; use omnigraph_compiler::{ DropMode, SchemaIR, SchemaMigrationPlan, SchemaMigrationStep, SchemaTypeKind, build_catalog_from_ir, build_schema_ir, plan_schema_migration, diff --git a/crates/omnigraph/src/db/omnigraph/table_ops.rs b/crates/omnigraph/src/db/omnigraph/table_ops.rs index f7a365a..3f40c1d 100644 --- a/crates/omnigraph/src/db/omnigraph/table_ops.rs +++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs @@ -310,6 +310,48 @@ pub(super) async fn ensure_indices_for_branch(db: &Omnigraph, branch: Option<&st Ok(()) } +/// The single scalar/vector index a node property receives from a one-column +/// `@index`/`@key` declaration, or `None` when the property type is not +/// indexable here (a list column or `Blob`). +/// +/// Shared by `build_indices_on_dataset_for_catalog` (which builds the index) +/// and `needs_index_work_node` (which checks coverage to decide recovery- +/// sidecar pinning) so the two cannot drift: an enum or orderable scalar the +/// builder gives a BTREE must also be reported as "needs work" until that +/// BTREE exists, or the HEAD-advancing build would run without sidecar cover. +#[derive(Clone, Copy, PartialEq, Eq, Debug)] +enum NodePropIndexKind { + Btree, + Fts, + Vector, +} + +fn node_prop_index_kind(prop_type: &PropType) -> Option { + if prop_type.list { + return None; + } + // Enums are physically `String` but filtered by equality, so they take a + // scalar BTREE, not an FTS inverted index (Lance never consults an inverted + // index for `=`/range). Free-text Strings keep FTS for + // `search()`/`match_text`/`bm25`. + let is_enum = prop_type.enum_values.is_some(); + match prop_type.scalar { + ScalarType::String if !is_enum => Some(NodePropIndexKind::Fts), + ScalarType::Vector(_) => Some(NodePropIndexKind::Vector), + ScalarType::String + | ScalarType::DateTime + | ScalarType::Date + | ScalarType::I32 + | ScalarType::I64 + | ScalarType::U32 + | ScalarType::U64 + | ScalarType::F32 + | ScalarType::F64 + | ScalarType::Bool => Some(NodePropIndexKind::Btree), + ScalarType::Blob => None, + } +} + /// Returns true if the node table is missing at least one declared /// scalar/vector index that `build_indices_on_dataset_for_catalog` would /// build AND has at least one row (the ensure_indices loop has @@ -318,11 +360,12 @@ pub(super) async fn ensure_indices_for_branch(db: &Omnigraph, branch: Option<&st /// would force `NoMovement` classification on recovery and trigger the /// all-or-nothing rollback of sibling tables' legitimate index work). /// -/// Per the actual `build_indices_on_dataset_for_catalog` implementation -/// (this file, ~line 419-491), nodes get BTree (id) + per-prop FTS -/// (@search String) + per-prop Vector indices; edges get BTree only -/// (id, src, dst). The two helpers mirror that asymmetry — see the -/// `needs_index_work_edge` doc comment. +/// Per `build_indices_on_dataset_for_catalog`, nodes get BTree (id) plus, for +/// each one-column `@index`/`@key` property, the index `node_prop_index_kind` +/// assigns: a scalar BTREE for enums and orderable scalars +/// (DateTime/Date/numeric/Bool), FTS for free-text Strings, or a Vector index. +/// Edges get BTree only (id, src, dst). This helper and the builder share +/// `node_prop_index_kind` so they cannot drift — see its doc comment. async fn needs_index_work_node( db: &Omnigraph, type_name: &str, @@ -359,14 +402,23 @@ async fn needs_index_work_node( let Some(prop_type) = node_type.properties.get(prop_name) else { continue; }; - if matches!(prop_type.scalar, ScalarType::String) && !prop_type.list { - if !db.storage().has_fts_index(&ds, prop_name).await? { - return Ok(true); + match node_prop_index_kind(prop_type) { + Some(NodePropIndexKind::Fts) => { + if !db.storage().has_fts_index(&ds, prop_name).await? { + return Ok(true); + } } - } else if matches!(prop_type.scalar, ScalarType::Vector(_)) && !prop_type.list { - if !db.storage().has_vector_index(&ds, prop_name).await? { - return Ok(true); + Some(NodePropIndexKind::Vector) => { + if !db.storage().has_vector_index(&ds, prop_name).await? { + return Ok(true); + } } + Some(NodePropIndexKind::Btree) => { + if !db.storage().has_btree_index(&ds, prop_name).await? { + return Ok(true); + } + } + None => {} } } Ok(false) @@ -615,30 +667,44 @@ pub(super) async fn build_indices_on_dataset_for_catalog( } let prop_name = &index_cols[0]; if let Some(prop_type) = node_type.properties.get(prop_name) { - if matches!(prop_type.scalar, ScalarType::String) && !prop_type.list { - if !db.storage().has_fts_index(ds, prop_name).await? { - stage_and_commit_inverted(db, table_key, ds, prop_name.as_str()) - .await?; + match node_prop_index_kind(prop_type) { + Some(NodePropIndexKind::Fts) => { + if !db.storage().has_fts_index(ds, prop_name).await? { + stage_and_commit_inverted(db, table_key, ds, prop_name.as_str()) + .await?; + } } - } else if matches!(prop_type.scalar, ScalarType::Vector(_)) && !prop_type.list { - if !db.storage().has_vector_index(ds, prop_name).await? { - // Inline-commit residual: lance-6.0.1 does not - // expose `build_index_metadata_from_segments` as - // `pub`, so vector indices cannot be staged from - // outside the lance crate. Document at the call - // site; companion ticket to lance-format/lance#6658. - let new_snap = db - .storage_inline_residual() - .create_vector_index(ds.clone(), prop_name.as_str()) - .await - .map_err(|e| { - OmniError::Lance(format!( - "create Vector index on {}({}): {}", - table_key, prop_name, e - )) - })?; - *ds = new_snap; + Some(NodePropIndexKind::Vector) => { + if !db.storage().has_vector_index(ds, prop_name).await? { + // Inline-commit residual: lance-6.0.1 does not + // expose `build_index_metadata_from_segments` as + // `pub`, so vector indices cannot be staged from + // outside the lance crate. Document at the call + // site; companion ticket to lance-format/lance#6658. + let new_snap = db + .storage_inline_residual() + .create_vector_index(ds.clone(), prop_name.as_str()) + .await + .map_err(|e| { + OmniError::Lance(format!( + "create Vector index on {}({}): {}", + table_key, prop_name, e + )) + })?; + *ds = new_snap; + } } + // Enum + orderable scalars (DateTime/Date/numeric/Bool) + // get a BTREE so `=`, range, IN, and IS NULL are index- + // accelerated instead of degrading to a full scan. + Some(NodePropIndexKind::Btree) => { + if !db.storage().has_btree_index(ds, prop_name).await? { + stage_and_commit_btree(db, table_key, ds, &[prop_name.as_str()]) + .await?; + } + } + // List or Blob column: not indexable as a scalar here. + None => {} } } } diff --git a/crates/omnigraph/tests/scalar_indexes.rs b/crates/omnigraph/tests/scalar_indexes.rs new file mode 100644 index 0000000..8d8a3f0 --- /dev/null +++ b/crates/omnigraph/tests/scalar_indexes.rs @@ -0,0 +1,74 @@ +//! Coverage for `build_indices_on_dataset_for_catalog`'s per-property index +//! dispatch: which scalar/vector index each `@index`/`@key` column gets. +//! +//! The observable signal is `TableStore::key_column_index_coverage`, which +//! reports `Indexed` only when a BTREE covers the column (the same helper the +//! traversal chooser uses). Enums and orderable scalars must get a BTREE so +//! `=`/range/IN/IS NULL are index-accelerated; free-text Strings keep FTS +//! (which `key_column_index_coverage` does not count as a BTREE, by design). + +mod helpers; + +use omnigraph::db::Omnigraph; +use omnigraph::loader::{LoadMode, load_jsonl}; +use omnigraph::table_store::{IndexCoverage, TableStore}; + +use helpers::*; + +const SCHEMA: &str = r#" +node Item { + slug: String @key + status: enum(active, archived) @index + published: DateTime @index + rank: I32 @index + title: String @index + note: String? +} +"#; + +const DATA: &str = r#"{"type":"Item","data":{"slug":"a","status":"active","published":"2024-06-01T00:00:00Z","rank":1,"title":"alpha","note":"n1"}} +{"type":"Item","data":{"slug":"b","status":"archived","published":"2023-01-01T00:00:00Z","rank":2,"title":"beta","note":"n2"}} +{"type":"Item","data":{"slug":"c","status":"active","published":"2025-02-02T00:00:00Z","rank":3,"title":"gamma","note":"n3"}}"#; + +// Enums and orderable scalars (DateTime, numeric) get a BTREE from load's +// build-indices pass, so a `=`/range filter on them uses the index. Free-text +// String `@index` keeps FTS (no BTREE), and an un-annotated column has no +// scalar index — both report `Degraded`, which is the negative control that +// keeps this test from being vacuously green. +#[tokio::test] +async fn node_scalar_and_enum_index_columns_get_btree() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, SCHEMA).await.unwrap(); + load_jsonl(&mut db, DATA, LoadMode::Overwrite).await.unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Item").await.unwrap(); + + for col in ["status", "published", "rank"] { + let cov = TableStore::key_column_index_coverage(&ds, col).await.unwrap(); + assert_eq!( + cov, + IndexCoverage::Indexed, + "column '{col}' (enum/DateTime/numeric @index) must get a BTREE, got {cov:?}" + ); + } + + // Free-text String @index -> FTS, which is not a BTREE -> Degraded. + let title_cov = TableStore::key_column_index_coverage(&ds, "title") + .await + .unwrap(); + assert!( + matches!(title_cov, IndexCoverage::Degraded { .. }), + "free-text String @index should keep FTS (no BTREE), got {title_cov:?}" + ); + + // No @index annotation -> no scalar index at all -> Degraded. + let note_cov = TableStore::key_column_index_coverage(&ds, "note") + .await + .unwrap(); + assert!( + matches!(note_cov, IndexCoverage::Degraded { .. }), + "un-annotated column should have no scalar index, got {note_cov:?}" + ); +} diff --git a/docs/user/indexes.md b/docs/user/indexes.md index df898c4..734d6a9 100644 --- a/docs/user/indexes.md +++ b/docs/user/indexes.md @@ -4,10 +4,27 @@ | Index | Use | Notes | |---|---|---| -| **BTREE scalar** | range / equality on any scalar | created on `@key`, `@index(...)`, and on key columns by `ensure_indices()` | -| **Inverted (FTS)** | `search`, `fuzzy`, `match_text`, `bm25` | created on text columns referenced by FTS queries | +| **BTREE scalar** | `=` / range / `IN` / `IS NULL` on a scalar | always on the node `id` and edge `src`/`dst`; and on each one-column `@index`/`@key` property that is an **enum** or an **orderable scalar** (`DateTime`/`Date`/`I32`/`I64`/`U32`/`U64`/`F32`/`F64`/`Bool`) | +| **Inverted (FTS)** | `search`, `fuzzy`, `match_text`, `bm25` | created on **free-text** (non-enum) `String` `@index`/`@key` columns | | **Vector** | `nearest()` k-NN | Lance picks IVF_PQ vs HNSW family by configuration; OmniGraph stores as FixedSizeList(Float32, dim) | +The per-property index a column gets is decided by `node_prop_index_kind` (shared +by the builder and the sidecar-pinning coverage check so they cannot drift): +enums and orderable scalars → BTREE, free-text Strings → FTS, `Vector` → vector, +list/`Blob` columns → none. + +> **Free-text Strings are not equality-indexed.** A non-enum `String` column +> (including a `String @key` slug) gets an FTS inverted index, which Lance does +> **not** consult for `=`/range — only for `search`/`match_text`/`bm25`. So an +> equality filter on a free-text String falls back to a full scan. If you filter +> a String identifier by equality on a large table, model it so the value is the +> node id, or track it as a follow-up to also build a BTREE on such columns. + +> **Coverage and cost.** Each indexed column adds index files and build time, and +> an index only covers the fragments it was built over. Rows appended after the +> index was built (e.g. by `ingest --mode merge`) are scanned unindexed until a +> reindex extends coverage; see [maintenance](maintenance.md) → `optimize`. + ## L2 — OmniGraph orchestration - `ensure_indices()` / `ensure_indices_on(branch)` — idempotent build of BTREE + inverted indexes for the current head; safe to re-run.