diff --git a/crates/omnigraph/src/db/manifest/tests.rs b/crates/omnigraph/src/db/manifest/tests.rs index b396866..0e00505 100644 --- a/crates/omnigraph/src/db/manifest/tests.rs +++ b/crates/omnigraph/src/db/manifest/tests.rs @@ -337,12 +337,23 @@ async fn test_directory_namespace_direct_publish_cannot_replace_native_omnigraph .unwrap(); // Lance 7: the native `DirectoryNamespace` no longer recognizes omnigraph's - // manifest-tracked tables. `check_table_status` reports the table absent - // (`lance-namespace-impls` dir.rs), so list / describe / create_table_version - // all return `TableNotFound`. The native path is fully decoupled from - // omnigraph's manifest authority: it cannot enumerate, inspect, or publish - // over omnigraph's tables. (Pre-v7 the native namespace could still *see* the - // published version but never replace the write path; v7 only widens the gap.) + // manifest-tracked tables, so list / describe / create_table_version all + // return `TableNotFound`. The mechanism is *contingent on omnigraph's legacy + // boolean PK key*, not an unconditional v7 property: v7's namespace eagerly + // rewrites any `__manifest` whose `object_id` lacks the new + // `lance-schema:unenforced-primary-key:position` key, omnigraph declares the + // PK with the legacy boolean key, and v7 forbids changing a PK once set — so + // `ensure_manifest_table_up_to_date` errors, the namespace silently falls + // back to directory listing (disabled here), and `check_table_status` reports + // the table absent. omnigraph keeps the boolean key deliberately: Lance + // honors it permanently (it maps to PK position 0) and one uniform on-disk + // format beats a new-vs-old split, since existing graphs can't be re-keyed to + // the position key under that same immutability rule. The decoupling is + // therefore an accepted, production-irrelevant tradeoff (omnigraph never uses + // the native namespace — its publisher writes `__manifest` via merge_insert + // and its reads go through its own `LanceNamespace` impls), and it only + // strengthens this guard's thesis: native tooling cannot enumerate, inspect, + // or publish over omnigraph's tables, let alone replace the write path. let assert_table_not_found = |what: &str, dbg: String| { assert!( dbg.contains("TableNotFound") && dbg.contains("node:Person"), diff --git a/crates/omnigraph/tests/lance_surface_guards.rs b/crates/omnigraph/tests/lance_surface_guards.rs index d473142..9a0c6bd 100644 --- a/crates/omnigraph/tests/lance_surface_guards.rs +++ b/crates/omnigraph/tests/lance_surface_guards.rs @@ -918,3 +918,76 @@ async fn skip_auto_cleanup_suppresses_version_gc() { __manifest-pinned versions on upgraded (pre-bump) graphs." ); } + +// --- Guard 19: unenforced primary key is immutable once set (lance v7) ------ +// +// Lance 7 (`lance::dataset::transaction`) makes the unenforced PK reserved: +// once `lance-schema:unenforced-primary-key` is set on a field, any later write +// that touches that reserved key — even re-applying the SAME value — errors +// "the unenforced primary key is a reserved key and cannot be changed once set". +// +// This is the upstream behavior that broke +// `db/manifest/migrations.rs::migrate_v1_to_v2`'s crash-idempotency: a +// pre-v0.4.0 graph that crashed after the field-set but before the stamp bump +// re-enters the migration with the PK already present, and on Lance 6 the +// re-apply was a no-op. The migration now guards the set on the manifest's +// unenforced-PK field (`["object_id"]` → no-op, `[]` → set, anything else → +// loud refusal). If Lance ever relaxes immutability (a re-set becomes a no-op +// again), this guard goes red — revisit whether that field-guard is still +// needed, and re-pin docs/dev/lance.md. +#[tokio::test] +async fn unenforced_primary_key_is_immutable_once_set() { + use lance::datatypes::LANCE_UNENFORCED_PRIMARY_KEY; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("g19.lance"); + let mut ds = fresh_dataset(uri.to_str().unwrap()).await; + + // Precondition: no unenforced PK yet (mirrors a genuine pre-v0.4.0 manifest). + assert!( + ds.schema().unenforced_primary_key().is_empty(), + "fresh dataset should carry no unenforced primary key" + ); + + // First set succeeds — the genuine pre-v0.4.0 migration path. (Discard the + // returned &Schema so the &mut borrow ends before the next call.) + ds.update_field_metadata() + .update( + "id", + [(LANCE_UNENFORCED_PRIMARY_KEY.to_string(), "true".to_string())], + ) + .unwrap() + .await + .unwrap(); + let pk: Vec = ds + .schema() + .unenforced_primary_key() + .iter() + .map(|field| field.name.clone()) + .collect(); + assert_eq!( + pk, + ["id"], + "first set should install `id` as the unenforced PK" + ); + + // Re-applying the SAME reserved key must still error. Normalize the sync + // validation stage (`.update()`) and the async commit stage (`.await`) into + // one Result so the actionable diagnostic below fires whichever stage Lance + // enforces immutability at — and even if a future Lance relaxes it to `Ok`. + // Bare `.unwrap()` / `.unwrap_err()` would instead panic with a generic + // message in those cases, defeating the guard's purpose. + let outcome: lance::Result<()> = match ds.update_field_metadata().update( + "id", + [(LANCE_UNENFORCED_PRIMARY_KEY.to_string(), "true".to_string())], + ) { + Ok(builder) => builder.await.map(|_| ()), + Err(e) => Err(e), + }; + assert!( + matches!(&outcome, Err(e) if e.to_string().contains("cannot be changed once set")), + "Lance no longer rejects re-setting the unenforced PK as immutable \ + (got: {outcome:?}); immutability relaxed or moved off the commit path \ + — revisit migrate_v1_to_v2's field-guard and re-pin docs/dev/lance.md." + ); +} diff --git a/docs/dev/lance.md b/docs/dev/lance.md index daa0435..4c624b3 100644 --- a/docs/dev/lance.md +++ b/docs/dev/lance.md @@ -166,8 +166,8 @@ Migration from Lance 6.0.1 → 7.0.0 landed in this cycle. **Arrow stayed 58, Da - **BTREE range-query bound inclusiveness fixed** (PR #6796, issue #6792): `x <= hi AND x > lo` returned the wrong boundary row on 6.0.1. omnigraph today builds BTREE only on string `@key` columns (`id`/`src`/`dst`) and queries them by equality/IN, not range, so its *current* query patterns almost certainly never hit this bug — but the corrected boundary semantics are a contract we rely on the moment a BTREE-range path appears (BTREE-on-properties via the index-type tickets, or a range-on-key query). Pinned by `lance_surface_guards.rs::btree_range_query_boundary_is_correct` (reproduces #6792's 5-row + BTREE shape). - **`WriteParams::auto_cleanup` default flipped from on (every-20-commits) to `None`** (PR #6755). On 6.0.1 the on-by-default hook could GC versions the `__manifest` pins for snapshots/time-travel. omnigraph owns cleanup explicitly (`optimize.rs::cleanup_all_tables`). Two parts to the fix, because `auto_cleanup` is **create-time config only and has no effect on existing datasets** (Lance `write.rs` docs): (1) `auto_cleanup: None` at all 11 `WriteParams` sites so *new* datasets store no cleanup config; (2) — the load-bearing half — `skip_auto_cleanup: true` on every commit path, because graphs created **before** the bump still carry the on-config in their datasets, and Lance's hook fires off the *dataset's stored* config at commit time (`io/commit.rs`: `if !commit_config.skip_auto_cleanup`). So the staged commit path (`commit_staged` → `CommitBuilder::with_skip_auto_cleanup(true)`), the `__manifest` publisher (`MergeInsertBuilder::skip_auto_cleanup(true)`), and the direct `WriteParams` paths all skip the hook. Without this, an upgraded graph would still auto-cleanup and delete `__manifest`-pinned versions. Pinned by `lance_surface_guards.rs::skip_auto_cleanup_suppresses_version_gc` (negative control + with-skip survival). - **Lance #6658 SHIPPED in 7.0.0** (`DeleteBuilder::execute_uncommitted`, exposed via PR #6781) → MR-A (migrate `delete_where` to the staged two-phase API, retire the parse-time D2 rule) is now **unblocked**, tracked separately (dev-graph `iss-950`). The bump itself keeps `delete_where` inline; the `_compile_delete_result_field_shape` guard is left untouched until MR-A. -- **The unenforced primary key is now immutable once set** (`lance::dataset::transaction`, ~L2472–2480: `if !primary_key_before.is_empty() && (writes_primary_key || primary_key_after != primary_key_before) → "the unenforced primary key is a reserved key and cannot be changed once set"`). omnigraph marks `__manifest.object_id` as the unenforced PK (`lance-schema:unenforced-primary-key`) for merge-insert row-level CAS — baked into `manifest_schema()` at init, and added by the `migrate_v1_to_v2` internal-schema migration for pre-v0.4.0 graphs. The migration relied on Lance 6's idempotent re-apply for crash-recovery (a crash after the field-set but before the stamp bump re-enters the migration with the PK already present); under v7 that re-apply errors, so a real v1 graph could never finish migrating. Fixed by guarding the set with `schema().unenforced_primary_key().is_empty()` (`db/manifest/migrations.rs`). Regression: `db::manifest::tests::test_publish_migrates_pre_stamp_manifest_to_current_version` (was red under v7). -- **Native `DirectoryNamespace` no longer recognizes omnigraph's manifest-tracked tables** (`lance-namespace-impls` dir.rs ~L1310): `list/describe/create_table_version` route through `check_table_status`, which reports an omnigraph table absent → `TableNotFound`. omnigraph production never uses Lance's native namespace (its publisher writes `__manifest` directly via merge_insert; its own `namespace.rs` impls are custom), so this is test-only — the `test_directory_namespace_direct_publish_cannot_replace_native_omnigraph_write_path` surface guard was realigned to the v7 behavior (it now asserts the native namespace is fully decoupled, which only strengthens the guard's thesis). +- **The unenforced primary key is now immutable once set** (`lance::dataset::transaction`, ~L2472–2480: `if !primary_key_before.is_empty() && (writes_primary_key || primary_key_after != primary_key_before) → "the unenforced primary key is a reserved key and cannot be changed once set"`). omnigraph marks `__manifest.object_id` as the unenforced PK (`lance-schema:unenforced-primary-key`) for merge-insert row-level CAS — baked into `manifest_schema()` at init, and added by the `migrate_v1_to_v2` internal-schema migration for pre-v0.4.0 graphs. The migration relied on Lance 6's idempotent re-apply for crash-recovery (a crash after the field-set but before the stamp bump re-enters the migration with the PK already present); under v7 that re-apply errors, so a real v1 graph could never finish migrating. Fixed by guarding the set on the manifest's unenforced-PK field (`db/manifest/migrations.rs::migrate_v1_to_v2`): `["object_id"]` → no-op, `[]` → set, any other PK field → loud refusal (the wrong CAS key, unchangeable under v7). Pinned by `lance_surface_guards.rs::unenforced_primary_key_is_immutable_once_set` (red if Lance relaxes immutability); regression: `db::manifest::tests::test_publish_migrates_pre_stamp_manifest_to_current_version` (was red under v7). +- **Native `DirectoryNamespace` no longer recognizes omnigraph's manifest-tracked tables** (`lance-namespace-impls` dir.rs ~L1310): `list/describe/create_table_version` route through `check_table_status`, which reports an omnigraph table absent → `TableNotFound`. The decoupling is *contingent on omnigraph's legacy boolean PK key*, not an unconditional v7 property: v7's namespace eagerly adds the new `lance-schema:unenforced-primary-key:position` key to any `__manifest` lacking it; that write hits the immutable-PK rule above (the boolean key already set the PK), so `ensure_manifest_table_up_to_date` errors and the namespace silently falls back to directory listing. omnigraph keeps the boolean key deliberately — Lance honors it permanently (maps to PK position 0), and one uniform on-disk format beats a new-vs-old split (existing graphs can't be re-keyed to the position key under that same immutability rule). omnigraph production never uses Lance's native namespace (its publisher writes `__manifest` directly via merge_insert; its own `namespace.rs` impls are custom), so this is test-only — the `test_directory_namespace_direct_publish_cannot_replace_native_omnigraph_write_path` surface guard was realigned to the v7 behavior (it now asserts the native namespace is fully decoupled, which only strengthens the guard's thesis). - **Still NOT fixed in 7.0.0:** vector-index two-phase (Lance #6666 open) — `create_vector_index` inline residual retained; blob-column compaction — `compact_files_still_fails_on_blob_columns` guard still red on a fix, `optimize` still skips blob tables behind `LANCE_SUPPORTS_BLOB_COMPACTION`. - **No Lance API surface omnigraph uses changed at *compile* time** (the only compile break was object_store) — but **two runtime behaviors did** (the unenforced-PK immutability and the native-namespace `TableNotFound`, above), each caught by the full engine test suite rather than the build. `CleanupPolicy`, `WriteParams` (apart from the `auto_cleanup` default), `CompactionOptions`, the namespace models (resolved via `lance-namespace-reqwest-client` 0.7.7, unchanged across the bump), `Operation`, `ManifestLocation`, and `MergeInsertBuilder` shapes are all stable. Lesson: a clean build is not a clean alignment — run `cargo test --workspace` before declaring a Lance bump done.