From 2d34d7c4323d8a6bc70ad936728b8fbded815803 Mon Sep 17 00:00:00 2001 From: Andrew Altshuler Date: Fri, 19 Jun 2026 21:06:13 +0300 Subject: [PATCH 1/8] =?UTF-8?q?test(engine):=20pin=20camelCase=20@index=20?= =?UTF-8?q?=E2=86=92=20scalar-index=20routing=20(#283=20follow-up)=20(#286?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greptile P2 on #285: the #283 tests prove correctness (right rows, type-level coercion) but not that a camelCase @index equality actually reaches the scalar-index path — a result-only test passes on a silent full-scan fallback, exactly the gap testing.md warns about and the bug-case-fix.md validation checklist (step 5) promised to close. Add lance-surface Guard 20 (mirrors Guard 16): build a BTREE on a camelCase column and assert the scan plan contains `ScalarIndexQuery` under the fix's case-preserving `ident()` expr, and that the pre-fix `col()` expr fails to plan (it normalizes `repoName` → a nonexistent `reponame`). A regression that breaks camelCase index routing — or reverts to `col()` — turns this red instead of degrading to a full scan. The existing e2e (`camelcase_property_filter_executes`) already guards the engine call-site (a `col()` revert errors there). Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 Co-authored-by: Claude Opus 4.8 (1M context) --- .../omnigraph/tests/lance_surface_guards.rs | 77 +++++++++++++++++++ 1 file changed, 77 insertions(+) diff --git a/crates/omnigraph/tests/lance_surface_guards.rs b/crates/omnigraph/tests/lance_surface_guards.rs index 9a0c6bd..6e7b891 100644 --- a/crates/omnigraph/tests/lance_surface_guards.rs +++ b/crates/omnigraph/tests/lance_surface_guards.rs @@ -991,3 +991,80 @@ async fn unenforced_primary_key_is_immutable_once_set() { — revisit migrate_v1_to_v2's field-guard and re-pin docs/dev/lance.md." ); } + +// --- Guard 20: camelCase @index equality routes to the scalar index (#283) ---- +// +// The #283 read-pushdown fix builds the filter column with datafusion `ident()` +// (case-preserving) instead of `col()` (SQL identifier normalization, which +// lowercases an unquoted name). The correctness tests in literal_filters.rs / +// writes.rs prove the right rows come back, but a result-only assertion also +// passes on a full-scan fallback — exactly the gap testing.md warns about. This +// guard pins the *plan*: an equality on a camelCase BTREE column must compile to +// a `ScalarIndexQuery` under the fix's expr shape, and must NOT under the old +// `col()` shape (which lowercases `repoName` → a nonexistent `reponame`). A +// regression that breaks camelCase index routing — or a revert to `col()` — +// turns this red instead of silently degrading to a full scan. +#[tokio::test] +async fn camelcase_index_equality_routes_to_scalar_index() { + use datafusion::physical_plan::displayable; + use datafusion::prelude::{col, ident, lit}; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("camelcase_index.lance"); + let uri = uri.to_str().unwrap(); + + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("repoName", DataType::Utf8, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["a", "b", "c", "d"])), + Arc::new(StringArray::from(vec![ + "acme", "globex", "initech", "umbrella", + ])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + let mut ds = Dataset::write(reader, uri, Some(params)).await.unwrap(); + ds.create_index_builder(&["repoName"], IndexType::BTree, &ScalarIndexParams::default()) + .replace(true) + .await + .unwrap(); + + async fn plan_str(ds: &Dataset, filter: datafusion::prelude::Expr) -> lance::Result { + let mut scanner = ds.scan(); + scanner.filter_expr(filter); + let plan = scanner.create_plan().await?; + Ok(format!("{}", displayable(plan.as_ref()).indent(true))) + } + + // The fix's shape: ident() preserves case → resolves `repoName` → index. + let used = plan_str(&ds, ident("repoName").eq(lit("acme"))) + .await + .expect("ident(\"repoName\") must plan against the case-preserved schema"); + assert!( + used.contains("ScalarIndexQuery"), + "camelCase @index equality must route to the scalar index (not full scan), got:\n{used}" + ); + + // The pre-fix shape: col() normalizes `repoName` → `reponame`, which does not + // exist in the case-sensitive schema, so planning fails. This is precisely + // why `col()` could never reach the index and surfaced the #283 runtime error + // — it could not silently full-scan past the index either. + let err = plan_str(&ds, col("repoName").eq(lit("acme"))).await; + assert!( + err.is_err(), + "col() lowercases repoName→reponame against a case-sensitive schema; \ + planning must fail rather than resolve, confirming ident() is required \ + for camelCase index routing. got plan:\n{err:?}" + ); +} From b38b36e48f8103d64de18b9039ded3ebcee76a82 Mon Sep 17 00:00:00 2001 From: Andrew Altshuler Date: Fri, 19 Jun 2026 23:12:44 +0300 Subject: [PATCH 2/8] release: v0.7.1 (#290) Patch release over v0.7.0: three correctness fixes (#283 camelCase filters, #284 cluster-apply crash-loop, #277 branch-merge OOM on embedding tables), the #280 queries-list catalog-metadata improvement, and the #268 warm-read perf fix. No breaking changes, no on-disk format change, no migration. Version coherence: all 7 crate manifests + path-dep constraints, Cargo.lock, openapi.json, and AGENTS.md surveyed version bumped 0.7.0 -> 0.7.1. Full workspace gate green (1472 tests). Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 Co-authored-by: Claude Opus 4.8 (1M context) --- AGENTS.md | 2 +- Cargo.lock | 14 +++--- crates/omnigraph-api-types/Cargo.toml | 6 +-- crates/omnigraph-cli/Cargo.toml | 14 +++--- crates/omnigraph-cluster/Cargo.toml | 6 +-- crates/omnigraph-compiler/Cargo.toml | 2 +- crates/omnigraph-policy/Cargo.toml | 2 +- crates/omnigraph-server/Cargo.toml | 12 ++--- crates/omnigraph/Cargo.toml | 8 ++-- docs/releases/v0.7.1.md | 67 +++++++++++++++++++++++++++ openapi.json | 2 +- 11 files changed, 101 insertions(+), 34 deletions(-) create mode 100644 docs/releases/v0.7.1.md diff --git a/AGENTS.md b/AGENTS.md index 1772f77..bf60a57 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,7 +16,7 @@ Tools that support `@`-imports (Claude Code) auto-include all three files via th `CLAUDE.md` is a symlink to this file — there is exactly one source of truth. Edit `AGENTS.md`. -**Version surveyed:** 0.7.0 +**Version surveyed:** 0.7.1 **Workspace crates:** `omnigraph-compiler`, `omnigraph` (engine), `omnigraph-policy`, `omnigraph-api-types` (shared HTTP wire DTOs), `omnigraph-cluster`, `omnigraph-cli`, `omnigraph-server` **Storage substrate:** Lance 7.x (columnar, versioned, branchable) **License:** MIT diff --git a/Cargo.lock b/Cargo.lock index 3963da1..7033173 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4851,7 +4851,7 @@ dependencies = [ [[package]] name = "omnigraph-api-types" -version = "0.7.0" +version = "0.7.1" dependencies = [ "omnigraph-compiler", "omnigraph-engine", @@ -4862,7 +4862,7 @@ dependencies = [ [[package]] name = "omnigraph-cli" -version = "0.7.0" +version = "0.7.1" dependencies = [ "assert_cmd", "clap", @@ -4886,7 +4886,7 @@ dependencies = [ [[package]] name = "omnigraph-cluster" -version = "0.7.0" +version = "0.7.1" dependencies = [ "fail", "omnigraph-compiler", @@ -4904,7 +4904,7 @@ dependencies = [ [[package]] name = "omnigraph-compiler" -version = "0.7.0" +version = "0.7.1" dependencies = [ "ahash", "arrow-array", @@ -4923,7 +4923,7 @@ dependencies = [ [[package]] name = "omnigraph-engine" -version = "0.7.0" +version = "0.7.1" dependencies = [ "arc-swap", "arrow-array", @@ -4967,7 +4967,7 @@ dependencies = [ [[package]] name = "omnigraph-policy" -version = "0.7.0" +version = "0.7.1" dependencies = [ "cedar-policy", "clap", @@ -4980,7 +4980,7 @@ dependencies = [ [[package]] name = "omnigraph-server" -version = "0.7.0" +version = "0.7.1" dependencies = [ "arc-swap", "async-trait", diff --git a/crates/omnigraph-api-types/Cargo.toml b/crates/omnigraph-api-types/Cargo.toml index d69d4fe..037edba 100644 --- a/crates/omnigraph-api-types/Cargo.toml +++ b/crates/omnigraph-api-types/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-api-types" -version = "0.7.0" +version = "0.7.1" edition = "2024" description = "Shared HTTP wire DTOs for Omnigraph — request/response types and engine-result → DTO mappings used by both omnigraph-server and omnigraph-cli (RFC-009). Plain serde/utoipa types; no transport or server internals." license = "MIT" @@ -9,8 +9,8 @@ homepage = "https://github.com/ModernRelay/omnigraph" documentation = "https://docs.rs/omnigraph-api-types" [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.0" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } serde = { workspace = true } serde_json = { workspace = true } utoipa = { workspace = true } diff --git a/crates/omnigraph-cli/Cargo.toml b/crates/omnigraph-cli/Cargo.toml index e21b21e..87c42aa 100644 --- a/crates/omnigraph-cli/Cargo.toml +++ b/crates/omnigraph-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-cli" -version = "0.7.0" +version = "0.7.1" edition = "2024" description = "CLI for the Omnigraph graph database." license = "MIT" @@ -13,12 +13,12 @@ name = "omnigraph" path = "src/main.rs" [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.0" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" } -omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.0" } -omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.0" } -omnigraph-server = { path = "../omnigraph-server", version = "0.7.0" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } +omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.1" } +omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" } +omnigraph-server = { path = "../omnigraph-server", version = "0.7.1" } clap = { workspace = true } color-eyre = { workspace = true } serde = { workspace = true } diff --git a/crates/omnigraph-cluster/Cargo.toml b/crates/omnigraph-cluster/Cargo.toml index f0a3a22..119545e 100644 --- a/crates/omnigraph-cluster/Cargo.toml +++ b/crates/omnigraph-cluster/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-cluster" -version = "0.7.0" +version = "0.7.1" edition = "2024" description = "Cluster configuration validation, planning, and config-only apply for Omnigraph." license = "MIT" @@ -14,8 +14,8 @@ documentation = "https://docs.rs/omnigraph-cluster" failpoints = ["dep:fail", "fail/failpoints", "omnigraph/failpoints"] [dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" } -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } fail = { workspace = true, optional = true } serde = { workspace = true } serde_json = { workspace = true } diff --git a/crates/omnigraph-compiler/Cargo.toml b/crates/omnigraph-compiler/Cargo.toml index 4645b81..13c3bbf 100644 --- a/crates/omnigraph-compiler/Cargo.toml +++ b/crates/omnigraph-compiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-compiler" -version = "0.7.0" +version = "0.7.1" edition = "2024" description = "Schema/query compiler for Omnigraph. Zero Lance dependency." license = "MIT" diff --git a/crates/omnigraph-policy/Cargo.toml b/crates/omnigraph-policy/Cargo.toml index 907ce07..25bedd1 100644 --- a/crates/omnigraph-policy/Cargo.toml +++ b/crates/omnigraph-policy/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-policy" -version = "0.7.0" +version = "0.7.1" edition = "2024" description = "Policy / authorization layer for Omnigraph — Cedar-backed PolicyEngine, PolicyChecker trait, ResourceScope enum." license = "MIT" diff --git a/crates/omnigraph-server/Cargo.toml b/crates/omnigraph-server/Cargo.toml index a6a0717..0f6bcfc 100644 --- a/crates/omnigraph-server/Cargo.toml +++ b/crates/omnigraph-server/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-server" -version = "0.7.0" +version = "0.7.1" edition = "2024" description = "HTTP server for the Omnigraph graph database." license = "MIT" @@ -19,11 +19,11 @@ default = [] aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"] [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.0" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.0" } -omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.0" } -omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.0" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.7.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" } +omnigraph-api-types = { path = "../omnigraph-api-types", version = "0.7.1" } +omnigraph-cluster = { path = "../omnigraph-cluster", version = "0.7.1" } axum = { workspace = true } clap = { workspace = true } color-eyre = { workspace = true } diff --git a/crates/omnigraph/Cargo.toml b/crates/omnigraph/Cargo.toml index 55d3008..c830367 100644 --- a/crates/omnigraph/Cargo.toml +++ b/crates/omnigraph/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-engine" -version = "0.7.0" +version = "0.7.1" edition = "2024" description = "Runtime engine for the Omnigraph graph database." license = "MIT" @@ -16,8 +16,8 @@ default = [] failpoints = ["dep:fail", "fail/failpoints"] [dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.7.1" } lance = { workspace = true } lance-datafusion = { workspace = true } datafusion = { workspace = true } @@ -52,7 +52,7 @@ chrono = { workspace = true } arc-swap = { workspace = true } [dev-dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.1" } tokio = { workspace = true } lance-namespace-impls = { workspace = true } lance-io = "7.0.0" diff --git a/docs/releases/v0.7.1.md b/docs/releases/v0.7.1.md new file mode 100644 index 0000000..3497261 --- /dev/null +++ b/docs/releases/v0.7.1.md @@ -0,0 +1,67 @@ +# Omnigraph v0.7.1 + +A patch release on top of v0.7.0: three correctness fixes (camelCase filters, +cluster-apply crash loops, branch-merge OOM on embedding tables), one CLI +catalog-metadata improvement, and a warm-read performance fix. No breaking +changes, no on-disk format change, and no migration — drop-in over v0.7.0. + +## Fixes + +- **camelCase property filters now execute (#283).** A query — or a chained + mutation — that filtered on a camelCase schema field (e.g. `repoName`) linted + and planned cleanly but failed at run time with `No field named reponame. + Column names are case sensitive.` The identifier's case was destroyed at two + engine→Lance boundaries: the read-filter pushdown built the column with a + case-normalizing constructor, and the pending-batch mutation scan re-parsed + the predicate through a normalizing SQL context. Both now preserve case (the + read path uses a case-preserving column reference; the pending scan disables + SQL identifier normalization), so camelCase fields work consistently in read + and write predicates and a camelCase `@index` equality still routes to the + scalar index. The fix is correct-by-construction rather than a per-query + guard; a regression test pins index routing so a silent full-scan fallback + can't slip back in. + +- **`cluster apply` no longer crash-loops a booting server (#284).** Applying a + schema change while a graph had non-main (agent/review) branches, or a + migration that needed a backfill, could throw a freshly-booting + `omnigraph-server --cluster` into an unescapable crash loop. Neither input is + an engine bug — the engine rejects both cleanly and before moving any graph + state — but `cluster apply` wrote a recovery sidecar before calling the + engine and left it in place on the clean rejection, and the server refuses to + boot while a sidecar is pending. The asymmetric-cleanup path is fixed so a + pre-movement rejection leaves no stale sidecar, breaking the loop. + +- **Branch-merge fast-forward no longer OOMs on embedding tables (#277).** A + branch→main fast-forward merge of a forked, embedding-bearing table + re-derived the whole branch through a single Lance `merge_insert` — a + full-outer hash join over the entire delta — which exhausted the DataFusion + memory pool on high-dimensional embeddings (e.g. 8k rows × 3072-dim) and hung + or failed the merge. New rows now stream through `stage_append` (no hash + join), only genuinely-changed rows are upserted, embeddings are no longer + stringified to diff them, and index coverage defers to the reconciler, so a + fast-forward merge completes in bounded work. The three-way merge path is + unchanged. + +## Improvements + +- **`omnigraph queries list` surfaces stored-query `@description` / + `@instruction` (#280).** The CLI now shows a stored query's catalog metadata — + what it does and how to invoke it — in both human and `--json` output, + matching what `GET /queries` already returned. Previously both fields were + silently dropped on the CLI side. + +- **Warm reads no longer pay an O(history) metadata tax (#268).** Warm reads + used to re-derive per-query metadata (coordinator re-open, `__manifest` + + commit-graph re-scans, per-table re-open, double schema validation) on a cost + that scaled with commit history and never warmed up. A warm same-branch read + now does one cheap version probe, one schema read, and zero table opens on a + warm repeat (warm coordinator reuse, open-by-location+version, validate-once, + held `Dataset` handles + one shared Lance `Session` per graph). This also + closes a commit-DAG fork where a same-branch write after an external commit + could append off a stale cached head. + +## Upgrade notes + +Drop-in over v0.7.0 — no configuration, schema, or data changes. Upgrade the +server and CLI together as usual. Graphs created on v0.7.0 read and write +identically on v0.7.1. diff --git a/openapi.json b/openapi.json index fb76fae..225a959 100644 --- a/openapi.json +++ b/openapi.json @@ -7,7 +7,7 @@ "name": "MIT", "identifier": "MIT" }, - "version": "0.7.0" + "version": "0.7.1" }, "paths": { "/graphs": { From f6d2cc03e379ed38f87abbc2ebeed101d515717d Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Sat, 20 Jun 2026 13:31:15 +0200 Subject: [PATCH 3/8] write-path cost gate + opener bypass (#288) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs(rfc): RFC-013 write-path latency design + index link * perf(engine): open write-path tables directly, bypassing the namespace builder Write opens routed through DatasetBuilder::from_namespace, whose describe_table opened the whole dataset just to return a location and then re-resolved the latest version — an O(commit-depth) double latest-resolution per table open that missed Lance's O(1) version-hint fast path. On an object store this dominated write latency (~70%, RFC-013 section 2.4). TableStore::open_dataset_head_for_write now delegates to the direct opener (open_dataset_head: Dataset::open by URI + checkout_branch, routed through the tracked opener so cost tests can count it; a no-op in production). The manifest already holds every sub-table's location, so the namespace catalog lookup was redundant; ensure_expected_version still validates head == pinned for strict ops. This completes PR #268's open-by-location migration on the write side. With both reads (PR #268) and now writes bypassing it, nothing in production routes through the per-table Lance namespace. The dead open chain (load_table_from_namespace, open_table_head_for_write) is deleted and the StagedTableNamespace contract apparatus is gated #[cfg(test)], mirroring the already-test-only read namespace; __manifest commit coordination (GraphNamespacePublisher) is a separate component and is unaffected. See docs/dev/rfc-013-write-path-latency.md sections 2.4 and 9 (step 3a). * test(engine): write-path cost-budget gate on a shared harness Adds tests/helpers/cost.rs, a store-agnostic cost harness (IoCounts/StagedCounts, measure/measure_with_staged, assert_flat, local_graph/s3_graph) that the read-side warm_read_cost.rs, write_cost.rs, and write_cost_s3.rs share, so the IOTracker / task-local plumbing lives in exactly one place instead of duplicated per test. write_cost.rs (local, every-PR) gates the internal-table scan term flat in commit-history depth (a RED #[ignore]'d LOCK, the acceptance for bringing the internal tables into compaction) plus green guards: a single insert's data writes are bounded, a per-write read-op ceiling fails the moment a round-trip is added, and a keyed insert routes through stage_merge_insert once with no stage_append or vector-index build. write_cost_s3.rs (bucket-gated, rustfs CI) gates the data-table opener term flat across depth — the object-store-RPC phenomenon local FS cannot reproduce, and the red->green proof of the opener bypass. Wired into the rustfs_integration CI job and its path filter. Guards the "hot-path cost is bounded by work, not history" invariant on writes. See docs/dev/rfc-013-write-path-latency.md section 5.1, docs/dev/testing.md. * docs(rfc): RFC-013 step 3a landed; write-skew coupling; cost-gate test map - Section 9: mark step 1 (gate + harness) and step 3a (opener bypass) landed; record the per-table namespace retirement to test-only and the corrected measurement note (the opener win is S3-only; the local data-table growth is the merge-insert/RI fragment scan, a compaction term, not the opener). - Sections 7.1/6/11/5.5/10: correct the cross-table write-skew analysis after a prototype proved the scoped expected-set fix is a no-op against the per-object_id manifest (disjoint writers never share a row, so Lance never conflicts, the publisher never retries, and the expected check is a non-atomic pre-check evaluated once against stale state). The fix needs a shared contention row (Phase-7 graph_head / a minimal head row / commit-time re-validation), so it is coupled to that row, not standalone; that contention is load-bearing for correctness, not a drawback. Split the concurrent face (read-set + head) from the sequential face (inbound-RI validation on node removal) -- two different fixes. - testing.md: add write_cost.rs / helpers/cost.rs / write_cost_s3.rs to the test map; document the local-vs-S3 backend split; extend the cost-budget checklist item to the write/open path and point at the shared harness. * test(engine): isolate the opener in the S3 cost gate; fail loud on S3 setup errors Addresses two PR review findings on the bucket-gated write_cost_s3 gate: - The data-table opener was not isolated: `data_reads` also counts the merge-insert/RI scan, which reads O(fragment-count) and so grows with history for a different reason (compaction's domain, not the opener) -- the same term that made the local data-table count grow. The flat assertion would false-RED or misattribute scan growth to the opener on rustfs. Fix: compact (db.optimize) before each measurement so the table holds ~1 fragment, bounding the scan and leaving the opener's latest-version resolution as the only history-varying term. Compaction preserves version history, so the opener still faces a deep _versions/ chain -- the thing under test. - s3_graph used `.ok()?`, so when OMNIGRAPH_S3_TEST_BUCKET was set but the store was down/misconfigured, init/seed failures collapsed to None and the gate skipped + passed vacuously. Fix: skip only when the bucket env var is absent; once it is set, init/seed failures panic (mirrors tests/s3_storage.rs). * test(engine): isolate the S3 opener with a per-prefix IO probe (correct-by-design) Replaces the fixture-bounded isolation (compact-before-measure) from the prior commit with the root fix: a path-classifying ObjectStore wrapper (PrefixCounter) that attributes each data-table read to the opener term (_versions/.manifest) vs the scan term (data/*.lance). IoCounts now exposes data_opener_reads / data_scan_reads, so write_cost_s3 asserts the opener flat *directly* -- no compaction or fixture massaging, and the assertion measures the opener, not the conflated total. Closes the "harness conflates two IO terms" class: any cost test (read or write) can now isolate the opener. PrefixCounter implements only the object_store 0.13 core ObjectStore methods; the convenience surface (get/put/head/...) routes through get_opts/put_opts via ObjectStoreExt's blanket impl, so every read/write is still counted. Validated locally (every-PR) by write_cost::data_table_reads_split_into_flat_opener_ and_growing_scan: opener stays flat (7 -> 3) while scan grows (11 -> 91) and opener + scan == data_total exactly -- proving the classifier and confirming the local data-table growth is the fragment scan, not the opener. warm_read_cost (12 tests) stays green under the shared-harness change. * refactor(tests): remove cost-harness duplication and namespace cfg(test) noise Branch self-review (no behavior change) — pay down three liabilities the write-path work left: - warm_read_cost.rs kept its own probes() (three IOTrackers + a QueryIoProbes + a probe counter) and read raw .stats().read_iops — duplicating the shared helpers::cost harness this branch introduced. Migrated all 12 tests onto measure()/IoCounts; deleted the local probes(). (This also makes IoCounts' version_probes field used rather than dead.) - insert_cost was copy-pasted verbatim into write_cost.rs and write_cost_s3.rs. Hoisted to helpers::cost::measure_insert so the measured write is defined once. - The per-table Lance namespace (namespace.rs) became entirely test-only after step 3a, but was gated with ~22 per-item #[cfg(test)] attributes. Collapsed to a single `#[cfg(test)] mod namespace;` and stripped the per-item attributes; merged the import groups the gating had split. Verified: lib in-source 162 passed; write_cost 4 + warm_read_cost 12 passed; forbidden_apis passed. --- .github/workflows/ci.yml | 6 +- crates/omnigraph/src/db/manifest.rs | 5 +- crates/omnigraph/src/db/manifest/layout.rs | 1 + crates/omnigraph/src/db/manifest/metadata.rs | 5 +- crates/omnigraph/src/db/manifest/namespace.rs | 65 +- crates/omnigraph/src/table_store.rs | 24 +- crates/omnigraph/tests/helpers/cost.rs | 360 +++++ crates/omnigraph/tests/helpers/mod.rs | 1 + crates/omnigraph/tests/warm_read_cost.rs | 317 ++--- crates/omnigraph/tests/write_cost.rs | 159 +++ crates/omnigraph/tests/write_cost_s3.rs | 71 + docs/dev/index.md | 1 + docs/dev/rfc-013-write-path-latency.md | 1203 +++++++++++++++++ docs/dev/testing.md | 8 +- 14 files changed, 1959 insertions(+), 267 deletions(-) create mode 100644 crates/omnigraph/tests/helpers/cost.rs create mode 100644 crates/omnigraph/tests/write_cost.rs create mode 100644 crates/omnigraph/tests/write_cost_s3.rs create mode 100644 docs/dev/rfc-013-write-path-latency.md diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index fca08da..1e9249f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -88,7 +88,8 @@ jobs: .github/workflows/ci.yml|Cargo.toml|Cargo.lock|crates/*/Cargo.toml) run_rustfs_ci=true ;; crates/omnigraph/src/storage.rs) run_rustfs_ci=true ;; crates/omnigraph/src/db/manifest.rs|crates/omnigraph/src/db/manifest/*) run_rustfs_ci=true ;; - crates/omnigraph/tests/s3_storage.rs|crates/omnigraph/tests/helpers/*) run_rustfs_ci=true ;; + crates/omnigraph/tests/s3_storage.rs|crates/omnigraph/tests/write_cost_s3.rs|crates/omnigraph/tests/helpers/*) run_rustfs_ci=true ;; + crates/omnigraph/src/table_store.rs|crates/omnigraph/src/instrumentation.rs) run_rustfs_ci=true ;; crates/omnigraph-cluster/src/store.rs|crates/omnigraph-cluster/src/serve.rs) run_rustfs_ci=true ;; crates/omnigraph-cluster/tests/s3_cluster.rs) run_rustfs_ci=true ;; crates/omnigraph-server/tests/s3.rs|crates/omnigraph-server/tests/support/*) run_rustfs_ci=true ;; @@ -372,6 +373,9 @@ jobs: - name: Run RustFS storage tests run: cargo test --locked -p omnigraph-engine --test s3_storage -- --nocapture + - name: Run RustFS write-path cost gate (RFC-013 step 3a opener) + run: cargo test --locked -p omnigraph-engine --test write_cost_s3 -- --nocapture + - name: Run RustFS server smoke # No name filter: every test in the s3 target is bucket-gated, and a # filter matching nothing passes vacuously (which silently ran zero diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs index 19f25a3..4c6410b 100644 --- a/crates/omnigraph/src/db/manifest.rs +++ b/crates/omnigraph/src/db/manifest.rs @@ -14,6 +14,10 @@ mod layout; mod metadata; #[path = "manifest/migrations.rs"] mod migrations; +// Entirely test-only since RFC-013 step 3a: with both reads (Fix 2) and writes +// bypassing the Lance namespace, nothing in production routes through it; the +// `LanceNamespace` impls are retained only to validate the contract in unit tests. +#[cfg(test)] #[path = "manifest/namespace.rs"] mod namespace; #[path = "manifest/publisher.rs"] @@ -28,7 +32,6 @@ use layout::{manifest_uri, open_manifest_dataset, table_uri_for_path, type_name_ pub(crate) use metadata::TableVersionMetadata; #[cfg(test)] use metadata::{OMNIGRAPH_ROW_COUNT_KEY, table_version_metadata_for_state}; -pub(crate) use namespace::open_table_head_for_write; #[cfg(test)] use namespace::{branch_manifest_namespace, staged_table_namespace}; use publisher::{GraphNamespacePublisher, ManifestBatchPublisher}; diff --git a/crates/omnigraph/src/db/manifest/layout.rs b/crates/omnigraph/src/db/manifest/layout.rs index 08fe043..f4ac09b 100644 --- a/crates/omnigraph/src/db/manifest/layout.rs +++ b/crates/omnigraph/src/db/manifest/layout.rs @@ -76,6 +76,7 @@ pub(super) fn table_uri_for_path(root_uri: &str, table_path: &str, branch: Optio } } +#[cfg(test)] pub(super) fn namespace_internal_error(message: impl Into) -> LanceNamespaceError { LanceNamespaceError::namespace_source(Box::new(std::io::Error::other(message.into()))) } diff --git a/crates/omnigraph/src/db/manifest/metadata.rs b/crates/omnigraph/src/db/manifest/metadata.rs index 7cd6436..d84db34 100644 --- a/crates/omnigraph/src/db/manifest/metadata.rs +++ b/crates/omnigraph/src/db/manifest/metadata.rs @@ -2,7 +2,9 @@ use std::collections::HashMap; use lance::Dataset; use lance_namespace::Error as LanceNamespaceError; -use lance_namespace::models::{CreateTableVersionRequest, TableVersion}; +use lance_namespace::models::CreateTableVersionRequest; +#[cfg(test)] +use lance_namespace::models::TableVersion; use serde::{Deserialize, Serialize}; use crate::error::{OmniError, Result}; @@ -142,6 +144,7 @@ impl TableVersionMetadata { self.to_namespace_version_with_details(version, None, None) } + #[cfg(test)] pub(super) fn to_namespace_version_with_details( &self, version: u64, diff --git a/crates/omnigraph/src/db/manifest/namespace.rs b/crates/omnigraph/src/db/manifest/namespace.rs index 0d567e0..a684b4d 100644 --- a/crates/omnigraph/src/db/manifest/namespace.rs +++ b/crates/omnigraph/src/db/manifest/namespace.rs @@ -1,3 +1,10 @@ +// Both the read namespace (BranchManifestNamespace) and the write namespace +// (StagedTableNamespace) are now test-only contract validation. Reads open +// sub-tables directly by location+version (SubTableEntry::open, Fix 2), and +// writes open the table head directly by URI (TableStore::open_dataset_head, +// RFC-013 step 3a), so nothing in production routes through the Lance namespace +// anymore. These impls are retained only to validate the LanceNamespace +// contract in unit tests. use std::sync::Arc; use async_trait::async_trait; @@ -16,30 +23,21 @@ use object_store::{ use crate::error::{OmniError, Result}; -use super::layout::{namespace_internal_error, table_uri_for_path}; -#[cfg(test)] -use super::layout::{open_manifest_dataset, table_id_to_key}; -use super::metadata::TableVersionMetadata; -#[cfg(test)] -use super::metadata::{namespace_version_metadata, parse_namespace_version_request}; -#[cfg(test)] +use super::layout::{ + namespace_internal_error, open_manifest_dataset, table_id_to_key, table_uri_for_path, +}; +use super::metadata::{ + TableVersionMetadata, namespace_version_metadata, parse_namespace_version_request, +}; use super::publisher::GraphNamespacePublisher; -// The read namespace (BranchManifestNamespace) is test-only since Fix 2: reads -// open sub-tables directly by location+version (SubTableEntry::open), so nothing -// in production routes a read through the Lance namespace. The writes path uses -// StagedTableNamespace. These items are retained to validate the namespace -// contract in unit tests. -#[cfg(test)] use super::state::{ManifestState, SubTableEntry, read_manifest_entries, read_manifest_state}; -#[cfg(test)] #[derive(Debug, Clone)] struct BranchManifestNamespace { root_uri: String, branch: Option, } -#[cfg(test)] impl BranchManifestNamespace { fn new(root_uri: &str, branch: Option<&str>) -> Self { Self { @@ -146,7 +144,6 @@ impl StagedTableNamespace { } } -#[cfg(test)] pub(crate) fn branch_manifest_namespace( root_uri: &str, branch: Option<&str>, @@ -165,27 +162,6 @@ pub(crate) fn staged_table_namespace( )) } -async fn load_table_from_namespace( - namespace: Arc, - table_key: &str, - branch: Option<&str>, - version: Option, -) -> Result { - let builder = DatasetBuilder::from_namespace(namespace, vec![table_key.to_string()]) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let builder = match (branch, version) { - (Some(branch), version) => builder.with_branch(branch, version), - (None, Some(version)) => builder.with_version(version), - (None, None) => builder, - }; - builder - .load() - .await - .map_err(|e| OmniError::Lance(e.to_string())) -} - -#[cfg(test)] #[async_trait] impl LanceNamespace for BranchManifestNamespace { fn namespace_id(&self) -> String { @@ -540,18 +516,3 @@ impl LanceNamespace for StagedTableNamespace { Ok(response) } } - -pub(crate) async fn open_table_head_for_write( - root_uri: &str, - table_key: &str, - table_path: &str, - branch: Option<&str>, -) -> Result { - load_table_from_namespace( - staged_table_namespace(root_uri, table_key, table_path, branch), - table_key, - branch, - None, - ) - .await -} diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index 511508f..96e6196 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -24,7 +24,7 @@ use lance_table::format::{Fragment, IndexMetadata, RowIdMeta}; use lance_table::rowids::{RowIdSequence, write_row_ids}; use std::sync::Arc; -use crate::db::manifest::{TableVersionMetadata, open_table_head_for_write}; +use crate::db::manifest::TableVersionMetadata; use crate::db::{Snapshot, SubTableEntry}; use crate::error::{OmniError, Result}; use crate::storage_layer::ForkOutcome; @@ -160,9 +160,15 @@ impl TableStore { dataset_uri: &str, branch: Option<&str>, ) -> Result { - let ds = Dataset::open(dataset_uri) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; + // Direct open by URI (O(1) latest-resolution). Routed through the tracked + // opener so a cost test counts it via the per-query `table_wrapper` + // (no-op in production — the task-local is unset, so this is exactly + // `Dataset::open(uri)`). + let ds = crate::instrumentation::open_dataset_tracked( + dataset_uri, + crate::instrumentation::table_wrapper(), + ) + .await?; match branch { Some(branch) if branch != "main" => ds .checkout_branch(branch) @@ -178,8 +184,14 @@ impl TableStore { dataset_uri: &str, branch: Option<&str>, ) -> Result { - let table_path = self.table_path_from_dataset_uri(dataset_uri)?; - open_table_head_for_write(&self.root_uri, table_key, &table_path, branch).await + // RFC-013 step 3a: open writes via the direct opener (O(1)) instead of the + // lance-namespace builder, which re-resolved the table's version chain + // O(depth) per write. The namespace is a catalog/discovery layer, not a + // per-open hot-path component (RFC §2.4); the manifest already holds the + // location, and `ensure_expected_version` still validates head == pinned + // for strict ops. `table_key` retained for signature stability. + let _ = table_key; + self.open_dataset_head(dataset_uri, branch).await } pub async fn delete_branch(&self, dataset_uri: &str, branch: &str) -> Result<()> { diff --git a/crates/omnigraph/tests/helpers/cost.rs b/crates/omnigraph/tests/helpers/cost.rs new file mode 100644 index 0000000..4be9ee6 --- /dev/null +++ b/crates/omnigraph/tests/helpers/cost.rs @@ -0,0 +1,360 @@ +//! Shared cost-budget test harness (RFC-013) — the single place the IO-counting +//! plumbing lives, so `warm_read_cost.rs`, `write_cost.rs`, and the S3 variant +//! assert in one vocabulary instead of duplicating `probes()` + raw `IOTracker` +//! reads. Three clean abstractions: structured counts, a `measure` primitive, a +//! named flat-assertion, plus store-agnostic backend fixtures. +//! +//! The data-table wrapper is a **path-classifying** counter (`PrefixCounter`), not a +//! plain `IOTracker`: it splits each read into the **opener** term (latest-version +//! resolution — reads of `_versions/`/`.manifest` objects) vs the **scan** term +//! (data-fragment reads, `data/`/`*.lance`). That lets a cost test isolate the +//! opener (RFC-013 step 3a's target, O(1) after the bypass) from the merge-insert/RI +//! scan (O(fragment-count), compaction's domain) even though both ride the same +//! `Dataset` — without controlling the fixture (no compaction needed). `__manifest` +//! and `_graph_commits` keep the plain `IOTracker` (no sub-prefixes worth splitting). +#![allow(dead_code)] + +use std::fmt; +use std::future::Future; +use std::sync::atomic::{AtomicU64, Ordering}; +use std::sync::{Arc, Mutex}; + +use async_trait::async_trait; +use futures::stream::BoxStream; +use lance::io::WrappingObjectStore; +use lance_io::utils::tracking_store::IOTracker; +use object_store::path::Path; +use object_store::{ + CopyOptions, GetOptions, GetResult, ListResult, MultipartUpload, ObjectMeta, ObjectStore, + PutMultipartOptions, PutOptions, PutPayload, PutResult, Result as OSResult, +}; + +use omnigraph::db::Omnigraph; +use omnigraph::instrumentation::{ + MergeWriteProbes, QueryIoProbes, with_merge_write_probes, with_query_io_probes, +}; +use omnigraph::loader::{LoadMode, load_jsonl}; + +use super::{MUTATION_QUERIES, TEST_DATA, TEST_SCHEMA, init_and_load, mixed_params}; + +/// Object-store op counts for one measured operation, by table class — the +/// vocabulary cost tests assert in (vs raw `IOTracker::stats().read_iops`). +#[derive(Debug, Clone, Copy, Default)] +pub struct IoCounts { + /// Per-table DATA opens (node/edge tables). The dominant write-path term. + pub data_reads: u64, + pub data_writes: u64, + /// DATA-table reads attributed to latest-version resolution (`_versions/`, + /// `.manifest`). This is the **opener** term step 3a flattened — isolated from + /// the scan, so it can be gated directly without compacting the fixture. + pub data_opener_reads: u64, + /// DATA-table reads attributed to data fragments (`data/`, `*.lance`) — the + /// merge-insert/RI **scan**, which grows with fragment count (compaction's + /// domain, not the opener). + pub data_scan_reads: u64, + /// `__manifest` registry scans (publish state). + pub manifest_reads: u64, + /// `_graph_commits` lineage scans. + pub commit_graph_reads: u64, + /// Version-probe invocations (the cheap freshness check). + pub version_probes: u64, +} + +impl IoCounts { + pub fn total_reads(&self) -> u64 { + self.data_reads + self.manifest_reads + self.commit_graph_reads + } +} + +/// Which staged-write primitives an operation invoked (from `MergeWriteProbes`). +#[derive(Debug, Clone, Copy, Default)] +pub struct StagedCounts { + pub stage_append: u64, + pub stage_merge_insert: u64, + pub create_vector_index: u64, + pub scan_staged_combined: u64, +} + +// ── Path-classifying data-table read counter ── + +/// How a data-table object read is attributed. +enum ReadClass { + /// Latest-version resolution: `_versions/`, `.manifest`, `_latest`. + Opener, + /// Data fragments: `data/`, `*.lance`. + Scan, + /// Anything else (indices, deletion files, …) — counted in the total only. + Other, +} + +/// Classify a Lance object path by its role in a write open. Lance's on-object +/// layout is identical on local FS and S3, so this split is backend-independent. +fn classify(path: &Path) -> ReadClass { + let p = path.as_ref(); + if p.contains("_versions") || p.ends_with(".manifest") || p.contains("_latest") { + ReadClass::Opener + } else if p.contains("/data/") || p.starts_with("data/") || p.ends_with(".lance") { + ReadClass::Scan + } else { + ReadClass::Other + } +} + +#[derive(Debug, Default, Clone, Copy)] +struct PrefixCounts { + reads: u64, + writes: u64, + opener_reads: u64, + scan_reads: u64, +} + +/// A `WrappingObjectStore` that counts reads/writes and attributes each read to the +/// opener vs scan term by object-key prefix. Shares its tally via `Arc>` so +/// the wrapped store (handed to Lance) and the test read the same counters. +#[derive(Debug, Default, Clone)] +struct PrefixCounter(Arc>); + +impl PrefixCounter { + fn record_read(&self, location: &Path) { + let mut c = self.0.lock().unwrap(); + c.reads += 1; + match classify(location) { + ReadClass::Opener => c.opener_reads += 1, + ReadClass::Scan => c.scan_reads += 1, + ReadClass::Other => {} + } + } + + fn record_write(&self) { + self.0.lock().unwrap().writes += 1; + } + + fn snapshot(&self) -> PrefixCounts { + *self.0.lock().unwrap() + } +} + +impl WrappingObjectStore for PrefixCounter { + fn wrap(&self, _store_prefix: &str, target: Arc) -> Arc { + Arc::new(PrefixCountingStore { + target, + counter: self.clone(), + }) + } +} + +/// The wrapped `ObjectStore` that records each call into a [`PrefixCounter`]. +/// Implements only the required core `ObjectStore` methods (object_store 0.13: the +/// convenience surface — `get`/`put`/`head`/`get_range`/… — lives on +/// `ObjectStoreExt` and is provided by a blanket impl that routes through `get_opts` +/// / `put_opts`, so every read/write is still counted here). Per-read path +/// classification is the only addition over a plain pass-through. +#[derive(Debug)] +struct PrefixCountingStore { + target: Arc, + counter: PrefixCounter, +} + +impl fmt::Display for PrefixCountingStore { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "PrefixCountingStore({})", self.target) + } +} + +#[async_trait] +impl ObjectStore for PrefixCountingStore { + async fn put_opts( + &self, + location: &Path, + payload: PutPayload, + opts: PutOptions, + ) -> OSResult { + self.counter.record_write(); + self.target.put_opts(location, payload, opts).await + } + + async fn put_multipart_opts( + &self, + location: &Path, + opts: PutMultipartOptions, + ) -> OSResult> { + self.counter.record_write(); + self.target.put_multipart_opts(location, opts).await + } + + async fn get_opts(&self, location: &Path, options: GetOptions) -> OSResult { + self.counter.record_read(location); + self.target.get_opts(location, options).await + } + + fn delete_stream( + &self, + locations: BoxStream<'static, OSResult>, + ) -> BoxStream<'static, OSResult> { + self.target.delete_stream(locations) + } + + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, OSResult> { + self.counter.record_read(&prefix.cloned().unwrap_or_default()); + self.target.list(prefix) + } + + fn list_with_offset( + &self, + prefix: Option<&Path>, + offset: &Path, + ) -> BoxStream<'static, OSResult> { + self.counter.record_read(&prefix.cloned().unwrap_or_default()); + self.target.list_with_offset(prefix, offset) + } + + async fn list_with_delimiter(&self, prefix: Option<&Path>) -> OSResult { + self.counter.record_read(&prefix.cloned().unwrap_or_default()); + self.target.list_with_delimiter(prefix).await + } + + async fn copy_opts(&self, from: &Path, to: &Path, options: CopyOptions) -> OSResult<()> { + self.counter.record_write(); + self.target.copy_opts(from, to, options).await + } +} + +/// The tracker handles backing one measurement; read once into [`IoCounts`]. +struct ProbeHandles { + manifest: IOTracker, + commit_graph: IOTracker, + table: PrefixCounter, + probe_count: Arc, +} + +impl ProbeHandles { + fn install() -> (QueryIoProbes, Self) { + let h = ProbeHandles { + manifest: IOTracker::default(), + commit_graph: IOTracker::default(), + table: PrefixCounter::default(), + probe_count: Arc::new(AtomicU64::new(0)), + }; + let probes = QueryIoProbes { + manifest_wrapper: Some(Arc::new(h.manifest.clone()) as Arc), + commit_graph_wrapper: Some( + Arc::new(h.commit_graph.clone()) as Arc + ), + table_wrapper: Some(Arc::new(h.table.clone()) as Arc), + probe_count: Arc::clone(&h.probe_count), + }; + (probes, h) + } + + fn counts(&self) -> IoCounts { + let t = self.table.snapshot(); + IoCounts { + data_reads: t.reads, + data_writes: t.writes, + data_opener_reads: t.opener_reads, + data_scan_reads: t.scan_reads, + manifest_reads: self.manifest.stats().read_iops, + commit_graph_reads: self.commit_graph.stats().read_iops, + version_probes: self.probe_count.load(Ordering::Relaxed), + } + } +} + +/// Run `op` under object-store IO counting; return its output + the counts. +/// The only place the `QueryIoProbes` task-local + tracker wiring lives. +pub async fn measure(op: F) -> (F::Output, IoCounts) { + let (probes, handles) = ProbeHandles::install(); + let out = with_query_io_probes(probes, op).await; + (out, handles.counts()) +} + +/// Like [`measure`], but also capture which staged-write primitives ran +/// (composes the two task-locals cleanly). +pub async fn measure_with_staged(op: F) -> (F::Output, IoCounts, StagedCounts) { + let (probes, handles) = ProbeHandles::install(); + let merge = MergeWriteProbes::default(); + let out = with_merge_write_probes(merge.clone(), with_query_io_probes(probes, op)).await; + let staged = StagedCounts { + stage_append: merge.stage_append_calls(), + stage_merge_insert: merge.stage_merge_insert_calls(), + create_vector_index: merge.create_vector_index_calls(), + scan_staged_combined: merge.scan_staged_combined_calls(), + }; + (out, handles.counts(), staged) +} + +/// Assert a per-depth metric is flat: the deepest sample must not exceed the +/// shallowest by more than `slack`. `select` picks the field; `what` names it in +/// the failure message. The shape every depth-swept cost gate uses. +pub fn assert_flat( + curve: &[(u64, IoCounts)], + select: impl Fn(&IoCounts) -> u64, + slack: u64, + what: &str, +) { + assert!(curve.len() >= 2, "assert_flat needs >= 2 depth points"); + let (d_lo, lo) = (curve[0].0, select(&curve[0].1)); + let (d_hi, hi) = (curve[curve.len() - 1].0, select(&curve[curve.len() - 1].1)); + assert!( + hi <= lo + slack, + "{what} grew with history: depth {d_lo} = {lo} -> depth {d_hi} = {hi} (slack {slack})" + ); +} + +/// Assert a per-depth metric *does* grow with history by at least `min_delta` — the +/// dual of [`assert_flat`], used to prove a term is genuinely history-dependent (so a +/// flat sibling term isn't flat merely because nothing was measured). +pub fn assert_grows( + curve: &[(u64, IoCounts)], + select: impl Fn(&IoCounts) -> u64, + min_delta: u64, + what: &str, +) { + assert!(curve.len() >= 2, "assert_grows needs >= 2 depth points"); + let (d_lo, lo) = (curve[0].0, select(&curve[0].1)); + let (d_hi, hi) = (curve[curve.len() - 1].0, select(&curve[curve.len() - 1].1)); + assert!( + hi >= lo + min_delta, + "{what} did not grow as expected: depth {d_lo} = {lo} -> depth {d_hi} = {hi} (min delta {min_delta})" + ); +} + +/// Measure one committing `insert_person` to `main` — the canonical write the cost +/// gates sweep over commit-history depth. Shared by `write_cost.rs` and +/// `write_cost_s3.rs` so the measured write is defined once. +pub async fn measure_insert(db: &mut Omnigraph, tag: &str) -> IoCounts { + let (res, io) = measure(db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", tag)], &[("$age", 30)]), + )) + .await; + res.unwrap(); + io +} + +// ── Backend fixtures — one knob, store-agnostic body ── + +/// Local tempdir graph (default; deterministic, every-PR). +pub async fn local_graph(dir: &tempfile::TempDir) -> Omnigraph { + init_and_load(dir).await +} + +/// Emulated-S3 graph, bucket-gated. Returns `None` **only** when +/// `OMNIGRAPH_S3_TEST_BUCKET` is unset, so the caller logs + skips — the +/// `tests/s3_storage.rs` graceful-skip pattern. Once the bucket *is* configured +/// (the rustfs CI job), any `init`/seed failure is a real failure and panics +/// rather than silently skipping — otherwise a down/misconfigured store would let +/// a bucket-gated gate pass vacuously. `name` disambiguates the prefix. +pub async fn s3_graph(name: &str) -> Option { + let bucket = std::env::var("OMNIGRAPH_S3_TEST_BUCKET").ok()?; + let uri = format!("s3://{bucket}/cost-tests/{name}-{}", std::process::id()); + let mut db = Omnigraph::init(&uri, TEST_SCHEMA) + .await + .expect("OMNIGRAPH_S3_TEST_BUCKET is set but S3 graph init failed"); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .expect("OMNIGRAPH_S3_TEST_BUCKET is set but S3 seed load failed"); + Some(db) +} diff --git a/crates/omnigraph/tests/helpers/mod.rs b/crates/omnigraph/tests/helpers/mod.rs index e690839..131f91b 100644 --- a/crates/omnigraph/tests/helpers/mod.rs +++ b/crates/omnigraph/tests/helpers/mod.rs @@ -1,5 +1,6 @@ #![allow(dead_code)] +pub mod cost; pub mod recovery; use arrow_array::{Array, RecordBatch, StringArray}; diff --git a/crates/omnigraph/tests/warm_read_cost.rs b/crates/omnigraph/tests/warm_read_cost.rs index d7fc52a..b3f5446 100644 --- a/crates/omnigraph/tests/warm_read_cost.rs +++ b/crates/omnigraph/tests/warm_read_cost.rs @@ -1,51 +1,22 @@ //! Cost-budget tests for the warm read path (Fix 1): a warm same-branch read -//! must perform no manifest or commit-graph opens, measured with Lance's -//! `IOTracker` at the object-store boundary (the LanceDB IO-counted-test -//! pattern; see docs/dev/testing.md). Guards invariant 15 (read cost bounded by -//! work, not history) for snapshot resolution, and invariant 6 (a warm reader -//! still observes external commits). +//! must perform no manifest or commit-graph opens, measured via the shared +//! `helpers::cost` harness at the object-store boundary (the LanceDB +//! IO-counted-test pattern; see docs/dev/testing.md). Guards invariant 15 (read +//! cost bounded by work, not history) for snapshot resolution, and invariant 6 +//! (a warm reader still observes external commits). mod helpers; -use std::sync::Arc; -use std::sync::atomic::{AtomicU64, Ordering}; - use arrow_array::{Array, StringArray}; -use lance::io::WrappingObjectStore; -use lance_io::utils::tracking_store::IOTracker; use omnigraph::db::{Omnigraph, ReadTarget}; -use omnigraph::instrumentation::{QueryIoProbes, with_query_io_probes}; use omnigraph_compiler::result::QueryResult; +use helpers::cost::measure; use helpers::{ MUTATION_QUERIES, TEST_QUERIES, commit_many, count_rows, init_and_load, mixed_params, mutate_branch, mutate_main, params, }; -/// IO probes plus the tracker handles to read `read_iops` after the query. -/// Returns `(probes, manifest, commit_graph, table, probe_count)` — `table` -/// counts per-table data opens (the cache-miss path), so a cost test can assert -/// N opens on a cold read and 0 on a warm repeat (Fix 3). -fn probes() -> ( - QueryIoProbes, - IOTracker, - IOTracker, - IOTracker, - Arc, -) { - let manifest = IOTracker::default(); - let commit_graph = IOTracker::default(); - let table = IOTracker::default(); - let probe_count = Arc::new(AtomicU64::new(0)); - let probes = QueryIoProbes { - manifest_wrapper: Some(Arc::new(manifest.clone()) as Arc), - commit_graph_wrapper: Some(Arc::new(commit_graph.clone()) as Arc), - table_wrapper: Some(Arc::new(table.clone()) as Arc), - probe_count: Arc::clone(&probe_count), - }; - (probes, manifest, commit_graph, table, probe_count) -} - fn first_column_strings(result: &QueryResult) -> Vec { if result.num_rows() == 0 { return Vec::new(); @@ -75,18 +46,14 @@ async fn warm_same_branch_read_does_no_resolution_opens() { // Deep history: warm-read resolution cost must be flat in commit count. commit_many(&mut db, 20).await; - let (probes_in, manifest, commit_graph, _table, probe_count) = probes(); - with_query_io_probes( - probes_in, - db.query( - ReadTarget::branch("main"), - TEST_QUERIES, - "total_people", - ¶ms(&[]), - ), - ) - .await - .unwrap(); + let (out, io) = measure(db.query( + ReadTarget::branch("main"), + TEST_QUERIES, + "total_people", + ¶ms(&[]), + )) + .await; + out.unwrap(); // A warm same-branch read opens nothing from the internal tables, even at // commit-history depth. Fix 1 reuses the coordinator (no re-open: 0 @@ -95,18 +62,15 @@ async fn warm_same_branch_read_does_no_resolution_opens() { // per-table __manifest scan is gone too. Pre-fix, each of these is a deep scan // of an internal table that grows with commit count. assert_eq!( - manifest.stats().read_iops, - 0, + io.manifest_reads, 0, "warm same-branch read must not scan __manifest (resolution or per-table)" ); assert_eq!( - commit_graph.stats().read_iops, - 0, + io.commit_graph_reads, 0, "warm same-branch read must not open the commit graph (no coordinator re-open)" ); assert_eq!( - probe_count.load(Ordering::Relaxed), - 1, + io.version_probes, 1, "warm same-branch read performs exactly one version probe" ); } @@ -121,22 +85,17 @@ async fn multi_table_query_does_no_manifest_scans() { let dir = tempfile::tempdir().unwrap(); let db = init_and_load(&dir).await; - let (probes_in, manifest, _commit_graph, _table, _probe) = probes(); - with_query_io_probes( - probes_in, - db.query( - ReadTarget::branch("main"), - TEST_QUERIES, - "age_stats", - ¶ms(&[]), - ), - ) - .await - .unwrap(); + let (out, io) = measure(db.query( + ReadTarget::branch("main"), + TEST_QUERIES, + "age_stats", + ¶ms(&[]), + )) + .await; + out.unwrap(); assert_eq!( - manifest.stats().read_iops, - 0, + io.manifest_reads, 0, "a multi-table read must not scan __manifest once per touched table" ); } @@ -278,32 +237,25 @@ async fn warm_branch_read_does_no_manifest_scans() { // Bind the handle's coordinator to the branch so reads of it take the warm path. db.sync_branch("feature").await.unwrap(); - let (probes_in, manifest, commit_graph, _table, probe_count) = probes(); - with_query_io_probes( - probes_in, - db.query( - ReadTarget::branch("feature"), - TEST_QUERIES, - "total_people", - ¶ms(&[]), - ), - ) - .await - .unwrap(); + let (out, io) = measure(db.query( + ReadTarget::branch("feature"), + TEST_QUERIES, + "total_people", + ¶ms(&[]), + )) + .await; + out.unwrap(); assert_eq!( - manifest.stats().read_iops, - 0, + io.manifest_reads, 0, "warm branch read must not scan __manifest (branch-owned table opened by location)" ); assert_eq!( - commit_graph.stats().read_iops, - 0, + io.commit_graph_reads, 0, "warm branch read must not open the commit graph" ); assert_eq!( - probe_count.load(Ordering::Relaxed), - 1, + io.version_probes, 1, "warm branch read performs exactly one version probe" ); } @@ -369,18 +321,14 @@ async fn warm_read_on_recreated_branch_observes_new_incarnation() { "test setup must exercise branch incarnation reuse at one Lance version" ); - let (probes_in, manifest, commit_graph, _table, probe_count) = probes(); - let new_feature = with_query_io_probes( - probes_in, - reader.query( - ReadTarget::branch("feature"), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "MainOnly")]), - ), - ) - .await - .unwrap(); + let (new_feature, io) = measure(reader.query( + ReadTarget::branch("feature"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "MainOnly")]), + )) + .await; + let new_feature = new_feature.unwrap(); assert_eq!( new_feature.num_rows(), @@ -388,17 +336,15 @@ async fn warm_read_on_recreated_branch_observes_new_incarnation() { "warm reader must refresh to the recreated branch incarnation" ); assert!( - manifest.stats().read_iops > 0, + io.manifest_reads > 0, "recreated branch must re-read the manifest after the incarnation probe" ); assert_eq!( - commit_graph.stats().read_iops, - 0, + io.commit_graph_reads, 0, "same-branch incarnation refresh must be manifest-only" ); assert_eq!( - probe_count.load(Ordering::Relaxed), - 2, + io.version_probes, 2, "stale same-branch read probes once under the read lock and once under the write lock" ); } @@ -469,39 +415,33 @@ async fn recreated_branch_owned_table_handle_uses_table_etag() { "test setup must force table handle identity to differ only by e_tag" ); - let (probes_in, manifest, commit_graph, table, probe_count) = probes(); - let new_person = with_query_io_probes( - probes_in, - reader.query( - ReadTarget::branch("feature"), - TEST_QUERIES, - "get_person", - ¶ms(&[("$name", "NewOnly")]), - ), - ) - .await - .unwrap(); + let (new_person, io) = measure(reader.query( + ReadTarget::branch("feature"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "NewOnly")]), + )) + .await; + let new_person = new_person.unwrap(); assert_eq!( new_person.num_rows(), 1, "warm reader must open the recreated branch-owned table incarnation" ); assert!( - table.stats().read_iops > 0, + io.data_reads > 0, "table e_tag must force a held-handle cache miss for the recreated table" ); assert!( - manifest.stats().read_iops > 0, + io.manifest_reads > 0, "recreated branch must refresh the manifest" ); assert_eq!( - commit_graph.stats().read_iops, - 0, + io.commit_graph_reads, 0, "same-branch table-incarnation refresh must be manifest-only" ); assert_eq!( - probe_count.load(Ordering::Relaxed), - 2, + io.version_probes, 2, "stale same-branch read probes once under each lock" ); @@ -594,35 +534,29 @@ async fn recreated_branch_traversal_uses_graph_index_incarnation() { "test setup must force graph-index identity to differ only by snapshot incarnation" ); - let (probes_in, manifest, commit_graph, _table, probe_count) = probes(); - let new_friends = with_query_io_probes( - probes_in, - reader.query( - ReadTarget::branch("feature"), - TEST_QUERIES, - "friends_of", - ¶ms(&[("$name", "NewWalker")]), - ), - ) - .await - .unwrap(); + let (new_friends, io) = measure(reader.query( + ReadTarget::branch("feature"), + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "NewWalker")]), + )) + .await; + let new_friends = new_friends.unwrap(); assert_eq!( first_column_strings(&new_friends), vec!["Bob"], "traversal must use the recreated branch's topology, not stale cached graph index" ); assert!( - manifest.stats().read_iops > 0, + io.manifest_reads > 0, "recreated branch traversal must refresh the manifest" ); assert_eq!( - commit_graph.stats().read_iops, - 0, + io.commit_graph_reads, 0, "same-branch traversal incarnation refresh must be manifest-only" ); assert_eq!( - probe_count.load(Ordering::Relaxed), - 2, + io.version_probes, 2, "stale same-branch read probes once under each lock" ); @@ -673,31 +607,25 @@ async fn stale_read_refreshes_manifest_only() { .await .unwrap(); - let (probes_in, manifest, commit_graph, _table, probe_count) = probes(); - with_query_io_probes( - probes_in, - reader.query( - ReadTarget::branch("main"), - TEST_QUERIES, - "total_people", - ¶ms(&[]), - ), - ) - .await - .unwrap(); + let (out, io) = measure(reader.query( + ReadTarget::branch("main"), + TEST_QUERIES, + "total_people", + ¶ms(&[]), + )) + .await; + out.unwrap(); assert!( - manifest.stats().read_iops > 0, + io.manifest_reads > 0, "stale read must re-read the manifest" ); assert_eq!( - commit_graph.stats().read_iops, - 0, + io.commit_graph_reads, 0, "stale refresh must be manifest-only (no commit-graph scan)" ); assert_eq!( - probe_count.load(Ordering::Relaxed), - 2, + io.version_probes, 2, "stale same-branch read probes once under the read lock and once under the write lock" ); } @@ -721,55 +649,40 @@ async fn repeat_warm_read_reuses_table_handles() { commit_many(&mut db, 10).await; // Cold first read: opens the touched table. - let (p1, _m1, _c1, table1, _pr1) = probes(); - with_query_io_probes( - p1, - db.query( - ReadTarget::branch("main"), - TEST_QUERIES, - "total_people", - ¶ms(&[]), - ), - ) - .await - .unwrap(); + let (cold_out, cold) = measure(db.query( + ReadTarget::branch("main"), + TEST_QUERIES, + "total_people", + ¶ms(&[]), + )) + .await; + cold_out.unwrap(); assert!( - table1.stats().read_iops > 0, + cold.data_reads > 0, "the cold first read must open the table" ); // Warm repeat: the held handle is reused, so no open happens through this - // query's table wrapper. - let (p2, manifest2, commit_graph2, table2, probe2) = probes(); - with_query_io_probes( - p2, - db.query( - ReadTarget::branch("main"), - TEST_QUERIES, - "total_people", - ¶ms(&[]), - ), - ) - .await - .unwrap(); + // query's table wrapper. A fresh `measure()` isolates the warm repeat's cost. + let (warm_out, warm) = measure(db.query( + ReadTarget::branch("main"), + TEST_QUERIES, + "total_people", + ¶ms(&[]), + )) + .await; + warm_out.unwrap(); assert_eq!( - table2.stats().read_iops, - 0, + warm.data_reads, 0, "a warm repeat read must reuse the held handle (0 table opens)" ); + assert_eq!(warm.manifest_reads, 0, "warm repeat read: 0 manifest opens"); assert_eq!( - manifest2.stats().read_iops, - 0, - "warm repeat read: 0 manifest opens" - ); - assert_eq!( - commit_graph2.stats().read_iops, - 0, + warm.commit_graph_reads, 0, "warm repeat read: 0 commit-graph opens" ); assert_eq!( - probe2.load(Ordering::Relaxed), - 1, + warm.version_probes, 1, "warm repeat read: exactly one version probe" ); } @@ -807,20 +720,16 @@ async fn write_invalidates_table_cache_for_changed_table() { .unwrap(); // The next read re-opens Person at the new version (cache miss). - let (p, _m, _c, table, _pr) = probes(); - with_query_io_probes( - p, - db.query( - ReadTarget::branch("main"), - TEST_QUERIES, - "total_people", - ¶ms(&[]), - ), - ) - .await - .unwrap(); + let (out, io) = measure(db.query( + ReadTarget::branch("main"), + TEST_QUERIES, + "total_people", + ¶ms(&[]), + )) + .await; + out.unwrap(); assert!( - table.stats().read_iops > 0, + io.data_reads > 0, "a read after a write to the table must re-open it (version-keyed miss)" ); diff --git a/crates/omnigraph/tests/write_cost.rs b/crates/omnigraph/tests/write_cost.rs new file mode 100644 index 0000000..5f753d7 --- /dev/null +++ b/crates/omnigraph/tests/write_cost.rs @@ -0,0 +1,159 @@ +//! Cost-budget tests for the WRITE path (RFC-013 step 1) — the safety/latency +//! twin of `warm_read_cost.rs`, on the shared `helpers::cost` harness. A +//! committing write's per-table opens and internal-table scans must be bounded +//! and **flat across commit-history depth**, measured at the object-store +//! boundary. Guards invariant 15 (cost bounded by work, not history) on writes. +//! +//! **Backend split (see docs/dev/testing.md / RFC-013).** This file runs on +//! **local FS** and gates the **internal-table** term (`__manifest`/`_graph_commits` +//! fragment scans, ~+18/depth — O(fragments) on any backend, step 2's target). +//! +//! The **data-table opener** term (step 3a's win) is a per-object-store-RPC +//! phenomenon and is NOT gated here: local-FS latest-resolution is cheap whether +//! the open goes through the namespace builder or direct-by-URI, so the +//! namespace→direct switch is invisible on local. Measured: the local data-table +//! read count grows with depth too (~+0.9/depth), but that is a *different* term — +//! the merge-insert/RI scan reading O(depth) **fragments**, unchanged by the +//! opener switch (depth-100 = 92 ops both before and after step 3a, same slope) +//! and reduced only by compaction. The opener term shows up only on a real object +//! store (per-version GETs, ~+12/depth → flat after step 3a), so it is gated in +//! `write_cost_s3.rs` (bucket-gated). Same `measure`/`IoCounts` harness, different +//! backend; each term gated where it actually manifests. +#![recursion_limit = "512"] + +mod helpers; + +use helpers::cost::{ + IoCounts, assert_flat, assert_grows, local_graph, measure_insert, measure_with_staged, +}; +use helpers::{MUTATION_QUERIES, commit_many, mixed_params}; + +// ── (A) The internal-table LOCK — RED today, the acceptance test for step 2 ── +// +// `__manifest` / `_graph_commits` scans must be O(1) in commit-history depth. +// RED today (O(fragments), uncompacted). Un-ignore when step 2 (internal-table +// compaction) lands — it must go green flat. (The data-table term is the S3 +// gate's, `write_cost_s3.rs`; local-FS hides it.) +#[tokio::test] +#[ignore = "RED until step 2 (internal-table compaction): __manifest/_graph_commits scans are O(fragments) today — RFC-013 §0/§2.2. Un-ignore there as the red→green acceptance test."] +async fn internal_table_scans_are_flat_in_history() { + let dir = tempfile::tempdir().unwrap(); + let mut db = local_graph(&dir).await; + + let mut curve: Vec<(u64, IoCounts)> = Vec::new(); + let mut current = 0u64; + for d in [10u64, 100] { + if d > current { + commit_many(&mut db, (d - current) as usize).await; + current = d; + } + let io = measure_insert(&mut db, &format!("lock_{d}")).await; + current += 1; // the measured write advanced depth by one + eprintln!( + "depth~{d}: data={} __manifest={} _graph_commits={}", + io.data_reads, io.manifest_reads, io.commit_graph_reads + ); + curve.push((d, io)); + } + + assert_flat(&curve, |c| c.manifest_reads, 4, "__manifest scan"); + assert_flat(&curve, |c| c.commit_graph_reads, 4, "_graph_commits scan"); +} + +// The data-table OPENER history-gate (opener flat across depth) lives in +// `write_cost_s3.rs` — its history-dependence is an S3-only phenomenon. But the +// *probe that isolates* the opener (the `PrefixCounter` split) is validated here, +// every-PR, on local FS: + +/// Proves the `PrefixCounter` opener/scan split: a committing write's data-table +/// reads divide into a **flat opener** term and a **growing scan** term. This pins +/// (a) the classifier actually attributes reads to the opener bucket (non-zero, so a +/// flat assertion isn't vacuously flat-at-zero), and (b) the local data-table growth +/// is the merge-insert/RI fragment scan, not the opener — which is *why* the S3 +/// gate asserts `data_opener_reads`, not total `data_reads`. (On local FS the opener +/// is O(1) regardless of step 3a; the opener's history-dependence is gated on S3.) +#[tokio::test] +async fn data_table_reads_split_into_flat_opener_and_growing_scan() { + let dir = tempfile::tempdir().unwrap(); + let mut db = local_graph(&dir).await; + + let mut curve: Vec<(u64, IoCounts)> = Vec::new(); + let mut current = 0u64; + for d in [10u64, 100] { + if d > current { + commit_many(&mut db, (d - current) as usize).await; + current = d; + } + let io = measure_insert(&mut db, &format!("split_{d}")).await; + current += 1; + eprintln!( + "depth~{d}: opener={} scan={} data_total={}", + io.data_opener_reads, io.data_scan_reads, io.data_reads + ); + curve.push((d, io)); + } + + assert!( + curve[0].1.data_opener_reads > 0, + "opener reads must be > 0 — the classifier missed version-resolution reads, \ + so a flat opener assertion would be vacuous" + ); + assert_flat(&curve, |c| c.data_opener_reads, 4, "local data-table opener"); + assert_grows(&curve, |c| c.data_scan_reads, 20, "local data-table scan"); +} + +// ── (B) Green-today regression guards — run on every PR ── + +/// A single insert's *data-table* write cost is O(1): the table commit is a small +/// constant number of writes, independent of history. +#[tokio::test] +async fn single_insert_data_write_is_bounded() { + let dir = tempfile::tempdir().unwrap(); + let mut db = local_graph(&dir).await; + commit_many(&mut db, 5).await; + let io = measure_insert(&mut db, "w").await; + eprintln!("single insert: data_writes={}", io.data_writes); + assert!(io.data_writes <= 4, "data-table write_iops should be a small constant, got {}", io.data_writes); +} + +/// At a fixed shallow depth, the per-write object-store read count is below a +/// documented ceiling. Fails the moment a change *adds* a round-trip on the write +/// path — the "no new round-trip" guard (calibrated: ~50 at depth ~5). +#[tokio::test] +async fn write_op_count_ceiling_at_shallow_depth() { + let dir = tempfile::tempdir().unwrap(); + let mut db = local_graph(&dir).await; + commit_many(&mut db, 5).await; + let io = measure_insert(&mut db, "ceil").await; + eprintln!( + "depth~5: data={} __manifest={} _graph_commits={} total_reads={}", + io.data_reads, io.manifest_reads, io.commit_graph_reads, io.total_reads() + ); + const CEILING: u64 = 80; + assert!( + io.total_reads() <= CEILING, + "per-write read ops {} exceeded ceiling {CEILING} — a new round-trip was added", + io.total_reads() + ); +} + +// ── (C) Fitness assert via the staged-write probes ── + +/// A keyed `Person` insert routes through `stage_merge_insert` exactly once, does +/// no `stage_append`, and no inline vector-index build. Pins the structural shape. +#[tokio::test] +async fn keyed_insert_routes_through_merge_insert_only() { + let dir = tempfile::tempdir().unwrap(); + let mut db = local_graph(&dir).await; + let (res, _io, staged) = measure_with_staged(db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "fit")], &[("$age", 30)]), + )) + .await; + res.unwrap(); + assert_eq!(staged.stage_merge_insert, 1, "keyed Person insert stages one merge-insert"); + assert_eq!(staged.stage_append, 0, "keyed insert must not stage_append"); + assert_eq!(staged.create_vector_index, 0, "no inline vector-index build on a plain insert"); +} diff --git a/crates/omnigraph/tests/write_cost_s3.rs b/crates/omnigraph/tests/write_cost_s3.rs new file mode 100644 index 0000000..d8ffd4f --- /dev/null +++ b/crates/omnigraph/tests/write_cost_s3.rs @@ -0,0 +1,71 @@ +//! S3 (object-store) cost-budget gate for the WRITE path — the bucket-gated twin of +//! `write_cost.rs` that proves RFC-013 **step 3a's data-table opener win**. On the +//! shared `helpers::cost` harness (`measure`/`IoCounts`/`assert_flat`/`s3_graph`). +//! +//! The opener term is an **object-store-RPC phenomenon**: latest-version resolution +//! costs per-version GETs/HEADs on S3 (O(depth) before step 3a, when writes routed +//! through the lance-namespace builder), which local FS cannot reproduce (one cheap +//! `read_dir` regardless). After step 3a (direct-by-URI opens), the per-write +//! **data-table read count is FLAT across commit-history depth** — the measured 70% +//! win. This file is the red→green acceptance for that term (it would be RED on the +//! pre-3a `from_namespace` opener); `write_cost.rs` gates the internal-table term on +//! local every-PR. +//! +//! **Isolating the opener (important):** total `data_reads` is not opener-only — the +//! same wrapped `Dataset` backs the merge-insert/RI **scan**, which reads +//! O(fragment-count) and grows with history for a *different* reason (compaction's +//! domain, not the opener; this is the term that made the *local* data-table count +//! grow). The shared harness's `PrefixCounter` attributes each read by object-key +//! prefix, so this gate asserts `data_opener_reads` (reads of `_versions/`/`.manifest`) +//! **directly** — no compaction or fixture massaging needed. After step 3a the opener +//! is O(1) regardless of version-history depth; before it grew ~+12/depth (RFC §2.4 +//! [M]). (See `write_cost.rs` for the local test that proves the split itself — +//! opener flat, scan growing.) +//! +//! Skips gracefully without `OMNIGRAPH_S3_TEST_BUCKET` (the `tests/s3_storage.rs` +//! pattern); runs for real in the rustfs CI job (`.github/workflows/ci.yml`). +#![recursion_limit = "512"] + +mod helpers; + +use helpers::cost::{IoCounts, assert_flat, measure_insert, s3_graph}; +use helpers::commit_many; + +/// After step 3a the data-table opener term is flat across depth on a real object +/// store (the measured win). RED on the pre-3a namespace-builder opener (O(depth) +/// per-version resolution). +#[tokio::test] +async fn data_table_opener_is_flat_in_history_on_s3() { + let Some(mut db) = s3_graph("write-cost-opener").await else { + eprintln!( + "SKIP data_table_opener_is_flat_in_history_on_s3: OMNIGRAPH_S3_TEST_BUCKET \ + unset (or store unreachable) — the S3 opener gate needs an object store" + ); + return; + }; + + let mut curve: Vec<(u64, IoCounts)> = Vec::new(); + let mut current = 0u64; + for d in [10u64, 50] { + if d > current { + commit_many(&mut db, (d - current) as usize).await; + current = d; + } + let io = measure_insert(&mut db, &format!("s3_{d}")).await; + current += 1; + eprintln!( + "depth~{d}: opener={} scan={} data_total={} __manifest={} _graph_commits={}", + io.data_opener_reads, + io.data_scan_reads, + io.data_reads, + io.manifest_reads, + io.commit_graph_reads + ); + curve.push((d, io)); + } + + // The opener (latest-version resolution) is O(1) after step 3a (direct-by-URI), + // isolated from the scan by the PrefixCounter. Slack absorbs object-store variance; + // the pre-3a builder grew this ~+12/depth (RFC §2.4 [M]). + assert_flat(&curve, |c| c.data_opener_reads, 8, "S3 data-table opener"); +} diff --git a/docs/dev/index.md b/docs/dev/index.md index 91f108b..9fe743f 100644 --- a/docs/dev/index.md +++ b/docs/dev/index.md @@ -91,6 +91,7 @@ Working documents for in-flight feature work. Removed when the work lands. | Restructure the CLI around explicit planes — one graph-addressing model, declared capability surface, plane-grouped help (expands RFC-009 Phase 4) | [rfc-010-cli-planes-restructure.md](rfc-010-cli-planes-restructure.md) | | CLI refactoring — one addressing & config model post-`omnigraph.yaml`: scope + `--graph` + derived access path, served-default / privileged-direct, profiles, named queries, capability classifier (completes RFC-008) | [rfc-011-cli-refactoring.md](rfc-011-cli-refactoring.md) | | Provider-independent embedding configuration — one resolved `EmbeddingConfig` + sealed provider enum (Gemini/OpenAI/Mock), identity recorded in the schema IR, query-time same-space validation, NFR floor | [rfc-012-embedding-provider-config.md](rfc-012-embedding-provider-config.md) | +| Write-path latency — capture-once `WriteTxn`, version-pinned opens, one `GraphPublishAuthority` fed declarative `PublishPlan`s, manifest-authoritative lineage, epoch fence, bounded history (compaction + cleanup), and an IO-counted cost contract (`iss-write-s3-roundtrip-amplification`, `iss-991`) | [rfc-013-write-path-latency.md](rfc-013-write-path-latency.md) | ## Boundary diff --git a/docs/dev/rfc-013-write-path-latency.md b/docs/dev/rfc-013-write-path-latency.md new file mode 100644 index 0000000..fa4abf3 --- /dev/null +++ b/docs/dev/rfc-013-write-path-latency.md @@ -0,0 +1,1203 @@ +# RFC-013: Write-path latency — capture-once `WriteTxn`, manifest-authoritative publish, bounded history, and a measured cost contract + +**Status:** Proposed +**Author(s):** write-path latency investigation (handoff + multi-agent validation) +**Date:** 2026-06-19 +**Audience:** engine / storage maintainers +**Builds on:** +[rfc-009-unify-access-paths.md](rfc-009-unify-access-paths.md) (`GraphClient` — embedded ≡ remote), +the query-latency work (PR #268, read-path warm-up — the read-side twin of this change), +the iss-991 handoff (manifest-authoritative graph lineage / Phase 7), +[writes.md](writes.md), [execution.md](execution.md), [invariants.md](invariants.md). +**Tracking (dev graph `modernrelay`):** primary `iss-write-s3-roundtrip-amplification`; depth term `iss-991`; substrate seam `iss-863`/`iss-864`; branch-create `iss-691`; recovery `iss-856`/`iss-recovery-sweep-live-writer-rollback`/`iss-merge-recovery-partial-rollforward`; MemWAL `iss-681`; read twin `gap-read-path-rederivation`. + +> Status maintained by maintainers: `Proposed` while open, `Accepted` on merge. + +--- + +## Summary + +On object-store-backed clusters a single trivial write (one edge, one branch op) +issues **hundreds of mostly-sequential object-store round-trips**, and that count +**grows without bound with the graph's commit history**, so a long-lived graph +degrades to minutes per edge. The cost is invisible on a local filesystem +(µs/call) and to correctness tests (results are right, just slow), and it was +never measured because nothing in the suite counts *object-store round-trips per +logical operation*. + +This RFC specifies the optimal write path from first principles — **a write is a +pure function of one version-pinned snapshot, published in a single +manifest-atomic CAS** — and the **cost contract that makes its O(1)-in-history +guarantee provable and non-regressable** (deterministic IO-counted tests on every +PR). It collapses four hand-rolled writers into one `GraphPublishAuthority`, +moves graph lineage into the manifest (so the per-write `_graph_commits` scan +disappears), brings the internal metadata tables into compaction (so the +per-write `__manifest` scan stops growing), takes recovery off the hot path, and +adds an epoch fence for multi-writer safety. None of it is a substrate rewrite — +the manifest-CAS model is already correct and is exactly what Lance native +multi-table transactions (lance#7264) will later formalize; this RFC builds the +seam to that future and pays down the write path onto it. + +**The dominant fix is demonstrated, not proposed:** a one-line opener-bypass +prototype (open writes direct-by-URI instead of through the lance-namespace builder) +flattens the depth-dominant term `31 + 12·depth → flat 4` and cuts a depth-80 edge +**2.7×** (1618 → 593 ops), measured end-to-end and functionally correct on +main/branch/node paths (§2.4). It is shippable as a standalone PR first (§9 step +3a); the rest of the RFC is the constant-factor + correctness + internal-residual +work layered on the same seam. + +--- + +## 0. Validation ledger (read this first) + +Every claim is tagged: **[M]** measured by me this cycle, **[S]** verified in +v0.7.0 source (`file:line` given), **[U]** verified against upstream +Lance/LanceDB/SlateDB source or docs, **[G]** tracked in the dev graph (slug +given), **[I]** inferred/reasoned. + +A correction from the originating handoff: it hypothesized that **Cloudflare R2 +walks the full manifest listing on every open** (a prod-only amplifier absent +from AWS). **This is false for the pinned Lance 7.0.0 [U].** R2 is treated as +lexically ordered (`list_is_lexically_ordered = !is_s3_express`, +`lance-io/.../providers/aws.rs:183`), so R2 gets the O(1) head-only manifest fast +path, same as AWS; only S3-Express buckets are excluded, and even those are O(1) +via the v7 `latest_version_hint.json`. There is no R2-list config fix because +there is no R2-list problem. + +**The depth term — corrected attribution.** Two measurements, one +instrumentation-blind, one complete: + +*(a) IOTracker probe [M] — internal tables only.* A throwaway probe (the +`warm_read_cost` harness applied to a single insert to `main`, swept across +commit depth) counted the two internal tables: `__manifest` ≈ 14 + 2·depth, +`_graph_commits` ≈ 9 + 2·depth → ≈ 23 + 4·depth, `write_iops = 1`. **But this +probe is structurally blind to the write path's per-table *data* opens** — they +bypass the instrumented opener (`table_wrapper`), so it reports `probes=0` for the +data tables. It measured the *minority* of the cost. + +*(b) Network-proxy measurement [M] — all RPCs, fresh graph.* A counting proxy in +front of `rustfs` (sees every object-store RPC, under `--mode merge` — the +production path), on a brand-new graph (400 seed nodes, one committing merge per +checkpoint), classified by S3 key: + +| commit depth | data `_versions` | `__manifest` | `_graph_commits` | node (RI) | schema | TOTAL | `write_iops` | +|---:|---:|---:|---:|---:|---:|---:|---:| +| 0 | 31 | 29 | 13 | 6 | 46 | 156 | 1 | +| 5 | 121 | 44 | 23 | 6 | 46 | 268 | 1 | +| 10 | 181 | 59 | 33 | 6 | 46 | 358 | 1 | +| 20 | 301 | 89 | 53 | 6 | 46 | 538 | 1 | +| 40 | 541 | 149 | 93 | 6 | 46 | 898 | 1 | +| 80 | 1021 | 269 | 173 | 6 | 46 | 1618 | 1 | + +Slopes: **data table +12/depth (~67%)**, `__manifest` +3/depth, `_graph_commits` ++2/depth → **TOTAL ≈ 156 + 18·depth**, `write_iops` flat at 1. The IOTracker probe +(a) saw only the +4/depth internal subset — blind to the data-table opens, the +dominant ~67%. + +**Constant-factor finding [M]: the schema contract is a flat 46 reads/write** — not +depth-scaling, but **29% of the depth-0 cost (46/156)**, from +`validate_schema_contract` re-running uncached on every resolve (`omnigraph.rs:561`). +A depth-slope gate will *not* catch it; WriteTxn's resolve/validate-once kills it, +and the §5.1 fitness assert (`validate_schema_contract_calls == 1`) is what pins it +(constant-factor delta, §6). + +The dominant term is **the written table's open routed through the lance-namespace +builder ~13× per write** — now source-traced. The **write** path opens via +`DatasetBuilder::from_namespace` (`namespace.rs:174`, from `open_table_head_for_write` +`table_store.rs:181` / `namespace.rs:544`). Lance's builder calls the namespace's +`describe_table` once and uses only `response.location` (`lance` `builder.rs:130-178`) +— but omnigraph's `describe_table` **opens the whole dataset** just to produce that +location (`open_head` → `Dataset::open`, `namespace.rs:362`/`:112`), and `.load()` +then **resolves the latest version again** — a **double latest-resolution per +open**, ~13× per write, nothing cached. Crucially, latest-resolution is **not +inherently O(depth)**: the namespace path is O(depth) because it **misses the V2 +lexical / `latest_version_hint.json` fast path** that the direct opener engages +(most likely because `load_table_from_namespace` attaches no shared `Session`/store +params, `namespace.rs:174` — inferred, not traced). The **read** path skips all of +it — `from_uri(location).with_version(N)`, one HEAD, O(1) — which is why reads are +flat (+12/depth on the data table, §0(b)). **Proven on omnigraph's own table [M]:** +a direct `Dataset::open` of the *same physical* 85-version edge table = **2 ops +(O(1))**, the `from_namespace` open of that identical table = the O(depth) sweep — +same bytes, two open paths. `checkout_version` is also O(1) — **exonerated**, not a +back-walk. So `from_uri().with_version(N)` is the O(1) primitive and step 3 makes +each open O(1) *intrinsically* (cleanup then becomes hygiene/interim, not +load-bearing for read cost — §2.3). **Mode-independent [U]:** `append` ≡ `merge` ≡ +12/depth, so §0(a) +measuring a single insert was *not* the defect — the defect is the namespace open +path, not the verb. **Using `from_namespace` per-open is a misuse of Lance's +design** (the namespace is a catalog/discovery layer — resolve once, then open the +dataset directly, `lance-namespace` `operations/index.md` **[U]**); the read path +already bypasses it (PR #268 Fix 2 — see §2.4). + +**Corrected conclusion.** The depth blow-up is in omnigraph's DB layer and is +**data-table-dominated**: the redundant per-table opens (fixed by §9 step 3 — +WriteTxn open-once-by-pinned-version — plus scheduled *version cleanup* of the +node/edge tables) are ~70% of it; the uncompacted internal tables (§9 step 2) are +the secondary ~30%. Both the originating R2 hypothesis and the earlier "entirely +the internal tables" framing are wrong. The exact Lance call doing the data-table +chain re-read (`checkout_version` back-walk vs merge-insert conflict replay) is the +one unpinned item — see §12. Reads, by contrast, are flat in depth +(`warm_read_cost.rs`, PR #268). This is the O(history)-per-write → +O(N²)-cumulative behavior the production incident hit. + +--- + +## 1. Problem & measurements + +On object storage every call is a 10–100 ms RPC, there is no cheap stat, and +sequential RPCs serialize. A long-lived production graph on R2, originating handoff +**[M]**: + +| operation | prod (R2) | local `file://` | +|---|---|---| +| one-edge `load --mode merge` → main | ~3 min (90 s workflow timeout) | <1 s | +| `branch create --from main` | 120 s | <1 s | +| one-row `load` → a branch | 204 s | <1 s | +| `branch delete` | 216 s | <1 s | +| warm read / `/healthz` | fast (0.2–2 s) | fast | + +`iss-write-s3-roundtrip-amplification` **[G]** independently records the same: +cross-region single insert ~46 s, 5-node mutation ~110 s, vs ~390 ms for a +no-storage `/healthz`. Its acceptance criteria are this RFC's goal: *"a single +insert issues O(1)-to-few S3 round-trips, not O(number of tables); bulk mutations +amortize the manifest commit."* + +The cost decomposes into terms; the dominant one scales with history (§0): + +1. **Per-table opens through the O(depth) lance-namespace builder (DOMINANT, + O(tables × depth)).** Each stage opens via `DatasetBuilder::from_namespace` + (`namespace.rs:174`); its `describe_table` opens the whole dataset just to return + a location (`open_head` → `Dataset::open`, `namespace.rs:362`/`:112`) and + `.load()` resolves latest **again** — a double latest-resolution per open, + O(depth) on the repro store, ~13× per write with nothing caching it **[S]** + (§2.2). The read path's direct `from_uri().with_version(N)` is O(1). → + **+12 reads/depth, ~70% of the slope [M]**. Fixed by opening once, by pinned + version via the direct opener (§9 step 3); node/edge version *cleanup* bounds it + further. +2. **Per-write `__manifest` scan (O(history), secondary).** Every publish + full-scans the uncompacted `__manifest` (`load_publish_state` → + `read_manifest_scan`, `state.rs:133-141`) **[S]**; the internal tables are + never compacted/cleaned (`optimize` iterates node/edge only, + `optimize.rs:895-904`) **[S]**. +3.1 reads/depth **[M]**. +3. **Per-write `_graph_commits` refresh (O(history), secondary).** + `record_graph_commit` reloads the entire commit cache before each append + (`commit_graph.rs:136-164`) **[S]**; never compacted/cleaned. +2.1 reads/depth + **[M]**. The "read-path anti-pattern, now on writes" (`iss-991` handoff **[G]**). + +Terms 2+3 are the secondary ~30%; term 1 dominates. Plus per-write fixed taxes: a `list_dir("__recovery/")` (`loader/mod.rs:197`, +`exec/mutation.rs:725`, `exec/merge.rs:1090`) **[S]**, and the publisher CAS +retry budget (`PUBLISHER_RETRY_BUDGET = 5`, `publisher.rs:51`) **[S]**. + +Branch ops compound it: `branch create` is a per-table sequential fork loop +(`fork_branch_from_state`, `table_store.rs:282`); `branch delete` opens a +snapshot per *other* branch (`ensure_branch_delete_safe`, `omnigraph.rs:1317`) +and force-deletes per forked table sequentially (`cleanup_deleted_branch_tables`, +`omnigraph.rs:1359`) **[S]**. + +--- + +## 2. Root cause (validated) + +### 2.1 The write re-derives its world from storage every stage + +`loader/mod.rs:400` captures a `snapshot` once, but downstream stages **ignore +it** and re-resolve **[S]**: + +- `open_for_mutation_on_branch` (`table_ops.rs:505`) re-calls + `resolved_branch_target` **per table** (`:512`), which runs + `ensure_schema_state_valid` (a full schema-contract storage read with no cache, + `omnigraph.rs:561-568`) and then opens **by head** via + `open_dataset_head_for_write` (`:522`/`:559`), asserting head == pinned only + *after* the open. +- `fresh_snapshot_for_branch` (`omnigraph.rs:771`) always does fresh I/O; the + fork authority path re-reads the live manifest (`table_ops.rs:574`). +- The captured snapshot is used only for membership/fork checks, never for the + actual opens. + +The drift guards, CAS retries, and recovery scans are **compensating machinery** +for the staleness this self-inflicts. The `Snapshot`/coordinator primitive +already exists; it is treated as cheap-to-reacquire rather than as the +operation's authoritative identity. + +### 2.2 The depth terms — data-table re-reads dominate, internal tables secondary + +Confirmed in code and measurement (§0). The **dominant** term is §2.1's per-table +opens: ~13 opens per write through the lance-namespace builder +(`DatasetBuilder::from_namespace`, `namespace.rs:174`). The builder calls the +namespace's `describe_table` (`lance` `builder.rs:130-178`), and omnigraph's +`describe_table` opens the whole dataset just to return a location (`open_head` → +`Dataset::open`, `namespace.rs:362`/`:112`); `.load()` then resolves latest again — +a **double latest-resolution per open**, O(depth) on the repro store — so cost +grows with the table's version count (+12 reads/depth, ~70%). The **read** path +opens direct `from_uri().with_version(N)` (`namespace.rs:112` / `SubTableEntry::open`) +— O(1) — and native pylance is flat 6 ops at any depth **[U]**, so this is +omnigraph's *namespace-open* pattern, not Lance; `checkout_version` is O(1) and not +implicated. (The heavier `list_table_versions` — `versions()` + a checkout per +version, `namespace.rs:395-427` — is **not** on this path; it is test-only today, a +separate latent O(depth): §10 follow-up.) The **secondary** terms are the two +internal tables: `load_publish_state` and +`commit_graph.refresh` each full-scan a table that gains a fragment per write and +is never compacted (+5 reads/depth, ~30%). This is the `gap-read-path-rederivation` +**[G]** failure mode — "cost grows with fragment count" — on the *write* path, +where PR #268 never reached. `invariants.md` documents the internal-table half: +*"the internal metadata tables (`__manifest`, `_graph_commits`) are still not +compacted, so the probe and refresh cost still grows with fragment count."* + +### 2.3 The `skip_auto_cleanup` interaction — and compaction ≠ cleanup + +v0.7.0 sets `skip_auto_cleanup: true` deliberately (`table_store.rs` 10 sites + +`publisher.rs:392`) **[S]** — load-bearing, because Lance 7's on-by-default +`auto_cleanup` would GC `__manifest`-pinned snapshot versions (`lance.md` audit) +**[U]**. Two distinct levers were turned off and must be replaced *separately*: +**compaction** (`compact_files`) rewrites small fragments into fewer larger ones +but does **not** prune old versions; **cleanup** (`cleanup_old_versions`) prunes +old versions. Measured on a ~85-version graph **[M]**: `optimize`/compaction +*added a version* (data-table reads 1035 → 1083, frags 81→1 — **no help** against +the depth term); `cleanup --keep 3` dropped it 1035 → 63 (89 versions pruned across +7 tables, **16×**). So only *cleanup* bounds the version-chain length. Note today's +`cleanup`/`optimize` cover **node/edge tables only** (the "7 tables"; internal +`__manifest`/`_graph_commits` are excluded, `optimize.rs:895-904` **[M]**) — so +bounding the internal +5/depth residual needs them **added** to the key set (§9 step +2's code change). Operationally: `cleanup` aborts on a remote store without `--yes` +(the +scheduled job must pass it). Relation to step 3: while the namespace open is still +on the write path, cleanup **caps** the dominant term — a real interim mitigation; +once step 3 opens direct-by-version (O(1) regardless of version count, §2.4), +cleanup is **storage hygiene + internal-table sprawl**, not load-bearing for read +cost. The correct replacement is *scheduled* compaction **and** version cleanup +(§9 step 2), **not** re-enabling `auto_cleanup`. Without it, version history (and +per-write cost) grows forever. + +### 2.4 Lance namespace: proper use (why the fix is bypass, not patch) + +The upstream Lance Namespace is a **catalog / discovery layer** — "table +discovery, resolving table locations, and coordinating commits" — whose intended +division of labor is *"the namespace provides basic information about the table, +[then] the Lance SDK … fulfill[s] the other operations"* (`lance-namespace` +`namespace/index.md`, `operations/index.md`) **[U]**. It is meant to be consulted +to *resolve a table once*, after which you operate on the `Dataset` directly — **not +consulted on every per-table open on a hot path.** `DatasetBuilder::from_namespace` +itself reflects this: it calls `describe_table` only to extract `location`, then +reduces to a `from_uri` builder (`lance` `builder.rs:130-178`). For a system that +*already holds* each table's location + version (omnigraph's `__manifest` does, via +`SubTableEntry`), routing per-open resolution back through the namespace is the +anti-pattern — and it aligns with this project's invariant 1 ("resolve latest state +through the substrate's cheap primitive instead of re-scanning") and the deny-list +"cold re-derivation on the hot path." + +So the fix is **bypass, not patch**: open writes by URI + pinned version +(`from_uri(location).with_version(N)`) — exactly what the **read** path already does +(PR #268 Fix 2; the read path's own comment notes the namespace open "would +full-scan `__manifest` twice per open (`describe_table` + `describe_table_version`)"), +so this completes #268's open-by-location migration on the write side (§9 step 3). +The **custom namespace impl stays** — it is still the right home for legitimate +*catalog* operations (`describe_table` / `table_exists` / `list_table_versions` / +`create_table_version` / managed-versioning commit coordination); only the +per-open *resolution* leaves it. Two Lance facts make this safe and final: opening +by explicit version is `default_resolve_version` = a single HEAD, O(1) (`lance` +`commit.rs:939-981`), and Lance's own latest-resolution cost work (version-hint, PR +#6752) confirms the latest path is the expensive one to avoid. **Proven on +omnigraph's own table [M]:** a direct `Dataset::open` of the *same physical* +85-version edge table is 2 ops (O(1)), while the `from_namespace` open of that +identical table is the O(depth) sweep — so latest-resolution is not inherently +O(depth); the namespace path is O(depth) only because it misses the fast path the +direct opener engages (likely the un-threaded `Session`). Step 3 therefore makes +each write open O(1) on its own — so node/edge `cleanup` (§2.3) is an **interim +mitigation + storage hygiene**, not load-bearing for read cost once step 3 ships. + +**End-to-end proof [M] — the one-line opener bypass, measured.** A prototype +patched `open_dataset_head_for_write` (`table_store.rs:174`) to open directly by URI +(bypassing `from_namespace` — exactly step 3 / Alternative B), rebuilt v0.7.0, and +re-ran the depth sweep on a fresh graph: + +| depth | data `edgeVER` baseline | data patched | TOTAL baseline | TOTAL patched | +|---:|---:|---:|---:|---:| +| 0 | 31 | **4** | 156 | 121 | +| 10 | 181 | **4** | 358 | 173 | +| 20 | 301 | **4** | 538 | 233 | +| 40 | 541 | **4** | 898 | 353 | +| 80 | 1021 | **4** | 1618 | **593** | + +The dominant data-table term collapses `31 + 12·depth → flat 4` (O(1) in history), +the total slope drops `+18/depth → +5/depth` (the residual +5 is exactly the two +internal tables — step 2's scope), and at depth 80 a single edge drops **1618 → 593 +ops (2.7×)** from this one change alone, before step 2 / Phase 7. Functional +correctness verified on the hot paths: main edge merge, branch create + write + +read-back, node merge (managed-versioning still correct) — the direct opener already +handles `checkout_branch` for non-main, so the namespace layer was not load-bearing +for write correctness on these paths. **Caveat:** the prototype did **not** exercise +schema-apply, branch merge, fork-on-first-write to a new table on a branch, overwrite +mode, or concurrent writers — a production step 3 must pass the full +`merge_truth_table`/recovery/failpoint suite (the namespace may do +managed-versioning work that matters there). It proves the thesis + hot-path +correctness, not drop-in completeness. + +**Step 2 also proven [M].** On the step-3-patched binary at depth ~87, compacting +the internal tables to 1 fragment each (content-preserving) collapsed their scans: +`__manifest` 285 → 32 (8.9×), `_graph_commits` 177 → 11 (16×); the step-3 data term +stayed flat at 4. So **both depth terms are now empirically eliminated** — a depth-87 +single edge drops **~1720 → 198 ops (~8.7×; ≈258 s → ≈30 s at 150 ms/RTT)** with +both fixes. The internal term is **fragment-scan growth** (`read_manifest_scan` / +`commit_graph.refresh` read all fragments of the *latest* version), so the fix is +**compaction** (merge fragments) — distinct from the data table's version-chain term +that step 3 / version-cleanup handle. `optimize`'s `all_table_keys` +(`optimize.rs:895-904`) excludes the internal tables, so step 2 is a real code +change, not just scheduling. + +--- + +## 3. First principles + +On object storage the only objective function is **minimize the number of +*sequential* round-trips per logical operation, and make that number invariant to +graph age, history depth, and table count** — under the hard floor of SI, +durability, atomicity, and loud integrity. Three generating principles fall out, +each mapped to a validated failure: + +1. **Pin once, derive the rest (MVCC / invariant 15).** A write is a pure + function of one immutable, fully-pinned snapshot + `{branch, manifest_version, per-table (location, version, e_tag), schema_hash, + writer_epoch}`, resolved exactly once; every stage reads only from it + (open-by-pinned-version, O(1), cacheable); the only contact with "current" is + the final CAS. → fixes §2.1. +2. **One source of truth, one commit (invariant 2).** Visibility + lineage + + version bumps are **one atomic manifest write**; the commit graph, indexes, + and topology are *projections* of the manifest, never second authorities to + keep in sync. → fixes the §2.2 `_graph_commits` term (iss-991 Phase 7). +3. **The plan is the contract (correct-by-construction recovery).** The writer + serializes its *complete* publish intent **before any HEAD moves**; the live + commit and crash-recovery execute the *identical* plan, so they cannot + diverge. → fixes the partial-publish bug class structurally + (`iss-merge-recovery-partial-rollforward`, PR #277). + +The optimal single-edge write under these: **~2–3 sequential hops, O(1) in size** +— 1 warm probe (0 if the coordinator is unchanged), 1 parallel stage of fragment +writes, 1 manifest CAS — regardless of 5 tables or 500, 10 commits or 10M. +Lance's own `test_commit_iops` (read 1 / write 2 / stages 3) **[U]** proves the +per-table primitive already hits this; the job is to make the *graph* write +inherit it. + +This is not speculative: it is exactly what the two reference object-storage +databases do. **LanceDB** threads a pinned `Arc` + shared `Session` and +commits with one CAS off a captured `read_version`, never re-resolving "latest" +under default consistency **[U]**. **SlateDB** captures a snapshot, treats a +monotonic-ID manifest (no pointer file) as the *sole* authority, commits with one +conditional-PUT, recovers on open (never per-write), fences with a monotonic +`writer_epoch`, and compacts on a schedule **[U]**. + +--- + +## 4. Reference-level design + +### 4.1 The interface — one publish authority, one declarative plan + +The deepest structural flaw is **four hand-rolled writers** (`load_as`, +`mutate_as`, `apply_schema_as`, `branch_merge_as`), each re-implementing open → +stage → commit → sidecar → lineage. There is **one publish machine**; the verbs +are different declarative plans fed to it. + +```rust +// The pinned, immutable operation identity — resolved ONCE. +struct WriteTxn { + branch: BranchRef, + base: PinnedSnapshot, // {manifest_version, per-table (loc,version,e_tag), schema_hash, writer_epoch} + session: Arc, // shared per-graph; warms metadata/index caches across opens + handles: HandleCache, // open-by-version; each table opened once, reused across stages +} + +// A typed, declarative publish plan — the COMPLETE "what", built before any HEAD moves. +enum TableAction { + Append(Stream), Upsert(Batch), Overwrite(Image), Delete(Pred), + Fork { from_version: u64 }, Register(Schema), Tombstone, +} +struct PublishPlan { + base: PinnedSnapshot, + actions: Map>, + lineage: GraphCommitIntent, // parent = base.head; rides the SAME manifest CAS (Phase 7) + expected: Expectations, // per-table versions + graph_head + writer_epoch +} + +impl GraphPublishAuthority { + async fn open_txn(&self, branch: BranchRef) -> WriteTxn; // 1 warm probe + async fn publish(&self, txn: &WriteTxn, plan: PublishPlan) -> PublishedSnapshot; // stage∥ → 1 CAS +} +``` + +Properties that make it optimal: + +- **Stages take `&WriteTxn`/`&PublishPlan`, never storage** — re-resolution and + open-latest are *unrepresentable*. Invariants 2/3/15 hold by construction. +- **The recovery sidecar *is* the serialized `PublishPlan`.** Phase C and + recovery both call `plan.apply()` — a merge that bumps tables A+B can never + roll A forward and silently drop B. The + `iss-merge-recovery-partial-rollforward` bug class is gone by design. +- **One CAS.** `publish` issues exactly one conditional `__manifest` + merge-insert carrying every touched-table version + the `graph_commit` / + `graph_head` lineage rows + the `writer_epoch` check. +- **Verbs are thin lowerings.** `load`/`mutate`/`schema apply`/`branch merge` + each build a `PublishPlan` and call `publish`. Four copies → one machine; the + public `load_as`/`mutate_as` API is unchanged (it lowers internally). + +The cost contract becomes part of `publish`'s documented API: + +> `publish(txn, plan)` costs `opens ≤ |plan.touched_tables|` (0 warm), +> `stages ≤ 3`, `manifest_ops = O(1)` — **invariant to history depth and table +> count.** + +### 4.2 Supporting mechanics (each validated this cycle) + +| Mechanic | Design | Validation | +|---|---|---| +| Open by pinned version | `from_uri(location).with_version(N)` + shared `Session` + warm handle cache — the O(1) opener *reads* already use (`instrumentation::open_table_dataset:112`, `SubTableEntry::open` `db/manifest.rs:200`). **NOT** the write path's `from_namespace` builder (`namespace.rs:174`), whose `describe_table` + `.load()` do an O(depth) double latest-resolution (§2.2 — the dominant cost), and **NOT** `open_dataset_at_state` (opens head then checks out, `table_store.rs:232`, not O(1)). | #0 **[S]** | +| Strict-op SI | Update/Delete/SchemaRewrite open by pinned version (consistent read base) and the publish CAS rejects a *same-table* advance. Insert/Merge rely on Lance's natural rebase. **Do not remove the open guards wholesale** — that is a silent lost-update. | #5 **[S]** | +| Fork × pinned-version | Fork already opens source at the pinned version and creates the target from it; the live-manifest authority re-read before fork stays (not defeated by the pin). | #6 **[S]** | +| Open-once via the direct opener (**THE dominant depth fix**) | Reuse is **intra-transaction** (open each table once, by pinned version, thread it — kills the ~13 namespace-builder opens, the O(depth) double latest-resolution / +12/depth term, §0/§2.2). A commit invalidates its own entry, so no cross-write warm cache. Thread the shared per-graph `Session` through write opens (it is *not* today — `load_table_from_namespace` attaches no session, `namespace.rs:174`). | #9 **[S]** | +| Lineage in the manifest (Phase 7) | Publish `graph_commit` + mutable `graph_head:` rows in the same `__manifest` merge-insert with a branch-head CAS; `_graph_commits` becomes a projection. Removes the per-write `commit_graph.refresh` and closes the "manifest→commit-graph atomicity" + "commit-graph parent under concurrency" gaps. | `iss-991` **[G]**, **[S]** | +| Bounded history (compaction **and** cleanup) | Bring the internal table(s) into the `optimize` loop AND schedule version *cleanup* of node/edge tables — compaction rewrites fragments, only cleanup prunes the version chain that §2.2's dominant term re-reads (§2.3). No blob/PK/CAS blocker (`__manifest` has no blob column, `state.rs:44-72`; the unenforced PK is orthogonal to a content-preserving Rewrite). Post-Phase-7 there is only **one** internal table to compact. | #8 **[S]** | +| Recovery off the hot path | Move the per-write `list_dir("__recovery/")` to coordinator-open + the CAS-conflict path, guarded by a sidecar-age grace window (the sidecar carries `created_at` micros + a ULID, `recovery.rs:762`/`:1522`). | #4, `iss-856`/`iss-recovery-sweep-live-writer-rollback` **[G][S]** | +| Epoch fence | Monotonic `writer_epoch` in `__manifest`, CAS-claimed at writer init, checked on every publish. Fences a whole zombie *writer* deterministically (no TTL); closes the multi-process exposure and the Lance-MTT TTL-lease gap. | SlateDB `FenceableTransactionalObject` **[U]** | +| Branch create | Lance `Clone` instead of the per-table fork loop (O(tables)→O(1) sequential). | `iss-691` **[G]** | +| Branch delete | Run the per-other-branch safety check and the per-table reclaim loops concurrently (`buffer_unordered`); read branch sets from in-memory coordinator state. | **[S]** | + +--- + +## 5. The cost contract — measurement & enforcement + +The bug class is invisible to correctness tests, to local-FS tests, and to +wall-clock benches. You can only prevent a regression in a quantity you **define +precisely, measure deterministically, and bound on every PR.** The quantity is +*sequential object-store round-trips per logical operation, as a function of +history depth and table count.* OmniGraph already has the correct pattern for +**reads** (`warm_read_cost.rs`, `IOTracker`, swept to depth 20); this RFC extends +it across the write/branch/open surface. This is exactly how Lance and SlateDB +enforce it **[U]**. + +### 5.1 Tier 1 — deterministic IO-counted gate (every PR) + +Ordinary `cargo test`, hermetic (in-memory / tempdir + `IOTracker`), no S3, no +wall-clock. Two shapes: + +```rust +// (A) cost-invariant-to-HISTORY — the load-bearing gate. Gate the MERGE verb (the prod path). +for depth in [10, 100, 1000] { // REAL commit history, not row count + build_history(depth); + reset_counters(); + let s = measured_merge(); // --mode merge, the read-modify production path + // PRIMARY — the dominant term (§0): the written table's data opens/reads, flat in depth. + assert!(s.data_table_opens <= touched_tables); // open each table ONCE, by pinned version + assert!(s.data_table_reads_per_open <= K_OPEN); // each open O(1) in the table's version count + // SECONDARY — internal-table scans flat in depth (compaction + cleanup). + assert!(s.manifest_ops <= K_MANIFEST); // small CONSTANT, NOT a function of depth + assert!(s.lineage_ops <= K_LINEAGE); + assert!(s.stages <= 3); // bounded sequential hops +} +assert_flat_across_depths(); // ALL terms — esp. data-table opens — flat in N + +// (B) fitness functions — architectural invariants AS tests +assert_eq!(validate_schema_contract_calls(write), 1); // resolve-once +assert_eq!(coordinator_resolutions(write), 1); // O(1) resolution +assert_eq!(recovery_listdir_calls(steady_state_write), 0); +``` + +**Prerequisite, not a follow-up: route ALL opens (read + write) through the one +instrumented opener BEFORE the gate is meaningful.** Today the write path's data +opens bypass `table_wrapper` (the §0(a) blind spot), so a gate that asserts only +`manifest_ops`/`lineage_ops` would **pass a still-broken build** — one that +compacts the internal tables (§9 step 2) but keeps the dominant ~13× namespace-open +sweep (§2.2). The gate MUST count data-table opens/reads (the dominant term), which +requires the routing change first. The data term is **mode-independent** (append ≡ +merge ≡ +12/depth **[U]**), so either verb exercises it; gate the **merge** verb +as the production path. **Fixture caveat [U]:** use *valid* edge endpoints — a +write to a non-existent endpoint fails RI validation and rolls back at ~192 ops +with **zero chain reads**, so a bad-endpoint fixture silently measures the rollback +path and would pass falsely. + +The load-bearing rule both Lance and SlateDB mostly miss: **assert the constant is +flat across N, not just small at one N.** A shallow fixture cannot catch an +O(history) cost (the §0(b) table is the red baseline). Add a `num_stages` +(sequential-hop) assertion via a `ThrottledStore` wrapper (Lance's +`test_commit_iops` setup) so an O(N) listing also blows a wall-time budget. + +### 5.2 Tier 2 — wall-clock trend (post-merge / nightly, never a PR gate) + +A `ThrottledStore` criterion bench injecting cross-region RTT (50/150 ms/op — the +incident's regime) for single-insert and branch-op latency, with a threshold +alert (Bencher.dev `--err` / github-action-benchmark `fail-on-alert`). Both +reference DBs keep wall-clock out of the PR gate (too noisy on shared runners) +and use it only as a trend. + +### 5.3 Close the loop — production metric + +Emit `storage.ops` and `storage.stages` per logical operation as a span/counter +(cheap always-on atomics; the heavy per-table attributing wrapper stays +test-only behind a `test-util`-style feature, zero release cost). The number +asserted in CI is the number observed in prod — `iss-write-s3-roundtrip-amplification`'s +cross-region signal becomes a direct readout. + +### 5.4 Process discipline — test-first for performance + +Write the depth-sweep cost-budget test **first**: it goes **red today** (§0), the +WriteTxn + Phase-7 + compaction work turns it **green** (flat in N), and the +red→green is the proof. This is CLAUDE.md rule 12 applied to cost, and the +originating handoff's sequencing (§8/§9: land the tests before the fix so the win +is measured and locked). Add the policy (extend invariant 15 + testing.md "Cost +budget tests"): *any change touching the read/write/branch/open path MUST add or +extend a cost-budget test asserting the metric is flat at history depth.* + +### 5.5 The correctness contract — concurrency tests (the safety twin of the cost gate) + +The cost gate proves *fast*; these prove *safe*. §6.5's multi-writer cliff slipped +the suite for the same structural reason the latency bug did — **nothing runs the +schedule that triggers it**: the suite is single-process with the in-process queue +(the bug is cross-process), uses local/in-memory stores (no object-store +cross-process CAS), and its recovery tests cover restart-time sweep, not +live-writer rollback. **These four must land before `PublishPlan`/epoch merge +(steps 5):** + +1. **Cross-process multi-writer on a real/emulated object store** (the *corruption* + case) — N independent engine **processes** writing the same `(table, branch)`; + assert all commit-or-cleanly-retry (no lost updates, no stuck "needs recovery," + no HEAD-ahead-of-manifest). **A single-process failpoint test cannot reproduce + the corruption** (in-process degrades to clean OCC, §6.5) — this genuinely needs + a multi-process harness (empirically 1/12 today). State that so nobody writes a + single-process test expecting it to fail. +2. **Deterministic in-process interleaving (failpoint) — WRITTEN, passes [M].** Two→ + eight handles, sleep failpoint at the `commit_staged`→publish window + (`loader/mod.rs:605`); resume losers and assert they retry cleanly. This + demonstrates the **benign** path (N=8 → 2 commit, 6 clean OCC retries) — it is the + regression guard for "in-process stays clean," *not* a reproduction of the + cross-process cliff. +3. **Live-writer recovery** (`iss-recovery-sweep-live-writer-rollback`) — a + concurrent open must not roll back a live in-flight publish (the grace window). +4. **Formal model** — a Quint/TLA+ model of `{two writers, interleave commit_staged + and manifest-CAS}` (`iss-934`); it finds the §6.5 cliff immediately. +5. **Cross-table write-skew — WRITTEN, red, and driven red→green in-process [M].** + Failpoint `loader.post_ri_pre_stage` (between RI-validation and staging): writer B + validates "Bob exists" and parks; writer A `overwrite`s `node:Person` dropping Bob + (non-cascading); B commits `Knows(Bob→Alice)` → committed orphan. The red test for + the §7.1 fix. **Acceptance is a single-process gate** — unlike the §6.5 HEAD-ahead + corruption (which genuinely needs the multi-process harness), this skew reproduces + *deterministically in one process*: the parked edge writer's snapshot really does + pin `edge:Knows:1` before the overwrite commits, so the overlap is real with two + in-process handles. The fix went red→green in-process behind a shared head row + (§7.1). Only #1–#4 (HEAD-ahead/epoch corruption) need cross-process scheduling. + +Plus one **disambiguating run** owed (§6.5 confound): separate-handles in-process +on S3 — to confirm the corruption is the process boundary, not the store. + +This mirrors the cost gate's discipline (assert across the dimension the suite +otherwise never exercises) — there, history depth; here, concurrent cross-process +schedules. + +--- + +## 6. What is already right vs. the deltas + +**Already correct — do not rewrite.** The in-memory `MutationStaging` accumulator, +the recovery sidecar mechanism, the per-(table,branch) write queue, D2, the sealed +`TableStorage` trait, and the read-path warm-up (PR #268) all stay. This is **not** +a substrate rewrite. + +**One claim to soften — manifest-CAS is atomic *per publish*, not unconditionally +cross-table-serializing [M].** The manifest CAS (the reference impl of the +lance#7264 "Alternative A") makes each publish atomic and serializes any two writers +whose write-sets **share a `__manifest` row** — overlapping or same-table, which is +exactly why §6.5's same-table cases and the cascading-delete case retry cleanly. But +two writers touching **disjoint** tables write disjoint per-`object_id` rows, so Lance +sees no conflict and **both commit** (proven [M], §7.1). The genuinely-atomic +cross-table commit §13 contrasts with Delta is the **target** (§4.1's single +merge-insert over a shared head row), **not current state**. So "do not rewrite the +CAS" holds for the *commit primitive*, but the cross-table-serialization §7.1 needs +is a real addition (the shared `graph_head` row), not something the current CAS +already provides. + +**The deltas (each a validated, localized gap):** + +| # | Delta | Mechanism | Tracking | +|---|---|---|---| +| 1 | Snapshot re-derived per stage | capture-once `WriteTxn`, thread by ref | `iss-write-s3-roundtrip-amplification` | +| 2 | Write opens via `from_namespace` re-resolve the data-table ~13×/write, missing the fast path (**DOMINANT, +12/depth**) | open each table **once, direct `from_uri().with_version(N)`** (bypass namespace, §2.4) + shared Session | `iss-write-s3-roundtrip-amplification`, #0 | +| 3 | Lineage = 2nd authority, O(history) refresh (secondary) | Phase 7: lineage into `__manifest` | `iss-991` | +| 4 | `__manifest`/`_graph_commits` excluded from optimize/cleanup (`optimize.rs:895-904`; prototype pruned "7 tables" = node/edge only **[M]**) — the +5/depth residual after step 3 | **add them to `all_table_keys`** (a code change) + scheduled compaction/cleanup | `gap-read-path-rederivation` (write twin) | +| 5 | `list_dir("__recovery/")` per write | move to open + conflict, grace window | `iss-856`, `iss-recovery-sweep-live-writer-rollback` | +| 6 | 4 hand-rolled writers, commit↔recovery drift | one `PublishPlan` executed by both | `iss-merge-recovery-partial-rollforward` (PR #277) | +| 7 | No writer epoch (multi-process exposure) | `writer_epoch` in `__manifest` | — (new) | +| 8 | branch create = O(tables) fork loop | Lance `Clone` | `iss-691` | +| 9 | branch delete = sequential loops | concurrent `buffer_unordered` | — (new) | +| 10 | No write/branch cost gate (must count **data-table** opens; route all opens through the instrumented opener first) | Tier-1 IO-counted tests, merge verb | — (new) | +| 11 | Schema contract re-validated uncached per resolve (**flat 46 reads/write — 29% of depth-0 cost; constant, not depth**) | resolve/validate-once in `WriteTxn`; §5.1 `validate_schema_contract_calls==1` (the depth gate misses it) | `iss-write-s3-roundtrip-amplification` | + +--- + +## 6.5 Concurrency correctness — the multi-writer cliff (proven [M]) + +The latency fixes are about *speed*; a separate, proven finding is about *safety*. +A multi-writer experiment **[M]** shows concurrent same-branch writers behave very +differently by topology: + +| topology | concurrency | outcome | +|---|---|---| +| single server (shared in-proc queue, `loader/mod.rs:426`) | 12 | **12 / 12 commit** (clean) | +| in-process, separate handles, interleave failpoint at `commit_staged`→publish (`loader/mod.rs:605`) | 8 | **2 / 8 commit; the other 6 are clean retryable OCC** | +| multi-process (separate CLIs / S3, no shared queue) | 2 / 3 / 5 / 12 | **1 / N commit; the rest CORRUPT** | + +**Two distinct failure modes — and the corruption is strictly cross-process:** + +- **In-process → benign.** Even with *separate handles, no shared queue, high + contention*, losers fail with `stale view of 'edge:Knows': expected manifest table + version 5 but current is 7 — refresh and retry` — a **clean, retryable OCC + conflict; graph state stays consistent.** The publisher CAS is doing its job. +- **Cross-process → corruption.** `Lance HEAD version N+1 ahead of manifest version + N; a pending recovery sidecar requires rollback`. **Mechanism:** a losing writer + advances the table's Lance HEAD (`commit_staged`) *before* the manifest CAS; when + the CAS loses, HEAD is ahead of the manifest — a partial commit the per-write heal + **defers** (`recovery.rs:978-988`; only the open-time sweep rolls back), so a + *live* writer hitting it **fails instead of healing**. Self-heals on the next + read-write reopen (not permanently bricked), but during a burst throughput + collapses to one survivor. Reachable at **concurrency = 2** cross-process. + +So in-process safety **already comes from the publisher CAS** (clean OCC); the +corruption needs the process boundary. *(Confound, stated honestly: the in-process +interleave ran on local-FS and the cross-process on S3-via-proxy — but +single-server-on-S3 was also clean (12/12), giving two independent "in-process +clean" points vs one "cross-process corrupt," triangulating on the process +boundary, not the store. One disambiguating run — separate-handles in-process on S3 +— would move this from triangulated to proven; §5.5.)* + +**Scoping (matters for urgency):** **single-server prod is serialized-correct, just +slow** — the in-process `(table,branch)` queue serializes same-branch writes (all 12 +commit, no lost updates); the production incident was the *latency* (serialized +O(depth) writes → 90 s timeout), **not** corruption. The corruption hazard is +**latent**: it appears the moment a second writer exists (server replica, +CLI-alongside-server, multi-writer scale-out). **So: single-server today = +serialized-correct (slow; fixed by steps 2/3); multi-writer = UNSAFE until +`writer_epoch` lands.** + +**The fix is the existing RFC, no new design.** The `A`-before-`B` window +(Lance HEAD moves before the manifest references it) is inherent to Lance's +per-table-lineage model — you cannot eliminate it, only fence and recover it: the +**`writer_epoch`** (delta #7) is a leader-lease via cross-process CAS so two writers +are never in the `commit_staged`→manifest-CAS window across processes (it removes +the concurrent-race dimension); the **`PublishPlan`=sidecar** (delta #6) makes a +single crashed writer roll forward/back deterministically (the crash dimension); and +**recovery off the hot path + grace window** (delta #5, Q2) is the exact reason the +live writers failed rather than self-healed (`iss-recovery-sweep-live-writer-rollback`). +This is the standard WAL-replay + leader-lease shape (confirmed against SlateDB's +`FenceableTransactionalObject` and Kleppmann's fencing-token canon, §10). **This +finding promotes #6/#7 from "nice correctness work" to the load-bearing guard that +gates multi-writer topologies — and it is the motivating case for them.** + +--- + +## 7. Invariants & deny-list check + +Touches and *strengthens* (does not weaken) invariants in +[invariants.md](invariants.md): + +- **§2 (manifest-atomic visibility):** preserved; lineage now rides the same CAS + (strengthens — closes the "manifest→commit-graph atomicity" gap). +- **§3 (one snapshot per op):** enforced *by construction* via `&WriteTxn`. +- **§4 (publish at one boundary):** unchanged — still one manifest publish. +- **§5 (recovery part of the commit protocol):** preserved; the sidecar *is* the + `PublishPlan` (strengthens — commit and recovery cannot diverge). The grace + window addresses the documented "recovery serialized against live writers + in-process only" gap. +- **§7 (indexes derived) / §15 (one source of truth, cheaply derived):** this RFC + is the write-side application of §15 — bound cost to the working set, not + history. The commit graph becomes derived (strengthens). +- **§5 strict-op SI:** preserved (#5 validation — open guards kept for + read-modify-write). + +**Deny-list:** does *not* hit "cold re-derivation on the hot path" (it removes +two instances), "state that drifts" (lineage stops being a second authority), or +"acks before durable persistence." The `writer_epoch` is the closing move on the +"local `write_text_if_match` is not a cross-process CAS" / multi-process gaps — +add it before admitting multi-process write topologies. + +No invariant is weakened. Two Known Gaps **close** (manifest→commit-graph +atomicity; commit-graph parent under concurrency, via Phase 7); one +(read-path-rederivation) gets its **write twin** filed and addressed. + +### 7.1 Scope of the correctness claims (literature review, §13) + +The "correct by construction" framing (§3, §4.1) is **precise but bounded** — the +DB-canon review flags three places not to over-claim: + +- **Per-table serializability, not graph-wide — but the gap is narrow and now + measured [M].** Three deterministic cases (failpoint `loader.post_ri_pre_stage`, + placed between RI-validation and staging; red test in `tests/failpoints.rs`): + - **Cross-table *disjoint* → genuine skew, VIOLATED.** A **non-cascading endpoint + removal** — `node:Person` *overwrite* dropping Bob, touching only the node table + — concurrent with an edge insert `Knows(Bob→Alice)`: both commit (write-set-only + CAS, RI validated once pre-commit and never re-checked at publish) → **committed + orphan**. (= `iss-ri-write-skew-dangling-edges` + the concurrent face of + `iss-overwrite-orphans-committed-edges`.) + - **Cross-table *overlapping* → incidentally protected.** `delete`-based removal + **cascades** into `edge:Knows`, so the write-sets overlap, the per-table CAS + engages, and the loser fails **cleanly** (stale-view OCC retry); invariant held. + - **Same-table → NOT a separate skew.** Cardinality / `@unique(src)` have + overlapping write-sets, so the per-table CAS holds the constraint; the loser's + failure is the **HEAD-ahead corruption already scoped to #6/#7** (epoch + + PublishPlan), not a consistency hole. *(This corrects an earlier + over-generalization: cardinality/uniqueness do not share the read-set gap.)* + + So the skew is **reachable only for the non-cascading-overwrite × disjoint-edge-insert + shape** — operation-specific, not constraint-specific. + + **The scoped fix alone is a no-op — proven [M], and the reason is mechanical.** + Feeding the endpoint node-table versions into the edge's publish *expected* set + (`check_expected_table_versions`, `publisher.rs:353`) was prototyped exactly; debug + confirmed the pins reach the check, **and both writers still committed — the orphan + persisted.** Every publish writes a *unique per-`object_id` row* into `__manifest` + (merge key `object_id = version_object_id(table, version)`). Two disjoint-table + writers (`node:Person` vs `edge:Knows`) touch **no common row**, so Lance's + row-level merge-insert CAS commits both with **no conflict**, the publisher's retry + loop **never fires**, and `check_expected_table_versions` — a **non-atomic + pre-check, not part of the CAS** — is evaluated exactly once against the stale + pre-both manifest and passes for both. The read-set pin only bites if the loser is + **forced to retry and re-evaluate against fresh state**, which requires a *shared + contention row* every publish touches. Adding a stand-in global head row + (`UpdateAll`-touched by every publish) makes the disjoint writers overlap → Lance + conflict → publisher retry → the reloaded pin (`edge:Knows:1` vs current `5`) + rejects the stale writer → no orphan (red→green, failpoint suite 52/52). **That + shared row is exactly Phase-7's `graph_head:`.** + + **Consequence — §7.1 is NOT a standalone single-server PR** (correct earlier text + that called it "single-server-live, not deferrable" — it *is* urgent and + epoch-independent, but it cannot ship against today's per-`object_id` manifest + without a contention point). Land it one of three ways: **(a)** with Phase 7 + (step 4), reusing `graph_head:` as the contention row; **(b)** behind a + minimal per-branch head row ahead of Phase 7 (~15 lines, as prototyped); or + **(c)** as commit-time re-validation — still must win a serialization point first. + **Recommended: (c) behind a per-branch head row.** The CAS-map approach carries the + two costs §11 anticipated — *table-granularity false conflicts* (any `Person` + overwrite conflicts with any concurrent `edge:Knows` insert, even different rows — + needs a row-granularity read-set) and *scope* (a global head serializes the whole + graph; per-branch `graph_head` is the right granularity). Commit-time re-validation + is precise (no false positives) **and** reuses the same serialization point, so once + the head row exists it strictly dominates the CAS-map. Either way the head row + imposes an inherent trade — same-branch writers serialize cross-process (throughput + ceiling 1/branch, bounded by `PUBLISHER_RETRY_BUDGET`) — **now a correctness + requirement, not just a Phase-7 side effect** (§11). + + **Two faces, two fixes — do not bundle them.** The above addresses only the + *concurrent* face (overlapping snapshots, `iss-ri-write-skew-dangling-edges`). The + *sequential* face (`iss-overwrite-orphans-committed-edges`) — an overwrite drops a + node that **already has a committed inbound edge**, with *zero* concurrency — + **cannot** be caught by read-set-in-CAS: the later writer's snapshot legitimately + post-dates the edge, so its pin matches and it commits. That is a pure + **inbound-RI-validation** gap: when an overwrite/delete removes node endpoints, + re-check that no live edge references them. A validation concern, not a CAS one; + it needs no contention row and ships independently. + *(Note: `iss-984` is a different bug — remote branch-merge idempotency — not this.)* +- **Recovery: roll-forward is by-construction; roll-back is not.** "Commit and + recovery replay the identical plan" holds for the **redo** direction (shared + `plan.apply()`). The undo classifier (NoMovement / UnexpectedAtP1 / + UnexpectedMultistep / IncompletePhaseB) lives *outside* the shared executor, only + at open-time — that's where ARIES-style divergence risk concentrates and where the + §5.5 failpoint coverage is owed. +- **The fence and the cross-file atomicity rest on a linearizable conditional-put.** + Kleppmann's fencing-token guarantee, the manifest CAS, and the epoch all require a + linearizable register — true on S3/R2 (If-Match) but **not** on the local-FS path + (`write_text_if_match` is content-token compare-then-replace, ABA-prone — + `invariants.md` Known Gap). **Precondition to state up front: every "deterministic + fence" / "atomic CAS" claim holds *on a store with linearizable conditional-put*; + the epoch must not use the local-FS path.** Delta Lake §3.2.2 treats the + object-store consistency model (read-after-write + put-if-absent) as a first-class + design parameter; so should this RFC. + +--- + +## 8. Relationship to Lance MTT (the seam, not a dependency) + +`GraphPublishAuthority.publish(txn, plan)` is exactly the adapter to a future +Lance `catalog.transaction()`. lance#7264 ("Multi-Table Transactions via +Branching") is real and OmniGraph is its reference "Alternative A" +(fast-forward-main + WAL + roll-forward recovery) **[U]**, but it is a 5-day-old +discussion with two unbuilt dependencies (lance#7263 branch merge/rebase, +lance#7185 UUID branch paths), an unresolved central choice (it *favors* +pointer-swap — the opposite identity model from OmniGraph), and an open soundness +question (TTL lease needs an epoch). **Build the seam now on its own merits; do +not schedule around MTT landing.** When it ships, `publish`'s *body* swaps +(stage→CAS→sidecar → `catalog.transaction()`) while `WriteTxn`/`PublishPlan` and +every verb lowering stay. `iss-863`/`iss-864` **[G]** already scope this spike. + +The MemWAL/LSM ingest tier (`iss-681` **[G]**, `dec-adopt-lance-v7-memwal`) is +**complementary, not competing, and not in flight** (the `memwal-benefit-analysis` +branch is an empty placeholder; the real analysis is commit `c9a81266`). MemWAL +sits *below* the manifest publisher (per-table durability, opt-in, intra-table); +`WriteTxn` owns the cross-table CAS. Build `WriteTxn` first. + +--- + +## 9. Sequencing + +Ordered by leverage and dependency. **The dominant depth term is the redundant +data-table opens (step 3), not the internal tables (step 2)** — §0; both must land +to flatten the curve. + +1. **Measure first (Tier-1 gate). ✅ LANDED (gate + harness).** *Prerequisite (1a):* + the write opener (`open_dataset_head`) is routed through the instrumented + `open_dataset_tracked` so the gate can count data-table opens (§5.1). The + write cost-budget tests live in `crates/omnigraph/tests/write_cost.rs` on a + **shared, store-agnostic harness** (`tests/helpers/cost.rs`: `measure`/`IoCounts`/ + `assert_flat`/`local_graph`/`s3_graph`) that `warm_read_cost.rs` and + `write_cost_s3.rs` also consume — one vocabulary, no duplicated `IOTracker` + plumbing. The local gate ships green every-PR guards + the RED `#[ignore]`'d + internal-table LOCK (step 2's red→green acceptance). *Still owed:* the prod + `storage.ops` span metric (§5.3) and the bucket-gated `write_cost_s3.rs` opener + LOCK (step 3a's red→green, S3-only per the §9-3a measurement note). +2. **Bound history — bring the INTERNAL tables into optimize/cleanup (a code + change, not just scheduling).** Today `optimize`/`cleanup` iterate **node/edge + keys only** (`optimize.rs:895-904`) — confirmed: the prototype's `cleanup --keep 3` + pruned "7 tables" = the node/edge data tables; `__manifest`/`_graph_commits` were + untouched **[M]**. So the residual +5/depth internal slope (§0b) is **not** fixed + by today's tooling — step 2 is a real `all_table_keys` change to add the internal + tables, then schedule compaction+cleanup (pass `--yes`; cleanup aborts on remote + otherwise). The pruning mechanism is proven on a data table (1035→63, 16× **[M]**); + the internal tables need the same inclusion. **Proven [M]:** compacting the + internal tables collapsed their scans `__manifest` 285→32, `_graph_commits` + 177→11; with step 3 a depth-87 edge drops **~1720 → 198 ops** (§2.4). (Separately, + node/edge cleanup **caps** the dominant data-table term as an interim *before* + step 3 — after step 3 that term is flat regardless.) **HARD PREREQUISITE:** the + Q8 boundary watermark must land **with** this step — Lance's version CAS is + confirmed vulnerable to cleanup-resurrection (§12 Q8, a silent lost write on + R2/S3), so scheduling cleanup without the watermark trades a latency bug for a + correctness bug. (`gap-read-path-rederivation` write twin.) +3. **The opener fix — a shippable lead + the structural follow-on.** + - **3a. Opener bypass (standalone PR, THE dominant fix — [M] proven). ✅ LANDED.** + `TableStore::open_dataset_head_for_write` now delegates to the direct + `open_dataset_head` opener (`Dataset::open` by URI + `checkout_branch`, routed + through `instrumentation::open_dataset_tracked` so the cost gate can count it; + no-op in prod) instead of the `from_namespace` builder. Measured end-to-end on + the prototype: data term `31 + 12·depth → flat 4`, total `+18 → +5/depth`, + depth-80 **2.7×** (§2.4), functionally correct on main/branch/node. + **Acceptance:** the full `cargo test --workspace --locked` suite passes under the + bypass (the `tests/` integration + `merge_truth_table` + recovery/failpoint + suites the prototype's `--lib` run didn't cover — schema-apply, branch merge, + fork-on-first-write, overwrite). **Namespace retired to test-only:** with both + reads (Fix 2) and now writes bypassing it, *nothing in production routes through + the Lance namespace* — confirming §2.4's premise. The dead per-table open chain + (`load_table_from_namespace`, `open_table_head_for_write`) was deleted and the + `StagedTableNamespace` contract apparatus gated `#[cfg(test)]`, mirroring the + already-`#[cfg(test)]` read namespace (`BranchManifestNamespace`). **Measurement + note (corrected):** the opener win is **S3-only** — local FS resolves latest with + one cheap `read_dir` regardless of opener, so the namespace-vs-direct difference + is invisible there (the local data-table read count *does* grow with depth, but + that is the merge-insert/RI scan over O(depth) *fragments*, a compaction term, + not the opener; depth-100 = 92 ops identically before and after the bypass). The + opener LOCK therefore lives in the bucket-gated `write_cost_s3.rs`, not the local + `write_cost.rs`. + - **3b. Full `WriteTxn` (capture-once + intra-txn handle reuse + shared Session).** + Formalize 3a's open-once into the pinned, threaded `WriteTxn` (re-resolution + *unrepresentable*, invariant 3) and kill the flat-46 schema-read constant + (resolve/validate-once, §0/§6). (`iss-write-s3-roundtrip-amplification`.) +4. **Phase 7 — lineage into the manifest.** Removes the per-write + `commit_graph.refresh`; commit graph becomes a projection. (`iss-991`.) + **Hard dependency: step 2 must land first (Q1, §12)** — each publisher retry + re-runs the O(history) `load_publish_state` scan, so the `graph_head` CAS + contention Phase 7 introduces is acceptable only once compaction bounds that + scan. Acceptance includes the Q1 concurrent-same-branch-writer gate. + **Carries the §7.1 concurrent write-skew fix.** The `graph_head:` row is + the shared contention point the cross-table read-set-in-CAS needs — proven [M] + that the read-set fix is a no-op without it (§7.1). So the concurrent face of the + write-skew lands *with* this step (or, if §7.1 must ship earlier, behind a minimal + per-branch head row — ~15 lines — or as commit-time re-validation). The + *sequential* face (`iss-overwrite-orphans-committed-edges`) is independent: + inbound-RI validation on node removal, no head row, ships anytime. +5. **`PublishPlan` unification + recovery off the hot path + epoch fence — the + multi-writer safety guard.** Collapse the four writers; move the `__recovery` list + to open/conflict; add the `writer_epoch` leader-lease. **Motivated by the proven + §6.5 cliff** (multi-process same-branch writers corrupt at concurrency = 2) — this + is the guard that makes multi-writer topologies safe, not optional polish. + **Gated by the §5.5 correctness contract** (the four concurrency tests must land + with it). `writer_epoch` must be a true cross-process conditional CAS — **not** + the local-FS `write_text_if_match` path (§7.1). (`iss-856`, + `iss-merge-recovery-partial-rollforward`, `iss-recovery-sweep-live-writer-rollback`, + `iss-934`.) +6. **Branch ops.** Lance `Clone` for create (`iss-691`); concurrent delete loops. +7. **Freeze** investment in publisher/sidecar/fork internals; pursue the MTT + seam (`iss-863`/`iss-864`) as the strategic exit. + +**Land PR #277 first** — it closes `iss-merge-recovery-partial-rollforward` and is +the producer-side half of the `PublishPlan` discipline; the heal-relocation in +step 5 must preserve its merge pre-snapshot heal (`exec/merge.rs:1084-1090`) and +its open-time `IncompletePhaseB → RollBack` (which the per-write heal never +performed anyway). + +--- + +## 10. Cross-reference map (the ties) + +**Dev-graph items (modernrelay) — what this RFC ties together:** + +- Primary: `iss-write-s3-roundtrip-amplification` (the bug). +- Depth term / Phase 7 (commit graph → manifest-derived projection): `iss-991` + (related: `iss-707` structured commit-graph lineage; `iss-934` Quint + multi-table-publish verification). Read twin: `gap-read-path-rederivation`. +- Substrate seam: `iss-863`, `iss-864`. Decision: `dec-adopt-lance-v7-memwal` + (`iss-681`). +- Recovery: `iss-856`, `iss-recovery-sweep-live-writer-rollback`, + `iss-merge-recovery-partial-rollforward`, `iss-903`, `iss-load-not-crash-safe`. +- Residual migration: `iss-950` (MR-A staged delete, retires D2), `iss-848` + (index-coverage reconciler, owns `create_vector_index`). +- Branch/load: `iss-691`, `iss-677`, `iss-895`, `iss-topology-cross-branch-cache`, + `iss-841`, `iss-982`, `iss-423`, `iss-989`. +- Concurrency correctness (survives MTT) — **two faces, two different fixes [M]** + (§7.1): `iss-ri-write-skew-dangling-edges` (the *concurrent* face; fix = + read-set-in-CAS **+ a shared `graph_head` contention row**, so it's coupled to + step 4 / a minimal head row / commit-time re-validation — NOT a standalone PR) and + `iss-overwrite-orphans-committed-edges` (the *sequential* face; fix = + **inbound-RI validation on node removal**, ships independently, no contention row). + *(`iss-984` — remote branch-merge idempotency — is unrelated; not a write-skew.)* +- Blockers: `blk-lance-6658` (shipped 7.0.0), `blk-lance-6666` (open, vector + index two-phase), `blk-lance-blob-compaction`. +- Epics: `epc-bulk-data-plane`, `epc-lance-v7-migration`, `epc-783` (reliability + harness), `epc-929` (Quint verification). + +**Proposed new dev-graph wiring (not yet written):** + +- New **Epic** `epc-write-path-latency` — owns the cluster of orphaned issues + above (none currently has an epic). +- New **Gap** `gap-write-path-rederivation` — the write twin of + `gap-read-path-rederivation` (current: write re-derives snapshot + scans + uncompacted internal tables per write; target: capture-once + bounded history). +- New **Issues**: write-side cost-budget gate + prod metric (step 1; prereq 1a + routes all opens through the instrumented opener); **opener bypass — open writes + direct-by-URI, standalone (step 3a, [M] the dominant fix, completes PR #268 Fix 2 + on the write path, §2.4)**; full `WriteTxn` capture-once (step 3b); **add + `__manifest`/`_graph_commits` to `all_table_keys`** for compaction+cleanup (step 2 + — a code change, `optimize.rs:895-904`); `PublishPlan` unification + epoch + (step 5); branch-delete concurrency (step 6). +- **Per-table namespace retired to test-only (step 3a landed).** With reads (Fix 2) + and now writes (step 3a) both opening direct-by-URI, *nothing in production routes + through the per-table `StagedTableNamespace`*. The dead open chain + (`load_table_from_namespace`, `open_table_head_for_write`) was deleted; the + `StagedTableNamespace` struct/impl/factory are now `#[cfg(test)]`, mirroring the + already-`#[cfg(test)]` read namespace (`BranchManifestNamespace`). Both are retained + only to validate the `LanceNamespace` contract in unit tests. *Production catalog / + managed-versioning commit coordination for `__manifest` itself goes through a + **separate** namespace (`GraphNamespacePublisher`), unaffected by this change.* The + former follow-up to harden `StagedTableNamespace::list_table_versions` + (`checkout_version` per version, O(depth)) is now purely a test-hygiene note — no + prod caller can hit it; if any future version-list / time-travel feature needs + per-table version enumeration, build `TableVersion`s from `versions()` metadata + directly rather than resurrecting the namespace open path. +- New **Decision** `dec-writetxn-manifest-authoritative-publish` — records this + RFC's design choice and the MTT-seam stance. + +**Key source locations (v0.7.0):** +`omnigraph.rs:561-568,739-779,1317-1389`; `table_ops.rs:505-609`; +`table_store.rs:157-280,282-341,797`; `loader/mod.rs:197,400,485,557`; +`exec/mutation.rs:725`; `exec/merge.rs:1084-1090`; +`db/manifest/publisher.rs:51,93-124,356-371,385,432-440,448-490`; +`exec/mutation.rs:640-673` (D2 rule); `db/manifest/state.rs:44-72,133-141`; `db/manifest/layout.rs:22-26`; +`db/manifest/namespace.rs:111-112` (read open, O(1)),`:357-385`/`:362` (`describe_table` → redundant `Dataset::open` — the write-path double-open),`:158-186,544-550` (write open via `from_namespace`),`:395-427` (`list_table_versions` per-version checkout — test-only O(depth), the §10 follow-up); +`db/manifest/recovery.rs:762,978-988,1522`; `db/commit_graph.rs:136-164,213-272`; +`db/omnigraph/optimize.rs:240,517,895-904`; `instrumentation.rs:37,112-131`; +`runtime_cache.rs:202-283`; `tests/warm_read_cost.rs` (the read-side gate to mirror). + +**Upstream:** lance#7264/#7263/#7185 (MTT); Lance `with_version` O(1) open +(`from_namespace` → `describe_table`, `builder.rs:130-178`; `default_resolve_version` += one HEAD, `commit.rs:939-981`; version-hint PR #6752), +`list_is_lexically_ordered = !is_s3_express` (`aws.rs:183`), +`IOTracker`/`assert_io_*`/`num_stages`, `test_commit_iops`, +`test_commit_uses_version_hint_on_non_lexical_store`; **lance-namespace** design +(`namespace/index.md`, `operations/index.md` — catalog/discovery layer, resolve +once); LanceDB `io_tracking.rs`, `test_reload_resets_consistency_timer`; SlateDB +`FenceableTransactionalObject` (epoch fence), `InstrumentedObjectStore`, +monotonic-ID manifest. + +**Reproduce the §0(b) network measurement:** `rustfs` (S3-compat) on `:9000` +behind a ~90-LoC Go counting proxy on `:9100` (adds `LATENCY_MS`, preserves the +SigV4 `Host` header, `/__ctl/reset` + `/__ctl/stat`); an omnigraph cluster on +`s3://…/cluster` through the proxy. Single-write breakdown: reset the proxy log, +`load --mode merge` one edge, classify by S3 key. Depth slope: write N× to main, +diff the per-write log at depth D vs D+20 by table. Native baseline: pylance 7.0.0 +`write_dataset(mode="append")` in a loop → flat 6 ops/append at any depth. + +--- + +## 11. Drawbacks, alternatives, reversibility + +**Drawbacks.** Phase 7 makes disjoint-table same-branch writers contend on the +`graph_head:` row (they don't today) — bounded by the Lance retry budget, +inherent to a linear per-branch DAG, gated on a measured concurrency test and on +step 2 landing first (§12 Q1, resolved). **Reframe [M]: this contention is +load-bearing for correctness, not merely a throughput tax.** The §7.1 write-skew is +*unreachable only because* the shared head row forces disjoint cross-table writers to +overlap, conflict, retry, and re-evaluate their read-set pins against fresh state +(proven — without it the scoped CAS fix is a no-op). So §7.1 and the head row are +**coupled**: the "drawback" is exactly what buys the cross-table invariant, and the +throughput ceiling (1 writer/branch, bounded by `PUBLISHER_RETRY_BUDGET`) is a +**correctness requirement** the moment §7.1 ships, not an optional Phase-7 side +effect. `PublishPlan` is a non-trivial refactor of four writers; it must land behind +the cost gate and the `merge_truth_table`/recovery/failpoint suites. + +**Alternatives.** (A) *Caching band-aid only* — memoize schema validation, cache +opens within a request: ~30–50% fewer round-trips but leaves open-by-latest and +the O(history) terms. Mitigation, not a fix. (B) *Opener bypass only* (open +direct-by-URI+version, no full txn) — **kills the dominant depth term, now measured +[M]**: a one-line patch flattened the data term `31+12·depth → flat 4` and cut a +depth-80 edge **2.7×** (§2.4), leaving only the secondary internal-table term and +the writer unification. (C) *Full design (this RFC)* — correctness by construction. +(D) *Wait for Lance MTT* — future exit, not a current dependency (§8). +**Recommend: ship B as a standalone PR first (behind the step-1 gate), then C for +the constant-factor + correctness, then step 2 for the internal residual; D as the +strategic end-state.** B is the demonstrated dominant fix, not a partial one. + +**Reversibility.** The interface (`WriteTxn`/`PublishPlan`) is internal and +reversible. Phase 7's new `__manifest` object types (`graph_commit`, +`graph_head`) are an **on-disk format addition** — additive (old binaries skip +unknown `object_type`s) but near-permanent; it earns its own validation pass +(forward/back-compat, the validation checklist in the `iss-991` handoff). The +`writer_epoch` is likewise a durable manifest field. Everything else (compaction +scheduling, recovery relocation, branch concurrency, the cost gate) is cheap to +undo. + +--- + +## 12. Resolved questions (was: unresolved) + +All five original open questions were investigated read-only against post-#277/#284 +`origin/main`, upstream Lance 7.0.0, and the dev graph; each is resolved below. One +new item (Q6), surfaced by peer review, remains genuinely open. + +1. **`graph_head` CAS contention → RESOLVED, gated on step 2 + a concurrency test.** + Retry is publisher-owned; Lance's internal rebase-retry is disabled + (`conflict_retries(0)`, `publisher.rs:385`) → no double-retry. Row-CAS is true + one-winner (`TooMuchWriteContention` → retryable, `publisher.rs:432-440`), + bounded by `PUBLISHER_RETRY_BUDGET = 5`. **But each retry re-runs the O(history) + `load_publish_state` scan (`publisher.rs:455`)**, so `graph_head` contention + multiplies the manifest term — **step 2 (compaction) is a hard prerequisite for + step 4 (Phase 7)**. Same-branch is the real workload (the incident is concurrent + `main` writes). Residual: a measured gate before Phase 7 — N≈100 concurrent + same-branch writers, assert bounded retry + O(working-set) re-scan + P99 within + SLA. Fallback: batched-lineage, or Alternative B (defer lineage-in-manifest). +2. **Recovery grace-window → RESOLVED.** PR #284 is **unrelated** (cluster-apply + trap; zero `recovery.rs` changes). The dangerous rollback classifications + (NoMovement / UnexpectedAtP1 / UnexpectedMultistep / #277's IncompletePhaseB) + fire only at the open-time Full sweep; the per-write heal defers all rollback + (`recovery.rs:978-988`), so moving the heal off the hot path doesn't break #277. + A sidecar-age grace window (defer sidecars younger than T_grace, loud typed + skip, `repair` override) on the existing `created_at`/ULID + (`recovery.rs:762`/`:1522`) is the sound interim; the permanent fix is the + in-process background reconciler `iss-856`. Lands step 5 with a failpoint test. +3. **Epoch fence × publisher CAS → RESOLVED (by construction).** With Lance retry + off (Q1), the publisher loop is the only retry layer. Model `writer_epoch` as a + **pre-publish hard-fail gate** beside `check_expected_table_versions` + (`publisher.rs:462`) but non-retryable (a stale epoch is a protocol violation, + not a race). No double-retry; the epoch gate and the row-CAS loop are + sequential. SlateDB `FenceableTransactionalObject` is the precedent. +4. **Compaction cadence → RESOLVED.** Not `auto_cleanup` (GCs pinned versions). + Not foreground every-N-writes (deny-list job-queue + invariant 7 + cost cliff). + Minimum (step 2): extend `optimize`/`cleanup` to the internal tables AND node/edge + version cleanup — no special-casing (`SidecarKind::Optimize` already covers a + mid-compaction crash). Follow-up: an `iss-856`-shaped background reconciler + triggered by a cheap fragment-count probe (work off the hot path; a reconciler, + not a job queue — deny-list-clean; SlateDB's epoch-coordinated compactor is the + precedent). CLI `omnigraph optimize` stays the operator override. +5. **`PublishPlan` residuals → RESOLVED.** Both `delete_where` and + `create_vector_index` are representable as `TableAction` variants with existing + sidecar coverage (`SidecarKind::Mutation`/`EnsureIndices`) and are + content-preserving (roll-forward safe). `TableAction::Delete` migrates to staged + two-phase via MR-A / `iss-950` (now unblocked — `blk-lance-6658` shipped); **D2 + retires then** (`enforce_no_mixed_destructive_constructive`, + `exec/mutation.rs:640-673`). `TableAction::CreateVectorIndex` stays inline until + `blk-lance-6666` ships (`iss-848` reconciler path). + +**Resolved post-review:** + +6. **The exact mechanism of the data-table chain re-read → RESOLVED (§0, §2.4).** + Pinned by Lance-source trace + proxy + pylance isolation **[U]**: it is **not** + `checkout_version` (O(1)) and **not** merge-insert conflict replay. The write + open goes through `DatasetBuilder::from_namespace` (`namespace.rs:174`), whose + `describe_table` opens the whole dataset just to return a location + (`namespace.rs:362`/`:112`) and whose `.load()` resolves latest **again** — a + double latest-resolution per open, ~13× per write, nothing cached. The open + resolves latest **without the V2 lexical / version-hint fast path** the direct + opener uses (likely the un-threaded `Session`/store params, + `load_table_from_namespace` `namespace.rs:174` — inferred, not traced), so it is + O(depth) where a direct `from_uri().with_version(N)` is O(1). **The mechanism + question is now academic for the fix:** bypassing `from_namespace` makes the open + flat regardless of the precise sub-mechanism (un-threaded `Session` / double + resolve / missed hint) — the bypass is the answer. (`list_table_versions` is + **not** on this path — test-only; §10 follow-up.) `checkout_version` stays + exonerated. + +**Resolved end-to-end [M]:** + +7. **End-to-end prototype of step 3 → DONE, measured (§2.4 before/after).** A + prototype patched the opener (`open_dataset_head_for_write`, `table_store.rs:174`) + to bypass `from_namespace` and open direct-by-URI, rebuilt v0.7.0, and re-ran the + sweep: the data term collapsed `31 + 12·depth → flat 4`, total `+18/depth → + +5/depth` (residual = the two internal tables, step 2), depth-80 **1618 → 593 ops + (2.7×)** — functionally correct on main edge merge, branch create+write+read, and + node merge. So step 3's "closes the dominant term outright" is **measured, not + inferred**, and the opener bypass is **shippable standalone** (§9 step 3a). + **Remaining (not blockers for step 3a's thesis):** the prototype did not cover + schema-apply / branch merge / fork-on-first-write / overwrite / concurrent — a + production opener change must pass the full `merge_truth_table`/recovery/failpoint + suite; and the internal-table cleanup demo (step 2) + the concurrency + fault-injection harness (steps 4/5) are still owed. + +**Newly surfaced (open):** + +8. **CAS-resurrection after cleanup → CONFIRMED VULNERABLE [S]; boundary watermark + is a HARD PREREQUISITE for step 2.** SlateDB found this race (RFC-0026 / issue + #352): a writer that stalls between computing manifest id `N+1` and creating it + can, *after GC deletes `N+1`*, re-create it and observe **false success**. + Lance 7.0.0 was traced directly and is **not immune**: version creation is a plain + `put_opts(naming_scheme.manifest_path(base, version), PutMode::Create)` / + `rename_if_not_exists` (`lance-table commit.rs:1421-1437`, `:1358`) on a + version-numbered, **pruneable** path, with **no monotonic/boundary/watermark + guard** anywhere in the manifest/commit/dataset path; `cleanup_old_versions` is + **timestamp-based** (`cleanup.rs:1086`), so it deletes the very file the only + guard (AlreadyExists→rebase) relies on. A stalled publisher whose target version + was pruned by step-2 cleanup gets a `PutMode::Create` **success on a non-existent + version → false success.** Severity by store: **R2/S3 (lexical, prod) = a silent + lost write** (the resurrected version doesn't win V2 latest-resolution, so data + lands on a dead branch while the publisher believes it committed); non-lexical = + the version hint (`commit.rs:1439`) is overwritten to the stale version and + trusted (worse, but not the prod path). **Action:** step 2 ships **only with** a + durable **boundary/floor watermark** (GC advances it before deleting; every writer + rejects `id <= boundary` after a "successful" create — SlateDB's fix), which also + bounds any list-then-read-latest fallback. This was "lowest-risk earliest item"; + it is now gated (§9 step 2). + +--- + +## 13. External validation (subagents + literature) + +Validated read-only against OSS prior art and the DB/distributed-systems canon: + +- **SlateDB** (canonical object-store LSM) — tenet-by-tenet ✅ on capture-once + snapshot, monotonic-ID manifest (no pointer file — *explicitly rejected* in their + RFC-0001), the **epoch fence** (exact match: `FenceableTransactionalObject`, + hard-fail, TTL-lease *explicitly rejected* — adopt as specified), background + epoch-coordinated compaction/GC, and recovery-on-open. **Adopt-items OmniGraph is + missing / under-specifies:** (1) the **boundary-file** CAS-resurrection guard (Q8); + (2) **group-commit batching** — coalesce pending `PublishPlan`s into one manifest + CAS, directly mitigating the Q1 / §6.5 contention; (3) SlateDB *peels* compaction + state *out* of the manifest (RFC-0013) — the **opposite** of Phase 7's fold-*in*; + §11 should defend "fold-in (lineage must be atomic with visibility) beats peel-out + for us"; (4) **write back-pressure** when cleanup lags (`l0_max_ssts`). **Citation + correction:** SlateDB has the per-RPC counter (`InstrumentedObjectStore`) but + **not** the flatness-across-history gate — the depth-swept Tier-1 gate (§5.1) is + OmniGraph-novel; cite it that way. +- **Literature** — OCC/MVCC (Kung-Robinson 1981; DDIA ch.7), ARIES redo/undo, the + fencing-token canon (Kleppmann — whose motivating example *is* OmniGraph's + S3-read-modify-write-paused-past-lease scenario), and the lakehouse genre (Delta + Lake VLDB 2020, Iceberg spec, Neon). The spine — OCC-over-MVCC + one atomic + manifest CAS + WAL-of-intent recovery + monotonic-epoch fence — is canon-blessed, + and OmniGraph **exceeds** Delta/Iceberg on the axis that matters (both are + explicitly *single-table*-transactional; the manifest CAS delivers the atomic + *cross-table* commit Delta only speculates about). The three scoping caveats are in + §7.1. +- **HelixDB** (embedded LMDB graph DB) — too different a substrate to validate the + object-store machinery (LMDB's `commit()` subsumes tenets #2–#8 for free), but it + **corroborates tenet #1** (capture-once, thread-by-reference, re-resolution + unrepresentable — its `&mut RwTxn`-threaded traversal is the embedded twin of + `WriteTxn`) and confirms the bug class is **substrate-induced**. Portable idea for + the roadmapped traversal work: adjacency as a *persisted, sorted, + label-partitioned projection* keyed by `(node, label)` (vs the cold-rebuilt + `TypeIndex`). diff --git a/docs/dev/testing.md b/docs/dev/testing.md index 6a62580..941cec6 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -26,6 +26,8 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav | `forbidden_apis.rs` | Defense-in-depth source-walk guard: engine code (`exec/`, `db/omnigraph/`, `loader/`, `changes/`) must not reach around the sealed storage trait to Lance inline-commit APIs, nor open datasets directly (`Dataset::open` / `DatasetBuilder::from_uri`/`from_namespace`) — reads route through `Snapshot::open` and the held-handle cache; `// forbidden-api-allow: ` sentinel exempts reviewed lines | | `lance_surface_guards.rs` | Pins the Lance API surfaces omnigraph depends on (named runtime + compile-only guards; see [lance.md](lance.md)) — the first smoke check on any Lance version bump; e.g. `compact_files_still_fails_on_blob_columns` turns red when the upstream blob-compaction fix lands | | `warm_read_cost.rs` | Cost-budget tests for the warm read path (query-latency work), measured at the object-store boundary with Lance `IOTracker` (the LanceDB IO-counted pattern): a warm same-branch read does 0 manifest opens, 0 commit-graph opens, 1 version probe, validates the schema once (Fix 1 / finding A / Fix 2 at commit-history depth); stale same-branch reads perform exactly 2 probes and refresh manifest-only; recreated non-main branches with the same Lance version refresh by incarnation; recreated branch-owned table handles are distinguished by table e_tag or refresh-time cache clearing; recreated traversal topology is protected by synthetic snapshot-id incarnation or refresh-time cache clearing; a warm *repeat* read does 0 table opens via the held-handle cache and a write re-opens only the changed table at its new version/e_tag (Fix 3/6A). See "Cost-budget tests" below | +| `write_cost.rs` | Cost-budget tests for the WRITE path (RFC-013), the latency twin of `warm_read_cost.rs` on the **shared `helpers::cost` harness** (`measure`/`IoCounts`/`assert_flat`/`local_graph`). Runs on **local FS**; gates the **internal-table** term (`__manifest`/`_graph_commits` scans flat in commit-history depth — the RED `internal_table_scans_are_flat_in_history` LOCK, `#[ignore]`'d until internal-table compaction lands) plus green every-PR guards (single-insert `data_writes` bounded, a per-write read-op ceiling that fails the moment a round-trip is added, and a `measure_with_staged` fitness assert that a keyed insert routes through `stage_merge_insert` once with no `stage_append`/vector-index build). The **data-table opener** term is S3-only — see `write_cost_s3.rs` and the backend-split note in "Cost-budget tests" below | +| `helpers/cost.rs` | The shared cost-budget harness (not a test): `IoCounts`/`StagedCounts` (counts by table class), `measure`/`measure_with_staged` (the one place the `with_query_io_probes` + `MergeWriteProbes` task-local + `IOTracker` wiring lives), `assert_flat(curve, select, slack, what)`, and store-agnostic `local_graph`/`s3_graph` fixtures. `warm_read_cost.rs`, `write_cost.rs`, and `write_cost_s3.rs` all consume it so a cost test body is written once and reads in one vocabulary | | `lifecycle.rs` | Graph lifecycle, schema state | | `point_in_time.rs` | Snapshots, time travel (`snapshot_at_version`, `entity_at`) | | `changes.rs` | `diff_between` / `diff_commits` | @@ -69,9 +71,10 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav ## RustFS / S3 integration -CI runs three S3-backed tests against a containerized RustFS server (`.github/workflows/ci.yml` → `rustfs_integration` job): +CI runs these S3-backed tests against a containerized RustFS server (`.github/workflows/ci.yml` → `rustfs_integration` job): - `cargo test -p omnigraph-engine --test s3_storage` +- `cargo test -p omnigraph-engine --test write_cost_s3` (RFC-013 step 3a's data-table opener cost gate — flat across commit depth on S3; the term local FS can't reproduce) - `cargo test -p omnigraph-server --test s3` (single-graph serving + config-free `--cluster s3://` boot) - `cargo test -p omnigraph-cluster --test s3_cluster` (full control-plane lifecycle on the bucket) - `cargo test -p omnigraph-cli --test system_local local_cli_s3_end_to_end_init_load_read_flow` @@ -126,7 +129,7 @@ When you pick up any change, walk through this: 6. **For substrate-touching changes** (Lance behavior), reach for `failpoints` or fixture-driven scenarios, not stubbed-out mocks. 7. **For server / API changes**, confirm the OpenAPI regeneration happens in `openapi.rs` and that the diff lands in `openapi.json`. 8. **Verify your change makes an existing test fail before it makes the new one pass.** If you can break the code without breaking a test, your coverage gap is the problem to fix first. -9. **Bound hot-path cost at history depth.** If the change touches a read or open path, add or extend a test that asserts a *bounded* cost (e.g. a warm same-branch read performs zero `Dataset::open`, or a fixed object-op count) against a fixture with realistic *commit-history depth*, not just realistic row counts. Cost that scales with history is invisible on a shallow fixture and only bites in production. See "Cost-budget tests" below. +9. **Bound hot-path cost at history depth.** If the change touches a read, **write**, or open path, add or extend a test that asserts a *bounded* cost (e.g. a warm same-branch read performs zero `Dataset::open`, or a per-write read-op count flat across commit depth) against a fixture with realistic *commit-history depth*, not just realistic row counts. Reuse the shared `helpers::cost` harness (`measure`/`IoCounts`/`assert_flat`) — don't hand-roll `IOTracker` wiring. Cost that scales with history is invisible on a shallow fixture and only bites in production. See "Cost-budget tests" below. ## Cost-budget tests: bound hot-path cost at history depth @@ -134,6 +137,7 @@ Correctness bugs fail loudly in tests; cost-scaling bugs pass every test and deg - **Assert a cost budget, not just a result.** For a read/open path, assert the number of `Dataset::open` calls (or object-store ops) a warm query performs, and that it does not grow with commit count. The reference is LanceDB's IO-counted tests, which assert a cached read costs 0-1 IO and carry a named regression test against "a list call on every subsequent query." - **Test at history depth.** Build a fixture with many *commits* (not many rows) and assert warm-read cost is flat across depths. A shallow fixture cannot catch an O(commits) cost. +- **Use the shared harness, and gate each term on the backend where it manifests.** `helpers::cost` (`measure`/`IoCounts`/`assert_flat`/`local_graph`/`s3_graph`) is the one place the `IOTracker`/task-local plumbing lives — consume it, don't duplicate it. The write path has *two distinct* depth terms that split cleanly across backends, and conflating them is a real trap (the local data-table read count grows with depth too, but for a different reason — the merge-insert/RI scan reading O(depth) *fragments*, reduced by compaction, not by the opener): (1) the **internal-table** scan term (`__manifest`/`_graph_commits` fragment scans) reproduces on **any** backend including local FS, so `write_cost.rs` gates it on local every-PR; (2) the **data-table opener** term (latest-version resolution) is a per-object-store-RPC phenomenon — local-FS resolves latest with one cheap `read_dir` regardless of the opener used, so the namespace-vs-direct difference is **invisible on local** and only shows on a real object store (per-version GETs), gated by the bucket-gated `write_cost_s3.rs`. Same harness, different fixture; each term asserted where it actually appears. - This is the testing companion to invariant 15 in [docs/dev/invariants.md](invariants.md) (hot-path cost is bounded by work, not history). When in doubt, re-read [docs/dev/invariants.md](invariants.md) — quality gates apply to every change. From 9c792649e2f58d5abe934d277a0aa7773fbcf03a Mon Sep 17 00:00:00 2001 From: Andrew Altshuler Date: Sun, 21 Jun 2026 00:02:34 +0300 Subject: [PATCH 4/8] docs(user): coherence cleanup aligned with 0.7.1 (#293) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * docs(cli): fix cluster apply semantics — converges graphs+schema, not config-only `cluster apply` creates graphs, applies schema updates (soft drops), writes stored-query/policy catalog resources, and executes approved graph deletes in one ordered run. Both the user docs and the shipped CLI help text still described it as a "Stage 3A" config-only (query/policy) subset that defers graph/schema changes "to a later stage" — wrong since the graph/schema executor landed. - docs/user/cli/reference.md: rewrite the cluster paragraph to describe apply's actual converge behavior; keep deferred for the genuinely-unsupported case (standalone schema deletes); drop the stale "Stage 3A" / "reserved for later stages" framing. - crates/omnigraph-cli/src/cli.rs: fix the `cluster apply` help text to match. Part of the docs/user coherence cleanup (docs/dev/docs-issues.md, P1). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 * docs(server): align stored-query exposure with cluster-only behavior server.md documented a per-query expose knob ("`mcp.expose` defaults to true; set `mcp: { expose: false }` to hide from the catalog") that does not exist in the only deployment mode. Cluster-only serving lists every stored query: the cluster registry has no expose field (`QueryConfig { file }`) and the boot bridge hardcodes `expose: true` for all cluster queries (omnigraph-server settings), and there is no GQ-level expose annotation. This contradicted clusters/config.md, which already states the correct behavior. Replace the knob bullet with the cluster truth (every applied query is listed; per-query exposure may become a Cedar-policy decision later) and drop the "`mcp.expose` stored queries" phrasing from the catalog description, the endpoint table, and the intro. The `mcp_expose` JSON catalog field is unchanged (still emitted, always true in cluster mode). Part of the docs/user coherence cleanup (docs/dev/docs-issues.md, P1). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 * docs(schema): split direct/embedded vs cluster-managed schema apply schema/index.md claimed `allow_data_loss` is "honored uniformly across transports" and listed HTTP `POST /schema/apply` among them. But that route is 409-disabled for cluster-backed serving (already documented in server.md), and cluster-managed graphs evolve only through `cluster apply` with soft drops — there is no cluster HTTP data-loss path. Scope the data-loss flag to the direct/embedded path (`schema apply --store`, SDK), and add a paragraph: cluster-managed graphs use `cluster apply` (soft drops only); HTTP `POST /schema/apply` is 409 for cluster serving; direct apply against a cluster-managed path is refused. Cross-refs server + cluster docs. Part of the docs/user coherence cleanup (docs/dev/docs-issues.md, P2). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 * docs(server): document /load as canonical in limits + admission prose The endpoint table already listed both `/load` (canonical) and `/ingest` (deprecated alias) at 32 MB, but the admission-control, body-limit, rate-limit, and manifest-conflict prose named only `/ingest` — and the constants page called the limit "Ingest body limit". Add `/load` alongside (or ahead of) `/ingest` everywhere, and rename the constant to "Load (bulk-write) body limit" noting the `/ingest` alias shares it. Part of the docs/user coherence cleanup (docs/dev/docs-issues.md, P2). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 * docs(cli): drop stale bearer-token keys + fix version string The "Bearer token resolution (CLI)" section still listed removed omnigraph.yaml keys (`graphs..bearer_token_env`, `auth.env_file`) — config surfaces that no longer exist and that implied plaintext tokens in config. Replace it with a pointer to the keyed-credential model documented above (`OMNIGRAPH_TOKEN_` → `~/.omnigraph/credentials` → `OMNIGRAPH_BEARER_TOKEN`). Also fix the `version` row: the CLI prints 0.7.x, not 0.3.x. Part of the docs/user coherence cleanup (docs/dev/docs-issues.md, P2 + smaller). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 * docs: route-spelling note + drop stale stage/deferred crumbs - server.md: add a one-line note that the per-graph subsections name routes in shorthand (`GET /queries`, `POST /query`, `POST /mutate`, `POST /queries/{name}`) but every one is served under `/graphs/{id}/…` — the endpoint table is already fully-qualified. - clusters/config.md: redefine the `deferred` plan disposition as an unsupported change (e.g. a standalone schema delete) instead of "graph/schema change, later phase" (graph creates and schema updates apply now); drop the "Stage 2C" label from the lock-recovery note. - search/indexes.md: `ingest --mode merge` → canonical `load --mode merge`. Part of the docs/user coherence cleanup (docs/dev/docs-issues.md, P2 + smaller). Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 * docs(dev): track user-docs coherence ledger; mark 2026-06-20 findings resolved Convert the scratch review notes into a tracked living ledger and link it from the dev index. All ten findings from the 2026-06-20 docs/user sweep are validated and fixed in this branch (P1 cluster-apply semantics + stored-query exposure; P2 schema-apply paths, /load canonical, bearer-token keys, route shorthand; plus version/ingest/deferred/stage crumbs). The verification grep checklist is retained for future audits. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 * docs(api): align GET /queries OpenAPI contract with cluster-only behavior Greptile P1 on #293: the prose fix in server.md left the OpenAPI surface stale. The utoipa annotations (handlers.rs, omnigraph-api-types QueriesCatalogOutput) still described the catalog as "the `mcp.expose == true` subset", and those drive the checked-in openapi.json — so SDK consumers read a contract the cluster-only server does not honor (it lists every stored query). Update the three Rust doc-comment/annotation strings to "every stored query" and regenerate openapi.json (OMNIGRAPH_UPDATE_OPENAPI=1; drift test green) in the same change, per AGENTS.md rule 4. Ledger updated: this finding resolved, plus the cross-repo drift it surfaced (omnigraph-ts generated spec/types and omnigraph-cookbooks best-practices bearer_token_env) tracked as open follow-ups. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 --------- Co-authored-by: Claude Opus 4.8 (1M context) --- crates/omnigraph-api-types/src/lib.rs | 4 +- crates/omnigraph-cli/src/cli.rs | 6 ++- crates/omnigraph-server/src/handlers.rs | 7 +-- docs/dev/docs-issues.md | 68 +++++++++++++++++++++++++ docs/dev/index.md | 1 + docs/user/cli/reference.md | 41 ++++++++------- docs/user/clusters/config.md | 13 ++--- docs/user/operations/server.md | 24 +++++---- docs/user/reference/constants.md | 2 +- docs/user/schema/index.md | 6 ++- docs/user/search/indexes.md | 2 +- openapi.json | 6 +-- 12 files changed, 134 insertions(+), 46 deletions(-) create mode 100644 docs/dev/docs-issues.md diff --git a/crates/omnigraph-api-types/src/lib.rs b/crates/omnigraph-api-types/src/lib.rs index 2814602..32bc753 100644 --- a/crates/omnigraph-api-types/src/lib.rs +++ b/crates/omnigraph-api-types/src/lib.rs @@ -401,8 +401,8 @@ pub struct QueryCatalogEntry { pub params: Vec, } -/// Response for `GET /queries`: the `mcp.expose` subset of a graph's -/// stored-query registry, each with typed parameters. +/// Response for `GET /queries`: every stored query in a graph's +/// registry, each with typed parameters. #[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] pub struct QueriesCatalogOutput { pub queries: Vec, diff --git a/crates/omnigraph-cli/src/cli.rs b/crates/omnigraph-cli/src/cli.rs index 94bec5a..2b1a861 100644 --- a/crates/omnigraph-cli/src/cli.rs +++ b/crates/omnigraph-cli/src/cli.rs @@ -397,8 +397,10 @@ pub(crate) enum ClusterCommand { #[arg(long)] json: bool, }, - /// Apply the config-only (query/policy) subset of the plan to the local - /// cluster catalog. Graph/schema changes are deferred to a later stage. + /// Converge the cluster to its config: create graphs, apply schema updates + /// (soft drops), write stored-query/policy catalog resources, and execute + /// approved graph deletes, in one ordered run. Serving picks up the applied + /// revision after an `omnigraph-server --cluster` restart. Apply { /// Cluster config directory containing cluster.yaml. #[arg(long, default_value = ".")] diff --git a/crates/omnigraph-server/src/handlers.rs b/crates/omnigraph-server/src/handlers.rs index 7de38d2..1571164 100644 --- a/crates/omnigraph-server/src/handlers.rs +++ b/crates/omnigraph-server/src/handlers.rs @@ -1026,7 +1026,7 @@ pub(crate) async fn server_invoke_query( tag = "queries", operation_id = "list_queries", responses( - (status = 200, description = "Stored-query catalog (the mcp.expose subset, with typed params)", body = QueriesCatalogOutput), + (status = 200, description = "Stored-query catalog (every stored query, with typed params)", body = QueriesCatalogOutput), (status = 401, description = "Unauthorized", body = ErrorOutput), (status = 403, description = "Forbidden", body = ErrorOutput), ), @@ -1034,10 +1034,11 @@ pub(crate) async fn server_invoke_query( )] /// List the graph's exposed stored queries as a typed tool catalog. /// -/// Returns the `mcp.expose == true` subset of the `queries:` registry, each +/// Returns every stored query in the `queries:` registry, each /// with its MCP tool name, read/mutate flag, description/instruction, and /// typed parameters — enough for a client to register them as tools without -/// fetching `.gq` source. Read-gated; the catalog is graph-wide (branch +/// fetching `.gq` source. Cluster-served graphs have no per-query expose flag, +/// so the catalog lists them all. Read-gated; the catalog is graph-wide (branch /// independent — `read` is authorized against `main`). **Not** Cedar-filtered /// per query yet, so it can list a query whose `invoke_query` the caller /// lacks (a known gap until per-query authorization lands). diff --git a/docs/dev/docs-issues.md b/docs/dev/docs-issues.md new file mode 100644 index 0000000..d045f4b --- /dev/null +++ b/docs/dev/docs-issues.md @@ -0,0 +1,68 @@ +# User Docs Coherence Ledger + +**Last review:** 2026-06-20 (against 0.7.1) +**Status:** all open findings resolved — living ledger for future audits. + +This page tracks stale or incoherent user-doc claims found during broad docs +reviews. Findings are validated against current **code/behavior**, not just +cross-doc consistency. Record new findings as they surface; mark them resolved +(with the fixing commit) once the public pages are corrected. + +## Resolved — 2026-06-20 docs/user coherence sweep + +Every finding from the 2026-06-20 review was validated (all reproduced) and +fixed. Branch `docs/user-coherence-0-7-1`. + +| Pri | Finding | Resolution | +|---|---|---| +| P1 | `cluster apply` documented as catalog-only / "Stage 3A" with graph+schema deferred — in both `cli/reference.md` and the shipped CLI help (`cli.rs`) | Rewrote both to describe the real converge behavior (creates graphs, applies schema with soft drops, writes catalog, executes approved deletes in one ordered run); `deferred` now means the genuinely-unsupported case (standalone schema delete). | +| P1 | Stored-query exposure had two contracts: `server.md` documented a per-query `mcp:{expose:false}` knob; cluster docs said all queries are listed | Confirmed in code: cluster registry has no expose field (`QueryConfig`), boot bridge hardcodes `expose: true` (`omnigraph-server` settings), no GQ-level annotation. Removed the knob from `server.md`; documented "every applied query is listed; per-query exposure may become a Cedar-policy decision later". | +| P1 | The same stale "`mcp.expose == true` subset" contract lived in the **OpenAPI surface**: utoipa annotations (`handlers.rs:1029,1037`, `omnigraph-api-types/src/lib.rs:404`) drove `openapi.json` (Greptile catch on #293) | Updated the three Rust doc-comment/annotation strings to "every stored query" and regenerated `openapi.json` (`OMNIGRAPH_UPDATE_OPENAPI=1`); drift test green. Same-change per AGENTS.md rule 4. | +| P2 | `schema/index.md` claimed `allow_data_loss` honored "uniformly across transports" incl. HTTP `POST /schema/apply` | Scoped to the direct/embedded path; added that cluster-managed graphs evolve via `cluster apply` (soft drops only) and the HTTP route is 409-disabled for cluster serving. | +| P2 | `/load` missing from admission / body-limit / rate-limit / manifest-conflict prose (named `/ingest` only); constants called it "Ingest body limit" | Documented `/load` as canonical everywhere with `/ingest` as the deprecated alias; renamed the constant to "Load (bulk-write) body limit". | +| P2 | CLI "Bearer token resolution" section listed removed `omnigraph.yaml` keys (`graphs..bearer_token_env`, `auth.env_file`) | Replaced with a pointer to the keyed-credential model (`OMNIGRAPH_TOKEN_` → `~/.omnigraph/credentials` → `OMNIGRAPH_BEARER_TOKEN`); no plaintext-in-config path. | +| P2 | Flat route names in a cluster-only server (`POST /query`, `POST /mutate`, `GET /queries`, `POST /queries/{name}`) | Added a one-line note that the per-graph subsections use shorthand under `/graphs/{id}/…`; the endpoint table is already fully qualified. | +| — | `version` printed `omnigraph 0.3.x` | → `0.7.x`. | +| — | `search/indexes.md` used deprecated `ingest --mode merge` | → `load --mode merge`. | +| — | `config.md` `deferred` disposition described as "graph/schema change, later phase" | → "an unsupported change (e.g. standalone schema delete)". | +| — | Stale stage labels (`Stage 3A`, `Stage 2C`, `Stage 1`) in active reference docs | Removed / reworded to plain language; release notes keep history. | + +## Open — surfaced 2026-06-20, not yet fixed + +- **Stale "config-only apply" / "Stage 3A" comments in `omnigraph-cluster` + source** (internal rustdoc, not user docs — out of scope for the docs sweep + above): `src/types.rs:147` ("Applied changes execute (config-only query/policy + catalog writes)"), `src/types.rs:265` ("Output of config-only cluster apply"), + `src/diff.rs:256`, and `src/tests.rs:1129` ("config-only apply (Stage 3A)"). + Apply now also runs graph creates, schema applies, and approved deletes + (`diff.rs:411` `GraphCreate` / `SchemaApply`; the Stage-4 create/schema/delete + executors + tests `apply_creates_graph_and_unblocks_dependents`, + `apply_schema_update_and_dependent_query_in_one_run`, + `apply_blocks_graph_delete_without_approval`). Update these comments in a + cluster-crate change. +- **Cross-repo drift from this sweep** (separate repos — track here, fix in a + follow-up in each repo): + - `omnigraph-ts` SDK ships a stale generated `spec/openapi.json` + + `packages/sdk/src/generated/types.gen.ts` still describing the `GET /queries` + catalog as the `mcp.expose` subset. Regenerate from this repo's + `openapi.json` once the SDK's deferred refresh happens (the SDK is known to + lag the API by design). + - `omnigraph-cookbooks/docs/best-practices.md` (~line 372) still describes + client-side auth as resolving through the removed `bearer_token_env` chain. + Update to the keyed-credential model (`OMNIGRAPH_TOKEN_` → + credentials file → `OMNIGRAPH_BEARER_TOKEN`). + +## Verification checklist (re-run on the next docs audit) + +```bash +rg -n "Stage [0-9]|graph/schema changes are deferred|reserved for later stages" docs/user crates/omnigraph-cli/src/cli.rs +rg -n "POST /query|POST /mutate|GET /queries|POST /queries/\{name\}|POST /schema/apply" docs/user +rg -n "ingest --mode|Ingest body limit|/ingest" docs/user +rg -n "0\.3\.x|bearer_token_env|auth\.env_file" docs/user +rg -n "expose: false|mcp\.expose" docs/user +``` + +Expected: active user docs have no matches for stale phrases, or the remaining +matches are explicitly marked as deprecated aliases, "no longer exist" notes, or +route shorthand disclaimed relative to `/graphs/{id}`. Release notes are allowed +to preserve historical behavior. diff --git a/docs/dev/index.md b/docs/dev/index.md index 9fe743f..23f0610 100644 --- a/docs/dev/index.md +++ b/docs/dev/index.md @@ -41,6 +41,7 @@ constraints. User-facing behavior should still be documented through | Error taxonomy and serialization | [errors.md](../user/operations/errors.md) | | Constants and tunables | [constants.md](../user/reference/constants.md) | | Transaction model public contract | [transactions.md](../user/branching/transactions.md) | +| User-doc coherence cleanup ledger | [docs-issues.md](docs-issues.md) | ## Project Operations diff --git a/docs/user/cli/reference.md b/docs/user/cli/reference.md index 1709226..3b97800 100644 --- a/docs/user/cli/reference.md +++ b/docs/user/cli/reference.md @@ -28,7 +28,7 @@ Top-level command families and subcommands. Graph-targeting commands accept a po | `policy validate \| test \| explain` | Cedar tooling against a cluster's applied policies (`--cluster `; `--graph ` picks a graph's bundle when several apply). `test` takes `--tests `; `explain` takes `--actor`/`--action`/`--branch`/`--target-branch` | | `queries list \| validate` | inspect a cluster's applied stored-query registry (`--cluster `; `--graph ` to scope one graph). `list` prints each query's kind (read/mutation), name, typed params, and `[mcp: …]` exposure; a query's `@description`/`@instruction` are shown as indented `description:` / `instruction:` lines when declared (omitted otherwise). `--json` emits `{name, mcp_expose, tool_name, mutation, params}` plus `description`/`instruction` **only when present** — matching the HTTP `GET /queries` catalog ([server.md](../operations/server.md)). `validate` type-checks the registry and exits non-zero on a broken query | | `profile list \| show []` | read-only inspection of `~/.omnigraph/config.yaml` profiles. `list` shows each profile's binding (server/cluster/store) + default graph and marks the `$OMNIGRAPH_PROFILE`-active one; JSON keeps `binding` and adds `scope_kind`, `target`, `valid`, and `error`; `show` resolves one profile's scope (endpoint + default graph), defaulting to the active profile, else the flat operator defaults | -| `version` / `-v` | print `omnigraph 0.3.x` | +| `version` / `-v` | print `omnigraph 0.7.x` | ## Command capabilities @@ -189,22 +189,26 @@ omnigraph cluster import --config company-brain --json omnigraph cluster force-unlock --config company-brain --json ``` -`--config` is a directory containing `cluster.yaml`; it defaults to `.`. -Stage 3A accepts graphs, schemas, stored queries, and policy bundle file +`--config` is a directory containing `cluster.yaml`; it defaults to `.`. The +config declares graphs, schemas, stored queries, and policy bundle file references. `cluster plan` reads local JSON state from `/__cluster/state.json`; a missing file means empty state. Plan, apply, refresh, and import acquire `__cluster/lock.json` by default and release -it before returning. `cluster apply` executes only stored-query/policy catalog -writes (content-addressed under `__cluster/resources/`) and requires an -existing `state.json`; graph/schema changes are deferred with warnings, and -applied resources do not serve traffic until an `omnigraph-server --cluster -` restart picks them up. `cluster status` reads state only and reports any existing -lock metadata. `force-unlock` removes a lock only when the supplied id exactly -matches the lock file. `refresh` requires an existing `state.json`; `import` -creates one only when it is missing. Both observe declared graphs read-only at -`/graphs/.omni`. External state backends, graph/schema -apply, automatic stale-lock breaking, `plan --refresh`, pipelines, UI specs, -embeddings, aliases, and bindings are reserved for later stages. See +it before returning. `cluster apply` converges the cluster to its config in one +ordered run: it creates declared graphs, applies schema updates (soft drops +only — see [schema](../schema/index.md)), writes stored-query/policy catalog +resources (content-addressed under `__cluster/resources/`), and executes +approved graph deletes; it requires an existing `state.json` (run `import` +first). Applied state does not serve traffic until an `omnigraph-server +--cluster ` restart picks up the new revision. Standalone schema deletes +remain unsupported and are reported as `deferred` with a warning. `cluster +status` reads state only and reports any existing lock metadata. `force-unlock` +removes a lock only when the supplied id exactly matches the lock file. +`refresh` requires an existing `state.json`; `import` creates one only when it +is missing. Both observe declared graphs read-only at +`/graphs/.omni`. External state backends, automatic +stale-lock breaking, `plan --refresh`, pipelines, UI specs, embeddings, +aliases, and bindings are not yet supported. See [cluster-config.md](../clusters/config.md). ## Output formats (`query` command, alias: `read`) @@ -221,9 +225,12 @@ Precedence (high to low): explicit `--params` / `--params-file`, alias positiona ## Bearer token resolution (CLI) -1. `graphs..bearer_token_env` -2. `OMNIGRAPH_BEARER_TOKEN` global env -3. `auth.env_file` referenced `.env` +See **Credentials keyed by server name** above: a remote command resolves its +token via `OMNIGRAPH_TOKEN_` env → the `[]` section in +`~/.omnigraph/credentials` → the default `OMNIGRAPH_BEARER_TOKEN` env, and a +keyed token is only ever sent to the server it is keyed to. Plaintext tokens are +never stored in operator config; the removed `omnigraph.yaml` keys +(`graphs..bearer_token_env`, `auth.env_file`) no longer exist. ## Duration parsing (cleanup) diff --git a/docs/user/clusters/config.md b/docs/user/clusters/config.md index 04811ec..d9fdc1a 100644 --- a/docs/user/clusters/config.md +++ b/docs/user/clusters/config.md @@ -212,7 +212,7 @@ resource is planned as a create. If present, the file must use this shape: ``` `state_revision`, `resource_statuses`, `approval_records`, `recovery_records`, -and `observations` are optional so older Stage 1 state fixtures keep working. +and `observations` are optional so earlier state fixtures keep working. Missing `state_revision` is treated as `0`. Resource status values are `pending`, `planned`, `applying`, `applied`, `drifted`, `blocked`, or `error`. @@ -238,9 +238,10 @@ profile in the ledger; pre-profile ledgers are backfilled by an Update with catalog changes and count toward convergence. Each plan change carries a `disposition` field — an honest preview of what -`cluster apply` will do with it in this stage: `applied` (executes), `derived` -(a `graph.` composite-digest update that converges automatically once its -query digests land), `deferred` (graph/schema change, later phase), or +`cluster apply` will do with it: `applied` (executes — graph creates, schema +updates, catalog writes, approved deletes), `derived` (a `graph.` +composite-digest update that converges automatically once its query digests +land), `deferred` (an unsupported change, e.g. a standalone schema delete), or `blocked` (query/policy gated by an unapplied or missing dependency, with the condition in `reason`). @@ -496,5 +497,5 @@ matches the argument. A wrong id, missing lock, invalid lock JSON, or unsupporte lock version exits non-zero and leaves the file untouched. This is manual recovery for abandoned local locks. OmniGraph does not perform -PID-liveness checks, TTL expiry, stale-lock breaking, or automatic unlock in -Stage 2C. +PID-liveness checks, TTL expiry, stale-lock breaking, or automatic unlock +today. diff --git a/docs/user/operations/server.md b/docs/user/operations/server.md index 18032e9..3f6bcd0 100644 --- a/docs/user/operations/server.md +++ b/docs/user/operations/server.md @@ -40,7 +40,7 @@ storage root, with no local config directory. `--bind`, ### Stored-query validation at startup -If a graph declares a `queries:` registry (see [cli-reference](../cli/reference.md)), the server **loads and type-checks every stored query against that graph's live schema at startup**. Query parse/type failures quarantine that graph; if no graph remains healthy, startup refuses. Two MCP-exposed queries claiming the same tool name are likewise graph-local startup failures. Non-blocking advisories (e.g. an MCP-exposed query with a vector parameter an agent cannot supply) are logged. Validate offline before deploying with `omnigraph queries validate`. Discover the exposed queries as a typed tool catalog with `GET /queries`, and invoke one over HTTP with `POST /queries/{name}` (both below). +If a graph declares a `queries:` registry (see [cli-reference](../cli/reference.md)), the server **loads and type-checks every stored query against that graph's live schema at startup**. Query parse/type failures quarantine that graph; if no graph remains healthy, startup refuses. Two MCP-exposed queries claiming the same tool name are likewise graph-local startup failures. Non-blocking advisories (e.g. an MCP-exposed query with a vector parameter an agent cannot supply) are logged. Validate offline before deploying with `omnigraph queries validate`. Discover the stored queries as a typed tool catalog with `GET /queries`, and invoke one over HTTP with `POST /queries/{name}` (both below). ## Endpoint inventory @@ -57,7 +57,7 @@ graph id from the cluster's applied revision: | POST | `/graphs/{id}/export` | bearer + `export` | NDJSON stream | | POST | `/graphs/{id}/mutate` | bearer + `change` | mutation (canonical; `query`/`name`; accepts legacy `query_source`/`query_name` as serde aliases) | | POST | `/graphs/{id}/change` | bearer + `change` | **deprecated** alias of `/mutate` (carries `Deprecation: true` + `Link: ; rel="successor-version"`) | -| GET | `/graphs/{id}/queries` | bearer + `read` | list the `mcp.expose` stored queries as a typed tool catalog | +| GET | `/graphs/{id}/queries` | bearer + `read` | list the graph's stored queries as a typed tool catalog | | POST | `/graphs/{id}/queries/{name}` | bearer + `invoke_query` (+ `change` for a stored mutation) | invoke a named query from the `queries:` registry; deny == 404 | | GET | `/graphs/{id}/schema` | bearer + `read` | get current `.pg` source | | POST | `/graphs/{id}/schema/apply` | bearer + `schema_apply` (target=`main`) | disabled for cluster-backed serving; returns 409 and points operators at `omnigraph cluster apply` + restart | @@ -76,12 +76,17 @@ Server-level management endpoints: |---|---|---|---| | GET | `/graphs` | bearer + `graph_list` on `Server::"root"` | list ready/served graphs | +> The per-graph subsections below name routes in shorthand (`GET /queries`, +> `POST /query`, `POST /mutate`, `POST /queries/{name}`); every one is served +> under the `/graphs/{id}/…` prefix shown in the table — only `/graphs` and +> `/healthz` are flat. + ### Stored-query catalog (`GET /queries`) -List the graph's **`mcp.expose`** stored queries as a typed tool catalog — enough for a client (e.g. an MCP server) to register each as a tool without fetching `.gq` source. Each entry: `{ name, tool_name, description, instruction, mutation, params }`, where each param is `{ name, kind, item_kind?, vector_dim?, nullable }`. `kind` is one of `string | bool | int | bigint | float | date | datetime | blob | vector | list` (decomposed so a consumer maps it with a closed `switch`, never re-parsing GQ type spelling). `bigint` (I64/U64), `date`, `datetime`, and `blob` are carried as JSON **strings** — a 64-bit integer loses precision as a JSON number, dates are ISO strings, and a blob is a URI string. +List the graph's stored queries as a typed tool catalog — enough for a client (e.g. an MCP server) to register each as a tool without fetching `.gq` source. Each entry: `{ name, tool_name, description, instruction, mutation, params }`, where each param is `{ name, kind, item_kind?, vector_dim?, nullable }`. `kind` is one of `string | bool | int | bigint | float | date | datetime | blob | vector | list` (decomposed so a consumer maps it with a closed `switch`, never re-parsing GQ type spelling). `bigint` (I64/U64), `date`, `datetime`, and `blob` are carried as JSON **strings** — a 64-bit integer loses precision as a JSON number, dates are ISO strings, and a blob is a URI string. - **Read-gated** (works in default-deny mode). The catalog is **graph-wide** (branch-independent; `read` is authorized against `main`). -- **`mcp.expose` defaults to `true`** — declaring a query in `queries:` lists it; set `mcp: { expose: false }` to keep it HTTP/service-callable but hidden from the catalog. +- **Every stored query in the applied registry is listed.** Cluster-served graphs have no per-query expose flag today — every query in the cluster `queries:` registry appears in the catalog. (Per-query exposure may become a Cedar-policy decision in a later release; see [cluster-config](../clusters/config.md).) - **Not Cedar-filtered per query (yet).** A caller with `read` but not `invoke_query` can *list* a query they can't *invoke* (which would 404). Closing that gap is future per-query authorization; for now the catalog is a discovery surface and `invoke_query` remains the invocation gate. ### Stored-query invocation (`POST /queries/{name}`) @@ -163,8 +168,8 @@ Uniform `ErrorOutput { error, code?, merge_conflicts[], manifest_conflict? }` wi caller's pre-write view of one table's manifest version was stale. `ManifestConflictOutput { table_key, expected, actual }` tells the client which table to refresh and retry. This is the conflict shape produced by -concurrent `/mutate` (or its `/change` alias) or `/ingest` calls landing -the same `(table, branch)` race. +concurrent `/mutate` (or its `/change` alias), `/load` (or its deprecated +`/ingest` alias) calls landing the same `(table, branch)` race. HTTP status codes used: 200, 400, 401, 403, 404, 409, 429, 500. @@ -191,7 +196,8 @@ Cedar policy authorization runs **before** admission accounting so denied requests don't consume admission slots. Today admission gates every mutating handler: `/mutate` (and its -deprecated alias `/change`), `/ingest`, `/branches/{create,delete,merge}`, +deprecated alias `/change`), `/load` (and its deprecated alias `/ingest`), +`/branches/{create,delete,merge}`, and `/schema/apply`. Read-only endpoints (`/snapshot`, `/query`, `/read`, `/export`, `/branches` GET, `/commits`, `/schema` GET) are not admission-gated. @@ -199,7 +205,7 @@ admission-gated. ## Body limits - Default: 1 MB -- `/ingest`: 32 MB +- `/load` (and its deprecated `/ingest` alias): 32 MB ## Auth model (`bearer + SHA-256`) @@ -227,7 +233,7 @@ See [deployment.md](../deployment.md) for token-source operational details. - CORS — not configured; add `tower_http::cors` if needed. - Rate limiting — per-actor admission control gates `/mutate` (alias - `/change`), `/ingest`, `/branches/{create,delete,merge}`, + `/change`), `/load` (alias `/ingest`), `/branches/{create,delete,merge}`, `/schema/apply` (see "Per-actor admission control" above). No global rate limiter is configured; add `tower_http::limit` if a graph-wide cap is needed. diff --git a/docs/user/reference/constants.md b/docs/user/reference/constants.md index ec19f4d..3da9a2b 100644 --- a/docs/user/reference/constants.md +++ b/docs/user/reference/constants.md @@ -18,7 +18,7 @@ | Expand CSR-build cost factor | `CSR_BUILD_FACTOR = 1.5` | traversal | | Expand mode override | `OMNIGRAPH_TRAVERSAL_MODE` (`indexed`\|`csr`; unset = cost-based auto) | traversal | | Default body limit | `1 MB` | HTTP server | -| Ingest body limit | `32 MB` | HTTP server | +| Load (bulk-write) body limit | `32 MB` | HTTP server (`/load`; shared by the deprecated `/ingest` alias) | | Default embed provider/model | `openai-compatible` / `openai/text-embedding-3-large` | engine embedding | | OpenAI-direct embed model | `text-embedding-3-large` | engine embedding | | Gemini-direct embed model | `gemini-embedding-2` | engine embedding | diff --git a/docs/user/schema/index.md b/docs/user/schema/index.md index 105281c..df82a23 100644 --- a/docs/user/schema/index.md +++ b/docs/user/schema/index.md @@ -72,6 +72,8 @@ Applying a plan reports whether it was supported, the steps applied, and the res `DropProperty` and `DropType` steps default to `Soft` mode: the catalog tombstones the entry but the prior column / dataset remains time-travel-reachable via `snapshot_at_version(prev)` until `omnigraph cleanup` runs. Soft drops are reversible. -Pass `--allow-data-loss` (CLI) or `allow_data_loss: true` (HTTP `POST /schema/apply` body, SDK `SchemaApplyOptions`) to promote every drop in the plan to `Hard` mode. Hard drops run `cleanup_old_versions` on the affected dataset immediately after the manifest publish, making the prior column / dataset unreachable. **Irreversible.** +Pass `--allow-data-loss` (CLI `schema apply`) or `allow_data_loss: true` (SDK `SchemaApplyOptions`) to promote every drop in the plan to `Hard` mode. Hard drops run `cleanup_old_versions` on the affected dataset immediately after the manifest publish, making the prior column / dataset unreachable. **Irreversible.** -The flag is honored uniformly across transports — `omnigraph schema apply --allow-data-loss`, `POST /schema/apply { schema_source, allow_data_loss: true }`, and `apply_schema_with_options(.., SchemaApplyOptions { allow_data_loss: true })` produce identical plans and identical effects. +This is the **direct/embedded** schema-apply path — `omnigraph schema apply --store …` and the embedded SDK `apply_schema_with_options(.., SchemaApplyOptions { allow_data_loss: true })` produce identical plans and identical effects. + +**Cluster-managed graphs are different.** A graph served from a cluster evolves only through `omnigraph cluster apply`, which performs **soft drops only** (no `allow_data_loss` path), and the HTTP `POST /schema/apply` route is **disabled (returns 409) for cluster-backed serving** — see [server](../operations/server.md) and [cluster-config](../clusters/config.md). Direct `schema apply` against a cluster-managed storage path is likewise refused. diff --git a/docs/user/search/indexes.md b/docs/user/search/indexes.md index 57935cd..af8c128 100644 --- a/docs/user/search/indexes.md +++ b/docs/user/search/indexes.md @@ -22,7 +22,7 @@ list/`Blob` columns → none. > **Coverage and cost.** Each indexed column adds index files and build time, and > an index only covers the fragments it was built over. Rows appended after the -> index was built (e.g. by `ingest --mode merge`) are scanned unindexed until a +> index was built (e.g. by `load --mode merge`) are scanned unindexed until a > reindex extends coverage; see [maintenance](../operations/maintenance.md) → `optimize`. ## L2 — OmniGraph orchestration diff --git a/openapi.json b/openapi.json index 225a959..0308087 100644 --- a/openapi.json +++ b/openapi.json @@ -1006,7 +1006,7 @@ "queries" ], "summary": "List the graph's exposed stored queries as a typed tool catalog.", - "description": "Returns the `mcp.expose == true` subset of the `queries:` registry, each\nwith its MCP tool name, read/mutate flag, description/instruction, and\ntyped parameters — enough for a client to register them as tools without\nfetching `.gq` source. Read-gated; the catalog is graph-wide (branch\nindependent — `read` is authorized against `main`). **Not** Cedar-filtered\nper query yet, so it can list a query whose `invoke_query` the caller\nlacks (a known gap until per-query authorization lands).", + "description": "Returns every stored query in the `queries:` registry, each\nwith its MCP tool name, read/mutate flag, description/instruction, and\ntyped parameters — enough for a client to register them as tools without\nfetching `.gq` source. Cluster-served graphs have no per-query expose flag,\nso the catalog lists them all. Read-gated; the catalog is graph-wide (branch\nindependent — `read` is authorized against `main`). **Not** Cedar-filtered\nper query yet, so it can list a query whose `invoke_query` the caller\nlacks (a known gap until per-query authorization lands).", "operationId": "cluster_list_queries", "parameters": [ { @@ -1021,7 +1021,7 @@ ], "responses": { "200": { - "description": "Stored-query catalog (the mcp.expose subset, with typed params)", + "description": "Stored-query catalog (every stored query, with typed params)", "content": { "application/json": { "schema": { @@ -2248,7 +2248,7 @@ }, "QueriesCatalogOutput": { "type": "object", - "description": "Response for `GET /queries`: the `mcp.expose` subset of a graph's\nstored-query registry, each with typed parameters.", + "description": "Response for `GET /queries`: every stored query in a graph's\nregistry, each with typed parameters.", "required": [ "queries" ], From fff441196cebc63e61e7dc96f7c623324d198660 Mon Sep 17 00:00:00 2001 From: Andrew Altshuler Date: Sun, 21 Jun 2026 00:11:48 +0300 Subject: [PATCH 5/8] =?UTF-8?q?docs(dev):=20update=20coherence=20ledger=20?= =?UTF-8?q?=E2=80=94=20cookbooks=20drift=20resolved,=20omnigraph-ts=20mech?= =?UTF-8?q?anism=20(#294)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - omnigraph-cookbooks `bearer_token_env` chain: RESOLVED by cookbooks PR #26 (deleted docs/best-practices.md in the 0.7 restructure). - omnigraph-ts catalog `mcp.expose` description: documented why there is no hand-fix — the SDK syncs openapi.json from a *tagged* omnigraph release, and the fix landed on main after the v0.7.1 tag, so it flows in on the next SDK version bump (v0.7.2+) rather than an out-of-band patch. Claude-Session: https://claude.ai/code/session_01FQ1Hf4eXLsJmeLUkTYBEw7 Co-authored-by: Claude Opus 4.8 (1M context) --- docs/dev/docs-issues.md | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/dev/docs-issues.md b/docs/dev/docs-issues.md index d045f4b..c0a4fdb 100644 --- a/docs/dev/docs-issues.md +++ b/docs/dev/docs-issues.md @@ -40,17 +40,19 @@ fixed. Branch `docs/user-coherence-0-7-1`. `apply_schema_update_and_dependent_query_in_one_run`, `apply_blocks_graph_delete_without_approval`). Update these comments in a cluster-crate change. -- **Cross-repo drift from this sweep** (separate repos — track here, fix in a - follow-up in each repo): - - `omnigraph-ts` SDK ships a stale generated `spec/openapi.json` + - `packages/sdk/src/generated/types.gen.ts` still describing the `GET /queries` - catalog as the `mcp.expose` subset. Regenerate from this repo's - `openapi.json` once the SDK's deferred refresh happens (the SDK is known to - lag the API by design). - - `omnigraph-cookbooks/docs/best-practices.md` (~line 372) still describes - client-side auth as resolving through the removed `bearer_token_env` chain. - Update to the keyed-credential model (`OMNIGRAPH_TOKEN_` → - credentials file → `OMNIGRAPH_BEARER_TOKEN`). +- **Cross-repo drift from this sweep** (separate repos): + - `omnigraph-ts` SDK — its generated `spec/openapi.json` + + `packages/sdk/src/generated/types.gen.ts` still describe the `GET /queries` + catalog as the `mcp.expose` subset. **No hand-fix:** the SDK's + `scripts/sync-spec.ts` pulls openapi.json from a *tagged* omnigraph release + (`/omnigraph/v{version}/openapi.json`), and the catalog fix landed on main + *after* the v0.7.1 tag — so it is in no tag yet and a hand-edit would be + overwritten on the next sync. It flows in automatically when the SDK bumps + to a tag containing the fix (v0.7.2+). Tracked, not actioned. + - `omnigraph-cookbooks/docs/best-practices.md` `bearer_token_env` chain — + **RESOLVED** by omnigraph-cookbooks PR #26 (2026-06-21), which deleted + `docs/best-practices.md` as part of the 0.7 restructure; the stale chain + survives nowhere on `main`. ## Verification checklist (re-run on the next docs audit) From f2b792e0aebdd735850f7a36b544e32683dd18a1 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Sun, 21 Jun 2026 16:38:20 +0200 Subject: [PATCH 6/8] (feat): compact the internal manifest/commit-graph tables in optimize (#291) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(engine): compact the internal __manifest/_graph_commits tables in optimize `optimize` iterated node/edge catalog tables only, so the two internal system tables (`__manifest`, `_graph_commits`) accumulated one fragment per commit and were never compacted -- making every write's metadata scan O(fragments), which grows forever on a long-lived graph (RFC-013 step 2). `optimize_all_tables` now also compacts both internal tables via a new `compact_internal_table`. They are not catalog-tracked (readers open them at their latest Lance HEAD), so it is a much simpler path than `optimize_one_table`: compact in place, no manifest publish (nothing to publish to), no recovery sidecar (a single atomic Lance commit -- no HEAD-before-publish gap), and no optimize_indices (they carry no Lance index, only object_id's unenforced-PK metadata). No application lock: Lance's compact_files auto-retries its Rewrite against any concurrent writer (the canonical LanceDB pattern; Rewrite vs Append is compatible, vs Update a retryable same-fragment conflict Lance rebases), and a coordinator refresh afterwards makes the warm handle observe the compacted HEAD. Compacts both tables even though Phase 7 (iss-991) will later fold _graph_commits into __manifest -- a one-call throwaway for the full interim win; __manifest compaction is also the prerequisite for Phase 7's graph_head contention. Cleanup (version GC) of the internal tables is deliberately NOT included here: it needs the Q8 cleanup-resurrection watermark first (deferred). maintenance.rs: optimize now returns 6 stats (4 data + 2 internal); adds optimize_compacts_internal_tables (sheds fragments, leaks no recovery sidecar, graph coherent for reads + strict writes after). * test(engine): un-ignore the internal-table scan LOCK (step 2 acceptance) `internal_table_scans_are_flat_in_history` was the RED, #[ignore]'d acceptance gate staged in PR #288. With internal-table compaction landed, a write's __manifest/_graph_commits scan is flat in commit-history depth on a compacted graph (measured __manifest 4->2, _graph_commits 7->3 across depth 10->100, vs the pre-step-2 RED 34->214 / 29->207). The test now compacts at each depth before measuring and runs green every-PR. * docs: RFC-013 step 2 internal-table compaction landed - invariants.md: close the compaction half of the read-path-rederivation known gap (optimize now compacts the internal tables; cleanup half still deferred). - maintenance.md: optimize covers __manifest/_graph_commits (no publish, no sidecar); not yet in cleanup. - rfc-013 §9: split step 2 into 2a (compaction, landed) and 2b (cleanup + Q8 watermark, deferred — debated; MTT-overlap + hot-path liability). - testing.md: the internal-table LOCK is now green every-PR. * fix(engine): guard absent _graph_commits + always compact internal tables Addresses PR #291 review findings: - Greptile (P1): optimize unconditionally opened `_graph_commits` for compaction, but a graph can validly have none (the coordinator opens it as `Option`, gated on `storage.exists`, for graphs predating the commit graph). `Dataset::open` on the absent table errored and failed the whole optimize. Guard the `_graph_commits` compaction with the same `storage_adapter().exists()` check the coordinator uses; `__manifest` always exists so it stays unguarded. Regression test `optimize_tolerates_absent_graph_commits_table` (empty graph so no publish recreates the table before the guard). - Cursor (low): the `table_tasks.is_empty()` early return skipped internal-table compaction for a schema with no node/edge types. Removed it so the internal tables are compacted regardless of the data-table set. - Codex (auto-cleanup, P1): documented — `compact_files` commits with a default `CommitConfig` (no skip_auto_cleanup) and `CompactionOptions` exposes no override, so on a graph storing an *on* auto_cleanup config the commit would fire version GC. Both internal tables are created with `auto_cleanup: None`, so new graphs are safe; the only exposure is pre-fix upgraded graphs, identical to the existing data-table optimize path, with step 2b's watermark as the comprehensive guard. Added a comment in `compact_internal_table` recording this. * fix(engine): retry publish on RetryableCommitConflict (compaction vs publish) Step 2 compacts `__manifest` with no app-level lock (Lance OCC arbitrates, validated against LanceDB + the lance-7.0.0 conflict resolver). compact_files' `Operation::Rewrite` auto-retries 20x (CommitConfig default num_retries=20), so a live publish usually wins the race and the compaction rebases. But the publish runs its merge-insert with conflict_retries(0) = one rebase attempt; if the compaction commits first AND the merge touched a fragment the Rewrite rewrote, Lance preempts the publish with `Error::RetryableCommitConflict` — a DIFFERENT variant from the row-level `TooMuchWriteContention` the publisher already retries. Left unhandled, that surfaces a transient error to the caller, i.e. a maintenance compaction (physical op) failing a live write (logical op) — invariant 7. Map `LanceError::RetryableCommitConflict` to a new `ManifestConflictDetails::RetryableCommitConflict` and treat it as retryable in the publisher's outer loop (reload fresh state + re-merge), alongside RowLevelCasContention. `ExpectedVersionMismatch` still propagates (a genuine expectation break must not be blindly retried). This also hardens multi-process concurrent writers generally, not just compaction. Normal publishes are insert-only (new object_ids -> new fragments, disjoint from rewritten old ones), so the conflict is rare; the guard covers the same-fragment-update edge and multi-process writers. Unit tests in publisher.rs pin the mapping + the retry-predicate contract. * revert: publisher RetryableCommitConflict handling (it was the wrong side) Reverts d138902e. Validated against lance-7.0.0: the publisher's merge-insert runs with conflict_retries(0), and execute_with_retry converts an exhausted retryable commit conflict to TooMuchWriteContention before the caller sees it (write/retry.rs ~95-130). So map_lance_publish_error NEVER receives RetryableCommitConflict from merge_rows — it receives TooMuchWriteContention, which the publisher already maps to RowLevelCasContention and retries. The reverted mapping was therefore dead on the real path and its unit test was synthetic. The actual exposure is the *compaction* side: compact_files -> commit_compaction -> apply_commit directly (no execute_with_retry), so a Rewrite-vs-Merge check_txn conflict propagates raw and optimize can fail on a live graph. That is fixed app-side in compact_internal_table in the following commit. * fix(engine): make internal-table compaction correct by construction Address three findings from review of the step-2 internal-table compaction: - Non-destructive by construction: before compacting an internal table, strip any stored `lance.auto_cleanup.*` config off it. `compact_files` commits with a default `CommitConfig` (skip_auto_cleanup=false) and `CompactionOptions` exposes no override, so on a graph created by an older binary (on-by-default GC hook) the compaction commit would fire Lance's auto-cleanup and silently prune `__manifest`-pinned versions. Current binaries store no such config; the strip is the upgrade-path safety net so `optimize` can never GC versions. - App-level compaction retry: `compact_files` does NOT auto-retry a semantic conflict against a concurrent live writer (Rewrite vs Update/Merge/Delete propagates raw from apply_commit; Lance prescribes app-rerun). Wrap the internal-table compaction in a bounded retry loop that reopens fresh and replans on a retryable Lance conflict, so a maintenance compaction (a physical op) never fails a live write (a logical op) — invariant 7. - Compact all three internal tables, not two: `_graph_commit_actors` grows one fragment per commit on the authenticated write path, the same O(depth) scan as `__manifest`/`_graph_commits`. Drive the sweep from one source-of-truth list with per-table existence guards (the two commit-graph tables are optional). Make `graph_commit_actors_uri` pub(crate). Tests: the `internal_table_scans_are_flat_in_history` LOCK now runs the authenticated (actorful) write path so it covers `_graph_commit_actors` via the shared commit-graph IO wrapper (new `commit_many_as`/`measure_insert_as` helpers); `optimize_clears_stale_auto_cleanup_and_preserves_versions` pins the non-destructive guarantee (config cleared + no version GC); a unit test pins the retryable-conflict classifier; the empty-graph stats count is 7 (the actor table is created at init). * docs: internal-table compaction covers all 3 tables, non-destructive, retried Sync the RFC-013 step-2a section and the maintenance guide with the correctness-by-design refinements: - optimize compacts `__manifest`, `_graph_commits`, AND `_graph_commit_actors` (the actor table grows on the authenticated write path). - optimize is non-destructive by construction — it never GCs versions, and strips stale `lance.auto_cleanup.*` config so an upgraded graph's commit-time GC hook cannot fire during compaction. - internal-table compaction rebases and retries against concurrent live writers rather than failing the operator's optimize or the live write. - the cost LOCK is the authenticated-path acceptance test. * fix(engine): refresh coordinator after a config-strip with no compaction work `compact_internal_table` returns early when `plan_compaction` finds no work, but `clear_stale_auto_cleanup_config` may have already committed a config-strip that advanced Lance HEAD. The early return skipped the coordinator refresh that the successful-compaction path performs, leaving warm `__manifest`/commit-graph handles pinned to the pre-strip version until the next read's version probe healed them. No correctness bug (the probe self-heals, and a stale-handle write would retry via publisher CAS), but the refresh makes coherence deterministic rather than probe-dependent. Refresh iff the config-strip actually committed. * docs(engine): correct compact_internal_table doc — compact_files does not auto-retry The function doc claimed "Lance's compact_files auto-retries its Operation::Rewrite against any concurrent writer" — wrong, and contradicting the is_retryable_lance_conflict doc just below it and the explicit retry loop that exists precisely because compact_files does NOT auto-retry semantic conflicts (Rewrite vs Update/Merge/Delete propagates raw through apply_commit). Also move the orphaned description from above the retry-budget const onto the function, and include the third internal table. * test(engine): optimize must clear stale auto_cleanup on DATA tables too (red) Regression test for a destructive bug on the data-table optimize path: on an upgraded graph whose node/edge table still carries pre-v7 lance.auto_cleanup.* config, `optimize`'s compact_files/optimize_indices commits fire Lance's version GC and prune __manifest-pinned data-table versions. Mirrors the internal-table auto_cleanup test on a Person table (force-repair realigns the config-induced drift so optimize doesn't skip the table). Red against the current code: the data-table path does not strip the config. The fix lands in the next commit. * fix(engine): clear stale auto_cleanup on the data-table optimize path too The auto_cleanup scrub previously only protected the internal tables; the data-table path (optimize_one_table) ran compact_files/optimize_indices with a default CommitConfig (skip_auto_cleanup=false) and no override, so on an upgraded graph those commits could fire Lance's version-GC hook and prune __manifest-pinned node/edge versions — making the "non-destructive" contract false for data tables. Strip the config before the HEAD-advancing commits, capturing version_before first so the strip's own commit still triggers the Phase-C manifest publish (no uncovered drift). No retry loop needed: the data-table path holds the per-table write queue. Covered by the existing Optimize recovery sidecar. Turns the prior commit's test green. Also: switch clear_stale_auto_cleanup_config off the deprecated delete_config_keys to update_config(None values), and correct two now-inaccurate doc comments — compaction is "one or more content-preserving commits" (compact_files can emit a ReserveFragments before the Rewrite), not "a single atomic commit"; the sidecar-free property rests on content-preservation + read-at-HEAD, not single-commit atomicity. * docs: optimize is non-destructive on all tables; correct atomicity/retry claims - non-destructive guarantee now spans data + internal tables (the auto_cleanup strip runs on both paths), not just the internal ones. - "single atomic Lance commit" was inaccurate: compaction can emit a ReserveFragments commit before the Rewrite; the no-sidecar property rests on content-preservation + read-at-HEAD, not single-commit atomicity. - "retries rather than failing" softened to the truth: a *bounded* retry on the internal path; sustained contention surfaces a loud conflict error (bounded + observable, not an infinite loop). The data path holds the per-table queue and never contends. --- crates/omnigraph/src/db/commit_graph.rs | 2 +- crates/omnigraph/src/db/manifest.rs | 3 +- crates/omnigraph/src/db/manifest/layout.rs | 2 +- crates/omnigraph/src/db/omnigraph/optimize.rs | 248 +++++++++++++++++- crates/omnigraph/tests/helpers/cost.rs | 17 ++ crates/omnigraph/tests/helpers/mod.rs | 16 ++ crates/omnigraph/tests/maintenance.rs | 234 ++++++++++++++++- crates/omnigraph/tests/write_cost.rs | 36 ++- docs/dev/invariants.md | 13 +- docs/dev/rfc-013-write-path-latency.md | 71 +++-- docs/dev/testing.md | 2 +- docs/user/operations/maintenance.md | 2 + 12 files changed, 600 insertions(+), 46 deletions(-) diff --git a/crates/omnigraph/src/db/commit_graph.rs b/crates/omnigraph/src/db/commit_graph.rs index 572bdf5..181d1d8 100644 --- a/crates/omnigraph/src/db/commit_graph.rs +++ b/crates/omnigraph/src/db/commit_graph.rs @@ -396,7 +396,7 @@ pub(crate) fn graph_commits_uri(root_uri: &str) -> String { format!("{}/{}", root_uri.trim_end_matches('/'), GRAPH_COMMITS_DIR) } -fn graph_commit_actors_uri(root_uri: &str) -> String { +pub(crate) fn graph_commit_actors_uri(root_uri: &str) -> String { format!( "{}/{}", root_uri.trim_end_matches('/'), diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs index 4c6410b..fa05b49 100644 --- a/crates/omnigraph/src/db/manifest.rs +++ b/crates/omnigraph/src/db/manifest.rs @@ -28,7 +28,8 @@ mod recovery; mod state; use graph::{init_manifest_graph, open_manifest_graph, snapshot_state_at}; -use layout::{manifest_uri, open_manifest_dataset, table_uri_for_path, type_name_hash}; +use layout::{open_manifest_dataset, table_uri_for_path, type_name_hash}; +pub(crate) use layout::manifest_uri; pub(crate) use metadata::TableVersionMetadata; #[cfg(test)] use metadata::{OMNIGRAPH_ROW_COUNT_KEY, table_version_metadata_for_state}; diff --git a/crates/omnigraph/src/db/manifest/layout.rs b/crates/omnigraph/src/db/manifest/layout.rs index f4ac09b..12894a7 100644 --- a/crates/omnigraph/src/db/manifest/layout.rs +++ b/crates/omnigraph/src/db/manifest/layout.rs @@ -15,7 +15,7 @@ pub(super) fn type_name_hash(name: &str) -> String { format!("{:016x}", h) } -pub(super) fn manifest_uri(root: &str) -> String { +pub(crate) fn manifest_uri(root: &str) -> String { format!("{}/{}", root.trim_end_matches('/'), MANIFEST_DIR) } diff --git a/crates/omnigraph/src/db/omnigraph/optimize.rs b/crates/omnigraph/src/db/omnigraph/optimize.rs index 9a0a17f..29bf2b6 100644 --- a/crates/omnigraph/src/db/omnigraph/optimize.rs +++ b/crates/omnigraph/src/db/omnigraph/optimize.rs @@ -248,10 +248,8 @@ pub async fn optimize_all_tables(db: &Omnigraph) -> Result> = futures::stream::iter(table_tasks.into_iter()) @@ -279,7 +277,42 @@ pub async fn optimize_all_tables(db: &Omnigraph) -> Resultmanifest drift). let version_before = ds.version().version; + + // Keep optimize non-destructive on upgraded graphs (same guarantee the + // internal-table path makes — see `clear_stale_auto_cleanup_config`). + // `compact_files` / `optimize_indices` commit with a default `CommitConfig` + // (`skip_auto_cleanup = false`) and expose no skip override, so on a graph + // created by a pre-v7 binary (auto_cleanup ON) those commits would fire + // Lance's version-GC hook and prune `__manifest`-pinned data-table versions. + // Strip the stale config first. We hold the per-table queue, so no concurrent + // writer can race this (no retry loop needed, unlike the internal-table path); + // any commit it makes is content-preserving and covered by the Optimize + // sidecar's loose `post_commit_pin` like the other Phase-B commits. + clear_stale_auto_cleanup_config(&mut ds) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let metrics: CompactionMetrics = if will_compact { compact_files(&mut ds, options, None) .await @@ -514,6 +565,173 @@ async fn optimize_one_table( Ok(stat) } +/// Bound on the app-level retry of an internal-table compaction against a +/// concurrent live writer (see [`is_retryable_lance_conflict`]). +const INTERNAL_COMPACTION_RETRY_BUDGET: u32 = 5; + +/// A Lance commit error that means "a concurrent writer preempted us; reload the +/// dataset and rerun." `compact_files` commits via `commit_compaction` -> +/// `apply_commit` *directly* — unlike the merge-insert path it is NOT wrapped in +/// `execute_with_retry`, so a `Rewrite`-vs-`Merge`/`Update`/`Delete` `check_txn` +/// conflict propagates raw instead of being rebased or converted to +/// `TooMuchWriteContention`. Lance's transaction spec prescribes that the +/// *application* reruns these, which is what `compact_internal_table` does — so a +/// maintenance compaction (a physical op) never fails a live write (a logical op), +/// invariant 7. (`TooMuchWriteContention` is included for the exhausted-retry form +/// some commit paths surface.) +fn is_retryable_lance_conflict(err: &lance::Error) -> bool { + matches!( + err, + lance::Error::RetryableCommitConflict { .. } + | lance::Error::CommitConflict { .. } + | lance::Error::TooMuchWriteContention { .. } + ) +} + +/// Remove any stored `lance.auto_cleanup.*` config from a table so compaction +/// stays **non-destructive by construction**. Used by both the internal-table +/// path ([`compact_internal_table`]) and the data-table path +/// ([`optimize_one_table`]). +/// +/// `compact_files` / `optimize_indices` commit with a default `CommitConfig` +/// (`skip_auto_cleanup = false`) and `CompactionOptions` exposes no override, so on +/// a dataset whose stored config has `lance.auto_cleanup.interval` set, the +/// compaction/reindex commit would fire Lance's auto-cleanup hook (version GC) — +/// deletion of old versions, including ones `__manifest` pins for snapshots / +/// time-travel (data tables) or that hold lineage/time-travel state (internal +/// tables). New graphs create tables with `auto_cleanup: None` (`manifest/graph.rs`, +/// `commit_graph.rs`, and the data-table create path) so there is nothing to clear; +/// only pre-`auto_cleanup`-fix *upgraded* graphs carry the config. OmniGraph owns +/// version cleanup explicitly (`cleanup`), so Lance's hook is unwanted regardless — +/// clearing it both makes `optimize` non-destructive and aligns the table with the +/// new-graph posture. The `delete_config_keys` commit itself does not GC: the +/// resulting manifest no longer has the `interval` key, so the post-commit hook is a +/// no-op. Returns whether any config was cleared (it advances Lance HEAD iff so). +/// Recovery coverage differs by caller: the data-table path runs this inside the +/// Optimize sidecar window; the internal-table path needs none (it commits at HEAD +/// and is read at HEAD — the strip is a content-preserving config commit, so a crash +/// leaves the table readable and content-identical, see [`compact_internal_table`]). +async fn clear_stale_auto_cleanup_config( + ds: &mut lance::Dataset, +) -> std::result::Result { + let keys: Vec = ds + .config() + .keys() + .filter(|k| k.starts_with("lance.auto_cleanup.")) + .cloned() + .collect(); + if keys.is_empty() { + return Ok(false); + } + // Merge-update with `None` values to delete the keys — the non-deprecated + // replacement for `delete_config_keys` (awaiting the builder merges rather + // than replacing the whole config map). + let entries: Vec<(&str, Option<&str>)> = keys.iter().map(|k| (k.as_str(), None)).collect(); + ds.update_config(entries).await?; + Ok(true) +} + +/// Compact one INTERNAL system table (`__manifest` / `_graph_commits` / +/// `_graph_commit_actors`) in place. +/// +/// Unlike catalog data tables, the internal tables are not tracked in the +/// `__manifest` (they ARE the manifest / the lineage DAG): readers open them at +/// their latest Lance HEAD, so compaction just advances that HEAD and the next +/// reader transparently observes the compacted version. That makes this path much +/// simpler than [`optimize_one_table`] — no manifest publish (nothing to publish +/// to), and no recovery sidecar. The sidecar-free claim does NOT rest on +/// single-commit atomicity: `compact_files` can emit a `ReserveFragments` commit +/// before the final `Rewrite` (and the config strip is a separate commit before +/// both), so this advances HEAD over one or more commits. It needs no sidecar +/// because every one of those commits is content-preserving and the table is read +/// at HEAD — a crash at any point leaves the table readable and content-identical, +/// and the next `optimize` re-plans. Internal tables carry no Lance index (only +/// `object_id`'s unenforced-PK schema metadata), so no `optimize_indices`. +/// +/// Concurrency: no application lock, but `compact_files` does NOT auto-retry a +/// semantic conflict — its `Operation::Rewrite` commits through `apply_commit` +/// directly (not the merge-insert `execute_with_retry` path), so a `Rewrite` +/// vs concurrent `Update`/`Merge`/`Delete` `check_txn` conflict propagates raw. +/// We own the retry here (see [`is_retryable_lance_conflict`]): on a retryable +/// conflict, reopen at the new HEAD and rerun. A follow-up coordinator `refresh` +/// makes the warm internal-table handles observe the compacted HEAD +/// deterministically (the version probe would also self-heal on the next read). +async fn compact_internal_table( + db: &Omnigraph, + table_key: &str, + uri: String, +) -> Result { + // App-level retry against concurrent live writers. compact_files does NOT + // auto-retry a Rewrite-vs-live-write conflict (see is_retryable_lance_conflict), + // so optimize would otherwise fail spuriously on a live graph. On a retryable + // conflict we re-open at the new HEAD and rerun — the canonical Lance-consumer + // pattern. Each attempt opens fresh because the conflict means the version moved. + for attempt in 0..INTERNAL_COMPACTION_RETRY_BUDGET { + let handle = db + .storage() + .open_dataset_head_for_write(table_key, &uri, None) + .await?; + let mut ds = handle.into_dataset(); + + // Keep optimize non-destructive by construction (see clear_stale_auto_cleanup_config). + // Returns whether it committed a config-strip (which advances Lance HEAD). + let cleared_config = match clear_stale_auto_cleanup_config(&mut ds).await { + Ok(cleared) => cleared, + Err(e) => { + if attempt + 1 < INTERNAL_COMPACTION_RETRY_BUDGET && is_retryable_lance_conflict(&e) + { + continue; + } + return Err(OmniError::Lance(e.to_string())); + } + }; + + let options = CompactionOptions::default(); + let plan = plan_compaction(&ds, &options) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + if plan.num_tasks() == 0 { + // No compaction work, but a config-strip still advanced HEAD — refresh + // the warm coordinator handles so they observe it deterministically + // (same cache-coherence step the successful-compaction path takes + // below; otherwise they stay pinned until the next version probe). + if cleared_config { + db.coordinator.write().await.refresh().await?; + } + return Ok(TableOptimizeStats::compacted( + table_key.to_string(), + &CompactionMetrics::default(), + false, + )); + } + + match compact_files(&mut ds, options, None).await { + Ok(metrics) => { + // Cache coherence: re-open the warm coordinator's internal-table + // handles at the compacted HEAD (they live in `db.coordinator`, not + // the data-table `runtime_cache`). + db.coordinator.write().await.refresh().await?; + return Ok(TableOptimizeStats::compacted( + table_key.to_string(), + &metrics, + true, + )); + } + Err(e) + if attempt + 1 < INTERNAL_COMPACTION_RETRY_BUDGET + && is_retryable_lance_conflict(&e) => + { + continue; + } + Err(e) => return Err(OmniError::Lance(e.to_string())), + } + } + Err(OmniError::manifest_conflict(format!( + "internal-table compaction of {table_key} exhausted {INTERNAL_COMPACTION_RETRY_BUDGET} \ + retries against concurrent writers" + ))) +} + /// Run Lance `cleanup_old_versions` on every node + edge table on `main`, /// using [`CleanupPolicyOptions`]. The latest manifest is always preserved /// regardless (Lance invariant). @@ -912,6 +1130,26 @@ mod tests { use crate::failpoints::ScopedFailPoint; use crate::loader::{LoadMode, load_jsonl}; + /// The internal-table compaction retry classifier: a concurrent live writer + /// preempting our `Rewrite` is retryable (Lance prescribes app-rerun, and + /// compact_files does not auto-retry it); a non-conflict error is not (must not + /// be masked by a blind retry). + #[test] + fn retryable_lance_conflicts_are_classified() { + assert!(is_retryable_lance_conflict( + &lance::Error::retryable_commit_conflict_source( + 1, + Box::new(std::io::Error::other("preempted by concurrent write")), + ) + )); + assert!(is_retryable_lance_conflict( + &lance::Error::too_much_write_contention("contended") + )); + assert!(!is_retryable_lance_conflict(&lance::Error::invalid_input( + "not a conflict" + ))); + } + fn node_table_uri(root: &str, type_name: &str) -> String { let mut hash: u64 = 0xcbf2_9ce4_8422_2325; for &b in type_name.as_bytes() { diff --git a/crates/omnigraph/tests/helpers/cost.rs b/crates/omnigraph/tests/helpers/cost.rs index 4be9ee6..2114f23 100644 --- a/crates/omnigraph/tests/helpers/cost.rs +++ b/crates/omnigraph/tests/helpers/cost.rs @@ -334,6 +334,23 @@ pub async fn measure_insert(db: &mut Omnigraph, tag: &str) -> IoCounts { io } +/// Like [`measure_insert`] but carries an actor, so the write appends to and reads +/// `_graph_commit_actors.lance` — the authenticated (server/CLI) write path. The +/// commit-graph IO wrapper covers both `_graph_commits` and `_graph_commit_actors`, +/// so `IoCounts::commit_graph_reads` includes the actor-table scan on this path. +pub async fn measure_insert_as(db: &mut Omnigraph, tag: &str, actor: &str) -> IoCounts { + let (res, io) = measure(db.mutate_as( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", tag)], &[("$age", 30)]), + Some(actor), + )) + .await; + res.unwrap(); + io +} + // ── Backend fixtures — one knob, store-agnostic body ── /// Local tempdir graph (default; deterministic, every-PR). diff --git a/crates/omnigraph/tests/helpers/mod.rs b/crates/omnigraph/tests/helpers/mod.rs index 131f91b..d89227f 100644 --- a/crates/omnigraph/tests/helpers/mod.rs +++ b/crates/omnigraph/tests/helpers/mod.rs @@ -182,6 +182,22 @@ pub async fn commit_many(db: &mut Omnigraph, n: usize) { } } +/// Like [`commit_many`] but every commit carries an actor, so it grows +/// `_graph_commit_actors.lance` too — the authenticated (server/CLI) write path. +pub async fn commit_many_as(db: &mut Omnigraph, n: usize, actor: &str) { + for i in 0..n { + db.mutate_as( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", &format!("commit_many_as_{i}"))], &[("$age", 30)]), + Some(actor), + ) + .await + .unwrap(); + } +} + pub async fn snapshot_main(db: &Omnigraph) -> Result { db.snapshot_of(ReadTarget::branch("main")).await } diff --git a/crates/omnigraph/tests/maintenance.rs b/crates/omnigraph/tests/maintenance.rs index 78e31fa..ca9026d 100644 --- a/crates/omnigraph/tests/maintenance.rs +++ b/crates/omnigraph/tests/maintenance.rs @@ -94,13 +94,23 @@ async fn optimize_on_empty_graph_returns_stats_per_table_with_no_changes() { let stats = db.optimize().await.unwrap(); - // Schema declares 2 nodes + 2 edges = 4 tables. Compaction should run on - // each but find nothing to merge. - assert_eq!(stats.len(), 4); + // Schema declares 2 nodes + 2 edges = 4 data tables, plus the 3 internal + // system tables (`__manifest`, `_graph_commits`, `_graph_commit_actors`) optimize + // also compacts (RFC-013 step 2) = 7. Compaction should run on each but find + // nothing to merge. + assert_eq!(stats.len(), 7); for s in &stats { assert_eq!(s.fragments_removed, 0, "{} should not remove", s.table_key); assert_eq!(s.fragments_added, 0, "{} should not add", s.table_key); } + // The internal tables are present and reported as no-ops on an empty graph. + for key in ["__manifest", "_graph_commits", "_graph_commit_actors"] { + let s = stats + .iter() + .find(|s| s.table_key == key) + .unwrap_or_else(|| panic!("optimize stats missing internal table {key}")); + assert!(!s.committed, "{key} should be a no-op on an empty graph"); + } } #[tokio::test] @@ -133,6 +143,224 @@ async fn optimize_after_load_then_again_is_idempotent() { } } +/// RFC-013 step 2: `optimize` compacts the internal system tables +/// (`__manifest`, `_graph_commits`), which accumulate one fragment per commit. +/// After compaction they shed fragments, write no recovery sidecar (a single +/// atomic Lance commit — no HEAD-before-publish gap), and the graph stays +/// coherent for subsequent reads + strict writes. +#[tokio::test] +async fn optimize_compacts_internal_tables() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Build version-history depth so the internal tables accumulate fragments. + for i in 0..20 { + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", &format!("p{i}"))], &[("$age", 30)]), + ) + .await + .unwrap(); + } + + let stats = db.optimize().await.unwrap(); + + for key in ["__manifest", "_graph_commits"] { + let s = stats + .iter() + .find(|s| s.table_key == key) + .unwrap_or_else(|| panic!("optimize stats missing internal table {key}")); + assert!(s.committed, "{key} should compact after 20 commits"); + assert!( + s.fragments_removed > 0, + "{key} should shed fragments, removed {}", + s.fragments_removed + ); + } + + // Internal compaction leaks no recovery sidecar. + let recovery_dir = dir.path().join("__recovery"); + if recovery_dir.exists() { + let leftover: Vec<_> = std::fs::read_dir(&recovery_dir) + .unwrap() + .filter_map(|e| e.ok()) + .map(|e| e.file_name()) + .collect(); + assert!( + leftover.is_empty(), + "optimize leaked recovery sidecars: {leftover:?}" + ); + } + + // Coherent after internal compaction: reads + a strict write still work. + assert!(count_rows(&db, "node:Person").await > 0); + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "after_compact")], &[("$age", 40)]), + ) + .await + .unwrap(); +} + +/// `optimize` must not fail on a graph that has no `_graph_commits.lance` — a valid +/// state the coordinator opens as `commit_graph = None` (graphs predating the commit +/// graph). Without the existence guard, `Dataset::open` on the absent table errors +/// and fails the whole optimize. Regression for the missing-existence-guard. +/// +/// Uses an EMPTY graph deliberately: a graph with data would publish during +/// optimize, and a publish records a graph commit that recreates `_graph_commits` +/// before the guard runs — masking the bug. With no data, nothing recreates it, so +/// the table stays absent through the guard. +#[tokio::test] +async fn optimize_tolerates_absent_graph_commits_table() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Simulate a graph with no commit-graph dataset. + std::fs::remove_dir_all(dir.path().join("_graph_commits.lance")).unwrap(); + + // Coordinator tolerates the absence; optimize must succeed (the guard skips the + // absent table rather than letting `Dataset::open` error) and omit its stat. + let db = Omnigraph::open(uri).await.unwrap(); + let stats = db.optimize().await.unwrap(); + assert!( + stats.iter().any(|s| s.table_key == "__manifest"), + "__manifest must still be compacted" + ); + assert!( + !stats.iter().any(|s| s.table_key == "_graph_commits"), + "absent _graph_commits must be skipped, not opened (would error)" + ); +} + +/// `optimize` must stay NON-DESTRUCTIVE on a pre-`auto_cleanup`-fix upgraded graph: +/// `compact_files` would otherwise fire the dataset's stored `lance.auto_cleanup.*` +/// hook (version GC) during the compaction commit. Internal-table compaction clears +/// that stale config first, so no versions are deleted. Without the clear, the +/// aggressive policy below GCs old versions and the count drops. +#[tokio::test] +async fn optimize_clears_stale_auto_cleanup_and_preserves_versions() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + for i in 0..5 { + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", &format!("v{i}"))], &[("$age", 30)]), + ) + .await + .unwrap(); + } + let manifest_uri = format!("{}/__manifest", dir.path().to_str().unwrap()); + + // Simulate an upgraded graph: an aggressive stored auto_cleanup config that, if + // it fired during compaction, would GC old versions. + { + let mut ds = Dataset::open(&manifest_uri).await.unwrap(); + ds.update_config([ + ("lance.auto_cleanup.interval", Some("1")), + ("lance.auto_cleanup.older_than", Some("0s")), + ]) + .await + .unwrap(); + } + let versions_before = Dataset::open(&manifest_uri) + .await + .unwrap() + .versions() + .await + .unwrap() + .len(); + + db.optimize().await.unwrap(); + + let ds = Dataset::open(&manifest_uri).await.unwrap(); + // (a) the stale auto_cleanup config was cleared (non-destructive by construction). + assert!( + !ds.config().keys().any(|k| k.starts_with("lance.auto_cleanup.")), + "optimize must clear stale auto_cleanup config; config = {:?}", + ds.config() + ); + // (b) no version GC: every pre-optimize version survives (compaction + the + // config-clear each add versions, so the count only grows). + let versions_after = ds.versions().await.unwrap().len(); + assert!( + versions_after >= versions_before, + "optimize must not GC __manifest versions: before={versions_before} after={versions_after}" + ); +} + +/// The same non-destructive guarantee on a DATA (node/edge) table, not just the +/// internal tables. `optimize_one_table` runs `compact_files` / `optimize_indices` +/// with a default `CommitConfig` (`skip_auto_cleanup = false`); on an upgraded +/// graph whose Person table still carries the pre-v7 `lance.auto_cleanup.*` config, +/// those commits would fire Lance's version-GC hook and prune `__manifest`-pinned +/// data-table versions. The path must strip that config first. Without the strip, +/// the aggressive policy below GCs old versions and the config survives the run. +#[tokio::test] +async fn optimize_clears_stale_auto_cleanup_on_data_tables_too() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path().to_str().unwrap().trim_end_matches('/').to_string(); + let mut db = init_and_load(&dir).await; + add_person_fragments(&mut db).await; // multiple fragments → will_compact + + // Simulate an upgraded graph: set an aggressive stored auto_cleanup config on + // the Person table. This is an out-of-band Lance commit (an `UpdateConfig` that + // advances HEAD past the manifest), so realign the manifest with a forced repair + // first — otherwise optimize skips the table as uncovered drift and never + // reaches the scrub. (Forced because UpdateConfig is not verified maintenance.) + let (_, _, person_full) = person_manifest_and_head(&db, &root).await; + { + let mut ds = Dataset::open(&person_full).await.unwrap(); + ds.update_config([ + ("lance.auto_cleanup.interval", Some("1")), + ("lance.auto_cleanup.older_than", Some("0s")), + ]) + .await + .unwrap(); + } + db.repair(RepairOptions { + confirm: true, + force: true, + }) + .await + .unwrap(); + + let versions_before = Dataset::open(&person_full) + .await + .unwrap() + .versions() + .await + .unwrap() + .len(); + let rows_before = count_rows(&db, "node:Person").await; + + db.optimize().await.unwrap(); + + let ds = Dataset::open(&person_full).await.unwrap(); + // (a) the stale auto_cleanup config was cleared (non-destructive by construction). + assert!( + !ds.config().keys().any(|k| k.starts_with("lance.auto_cleanup.")), + "optimize must clear stale auto_cleanup config on data tables; config = {:?}", + ds.config() + ); + // (b) no version GC: every pre-optimize version survives (compaction + the + // config-clear each add versions, so the count only grows). + let versions_after = ds.versions().await.unwrap().len(); + assert!( + versions_after >= versions_before, + "optimize must not GC Person versions: before={versions_before} after={versions_after}" + ); + // (c) data is intact — the run rewrote fragments, it did not drop rows. + assert_eq!(count_rows(&db, "node:Person").await, rows_before); +} + // PR3 (Workstream B): an existing scalar index does not cover fragments // appended after it was built (build_indices is existence-gated), so those // rows are scanned unindexed. `optimize` must fold them back in via Lance's diff --git a/crates/omnigraph/tests/write_cost.rs b/crates/omnigraph/tests/write_cost.rs index 5f753d7..c7e8528 100644 --- a/crates/omnigraph/tests/write_cost.rs +++ b/crates/omnigraph/tests/write_cost.rs @@ -24,19 +24,26 @@ mod helpers; use helpers::cost::{ - IoCounts, assert_flat, assert_grows, local_graph, measure_insert, measure_with_staged, + IoCounts, assert_flat, assert_grows, local_graph, measure_insert, measure_insert_as, + measure_with_staged, }; -use helpers::{MUTATION_QUERIES, commit_many, mixed_params}; +use helpers::{MUTATION_QUERIES, commit_many, commit_many_as, mixed_params}; -// ── (A) The internal-table LOCK — RED today, the acceptance test for step 2 ── +// ── (A) The internal-table LOCK — the acceptance test for step 2 (compaction) ── // -// `__manifest` / `_graph_commits` scans must be O(1) in commit-history depth. -// RED today (O(fragments), uncompacted). Un-ignore when step 2 (internal-table -// compaction) lands — it must go green flat. (The data-table term is the S3 -// gate's, `write_cost_s3.rs`; local-FS hides it.) +// `__manifest` / `_graph_commits` / `_graph_commit_actors` scans on a write must be +// O(1) in commit-history depth **on a compacted graph**. Without internal-table +// compaction these scans are O(fragments) and grow forever; step 2 brings all three +// internal tables into `db.optimize()`, so after compaction the per-write scan is +// flat. The test runs the **authenticated (actorful) write path** — every commit +// carries an actor, so it grows `_graph_commit_actors.lance` too (the production +// server/CLI path); the commit-graph IO wrapper covers both that and `_graph_commits`, +// so `commit_graph_reads` includes the actor-table scan. It compacts at each depth +// checkpoint before measuring — pinning the production invariant "a periodically- +// compacted graph's write cost does not grow with version history." #[tokio::test] -#[ignore = "RED until step 2 (internal-table compaction): __manifest/_graph_commits scans are O(fragments) today — RFC-013 §0/§2.2. Un-ignore there as the red→green acceptance test."] async fn internal_table_scans_are_flat_in_history() { + const ACTOR: &str = "act-cost-gate"; let dir = tempfile::tempdir().unwrap(); let mut db = local_graph(&dir).await; @@ -44,20 +51,25 @@ async fn internal_table_scans_are_flat_in_history() { let mut current = 0u64; for d in [10u64, 100] { if d > current { - commit_many(&mut db, (d - current) as usize).await; + commit_many_as(&mut db, (d - current) as usize, ACTOR).await; current = d; } - let io = measure_insert(&mut db, &format!("lock_{d}")).await; + // Step 2: compaction folds all three internal tables' O(depth) fragments back + // to a small constant, so the following write's scan of them is flat. + db.optimize().await.unwrap(); + let io = measure_insert_as(&mut db, &format!("lock_{d}"), ACTOR).await; current += 1; // the measured write advanced depth by one eprintln!( - "depth~{d}: data={} __manifest={} _graph_commits={}", + "depth~{d}: data={} __manifest={} _graph_commits+actors={}", io.data_reads, io.manifest_reads, io.commit_graph_reads ); curve.push((d, io)); } assert_flat(&curve, |c| c.manifest_reads, 4, "__manifest scan"); - assert_flat(&curve, |c| c.commit_graph_reads, 4, "_graph_commits scan"); + // commit_graph_reads covers BOTH _graph_commits and _graph_commit_actors (shared + // wrapper), so this also gates the actor table on the authenticated path. + assert_flat(&curve, |c| c.commit_graph_reads, 4, "_graph_commits + _graph_commit_actors scan"); } // The data-table OPENER history-gate (opener flat across depth) lives in diff --git a/docs/dev/invariants.md b/docs/dev/invariants.md index eb6821a..3195bd0 100644 --- a/docs/dev/invariants.md +++ b/docs/dev/invariants.md @@ -285,11 +285,14 @@ them explicit. because Lance branch names can be deleted/recreated at the same version number; the manifest e_tag is carried into synthetic snapshot ids when available, and a detected same-branch manifest refresh clears read caches as the fallback for - e_tag-less table locations/topology. Remaining: the internal metadata tables - (`__manifest`, `_graph_commits`) are still not compacted, so the probe and - refresh cost still grows with fragment count on a long-lived graph (the - `optimize`-covers-internal-tables follow-up); the commit graph is not yet - reconcilable from the manifest; and the traversal id-map is still rebuilt. + e_tag-less table locations/topology. Remaining: `optimize` now compacts the + internal metadata tables (`__manifest`, `_graph_commits`) too (RFC-013 step 2), + so a *periodically-optimized* graph keeps the probe/refresh/per-write scan flat + in history; but they are not yet brought into `cleanup` (version GC), so the + `_versions/` chain still grows until an explicit cleanup (the cleanup half is + deferred — it needs the Q8 cleanup-resurrection watermark first). The commit + graph is not yet reconcilable from the manifest; and the traversal id-map is + still rebuilt. - **Commit-graph parent under concurrency:** `record_graph_commit` now refreshes the commit-graph head from storage before appending, so a same-branch write after an external commit no longer forks the commit DAG by parenting off a diff --git a/docs/dev/rfc-013-write-path-latency.md b/docs/dev/rfc-013-write-path-latency.md index fa4abf3..37e6a8a 100644 --- a/docs/dev/rfc-013-write-path-latency.md +++ b/docs/dev/rfc-013-write-path-latency.md @@ -846,23 +846,60 @@ to flatten the curve. internal-table LOCK (step 2's red→green acceptance). *Still owed:* the prod `storage.ops` span metric (§5.3) and the bucket-gated `write_cost_s3.rs` opener LOCK (step 3a's red→green, S3-only per the §9-3a measurement note). -2. **Bound history — bring the INTERNAL tables into optimize/cleanup (a code - change, not just scheduling).** Today `optimize`/`cleanup` iterate **node/edge - keys only** (`optimize.rs:895-904`) — confirmed: the prototype's `cleanup --keep 3` - pruned "7 tables" = the node/edge data tables; `__manifest`/`_graph_commits` were - untouched **[M]**. So the residual +5/depth internal slope (§0b) is **not** fixed - by today's tooling — step 2 is a real `all_table_keys` change to add the internal - tables, then schedule compaction+cleanup (pass `--yes`; cleanup aborts on remote - otherwise). The pruning mechanism is proven on a data table (1035→63, 16× **[M]**); - the internal tables need the same inclusion. **Proven [M]:** compacting the - internal tables collapsed their scans `__manifest` 285→32, `_graph_commits` - 177→11; with step 3 a depth-87 edge drops **~1720 → 198 ops** (§2.4). (Separately, - node/edge cleanup **caps** the dominant data-table term as an interim *before* - step 3 — after step 3 that term is flat regardless.) **HARD PREREQUISITE:** the - Q8 boundary watermark must land **with** this step — Lance's version CAS is - confirmed vulnerable to cleanup-resurrection (§12 Q8, a silent lost write on - R2/S3), so scheduling cleanup without the watermark trades a latency bug for a - correctness bug. (`gap-read-path-rederivation` write twin.) +2. **Bound history — bring the INTERNAL tables into optimize/cleanup.** Split into + a compaction half (the latency win, safe) and a cleanup half (version GC, needs + the Q8 watermark). Validated (Lance docs + source): compaction *preserves* + versions and is the only term needed to flatten the per-write metadata scan; + cleanup is the separate version-deleting op that opens the Q8 hole. + - **2a. Internal-table compaction. ✅ LANDED.** `optimize` now compacts all + three internal tables — `__manifest`, `_graph_commits`, **and + `_graph_commit_actors`** (the actor table grows one fragment per commit on the + authenticated write path, so it carries the same O(depth) scan as the other + two and is compacted from one source-of-truth list with per-table existence + guards). `compact_internal_table` is a separate simpler path than + `optimize_one_table`: no manifest publish, no recovery sidecar. The sidecar-free + property does **not** rest on single-commit atomicity (`compact_files` can emit a + `ReserveFragments` commit before the `Rewrite`, and the auto-cleanup strip is a + further commit) — it holds because each of those commits is content-preserving + and the table is read at HEAD, so a crash leaves it readable and content-identical + and the next `optimize` re-plans. **Non-destructive by construction:** compaction + preserves versions, and before compacting it strips any stale `lance.auto_cleanup.*` + config off the table, so a graph created by an older binary (on-by-default GC + hook) cannot have the commit-time hook silently prune `__manifest`-pinned + versions during an `optimize` (current binaries store no such config; the + strip is the upgrade-path safety net). **The same strip now also runs on the + data-table path** (`optimize_one_table`), inside the Optimize sidecar window — + so `optimize` is non-destructive on node/edge tables too, not just the internal + ones (the data-table path was a pre-existing gap, since `compact_files`/ + `optimize_indices` there also commit with the auto-cleanup hook enabled). **Concurrency:** + no app lock on the internal path — and `compact_files` does *not* auto-retry a + semantic conflict against a concurrent live writer (Lance prescribes app-rerun for + `Rewrite` vs `Update`/`Merge`), so `compact_internal_table` runs a *bounded* + retry loop that reopens fresh and replans on a retryable Lance conflict (the + canonical Lance-consumer pattern); transient contention does not fail the live + publisher or the operator's `optimize`, but sustained contention past the budget + surfaces a loud conflict error (bounded + observable, not an infinite loop). The + data-table path instead holds the per-table write queue, so it never contends. A + coordinator `refresh` after the compaction restores cache coherence. The + `internal_table_scans_are_flat_in_history` LOCK is now green on the + **authenticated** write path: on a compacted graph a write's + `__manifest`/`_graph_commits`+`_graph_commit_actors` scan is flat in history + (measured `__manifest` 4→2, commit-graph+actors 10→2 across depth 10→100). + Compacts all three tables even though Phase 7 (`iss-991`) will later fold + `_graph_commits` into `__manifest` (one-call throwaway; full interim win until + then). **2a is also the hard prerequisite for Phase 7** (its `graph_head` CAS + contention is only acceptable once `__manifest` compaction bounds the + publisher's `load_publish_state` scan). + - **2b. Internal-table cleanup + Q8 watermark — DEFERRED** (debated; not bundled + with 2a). Cleanup is the version-deleting op that hits cleanup-resurrection + (§12 Q8: Lance's version CAS has no monotonic guard), so it must land **with** + a durable monotonic watermark (a Lance boundary tag — durable across cleanup, + `cleanup.rs` `is_tagged`). Deferred because it touches the read/open path + (a tag-floor clamp on every coordinator open), is the MTT-redundant part (MTT + may replace `__manifest`), and only buys the secondary version-count/space term + — whereas 2a delivers the dominant per-write scan win with zero resurrection + risk. Land it when the version-count cost bites or the Lance MTT timeline + clarifies. (`gap-read-path-rederivation` write twin.) 3. **The opener fix — a shippable lead + the structural follow-on.** - **3a. Opener bypass (standalone PR, THE dominant fix — [M] proven). ✅ LANDED.** `TableStore::open_dataset_head_for_write` now delegates to the direct diff --git a/docs/dev/testing.md b/docs/dev/testing.md index 941cec6..ac5d4f0 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -26,7 +26,7 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav | `forbidden_apis.rs` | Defense-in-depth source-walk guard: engine code (`exec/`, `db/omnigraph/`, `loader/`, `changes/`) must not reach around the sealed storage trait to Lance inline-commit APIs, nor open datasets directly (`Dataset::open` / `DatasetBuilder::from_uri`/`from_namespace`) — reads route through `Snapshot::open` and the held-handle cache; `// forbidden-api-allow: ` sentinel exempts reviewed lines | | `lance_surface_guards.rs` | Pins the Lance API surfaces omnigraph depends on (named runtime + compile-only guards; see [lance.md](lance.md)) — the first smoke check on any Lance version bump; e.g. `compact_files_still_fails_on_blob_columns` turns red when the upstream blob-compaction fix lands | | `warm_read_cost.rs` | Cost-budget tests for the warm read path (query-latency work), measured at the object-store boundary with Lance `IOTracker` (the LanceDB IO-counted pattern): a warm same-branch read does 0 manifest opens, 0 commit-graph opens, 1 version probe, validates the schema once (Fix 1 / finding A / Fix 2 at commit-history depth); stale same-branch reads perform exactly 2 probes and refresh manifest-only; recreated non-main branches with the same Lance version refresh by incarnation; recreated branch-owned table handles are distinguished by table e_tag or refresh-time cache clearing; recreated traversal topology is protected by synthetic snapshot-id incarnation or refresh-time cache clearing; a warm *repeat* read does 0 table opens via the held-handle cache and a write re-opens only the changed table at its new version/e_tag (Fix 3/6A). See "Cost-budget tests" below | -| `write_cost.rs` | Cost-budget tests for the WRITE path (RFC-013), the latency twin of `warm_read_cost.rs` on the **shared `helpers::cost` harness** (`measure`/`IoCounts`/`assert_flat`/`local_graph`). Runs on **local FS**; gates the **internal-table** term (`__manifest`/`_graph_commits` scans flat in commit-history depth — the RED `internal_table_scans_are_flat_in_history` LOCK, `#[ignore]`'d until internal-table compaction lands) plus green every-PR guards (single-insert `data_writes` bounded, a per-write read-op ceiling that fails the moment a round-trip is added, and a `measure_with_staged` fitness assert that a keyed insert routes through `stage_merge_insert` once with no `stage_append`/vector-index build). The **data-table opener** term is S3-only — see `write_cost_s3.rs` and the backend-split note in "Cost-budget tests" below | +| `write_cost.rs` | Cost-budget tests for the WRITE path (RFC-013), the latency twin of `warm_read_cost.rs` on the **shared `helpers::cost` harness** (`measure`/`IoCounts`/`assert_flat`/`local_graph`). Runs on **local FS**; gates the **internal-table** term (`__manifest`/`_graph_commits` scans flat in commit-history depth — `internal_table_scans_are_flat_in_history`, now **green every-PR** since RFC-013 step 2 brought the internal tables into `optimize`; the test compacts at each depth before measuring) plus green every-PR guards (single-insert `data_writes` bounded, a per-write read-op ceiling that fails the moment a round-trip is added, and a `measure_with_staged` fitness assert that a keyed insert routes through `stage_merge_insert` once with no `stage_append`/vector-index build). The **data-table opener** term is S3-only — see `write_cost_s3.rs` and the backend-split note in "Cost-budget tests" below | | `helpers/cost.rs` | The shared cost-budget harness (not a test): `IoCounts`/`StagedCounts` (counts by table class), `measure`/`measure_with_staged` (the one place the `with_query_io_probes` + `MergeWriteProbes` task-local + `IOTracker` wiring lives), `assert_flat(curve, select, slack, what)`, and store-agnostic `local_graph`/`s3_graph` fixtures. `warm_read_cost.rs`, `write_cost.rs`, and `write_cost_s3.rs` all consume it so a cost test body is written once and reads in one vocabulary | | `lifecycle.rs` | Graph lifecycle, schema state | | `point_in_time.rs` | Snapshots, time travel (`snapshot_at_version`, `entity_at`) | diff --git a/docs/user/operations/maintenance.md b/docs/user/operations/maintenance.md index e2a88eb..d8df950 100644 --- a/docs/user/operations/maintenance.md +++ b/docs/user/operations/maintenance.md @@ -6,6 +6,8 @@ - Compacts every node + edge table on `main`, then reindexes them, then **publishes the resulting version to the `__manifest`** so the manifest's recorded version tracks the compacted-and-reindexed state. Reads pin the manifest version, so without this publish the work would be invisible to readers *and* would break the version precondition of the next schema apply / strict update/delete ("stale view … refresh and retry"). The publish advances the graph version (a system-attributed commit) only for tables that actually changed. - Rewrites small fragments into fewer large ones; old fragments remain reachable via older versions until `cleanup` runs. +- **Also compacts the internal system tables** `__manifest`, `_graph_commits`, and `_graph_commit_actors` (RFC-013 step 2), which accumulate one fragment per commit (the actor table only on the authenticated write path, where every commit carries an actor) and otherwise make every write's metadata scan grow with history. These take a simpler path than data tables: they are not `__manifest`-tracked (readers open them at their latest version), so compaction just advances their version in place — **no manifest publish and no recovery sidecar**. (The sidecar-free property is not because it is one commit — `compact_files` can emit a `ReserveFragments` commit before the `Rewrite`, and the auto-cleanup strip below is a further commit — but because every one of those commits is content-preserving and the table is read at its latest version, so a crash at any point leaves it readable and content-identical and the next `optimize` re-plans.) They appear in the returned stats under `table_key` `"__manifest"` / `"_graph_commits"` / `"_graph_commit_actors"` (the latter two only when present). They are **not yet covered by `cleanup`**, so their version chain still grows until the cleanup half lands (it requires a cleanup-resurrection safeguard first); run `optimize` on a cadence to keep per-write metadata scans flat. +- **`optimize` is non-destructive by construction — it never garbage-collects versions, on any table (data or internal).** Compaction rewrites fragments and advances the version; old versions stay reachable until you run `cleanup`. This holds even for a graph created by an older binary that stored an on-by-default Lance `auto_cleanup` hook: `compact_files` / `optimize_indices` commit with the hook enabled and expose no skip override, so before compacting **any** table `optimize` strips its stale `lance.auto_cleanup.*` config first, so Lance's commit-time GC hook cannot fire and silently prune `__manifest`-pinned versions. (Graphs created by current binaries store no such config; the strip is the upgrade-path safety net.) The internal-table path additionally tolerates a concurrent live writer: it runs a **bounded** rebase-and-retry, so transient contention does not fail the operator's `optimize` or the live write — but sustained contention past the retry budget surfaces a loud conflict error rather than looping forever (bounded and observable, not a silent give-up). The data-table path holds the per-table write queue while it compacts, so it does not contend with mutations on that table in the first place. - **Reindex (index coverage maintenance).** A scalar/FTS/vector index only covers the fragments it was built over. Rows appended after the index was built (e.g. by `load --mode merge`, whose commit does not rebuild an already-existing index) are scanned unindexed, and compaction itself rewrites fragments out of an index's coverage. `optimize` runs Lance's incremental `optimize_indices` after compaction to fold those fragments back in (a delta merge, not a full retrain), restoring full coverage so equality/range/traversal predicates stay index-accelerated. This is why a table with **no compaction work but stale index coverage still commits** a new version under `optimize`. Run `optimize` on a cadence at least as frequent as your freshness window so recently-loaded rows do not linger in the unindexed flat-scan tail. - **Create declared-but-missing indexes (the index reconciler).** `@index`/`@key` declares intent; `schema apply` records it but builds nothing, and `load`/`mutate` defer a column that cannot be built yet (a `Vector` column with no trainable vectors). `optimize` materializes any such declared-but-unbuilt index over the compacted layout — so it is the convergence path for an `@index` added after data exists, or a vector index whose embeddings arrived via a later `embed`. A column still not buildable (no vectors yet) is reported on the table's stat as `pending_indexes` (visible in `--json`), not treated as a failure; the next `optimize` retries. So `optimize` is the single operator-facing index reconciler: it compacts, restores coverage, **and** builds declared-but-missing indexes. - Each table's compact→reindex→publish serializes with concurrent mutations on the same table. A crash mid-operation is recovered automatically on the next open (both compaction and reindex are content-preserving, so roll-forward is always safe). From 5cfae9acc1bf0d63b37e67d17aa53fe0ddef3ed5 Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Sun, 21 Jun 2026 21:54:59 +0200 Subject: [PATCH 7/8] =?UTF-8?q?docs(rfc-013):=20latency=20=3D=20(serial=5F?= =?UTF-8?q?hops=20+=20ops/concurrency)=C2=B7RTT=20=E2=80=94=20concurrency-?= =?UTF-8?q?cap=20correction=20+=20Lance-metadata=20comparison=20(#292)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat(engine): compact the internal __manifest/_graph_commits tables in optimize `optimize` iterated node/edge catalog tables only, so the two internal system tables (`__manifest`, `_graph_commits`) accumulated one fragment per commit and were never compacted -- making every write's metadata scan O(fragments), which grows forever on a long-lived graph (RFC-013 step 2). `optimize_all_tables` now also compacts both internal tables via a new `compact_internal_table`. They are not catalog-tracked (readers open them at their latest Lance HEAD), so it is a much simpler path than `optimize_one_table`: compact in place, no manifest publish (nothing to publish to), no recovery sidecar (a single atomic Lance commit -- no HEAD-before-publish gap), and no optimize_indices (they carry no Lance index, only object_id's unenforced-PK metadata). No application lock: Lance's compact_files auto-retries its Rewrite against any concurrent writer (the canonical LanceDB pattern; Rewrite vs Append is compatible, vs Update a retryable same-fragment conflict Lance rebases), and a coordinator refresh afterwards makes the warm handle observe the compacted HEAD. Compacts both tables even though Phase 7 (iss-991) will later fold _graph_commits into __manifest -- a one-call throwaway for the full interim win; __manifest compaction is also the prerequisite for Phase 7's graph_head contention. Cleanup (version GC) of the internal tables is deliberately NOT included here: it needs the Q8 cleanup-resurrection watermark first (deferred). maintenance.rs: optimize now returns 6 stats (4 data + 2 internal); adds optimize_compacts_internal_tables (sheds fragments, leaks no recovery sidecar, graph coherent for reads + strict writes after). * test(engine): un-ignore the internal-table scan LOCK (step 2 acceptance) `internal_table_scans_are_flat_in_history` was the RED, #[ignore]'d acceptance gate staged in PR #288. With internal-table compaction landed, a write's __manifest/_graph_commits scan is flat in commit-history depth on a compacted graph (measured __manifest 4->2, _graph_commits 7->3 across depth 10->100, vs the pre-step-2 RED 34->214 / 29->207). The test now compacts at each depth before measuring and runs green every-PR. * docs: RFC-013 step 2 internal-table compaction landed - invariants.md: close the compaction half of the read-path-rederivation known gap (optimize now compacts the internal tables; cleanup half still deferred). - maintenance.md: optimize covers __manifest/_graph_commits (no publish, no sidecar); not yet in cleanup. - rfc-013 §9: split step 2 into 2a (compaction, landed) and 2b (cleanup + Q8 watermark, deferred — debated; MTT-overlap + hot-path liability). - testing.md: the internal-table LOCK is now green every-PR. * fix(engine): guard absent _graph_commits + always compact internal tables Addresses PR #291 review findings: - Greptile (P1): optimize unconditionally opened `_graph_commits` for compaction, but a graph can validly have none (the coordinator opens it as `Option`, gated on `storage.exists`, for graphs predating the commit graph). `Dataset::open` on the absent table errored and failed the whole optimize. Guard the `_graph_commits` compaction with the same `storage_adapter().exists()` check the coordinator uses; `__manifest` always exists so it stays unguarded. Regression test `optimize_tolerates_absent_graph_commits_table` (empty graph so no publish recreates the table before the guard). - Cursor (low): the `table_tasks.is_empty()` early return skipped internal-table compaction for a schema with no node/edge types. Removed it so the internal tables are compacted regardless of the data-table set. - Codex (auto-cleanup, P1): documented — `compact_files` commits with a default `CommitConfig` (no skip_auto_cleanup) and `CompactionOptions` exposes no override, so on a graph storing an *on* auto_cleanup config the commit would fire version GC. Both internal tables are created with `auto_cleanup: None`, so new graphs are safe; the only exposure is pre-fix upgraded graphs, identical to the existing data-table optimize path, with step 2b's watermark as the comprehensive guard. Added a comment in `compact_internal_table` recording this. * docs(rfc-013): serial-hop correction — wall-clock is the ~110-hop backbone, not op count Latency-slope measurement on the deployed edge binary (f6d2cc03, steps 1+3a landed; rustfs + per-op latency proxy, depth 1..85) shows wall-clock is set by a ~110-hop SERIAL backbone that is depth-invariant. Total ops grow +~7/depth but PARALLELIZE (parallelism 1->6), so the depth term adds little wall-clock. - New §0(c): the serial-hop vs total-op finding + branch-op backbones (create ~77, delete ~87, branch-write ~258/1777-ops/21s floor = fork-on-first-write). - §2.4: correct the '1720->198 ops => 258s->30s' op-count->wall-clock conversion. - §5.1: promote serial-hop/num_stages to the PRIMARY latency LOCK; op-count flatness demoted to a cost/compute-floor gate. - §9 step 2: reprioritized as Phase-7 prerequisite + compute-floor/space, NOT the wall-clock fix; step 3b (parallel capture-once WriteTxn) is the headline latency lever; branch-write moved under step 3b + fork seam. - Summary: serial-backbone correction up front. Vindicates the §3/§4.1 design; corrects the op-count latency framing. * docs(rfc-013): concurrency-cap correction + Lance-metadata comparison Fold in two measured findings from the deployed edge binary (f6d2cc03) on rustfs behind a latency+concurrency proxy: - §0(d): concurrency-cap A/B. Under unlimited concurrency the internal-table scan parallelizes (backbone ~110); under an R2-realistic cap (8) it serializes and an UNCOMPACTED graph runs away (per-write ops 1273->3505, wall 6->16s), while #291's internal compaction cuts it ~6x and bounds it (137->1 frag). The latency model is (serial_hops + ops/effective_concurrency)*RTT + compute. - Reframe step 2 across Summary/§2.4/§9: NOT de-ranked — on R2 (capped) it is a primary latency lever + the anti-runaway fix + Phase-7 prereq. The earlier 'step 2 is parallel, irrelevant to latency' was an unlimited-concurrency artifact. Deployed f6d2cc03 optimize is node/edge-only; #291 (undeployed) is the prod win. - §5.1: the cost-gate ThrottledStore must cap concurrency AND inject latency; assert serial_hops flat AND ops flat in history. - §2.3 + §8: Lance/LanceDB comparison from 7.0.0 source — Lance metadata is a single-file per-version manifest read O(1) (latest_version_hint), pruned by default; omnigraph's __manifest-as-Lance-dataset scan is self-inflicted by the cross-table-atomicity choice. Adds explicit defense of Lance-dataset __manifest (MTT seam) vs a flat-file CAS'd manifest (cheaper, off the MTT path). Design (§3/§4.1) unchanged and vindicated; corrections are measurement framing, step sizing, and one design-choice that was implicit. --- docs/dev/rfc-013-write-path-latency.md | 195 +++++++++++++++++++++++-- 1 file changed, 184 insertions(+), 11 deletions(-) diff --git a/docs/dev/rfc-013-write-path-latency.md b/docs/dev/rfc-013-write-path-latency.md index 37e6a8a..d955a9d 100644 --- a/docs/dev/rfc-013-write-path-latency.md +++ b/docs/dev/rfc-013-write-path-latency.md @@ -46,6 +46,24 @@ main/branch/node paths (§2.4). It is shippable as a standalone PR first (§9 st 3a); the rest of the RFC is the constant-factor + correctness + internal-residual work layered on the same seam. +**Correction (2026-06-20/21) — the latency metric is `(serial_hops + ops / +effective_concurrency) · RTT + compute`, measured [M].** Two findings, both from the +deployed edge binary (steps 1+3a landed) on rustfs behind a latency+concurrency proxy: +**(i)** under *unlimited* concurrency, wall-clock is a **~110-hop serial backbone, +depth-invariant** — the depth-driven ops parallelize away (§0(c)); but **(ii)** under +an **R2-realistic concurrency cap (8)**, the internal-table fragment scan can no longer +fan out, so **op count re-enters wall-clock** and an uncompacted graph *runs away* +(per-write ops 1273→3505, wall 6→16s and climbing) — while #291's internal-table +compaction cuts it ~6× and bounds it (§0(d) A/B). So the design is **vindicated and +unchanged** (§3/§4.1: capture-once `WriteTxn` + parallel stages → "~2–3 hops" is the +**serial-backbone** lever, step 3b; bounded history is the **op-count** lever, step 2a) +— what's corrected is the *measurement framing and step sizing*: op count was the wrong +latency proxy **only because the harness had unlimited concurrency**; on a capped store +both `serial_hops` (→ step 3b) and `ops` (→ step 2a) are on the critical path, and +which dominates is set by `effective_concurrency × fragment_count`. The cost gate +(§5.1) is corrected to inject a **concurrency cap *and* latency**, and to assert serial +hops *and* op-count-flat-in-history. + --- ## 0. Validation ledger (read this first) @@ -139,6 +157,72 @@ one unpinned item — see §12. Reads, by contrast, are flat in depth (`warm_read_cost.rs`, PR #268). This is the O(history)-per-write → O(N²)-cumulative behavior the production incident hit. +**(c) Serial-hop measurement [M] — wall-clock is set by the serial backbone, not +the op count.** §0(b) counts *total* object-store ops; wall-clock is set by the ops +on the *critical path*. Measured on the **deployed edge binary `f6d2cc03`** (steps +1+3a landed) via rustfs + a per-op latency proxy, sweeping injected per-op latency `L` +and reading the slope of `wall = compute + serial_hops · L` (the slope **is** the +critical-path hop count; the proxy also reports request overlap → parallelism): + +| depth | total ops | parallelism | **serial backbone (slope)** | `L=0` wall (compute floor) | +|---:|---:|---:|---:|---:| +| ~1 | 107 | 1.0–1.2 | **~109** | 2.15s | +| ~33 | 338 | 3.4–4.0 | **~108** | 2.45s | +| ~85 | 716 | 6.0–7.1 | **~113** | 4.27s | + +The serial backbone is **~110 hops and depth-INVARIANT**, while total ops grow +`+~7/depth` (107→716, the §0(b) term) **and parallelize** (parallelism 1→6, +`max_inflight` up to 65) — so the depth-driven ops add almost nothing to wall-clock. +`wall ≈ 110·RTT + compute`; the prod 35s direct-main write ≈ 110 hops × ~280ms +cross-region RTT. Branch ops measured the same way (4-table graph; prod = 217 tables, +≈50× worse): **branch-create serial ~77, branch-delete ~87** (op counts scale with +table count → §9 step 6), and **branch-WRITE is worst — 1777 ops, serial ~258, 21s +compute floor even at `L=0`** = fork-on-first-write (the path 3a did *not* cover; §9 +step 3b + the fork seam), matching prod's 103–138s. + +**The methodological correction this forces.** *Op count is a cost/space/compute-floor +metric; the serial-hop count (latency slope / `num_stages`) is the wall-clock metric.* +3a's real 90s→35s win (≈2.6×, matching its measured 2.7× op cut) is genuine **because +it removed *serial* hops** (the per-table data opens were on the critical path). But +the wall-clock predictor is not serial-hops *alone* — it is +**`(serial_hops + ops / effective_concurrency) · RTT + compute`**: total op count +re-enters wall-clock whenever the store **caps concurrency**, because the parallel +tail can no longer fan out. + +**(d) The concurrency-cap A/B [M] — proves op count *is* wall-clock on a capped store, +and that step 2a is a primary latency lever (not a parallel afterthought).** §0(c) was +measured on **rustfs with unlimited concurrency** (`max_inflight` reached **129**) — a +poor proxy for R2, which is connection-capped and rate-limited. Re-running the same +write through a proxy capped at **8 concurrent** (R2-realistic), with internal-table +**fragment count as the only variable** (edge binary for writes; the unmerged #291 +binary only to run `optimize`), depth ~130, `__manifest`≈137 fragments: + +| state | per-write ops | wall (cap=8, L=20) | trend | +|---|---:|---:|---| +| **uncompacted** (`__manifest` 137 frags) | 1273 → 1487 → **3505** | 5.9 → 8.4 → **16.4 s** | **runaway** — each write reads all frags **and appends one more** | +| **after #291 `optimize`** (137→1 frag) | 275 → 250 → **197** | 6.2 → 5.4 → **3.8 s** | **bounded** | + +`optimize` collapsed `__manifest` 137→1, `_graph_commits` 140→1 frags → **~6× fewer +ops/write and the runaway stopped.** Under unlimited concurrency this delta vanishes +(the frags fan out); under the cap it is the dominant term. **This is the actual +mechanism of the prod 35s and its degradation over time** (the `O(N²)` of §0/§2.2): +on a capped store, every uncompacted write scans all `__manifest`/`_graph_commits` +fragments *and adds one*, so latency climbs with graph age — exactly what prod shows, +and exactly what step 2a halts. Prod confirms the scale: `__manifest` 1,739 obj / +59 MiB, `_graph_commits` 1,848 obj / 23.5 MiB, read per write, **uncompacted** (the +deployed `f6d2cc03` optimize is node/edge-only — §9 step 2 — so an operator `optimize` +run on prod cannot touch them; only #291 can). + +**Corrected conclusion.** The §2.4 op-count math (`1720→198 ⇒ 258s→30s`) is still +wrong *as stated* (it assumes full serialization), but the opposite over-correction — +"step 2 is parallel, so irrelevant to latency" — is **also wrong**, and an artifact of +the unlimited-concurrency harness. The truth is **concurrency-dependent**: on a capped +store (R2) the internal-scan op count *is* on the critical path and **step 2a is a +primary latency lever and the anti-runaway fix**; the residual after compaction +(~4 s here, mostly compute + the serial backbone) is then **step 3b**'s. Both are +load-bearing; which dominates is set by `effective_concurrency × fragment_count`. So +the cost gate (§5.1) must inject a **concurrency cap**, not just latency. + --- ## 1. Problem & measurements @@ -191,7 +275,11 @@ Branch ops compound it: `branch create` is a per-table sequential fork loop (`fork_branch_from_state`, `table_store.rs:282`); `branch delete` opens a snapshot per *other* branch (`ensure_branch_delete_safe`, `omnigraph.rs:1317`) and force-deletes per forked table sequentially (`cleanup_deleted_branch_tables`, -`omnigraph.rs:1359`) **[S]**. +`omnigraph.rs:1359`) **[S]**. Measured serial backbones (§0(c), edge binary): branch +create **~77 hops**, delete **~87** (op counts scale with table count → §9 step 6); +**branch *write* is the worst — 1777 ops, ~258-hop serial backbone, a 21s compute +floor even at zero RTT** = fork-on-first-write (the path step 3a did not cover; §9 step +3b + the fork seam), which is why prod branch-load (103–138s) ≫ direct-main (35s). --- @@ -267,6 +355,31 @@ cost. The correct replacement is *scheduled* compaction **and** version cleanup (§9 step 2), **not** re-enabling `auto_cleanup`. Without it, version history (and per-write cost) grows forever. +**Why Lance/LanceDB don't have this cost — the internal-table scan is self-inflicted +[U].** Verified in Lance 7.0.0 source (cargo registry): a Lance dataset's metadata is a +**per-version manifest *file*** — one self-contained protobuf +(`format/manifest.rs:35`, `struct Manifest { fragments: Arc>, … }`) — +and the current version is resolved **O(1)** via `latest_version_hint.json` +("O(1)/O(k) latest-version lookup via HEAD", `io/commit.rs:75-79`) or the V2 lexical +name. Reading current state is **one file read, never a scan over accumulated +metadata**; old manifests + `_transactions` files are reclaimed by **timestamp GC** +(`dataset/cleanup.rs`, on by default), and manifest *size* is bounded by data +compaction. **LanceDB** is multi-table but each table is an *independent* Lance +dataset; its catalog is a directory/namespace lookup (or a cloud catalog service), not +a mutable dataset read per write — it does **no cross-table atomic commit**, so it +needs no coordinating meta-table. Omnigraph's `__manifest`/`_graph_commits` are +therefore **not a Lance pattern** — they exist only because omnigraph layers a +**mutable catalog *as a Lance dataset*** over 217 independent tables to get a +cross-table atomic commit (the lance#7264 "Alternative A"). The whole §2.2 internal +term is the price of that choice: omnigraph reads its catalog as an **O(fragments) +dataset scan and appends a fragment per write**, where Lance reads its own metadata +**O(1)** and prunes by default. Step 2a (compact → 1 fragment) ≈ Lance's single-file +manifest read; step 2b (cleanup) ≈ Lance's `cleanup_old_versions`; the design simply +re-derives, on a Lance-dataset catalog, the hygiene Lance treats as table stakes — and +§8/lance#7264 MTT is the path to delete the catalog and inherit Lance's O(1) metadata +outright. *(This also raises a design question — should the catalog be a Lance dataset +at all, vs a single flat CAS'd manifest file? — addressed in §8.)* + ### 2.4 Lance namespace: proper use (why the fix is bypass, not patch) The upstream Lance Namespace is a **catalog / discovery layer** — "table @@ -334,9 +447,18 @@ correctness, not drop-in completeness. **Step 2 also proven [M].** On the step-3-patched binary at depth ~87, compacting the internal tables to 1 fragment each (content-preserving) collapsed their scans: `__manifest` 285 → 32 (8.9×), `_graph_commits` 177 → 11 (16×); the step-3 data term -stayed flat at 4. So **both depth terms are now empirically eliminated** — a depth-87 -single edge drops **~1720 → 198 ops (~8.7×; ≈258 s → ≈30 s at 150 ms/RTT)** with -both fixes. The internal term is **fragment-scan growth** (`read_manifest_scan` / +stayed flat at 4. So **both depth *op-count* terms are now empirically eliminated** — +a depth-87 single edge drops **~1720 → 198 ops (~8.7× in op count)** with both fixes. +**Wall-clock correction (§0(c)/(d)):** the `≈258 s → ≈30 s` figure was wrong (it +multiplied *total* ops by RTT as if serial); but the win is **concurrency-dependent**, +not zero. Under *unlimited* concurrency the depth-driven ops parallelize and this +op-count cut barely moves wall-clock (the backbone is ~110 hops); **under an +R2-realistic concurrency cap the same op-count cut is a primary latency win** — the +§0(d) A/B shows the uncompacted internal scan *runs away* (6→16 s) and #291's +compaction cuts it ~6× and bounds it. So step 2a is a **latency lever on a capped store +(R2) and the anti-runaway fix**, *and* a compute-floor / Phase-7-prerequisite / space +win; step 3b is the lever for the residual serial backbone. The internal term is +**fragment-scan growth** (`read_manifest_scan` / `commit_graph.refresh` read all fragments of the *latest* version), so the fix is **compaction** (merge fragments) — distinct from the data table's version-chain term that step 3 / version-cleanup handle. `optimize`'s `all_table_keys` @@ -513,9 +635,24 @@ path and would pass falsely. The load-bearing rule both Lance and SlateDB mostly miss: **assert the constant is flat across N, not just small at one N.** A shallow fixture cannot catch an -O(history) cost (the §0(b) table is the red baseline). Add a `num_stages` -(sequential-hop) assertion via a `ThrottledStore` wrapper (Lance's -`test_commit_iops` setup) so an O(N) listing also blows a wall-time budget. +O(history) cost (the §0(b) table is the red baseline). + +**Two latency LOCKs, and the `ThrottledStore` must cap concurrency *and* inject +latency (corrected per §0(c)/(d)).** The wall-clock model is +`(serial_hops + ops/effective_concurrency)·RTT + compute`, so the gate needs **both** +terms, and an unlimited-concurrency harness measures neither honestly: +(1) **serial-hop LOCK** (`serial_hops ≤ K`, flat in depth) — read off the +`wall = compute + serial_hops·L` slope (Lance's `test_commit_iops` setup); catches the +~110-hop backbone (step 3b's target). (2) **op-count-flat-in-history LOCK** under a +**capped-concurrency** `ThrottledStore` (e.g. `MAXCONC=8`) — catches the internal-scan +runaway (§0(d)) that step 2a fixes; *without the cap this LOCK is invisible* because +the ops fan out (the §0(d) trap). Both are load-bearing: a build can pass the serial-hop +LOCK and still run away on a capped store if its per-write op count grows with history. +Run the depth sweep through a `ThrottledStore` that **both** throttles per-op latency +**and** bounds in-flight concurrency to an R2-realistic value; assert `serial_hops` flat +*and* `ops` flat in history. (A pure op-count gate under unlimited concurrency would +*fail a correct build* whose parallel scans grow yet cost no wall-clock, and *pass a +slow one* — which is why the cap is the load-bearing addition.) ### 5.2 Tier 2 — wall-clock trend (post-merge / nightly, never a PR gate) @@ -821,6 +958,25 @@ not schedule around MTT landing.** When it ships, `publish`'s *body* swaps (stage→CAS→sidecar → `catalog.transaction()`) while `WriteTxn`/`PublishPlan` and every verb lowering stay. `iss-863`/`iss-864` **[G]** already scope this spike. +**Why keep `__manifest` as a Lance *dataset* (and compact it) rather than a single flat +CAS'd manifest file?** The Lance-source comparison (§2.3) makes this an explicit choice +to defend, not assume. Both reference designs the RFC cites store cross-version metadata +as **one flat file** read O(1): Lance's per-version manifest (`format/manifest.rs`) and +SlateDB's monotonic-ID manifest (§13). A flat `graph_manifest.json` updated by +conditional-PUT would give omnigraph O(1) catalog reads and a natural one-writer CAS +**with no fragment-scan / compaction / cleanup treadmill** — structurally cheaper than +the Lance-dataset `__manifest` whose hygiene §9 step 2 exists to maintain. The reason to +keep the Lance-dataset form is the **MTT seam**: `__manifest` is deliberately shaped so +`publish` swaps to Lance `catalog.transaction()` when lance#7264 lands, at which point +Lance owns the cross-table manifest and omnigraph **deletes `__manifest` entirely** — +inheriting Lance's O(1) metadata rather than maintaining its own. A flat-file rewrite +would be a detour *away* from that seam, replaced again by MTT. So the trade is +**"Lance-dataset catalog (compacted, MTT-aligned) over flat-file manifest (locally +cheaper, off the MTT path)"** — defensible, but it means step 2's compaction/cleanup +work is a *bridge cost*, justified only by the MTT endgame; if MTT slips materially, the +flat-file manifest becomes the better target and step 2 stops being a bridge and starts +being permanent overhead. Worth a revisit checkpoint tied to the lance#7264 timeline. + The MemWAL/LSM ingest tier (`iss-681` **[G]**, `dec-adopt-lance-v7-memwal`) is **complementary, not competing, and not in flight** (the `memwal-benefit-analysis` branch is an empty placeholder; the real analysis is commit `c9a81266`). MemWAL @@ -847,10 +1003,21 @@ to flatten the curve. `storage.ops` span metric (§5.3) and the bucket-gated `write_cost_s3.rs` opener LOCK (step 3a's red→green, S3-only per the §9-3a measurement note). 2. **Bound history — bring the INTERNAL tables into optimize/cleanup.** Split into - a compaction half (the latency win, safe) and a cleanup half (version GC, needs - the Q8 watermark). Validated (Lance docs + source): compaction *preserves* - versions and is the only term needed to flatten the per-write metadata scan; - cleanup is the separate version-deleting op that opens the Q8 hole. + a compaction half (safe) and a cleanup half (version GC, needs the Q8 watermark). + Validated (Lance docs + source): compaction *preserves* versions and flattens the + per-write metadata *op-count* scan; cleanup is the separate version-deleting op that + opens the Q8 hole. **Latency role — concurrency-dependent, MEASURED (§0(d)):** the + internal fragment scan parallelizes only on a store with free concurrency; under an + R2-realistic cap (8) it serializes and an uncompacted graph *runs away* (per-write + ops 1273→3505, wall 6→16 s), which #291's compaction cuts ~6× and bounds. So on R2 + step 2a is **both a primary latency lever and the anti-runaway fix**, *and* the + **hard prerequisite for Phase 7 / step 4** (the `graph_head` CAS retry re-runs + `load_publish_state`, only acceptable once `__manifest` is compacted), *and* a + compute-floor/space win. (On an unlimited-concurrency store the latency component + alone vanishes — the depth ops fan out — but R2 is not that store.) **#291 is merged + to main but undeployed; the deployed `f6d2cc03` optimize is node/edge-only, so an + operator `optimize` on prod cannot compact these tables — deploying #291 + running + optimize is the immediate prod win.** - **2a. Internal-table compaction. ✅ LANDED.** `optimize` now compacts all three internal tables — `__manifest`, `_graph_commits`, **and `_graph_commit_actors`** (the actor table grows one fragment per commit on the @@ -952,6 +1119,12 @@ to flatten the curve. `iss-merge-recovery-partial-rollforward`, `iss-recovery-sweep-live-writer-rollback`, `iss-934`.) 6. **Branch ops.** Lance `Clone` for create (`iss-691`); concurrent delete loops. + Measured backbones (§0(c)): create ~77, delete ~87 — op counts scale with table + count, so `Clone` (O(tables)→O(1)) + `buffer_unordered` delete are the fix. + **Note: branch *write* (1777 ops, ~258-hop backbone, 21s compute floor) is NOT a + step-6 item** — it is fork-on-first-write stacked on the main backbone, owned by + **step 3b + the fork seam** (the path 3a skipped); it is the single worst write + shape and should be a named acceptance case for step 3b. 7. **Freeze** investment in publisher/sidecar/fork internals; pursue the MTT seam (`iss-863`/`iss-864`) as the strategic exit. From 6d4606a830a3a6404ac8ee6b72f70eb24e35052f Mon Sep 17 00:00:00 2001 From: Ragnor Comerford Date: Mon, 22 Jun 2026 13:05:28 +0200 Subject: [PATCH 8/8] fix(engine): optimize survives a cross-process write race on the same table (#297) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * test(engine): cross-process optimize-vs-write race — RED Two regression tests for the prod bug: a direct `optimize` process racing a served write on the same table fails, because the in-process write queue does not serialize across processes and the data-table optimize path has no retry. - optimize_survives_concurrent_insert_advancing_manifest: a concurrent insert advances the manifest while optimize is paused between compact and publish; optimize's equality-CAS publish then fails "expected X but current Y". - optimize_survives_concurrent_delete_before_compaction: a concurrent delete commits before optimize compacts; Lance rebases the compaction past it cleanly, so optimize again fails the publish CAS (the genuine Lance Rewrite-vs-Rewrite overlap is rarer and shares the internal path's retry). Both fail today with ExpectedVersionMismatch. Adds the `optimize.before_compact` failpoint seam + a wait_for_sidecar helper; serializes the optimize failpoint tests (shared failpoint name). The fix lands next. * fix(engine): optimize survives a cross-process write race on the same table The data-table optimize path trusted the in-process write queue and skipped a retry, so a CLI `optimize` racing a served write (separate processes = separate queues) failed: either the Lance Rewrite lost ("preempted by concurrent Update") or the manifest publish lost the strict equality CAS ("expected X but current Y"). Unify both compaction paths on the internal path's reopen+replan shape, with a two-level retry that matches the two failure points: - Outer loop (reopen+replan): a genuine Lance Rewrite-vs-Update/Delete same- fragment conflict means our compaction did not commit — reopen at the new HEAD and re-plan. Lance rebases the common disjoint case (a concurrent insert/delete on other fragments) for free, so this fires only on real overlap. - Inner loop (Phase C, monotonic publish): the manifest advanced between our compaction and our publish. The compaction is already committed at Lance HEAD N, so we must NOT reopen (that trips the HEAD>manifest drift guard on our own work). Re-read the current manifest version C: if C >= N the manifest already includes our compaction (versions are linear) — no-op; else fast-forward to N. Monotonic, not the strict equality CAS that manufactured the conflict. The Phase-A sidecar is written once and reused across reopen attempts (every Phase-B commit is content-preserving, so recovery rolls the observed HEAD forward or safely rolls the compaction back). The in-process queue is kept — it is now an in-process contention reducer, not the cross-process correctness guard. Shares the COMPACTION_RETRY_BUDGET constant + is_retryable_lance_conflict with the internal path; adds is_retryable_manifest_conflict for the publish loop. No writer_epoch. Turns the prior commit's two race tests green. * docs(rfc-013): two-op-class principle + the found+fixed optimize-vs-write race §6.6 records the maintenance vs logical op-class distinction (maintenance commutes → Lance rebase + reopen/replan + monotonic manifest fast-forward, no writer_epoch; logical → strict cross-process OCC + epoch) and the prod optimize-vs-served-write race that motivated it, now landed. Adds the matching mechanic row to §4.2. * fix(engine): retry must not misclassify optimize's own HEAD drift Review catch on the cross-process optimize fix: the outer retry loop re-ran the `lance_head > manifest` drift guard every iteration. After a partial Phase-B commit (the auto_cleanup strip or compaction commits, then a later op hits a retryable conflict), the reopened attempt saw HEAD ahead of the manifest — from OUR own sidecar-covered work, not an external writer — and deleted the sidecar + returned `skipped_for_drift`, stranding uncovered drift that then needs `repair`. Track `head_advanced` (did one of our Phase-B ops already commit). The drift guard now fires only when `!head_advanced` (genuine pre-existing external drift); once we have advanced HEAD, a reopened HEAD>manifest is our work that the monotonic publish fast-forwards. The no-op early-return likewise publishes prior committed work instead of dropping it when `head_advanced`. Regression test `optimize_retry_does_not_misclassify_own_head_drift` injects one retryable reindex conflict after the compaction commits (new `optimize.inject_ reindex_conflict` seam); red→green verified by negative control (reverting the gate reproduces `skipped_for_drift: Some(DriftNeedsRepair)`). Also de-flake `optimize_survives_concurrent_insert_advancing_manifest`: pause at `before_compact` (not post-compact) so the concurrent insert lands while HEAD== manifest — otherwise it could race optimize's committed-but-unpublished compaction and hit the write-path "HEAD ahead of manifest" guard. * fix(engine): optimize publish converges on retry-budget exhaustion Review catch (greptile): the monotonic Phase-C publish loop returned an error on its final iteration's retryable manifest conflict, even though that conflict can itself mean a concurrent writer published a version that already includes our (content- preserving) compaction — i.e. the postcondition ("the manifest reflects our compaction") is already met. Recovery covered it (no data loss), but the operator saw a spurious error and had to re-run. Restructure the loop to re-read `current` on every retryable conflict and, on budget exhaustion, do a final `current >= state.version` convergence check before surfacing the error — the §6.6 "postcondition is the state, not winning the CAS" principle. Factor the repeated current-version read into `current_manifest_version`. --- crates/omnigraph/src/db/omnigraph/optimize.rs | 500 +++++++++++------- crates/omnigraph/tests/failpoints.rs | 237 +++++++++ docs/dev/rfc-013-write-path-latency.md | 54 ++ 3 files changed, 603 insertions(+), 188 deletions(-) diff --git a/crates/omnigraph/src/db/omnigraph/optimize.rs b/crates/omnigraph/src/db/omnigraph/optimize.rs index 29bf2b6..e3aed3d 100644 --- a/crates/omnigraph/src/db/omnigraph/optimize.rs +++ b/crates/omnigraph/src/db/omnigraph/optimize.rs @@ -359,205 +359,306 @@ async fn optimize_one_table( .acquire_many(&[(table_key.clone(), None)]) .await; - // `compact_files` is a Lance-only maintenance API that needs `&mut Dataset`. - // The `TableStorage` trait deliberately does not surface it (the staged-write - // invariant covers writes; compaction is a separate concern). Unwrap the - // opaque `SnapshotHandle` via `into_dataset()` (`pub(crate)`, gated to the - // maintenance path). - let handle = db - .storage() - .open_dataset_head_for_write(&table_key, &full_path, None) - .await?; - let mut ds = handle.into_dataset(); + // Survive a CROSS-PROCESS race (a CLI `optimize` vs the served server): the + // in-process write queue above serializes only same-process writers, so we also + // retry. Two failure modes, two retry levels: + // * Outer loop — a genuine Lance `Rewrite`-vs-`Update/Delete` same-fragment + // conflict (compaction did NOT commit). Reopen at the new HEAD and re-plan, + // exactly as the internal-table path does. (Lance rebases the common disjoint + // case — a concurrent insert/delete on other fragments — for free, so this + // fires only on real overlap.) + // * Inner loop (Phase C) — the manifest advanced under us between our + // compaction and our publish. The compaction IS committed at Lance HEAD, so + // we must NOT reopen (that would trip the HEAD>manifest drift guard on our + // own work); instead re-read the current manifest version and either no-op + // (the manifest already moved past our version — being linear, it descends + // from and includes our compaction) or fast-forward to it. Monotonic, never + // the strict equality CAS that manufactured the bug. + // + // The Phase-A sidecar is written ONCE on the first productive attempt and reused + // across reopen attempts: every Phase-B commit is content-preserving, so a crash + // mid-retry leaves the table readable and recovery either rolls the observed HEAD + // forward (pin still matches the manifest) or safely rolls the compaction back. + let mut sidecar: Option = None; - // CAS baseline: the table's current manifest version, read under the queue - // (in-memory coordinator snapshot, no storage I/O — stable for this section). - let expected_version = db - .fresh_snapshot_for_branch(None) - .await? - .entry(&table_key) - .map(|e| e.table_version) - .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + // Tracks whether one of OUR Phase-B ops (auto-cleanup strip / compact / reindex) + // already committed and advanced Lance HEAD past the manifest in a prior attempt. + // Once true, a reopened `lance_head > manifest` is our own sidecar-covered work, + // NOT external drift — so the drift guard and the no-op early-return must not treat + // it as such (that would drop our committed work as uncovered drift). + let mut head_advanced = false; - let lance_head_version = ds.version().version; - if lance_head_version < expected_version { - return Err(OmniError::manifest_internal(format!( - "table '{}' Lance HEAD version {} is behind manifest version {}", - table_key, lance_head_version, expected_version - ))); - } - if lance_head_version > expected_version { - tracing::warn!( - target: "omnigraph::optimize", - table = %table_key, - manifest_version = expected_version, - lance_head_version, - "skipping compaction: Lance HEAD is ahead of the manifest; run `omnigraph repair` \ - to classify and publish covered maintenance drift explicitly", - ); - return Ok(TableOptimizeStats::skipped_for_drift( - table_key, - expected_version, - lance_head_version, - )); - } + // Outer loop: open → plan → Phase B, reopening + re-planning on a retryable + // Lance conflict. Breaks with the committed snapshot once Phase B succeeds. + let mut attempt: u32 = 0; + let (snapshot, metrics, pending_indexes, committed) = loop { + attempt += 1; - // Precise "will it compact?" check — `plan_compaction` also accounts for - // deletion materialization (which can rewrite even a single fragment). - let options = CompactionOptions::default(); - let plan = plan_compaction(&ds, &options) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let will_compact = plan.num_tasks() > 0; - // Even when there is nothing to compact, the table may still have index - // work: rows appended since the index was built (e.g. via `ingest --mode - // merge`) are scanned unindexed until folded in (needs_reindex), OR a - // declared `@index` was never built — schema apply records the intent but - // defers the physical build (iss-848), so optimize is the operator-facing - // reconciler that materializes it (needs_index_create). Any of the three is - // enough to enter the publish path. If NONE, this table is a no-op and must - // NOT be pinned in a sidecar — a zero-commit pin classifies NoMovement on - // recovery and forces an all-or-nothing rollback of sibling tables' - // legitimate work. Uncovered pre-existing manifest/HEAD drift is skipped - // above and goes through explicit repair, so this only runs on a healthy - // table under the per-table queue + sidecar. - let needs_reindex = TableStore::has_unindexed_fragments(&ds).await?; - // needs_index_work_* checks "a declared index is missing AND row_count > 0", - // so empty tables stay no-ops (never pinned). It re-reads the head under the - // queue we already hold, so it is consistent with `ds`. - let needs_index_create = if let Some(type_name) = table_key.strip_prefix("node:") { - super::table_ops::needs_index_work_node(db, type_name, &table_key, &full_path, None).await? - } else { - super::table_ops::needs_index_work_edge(db, &table_key, &full_path, None).await? - }; - if !will_compact && !needs_reindex && !needs_index_create { - return Ok(TableOptimizeStats::compacted( - table_key, - &CompactionMetrics::default(), - false, - )); - } + // `compact_files` is a Lance-only maintenance API that needs `&mut Dataset`. + // The `TableStorage` trait deliberately does not surface it; unwrap the + // opaque `SnapshotHandle` via `into_dataset()` (gated to the maintenance path). + let mut ds = db + .storage() + .open_dataset_head_for_write(&table_key, &full_path, None) + .await? + .into_dataset(); - // Phase A: recovery sidecar BEFORE any HEAD-advancing op (compaction or - // index optimize), so a crash before the manifest publish rolls forward on - // next open. - let sidecar = crate::db::manifest::new_sidecar( - crate::db::manifest::SidecarKind::Optimize, - None, - // optimize is system-attributed (no `optimize_as` actor API today). - None, - vec![crate::db::manifest::SidecarTablePin { - table_key: table_key.clone(), - table_path: full_path.clone(), - expected_version, - // Lower bound — compaction commits N≥1 versions (reserve + rewrite); - // the classifier loose-matches SidecarKind::Optimize. - post_commit_pin: expected_version + 1, - // Optimize uses the loose match (drift is derived state), not - // BranchMerge's Phase-B confirmation — left None. - confirmed_version: None, - table_branch: None, - }], - ); - let handle = - crate::db::manifest::write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?; + // CAS baseline: the table's current manifest version, re-read each attempt + // (a reopen means the manifest may have advanced). + let expected_version = db + .fresh_snapshot_for_branch(None) + .await? + .entry(&table_key) + .map(|e| e.table_version) + .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; - // Phase B: compaction (if any) then incremental index optimize — both - // advance Lance HEAD inside the sidecar window. `compact_files` rewrites - // fragments and drops them from existing index segments' coverage; - // `optimize_indices` folds the rewritten and any previously-unindexed - // fragments back in (Lance's incremental merge, not a full retrain). This - // is the same compact -> optimize_indices sequencing LanceDB's `optimize()` - // uses. `optimize_indices` is an inline-commit residual: lance-6.0.1 - // exposes no uncommitted variant, so like `compact_files` it commits - // directly and relies on the sidecar for recovery. - // Capture the baseline BEFORE the auto-cleanup scrub below, so that if the - // scrub is the only thing that commits, `committed` is still true and Phase C - // publishes the advanced HEAD (no uncovered HEAD>manifest drift). - let version_before = ds.version().version; + let lance_head_version = ds.version().version; + if lance_head_version < expected_version { + return Err(OmniError::manifest_internal(format!( + "table '{}' Lance HEAD version {} is behind manifest version {}", + table_key, lance_head_version, expected_version + ))); + } + if !head_advanced && lance_head_version > expected_version { + // Pre-existing EXTERNAL uncovered drift (we have not advanced HEAD yet) — + // go through explicit repair. Once `head_advanced` is set, a reopened + // `lance_head > manifest` is our own prior Phase-B commit (sidecar-covered) + // that the publish below fast-forwards, NOT external drift, so this guard is + // skipped on those retries. + if let Some(h) = sidecar.take() { + let _ = crate::db::manifest::delete_sidecar(&h, db.storage_adapter()).await; + } + tracing::warn!( + target: "omnigraph::optimize", + table = %table_key, + manifest_version = expected_version, + lance_head_version, + "skipping compaction: Lance HEAD is ahead of the manifest; run `omnigraph repair` \ + to classify and publish covered maintenance drift explicitly", + ); + return Ok(TableOptimizeStats::skipped_for_drift( + table_key.clone(), + expected_version, + lance_head_version, + )); + } - // Keep optimize non-destructive on upgraded graphs (same guarantee the - // internal-table path makes — see `clear_stale_auto_cleanup_config`). - // `compact_files` / `optimize_indices` commit with a default `CommitConfig` - // (`skip_auto_cleanup = false`) and expose no skip override, so on a graph - // created by a pre-v7 binary (auto_cleanup ON) those commits would fire - // Lance's version-GC hook and prune `__manifest`-pinned data-table versions. - // Strip the stale config first. We hold the per-table queue, so no concurrent - // writer can race this (no retry loop needed, unlike the internal-table path); - // any commit it makes is content-preserving and covered by the Optimize - // sidecar's loose `post_commit_pin` like the other Phase-B commits. - clear_stale_auto_cleanup_config(&mut ds) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - - let metrics: CompactionMetrics = if will_compact { - compact_files(&mut ds, options, None) + // Precise "will it compact?" check — `plan_compaction` also accounts for + // deletion materialization (which can rewrite even a single fragment). + let options = CompactionOptions::default(); + let plan = plan_compaction(&ds, &options) .await - .map_err(|e| OmniError::Lance(e.to_string()))? - } else { - CompactionMetrics::default() + .map_err(|e| OmniError::Lance(e.to_string()))?; + let will_compact = plan.num_tasks() > 0; + // Even with nothing to compact, the table may still have index work + // (needs_reindex: rows appended since the index was built; needs_index_create: + // a declared `@index` whose physical build schema apply deferred, iss-848). + // Any of the three enters the publish path. If NONE, this is a no-op and must + // NOT be pinned in a sidecar (a zero-commit pin classifies NoMovement on + // recovery and rolls back siblings). + let needs_reindex = TableStore::has_unindexed_fragments(&ds).await?; + let needs_index_create = if let Some(type_name) = table_key.strip_prefix("node:") { + super::table_ops::needs_index_work_node(db, type_name, &table_key, &full_path, None) + .await? + } else { + super::table_ops::needs_index_work_edge(db, &table_key, &full_path, None).await? + }; + if !will_compact && !needs_reindex && !needs_index_create { + if head_advanced { + // Nothing left to compact, but a prior attempt already advanced HEAD + // (e.g. the strip committed, then compaction conflicted, and the reopen + // is now already compacted). Publish that committed work instead of + // dropping it as uncovered drift. + break ( + crate::storage_layer::SnapshotHandle::new(ds), + CompactionMetrics::default(), + Vec::new(), + true, + ); + } + if let Some(h) = sidecar.take() { + let _ = crate::db::manifest::delete_sidecar(&h, db.storage_adapter()).await; + } + return Ok(TableOptimizeStats::compacted( + table_key.clone(), + &CompactionMetrics::default(), + false, + )); + } + + // Phase A: recovery sidecar BEFORE any HEAD-advancing op, written once and + // reused across reopen attempts. + if sidecar.is_none() { + let sc = crate::db::manifest::new_sidecar( + crate::db::manifest::SidecarKind::Optimize, + None, + // optimize is system-attributed (no `optimize_as` actor API today). + None, + vec![crate::db::manifest::SidecarTablePin { + table_key: table_key.clone(), + table_path: full_path.clone(), + expected_version, + // Lower bound — compaction commits N≥1 versions (reserve + rewrite); + // the classifier loose-matches SidecarKind::Optimize. + post_commit_pin: expected_version + 1, + confirmed_version: None, + table_branch: None, + }], + ); + sidecar = Some( + crate::db::manifest::write_sidecar(db.root_uri(), db.storage_adapter(), &sc).await?, + ); + } + + // Test seam: a concurrent (cross-process) writer can interleave here, before + // any Phase-B commit lands, to exercise the reopen+replan path. + crate::failpoints::maybe_fail("optimize.before_compact")?; + + // Phase B: scrub stale auto_cleanup (keeps optimize non-destructive on a + // graph upgraded from a pre-v7 binary whose `compact_files`/`optimize_indices` + // commits would otherwise fire Lance's auto-cleanup GC hook), compact, + // incremental reindex, then materialize declared-but-missing indexes. Each is + // an inline-commit residual covered by the sidecar. A retryable Lance conflict + // here means a concurrent writer preempted an overlapping fragment → reopen at + // the new HEAD and re-plan. Baseline captured BEFORE the scrub so that if the + // scrub is the only commit, `committed` still triggers the Phase-C publish. + let version_before = ds.version().version; + match clear_stale_auto_cleanup_config(&mut ds).await { + // `true` ⇒ the strip committed and advanced HEAD past the manifest. + Ok(stripped) => head_advanced |= stripped, + Err(e) if attempt < COMPACTION_RETRY_BUDGET && is_retryable_lance_conflict(&e) => { + continue; + } + Err(e) => return Err(OmniError::Lance(e.to_string())), + } + let metrics: CompactionMetrics = if will_compact { + match compact_files(&mut ds, options, None).await { + Ok(m) => { + head_advanced = true; + m + } + Err(e) if attempt < COMPACTION_RETRY_BUDGET && is_retryable_lance_conflict(&e) => { + continue; + } + Err(e) => return Err(OmniError::Lance(e.to_string())), + } + } else { + CompactionMetrics::default() + }; + // Test seam: inject one retryable reindex conflict AFTER compaction has + // committed (so HEAD is already ahead of the manifest from our own work), + // exercising the own-HEAD (not external) drift classification on the next + // reopened attempt. + if crate::failpoints::maybe_fail("optimize.inject_reindex_conflict").is_err() + && attempt < COMPACTION_RETRY_BUDGET + { + continue; + } + match ds.optimize_indices(&OptimizeOptions::default()).await { + Ok(()) => {} + Err(e) if attempt < COMPACTION_RETRY_BUDGET && is_retryable_lance_conflict(&e) => { + continue; + } + Err(e) => { + return Err(OmniError::Lance(format!("optimize_indices on {}: {}", table_key, e))); + } + } + + let catalog = db.catalog(); + let mut snapshot = crate::storage_layer::SnapshotHandle::new(ds); + let pending_indexes: Vec = + super::table_ops::build_indices_on_dataset_for_catalog( + db, + &catalog, + &table_key, + &mut snapshot, + ) + .await?; + // optimize_indices / index build may also have committed (folded fragments, + // built a deferred index). Any HEAD advance this attempt counts too. + let version_after = snapshot.dataset().version().version; + head_advanced |= version_after != version_before; + + break (snapshot, metrics, pending_indexes, head_advanced); }; - ds.optimize_indices(&OptimizeOptions::default()) - .await - .map_err(|e| OmniError::Lance(format!("optimize_indices on {}: {}", table_key, e)))?; - // Materialize any declared-but-missing index over the just-compacted layout, - // reusing the build chokepoint (idempotent: skips existing indexes; fault- - // isolates an untrainable vector column into `pending` rather than failing). - // Run it UNCONDITIONALLY now that we are past the no-op gate — not only when - // `needs_index_create`. A table can enter this path for compaction or - // reindex while its sole missing index is an untrainable Vector column - // (which `needs_index_work_*` does not count as buildable work); calling the - // build here is what surfaces that column in `pending_indexes`, so optimize - // can't compact a table yet silently drop the deferred-index signal. - // Idempotent + cheap when there is nothing to build. Vector index creation - // is an inline-commit residual; the Optimize sidecar's loose post_commit_pin - // covers the extra commits. - let catalog = db.catalog(); - let mut snapshot = crate::storage_layer::SnapshotHandle::new(ds); - let pending_indexes: Vec = - super::table_ops::build_indices_on_dataset_for_catalog( - db, - &catalog, - &table_key, - &mut snapshot, - ) - .await?; - let version_after = snapshot.dataset().version().version; - let committed = version_after != version_before; - - // Pin the per-writer Phase B → Phase C residual for optimize: Lance HEAD has - // advanced but the manifest publish below hasn't run. + // Pin the per-writer Phase B → Phase C residual: Lance HEAD has advanced but the + // manifest publish below hasn't run. crate::failpoints::maybe_fail("optimize.post_phase_b_pre_manifest_commit")?; - // Phase C: publish the compacted version to the manifest (one CAS commit, - // expected = the version observed under the queue). On failure the sidecar - // is intentionally left for the open-time recovery sweep to roll forward. + // Phase C: monotonic fast-forward publish. The compaction is committed at Lance + // HEAD `N`; publish a manifest pointer that includes it. If a concurrent writer + // already advanced the manifest to ≥ N (it built on our compaction), there is + // nothing to do. Otherwise advance to N; a concurrent advance during this window + // is a retryable manifest conflict — re-read the current version and re-evaluate + // (NOT a reopen: the compaction is already committed). if committed { let state = db.storage().table_state(&full_path, &snapshot).await?; - let update = crate::db::SubTableUpdate { - table_key: table_key.clone(), - table_version: state.version, - table_branch: None, - row_count: state.row_count, - version_metadata: state.version_metadata, - }; - let mut expected = std::collections::HashMap::new(); - expected.insert(table_key.clone(), expected_version); - db.coordinator - .write() - .await - .commit_updates_with_actor_with_expected(&[update], &expected, None) - .await?; + let mut published = false; + let mut last_conflict: Option = None; + for _ in 0..COMPACTION_RETRY_BUDGET { + let current = current_manifest_version(db, &table_key).await?; + if current >= state.version { + // The manifest already points at a version that includes our + // compaction (Lance versions are linear). Nothing to publish. + published = true; + break; + } + let update = crate::db::SubTableUpdate { + table_key: table_key.clone(), + table_version: state.version, + table_branch: None, + row_count: state.row_count, + version_metadata: state.version_metadata.clone(), + }; + let mut expected = std::collections::HashMap::new(); + expected.insert(table_key.clone(), current); + match db + .coordinator + .write() + .await + .commit_updates_with_actor_with_expected(&[update], &expected, None) + .await + { + Ok(_) => { + published = true; + break; + } + // A retryable manifest conflict means the manifest moved under us — + // loop and re-read `current` (the top check converges if it now + // already includes our compaction). Record it for the exhaustion path. + Err(e) if is_retryable_manifest_conflict(&e) => last_conflict = Some(e), + // Leave the sidecar for the open-time recovery sweep to roll forward. + Err(e) => return Err(e), + } + } + if !published { + // Budget exhausted under sustained contention. The final conflict may + // itself mean a concurrent writer published a version that already + // includes our (content-preserving) compaction — the postcondition is + // "the manifest reflects our compaction," not "we won the CAS" — so + // re-check before surfacing an error (§6.6). + let current = current_manifest_version(db, &table_key).await?; + if current < state.version { + return Err(last_conflict.unwrap_or_else(|| { + OmniError::manifest_conflict(format!( + "optimize publish of {table_key} exhausted {COMPACTION_RETRY_BUDGET} \ + retries against concurrent writers" + )) + })); + } + } } // Phase D: delete the sidecar (best-effort; recovery resolves a leftover). - if let Err(err) = crate::db::manifest::delete_sidecar(&handle, db.storage_adapter()).await { - tracing::warn!( - error = %err, - operation_id = handle.operation_id.as_str(), - "optimize recovery sidecar cleanup failed; next open's recovery sweep will resolve it" - ); + if let Some(h) = sidecar.take() { + if let Err(err) = crate::db::manifest::delete_sidecar(&h, db.storage_adapter()).await { + tracing::warn!( + error = %err, + operation_id = h.operation_id.as_str(), + "optimize recovery sidecar cleanup failed; next open's recovery sweep will resolve it" + ); + } } let mut stat = TableOptimizeStats::compacted(table_key, &metrics, committed); @@ -567,7 +668,7 @@ async fn optimize_one_table( /// Bound on the app-level retry of an internal-table compaction against a /// concurrent live writer (see [`is_retryable_lance_conflict`]). -const INTERNAL_COMPACTION_RETRY_BUDGET: u32 = 5; +const COMPACTION_RETRY_BUDGET: u32 = 5; /// A Lance commit error that means "a concurrent writer preempted us; reload the /// dataset and rerun." `compact_files` commits via `commit_compaction` -> @@ -588,6 +689,29 @@ fn is_retryable_lance_conflict(err: &lance::Error) -> bool { ) } +/// A manifest publish conflict that optimize's monotonic Phase-C loop re-evaluates +/// (re-read the current version, then no-op or fast-forward). Both shapes that reach +/// here are `Conflict`-kind and mean "the manifest moved under us; reconsider," never +/// a lost update: the typed `ExpectedVersionMismatch` (a concurrent writer advanced +/// the table) and the publisher's exhausted row-level CAS (`manifest_conflict`). +fn is_retryable_manifest_conflict(err: &OmniError) -> bool { + matches!( + err, + OmniError::Manifest(m) if m.kind == crate::error::ManifestErrorKind::Conflict + ) +} + +/// The table's current manifest version on `main` (0 if absent), read fresh. Used by +/// optimize's monotonic publish loop to decide no-op (`current >= N`) vs fast-forward. +async fn current_manifest_version(db: &Omnigraph, table_key: &str) -> Result { + Ok(db + .fresh_snapshot_for_branch(None) + .await? + .entry(table_key) + .map(|e| e.table_version) + .unwrap_or(0)) +} + /// Remove any stored `lance.auto_cleanup.*` config from a table so compaction /// stays **non-destructive by construction**. Used by both the internal-table /// path ([`compact_internal_table`]) and the data-table path @@ -666,7 +790,7 @@ async fn compact_internal_table( // so optimize would otherwise fail spuriously on a live graph. On a retryable // conflict we re-open at the new HEAD and rerun — the canonical Lance-consumer // pattern. Each attempt opens fresh because the conflict means the version moved. - for attempt in 0..INTERNAL_COMPACTION_RETRY_BUDGET { + for attempt in 0..COMPACTION_RETRY_BUDGET { let handle = db .storage() .open_dataset_head_for_write(table_key, &uri, None) @@ -678,7 +802,7 @@ async fn compact_internal_table( let cleared_config = match clear_stale_auto_cleanup_config(&mut ds).await { Ok(cleared) => cleared, Err(e) => { - if attempt + 1 < INTERNAL_COMPACTION_RETRY_BUDGET && is_retryable_lance_conflict(&e) + if attempt + 1 < COMPACTION_RETRY_BUDGET && is_retryable_lance_conflict(&e) { continue; } @@ -718,7 +842,7 @@ async fn compact_internal_table( )); } Err(e) - if attempt + 1 < INTERNAL_COMPACTION_RETRY_BUDGET + if attempt + 1 < COMPACTION_RETRY_BUDGET && is_retryable_lance_conflict(&e) => { continue; @@ -727,7 +851,7 @@ async fn compact_internal_table( } } Err(OmniError::manifest_conflict(format!( - "internal-table compaction of {table_key} exhausted {INTERNAL_COMPACTION_RETRY_BUDGET} \ + "internal-table compaction of {table_key} exhausted {COMPACTION_RETRY_BUDGET} \ retries against concurrent writers" ))) } diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index 9d65bc1..85c056d 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -3089,6 +3089,7 @@ edge WorksAt: Person -> Company /// forward on next open so the manifest tracks the Lance HEAD — and the healed /// table must then accept a schema apply (the original bug's victim). #[tokio::test] +#[serial(optimize)] async fn optimize_phase_b_failure_recovered_on_next_open() { let _scenario = FailScenario::setup(); let dir = tempfile::tempdir().unwrap(); @@ -3178,6 +3179,242 @@ async fn optimize_phase_b_failure_recovered_on_next_open() { .expect("schema apply after optimize recovery must succeed"); } +/// Cross-process race (the prod bug): a served write advances the manifest on the +/// same table while a SEPARATE `optimize` process is paused between its compaction +/// and its manifest publish. The in-process write queue does NOT serialize across +/// processes, so optimize's equality-CAS publish (expected = its pre-compaction +/// version) finds the manifest already advanced. optimize must CONVERGE — the +/// concurrent write built on top of the compacted HEAD, so the compaction is +/// already reflected — not fail with "expected X but current Y". RED before the +/// monotonic-publish fix. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[serial(optimize)] +async fn optimize_survives_concurrent_insert_advancing_manifest() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + + { + let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); + for (name, age) in [("alice", 30), ("bob", 31), ("carol", 32), ("dave", 33)] { + db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", name)], &[("$age", age)]), + ) + .await + .unwrap(); + } + } + + // Pause optimize BEFORE it compacts, so the concurrent insert lands while + // HEAD == manifest (no in-flight optimize drift for the writer to trip on); the + // insert advances the manifest, then optimize compacts on top and must converge + // its publish over the advanced manifest rather than fail the equality CAS. + let failpoint = ScopedFailPoint::new("optimize.before_compact", "pause"); + + let uri_opt = uri.clone(); + let optimize = tokio::spawn(async move { + let db = Omnigraph::open(&uri_opt).await.unwrap(); + db.optimize().await + }); + + // Wait until optimize reaches the pause (its Optimize sidecar is on disk). + assert!( + wait_for_sidecar(dir.path()).await, + "optimize never reached the pre-compact pause", + ); + + // Concurrent insert on the SAME table via a SEPARATE handle (= separate + // in-process write queue = a different process) advances the manifest. + { + let db_b = Omnigraph::open(&uri).await.unwrap(); + db_b.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "eve")], &[("$age", 34)]), + ) + .await + .unwrap(); + } + + drop(failpoint); // release optimize + let result = tokio::time::timeout(std::time::Duration::from_secs(20), optimize) + .await + .expect("optimize task hung") + .unwrap(); + result.expect("optimize must survive a concurrent same-table write (cross-process)"); + + // No lost write: 4 seed + eve all present; graph remains re-optimizable. + let db = Omnigraph::open(&uri).await.unwrap(); + assert_eq!( + helpers::count_rows(&db, "node:Person").await, + 5, + "concurrent insert must not be lost", + ); + db.optimize() + .await + .expect("graph must remain healthy / re-optimizable"); +} + +/// Cross-process race: a served DELETE commits on the same table while a SEPARATE +/// `optimize` process is parked just before its compaction. Lance rebases the +/// compaction past the delete cleanly (so this surfaces as a manifest-CAS mismatch +/// at publish, not a Lance `Rewrite` conflict — the genuine `Rewrite`-vs-`Rewrite` +/// overlap is the rarer many-fragment/concurrent-compaction case, covered by the +/// shared `is_retryable_lance_conflict` retry the internal-table path already +/// exercises). optimize must converge its publish over the advanced manifest and +/// preserve the delete. RED before the fix. +#[tokio::test(flavor = "multi_thread", worker_threads = 4)] +#[serial(optimize)] +async fn optimize_survives_concurrent_delete_before_compaction() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + + { + let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); + for (name, age) in [("alice", 30), ("bob", 31), ("carol", 32), ("dave", 33)] { + db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", name)], &[("$age", age)]), + ) + .await + .unwrap(); + } + } + + // Pause optimize BEFORE its compaction commits. + let failpoint = ScopedFailPoint::new("optimize.before_compact", "pause"); + + let uri_opt = uri.clone(); + let optimize = tokio::spawn(async move { + let db = Omnigraph::open(&uri_opt).await.unwrap(); + db.optimize().await + }); + + assert!( + wait_for_sidecar(dir.path()).await, + "optimize never reached the pre-compact pause", + ); + + // Concurrent DELETE of an existing row writes a deletion vector onto the + // fragment optimize is about to compact → optimize's Rewrite overlap-conflicts + // at the Lance level ("Rewrite … preempted by concurrent Delete/Update"). + { + let db_b = Omnigraph::open(&uri).await.unwrap(); + db_b.mutate( + "main", + MUTATION_QUERIES, + "remove_person", + &mixed_params(&[("$name", "alice")], &[]), + ) + .await + .unwrap(); + } + + drop(failpoint); // release optimize + let result = tokio::time::timeout(std::time::Duration::from_secs(20), optimize) + .await + .expect("optimize task hung") + .unwrap(); + result.expect("optimize must reopen+replan past a concurrent overlapping delete"); + + // No lost write: alice's delete persisted (3 rows); graph remains re-optimizable. + let db = Omnigraph::open(&uri).await.unwrap(); + assert_eq!( + helpers::count_rows(&db, "node:Person").await, + 3, + "the concurrent delete must persist (alice removed)", + ); + db.optimize() + .await + .expect("graph must remain healthy / re-optimizable"); +} + +/// Regression: the outer compaction retry loop must NOT misclassify optimize's OWN +/// committed Phase-B work as external drift. Attempt 1 compacts (HEAD → V+1); if a +/// LATER Phase-B op (reindex) then hits a retryable conflict, the reopened attempt +/// sees Lance HEAD ahead of the manifest — from OUR compaction, not an external +/// writer. The drift guard must skip it (we hold the sidecar) and converge, not +/// delete the sidecar and return `skipped_for_drift` (which would strand uncovered +/// drift). Reproduced by injecting one retryable reindex conflict after the compact. +#[tokio::test] +#[serial(optimize)] +async fn optimize_retry_does_not_misclassify_own_head_drift() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + + { + let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); + for (name, age) in [("alice", 30), ("bob", 31), ("carol", 32), ("dave", 33)] { + db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", name)], &[("$age", age)]), + ) + .await + .unwrap(); + } + } + + // Inject exactly one retryable reindex conflict: attempt 1 compacts (HEAD+1) then + // "conflicts" on reindex → retry; attempt 2 reopens with HEAD ahead of the manifest + // from our own compaction — the misclassification trigger. + let _failpoint = ScopedFailPoint::new("optimize.inject_reindex_conflict", "1*return"); + + let db = Omnigraph::open(&uri).await.unwrap(); + let stats = db + .optimize() + .await + .expect("optimize must converge, not misclassify its own HEAD drift"); + let person = stats + .iter() + .find(|s| s.table_key == "node:Person") + .expect("node:Person stat present"); + assert!( + person.skipped.is_none(), + "node:Person must converge, not skipped_for_drift: {:?}", + person.skipped, + ); + + // No uncovered drift stranded: a follow-up optimize is clean and all rows read. + let stats2 = db.optimize().await.unwrap(); + let person2 = stats2 + .iter() + .find(|s| s.table_key == "node:Person") + .unwrap(); + assert!( + person2.skipped.is_none(), + "follow-up optimize must be clean (no stranded drift): {:?}", + person2.skipped, + ); + assert_eq!(helpers::count_rows(&db, "node:Person").await, 4); +} + +/// Poll until `optimize` has written its recovery sidecar (i.e. reached Phase B +/// and is about to / has compacted), signalling it is parked at its failpoint. +async fn wait_for_sidecar(root: &std::path::Path) -> bool { + let recovery_dir = root.join("__recovery"); + for _ in 0..1000 { + if recovery_dir.exists() + && std::fs::read_dir(&recovery_dir) + .map(|d| d.count() > 0) + .unwrap_or(false) + { + return true; + } + tokio::time::sleep(std::time::Duration::from_millis(10)).await; + } + false +} + #[tokio::test] #[serial(branch_merge_phase_b)] async fn branch_merge_phase_b_failure_recovered_on_next_open() { diff --git a/docs/dev/rfc-013-write-path-latency.md b/docs/dev/rfc-013-write-path-latency.md index d955a9d..1954b01 100644 --- a/docs/dev/rfc-013-write-path-latency.md +++ b/docs/dev/rfc-013-write-path-latency.md @@ -579,6 +579,7 @@ The cost contract becomes part of `publish`'s documented API: | Epoch fence | Monotonic `writer_epoch` in `__manifest`, CAS-claimed at writer init, checked on every publish. Fences a whole zombie *writer* deterministically (no TTL); closes the multi-process exposure and the Lance-MTT TTL-lease gap. | SlateDB `FenceableTransactionalObject` **[U]** | | Branch create | Lance `Clone` instead of the per-table fork loop (O(tables)→O(1) sequential). | `iss-691` **[G]** | | Branch delete | Run the per-other-branch safety check and the per-table reclaim loops concurrently (`buffer_unordered`); read branch sets from in-memory coordinator state. | **[S]** | +| Maintenance-class commit (compaction) | Commutative/content-preserving ops do NOT use the logical class's strict OCC: Lance rebases the disjoint case, the app reopens+replans on a real overlap, and the manifest publish is a **monotonic fast-forward** (advance or no-op, never equality CAS) — no `writer_epoch`. The two-op-class rule + the found+fixed optimize-vs-write race: §6.6. | §6.6 **[M]**, **LANDED** | --- @@ -823,6 +824,59 @@ This is the standard WAL-replay + leader-lease shape (confirmed against SlateDB' finding promotes #6/#7 from "nice correctness work" to the load-bearing guard that gates multi-writer topologies — and it is the motivating case for them.** +## 6.6 The two op classes — and a found+fixed maintenance race (LANDED) + +§6.5 is about the **logical** write class. A prod run surfaced the same +process-boundary flaw in the **maintenance** class: a direct CLI `optimize` racing +a served write on the same table **failed** — either the Lance `Rewrite` lost +("preempted by concurrent Update") or the manifest publish lost the strict equality +CAS ("expected 14 but current 15"). Same root cause as §6.5 (the in-process write +queue does not serialize across processes), but the right fix is the **opposite** of +the logical class, because the two classes commute differently: + +| class | examples | commutes? | correct commit model | +|---|---|---|---| +| **maintenance** | compaction (`Rewrite`), `optimize_indices` | **yes** (content-preserving) | Lance native rebase + app reopen/replan on real overlap + **monotonic manifest fast-forward** — no epoch, no read-set | +| **logical mutation** | load / mutate / merge / delete | **no** (lost-update, write-skew) | strict cross-process OCC: read-set + write-set CAS under the `writer_epoch` fence (§6.5, #7) | + +Applying strict OCC + equality-CAS uniformly is the mistake: **too strong for +maintenance** (it manufactures a false conflict against a commutative op — this +bug) and **too weak for logical writes cross-process** (§6.5). The maintenance fix +needs **no `writer_epoch`** — that is the tell that it is a different class. + +**Validated against Lance 7.0.0 source + reproduced [M].** Lance has no compaction +re-plan retry (the semantic `RetryableCommitConflict` escapes `commit_transaction`'s +loop at `io/commit.rs:979`; only the OCC manifest race is retried), so the +application must reopen + re-plan — exactly what the internal-table path already +did. Notably, Lance **rebases the common case for free**: a concurrent +insert/update/delete on *other* fragments is disjoint, so the data-table compaction +commits cleanly and the conflict surfaces only as the manifest fast-forward +(the genuine `Rewrite`-vs-`Rewrite` overlap is the rarer many-fragment / +concurrent-compaction case). + +**Fix (LANDED).** Both compaction paths now share one reopen+replan shape with a +two-level retry: an outer loop reopens+replans on a real Lance overlap conflict; an +inner Phase-C loop makes the manifest publish a **monotonic fast-forward** +(advance to the compacted version `N`, or no-op when the manifest already moved to +`≥ N` — being linear, it descends from and includes the compaction), never the +equality CAS. The `Optimize` recovery sidecar is written once and reused across +attempts (every commit is content-preserving). The in-process queue is kept as an +in-process contention reducer, not the cross-process guard. No `writer_epoch`. +(`db/omnigraph/optimize.rs`; regression tests in `tests/failpoints.rs`: +`optimize_survives_concurrent_insert_advancing_manifest`, +`optimize_survives_concurrent_delete_before_compaction`.) + +**Relationship to step 5 (the unification).** This is the first correct *instance* of +the maintenance-class commit model, not a parallel band-aid: when step 5 collapses the +writers into the single `publish(txn, plan)` authority, it **relocates** this — a +`TableAction::Rewrite` carries the monotonic-fast-forward + reopen/replan commit model +into the unified publisher — rather than reinventing it. What step 5 deletes is the +*location* (the hand-rolled loop in `optimize_one_table`), not the *semantics*; the two +regression tests above are the contract that must stay green across that refactor. It +also makes step 5 easier, not harder: it already unified the two compaction paths onto +one retry shape and drew the op-class line (logical writers keep equality CAS; only +compaction is monotonic), so there is one fewer pattern for the unification to absorb. + --- ## 7. Invariants & deny-list check