fix(engine): guard absent _graph_commits + always compact internal tables

Addresses PR #291 review findings:

- Greptile (P1): optimize unconditionally opened `_graph_commits` for compaction,
  but a graph can validly have none (the coordinator opens it as `Option`, gated on
  `storage.exists`, for graphs predating the commit graph). `Dataset::open` on the
  absent table errored and failed the whole optimize. Guard the `_graph_commits`
  compaction with the same `storage_adapter().exists()` check the coordinator uses;
  `__manifest` always exists so it stays unguarded. Regression test
  `optimize_tolerates_absent_graph_commits_table` (empty graph so no publish
  recreates the table before the guard).

- Cursor (low): the `table_tasks.is_empty()` early return skipped internal-table
  compaction for a schema with no node/edge types. Removed it so the internal
  tables are compacted regardless of the data-table set.

- Codex (auto-cleanup, P1): documented — `compact_files` commits with a default
  `CommitConfig` (no skip_auto_cleanup) and `CompactionOptions` exposes no override,
  so on a graph storing an *on* auto_cleanup config the commit would fire version
  GC. Both internal tables are created with `auto_cleanup: None`, so new graphs are
  safe; the only exposure is pre-fix upgraded graphs, identical to the existing
  data-table optimize path, with step 2b's watermark as the comprehensive guard.
  Added a comment in `compact_internal_table` recording this.
This commit is contained in:
Ragnor Comerford 2026-06-20 19:13:40 +02:00
parent 8db8937a6a
commit b2f65062c8
2 changed files with 53 additions and 12 deletions

View file

@ -248,10 +248,8 @@ pub async fn optimize_all_tables(db: &Omnigraph) -> Result<Vec<TableOptimizeStat
tasks
};
if table_tasks.is_empty() {
return Ok(Vec::new());
}
// NB: do NOT early-return when `table_tasks` is empty (a schema with no
// node/edge types) — the internal system tables below must still be compacted.
let concurrency = maint_concurrency().min(table_tasks.len()).max(1);
let stats: Vec<Result<TableOptimizeStats>> = futures::stream::iter(table_tasks.into_iter())
@ -286,18 +284,18 @@ pub async fn optimize_all_tables(db: &Omnigraph) -> Result<Vec<TableOptimizeStat
// data-table stats only; each internal compaction does its own coordinator
// refresh for cache coherence.
let mut all = stats;
// `__manifest` always exists (created at init). `_graph_commits` may be absent
// (the coordinator opens it as `Option`, gated on existence — graphs predating
// the commit graph have none), so guard it with the same existence check rather
// than letting `Dataset::open` error and fail the whole optimize.
all.push(
compact_internal_table(db, "__manifest", crate::db::manifest::manifest_uri(db.root_uri()))
.await,
);
all.push(
compact_internal_table(
db,
"_graph_commits",
crate::db::commit_graph::graph_commits_uri(db.root_uri()),
)
.await,
);
let graph_commits_uri = crate::db::commit_graph::graph_commits_uri(db.root_uri());
if db.storage_adapter().exists(&graph_commits_uri).await? {
all.push(compact_internal_table(db, "_graph_commits", graph_commits_uri).await);
}
all.into_iter().collect()
}
@ -575,6 +573,17 @@ async fn compact_internal_table(
));
}
// `compact_files` commits with a default `CommitConfig` (skip_auto_cleanup =
// false) and `CompactionOptions` exposes no override, so on a graph whose
// dataset stores an *on* auto_cleanup config the commit would fire Lance's
// auto-cleanup (version GC). For the internal tables that GC would be version
// deletion on `__manifest`/`_graph_commits` — the cleanup-resurrection surface
// this compaction-only step deliberately avoids (RFC-013 step 2b / Q8). Both
// internal tables are created with `auto_cleanup: None` (see
// `manifest/graph.rs`, `commit_graph.rs`), so a NEW graph stores no config and
// nothing fires here. Pre-auto_cleanup-fix upgraded graphs are the only
// exposure — identical to the existing data-table `optimize_one_table` path —
// and the comprehensive guard is step 2b's watermark.
let metrics = compact_files(&mut ds, options, None)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;

View file

@ -205,6 +205,38 @@ async fn optimize_compacts_internal_tables() {
.unwrap();
}
/// `optimize` must not fail on a graph that has no `_graph_commits.lance` — a valid
/// state the coordinator opens as `commit_graph = None` (graphs predating the commit
/// graph). Without the existence guard, `Dataset::open` on the absent table errors
/// and fails the whole optimize. Regression for the missing-existence-guard.
///
/// Uses an EMPTY graph deliberately: a graph with data would publish during
/// optimize, and a publish records a graph commit that recreates `_graph_commits`
/// before the guard runs — masking the bug. With no data, nothing recreates it, so
/// the table stays absent through the guard.
#[tokio::test]
async fn optimize_tolerates_absent_graph_commits_table() {
let dir = tempfile::tempdir().unwrap();
let uri = dir.path().to_str().unwrap();
Omnigraph::init(uri, TEST_SCHEMA).await.unwrap();
// Simulate a graph with no commit-graph dataset.
std::fs::remove_dir_all(dir.path().join("_graph_commits.lance")).unwrap();
// Coordinator tolerates the absence; optimize must succeed (the guard skips the
// absent table rather than letting `Dataset::open` error) and omit its stat.
let db = Omnigraph::open(uri).await.unwrap();
let stats = db.optimize().await.unwrap();
assert!(
stats.iter().any(|s| s.table_key == "__manifest"),
"__manifest must still be compacted"
);
assert!(
!stats.iter().any(|s| s.table_key == "_graph_commits"),
"absent _graph_commits must be skipped, not opened (would error)"
);
}
// PR3 (Workstream B): an existing scalar index does not cover fragments
// appended after it was built (build_indices is existence-gated), so those
// rows are scanned unindexed. `optimize` must fold them back in via Lance's