diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
deleted file mode 100644
index ce8510c..0000000
--- a/.github/CODEOWNERS
+++ /dev/null
@@ -1,18 +0,0 @@
-# AUTOGENERATED from .github/codeowners-roles.yml. Do not edit by hand.
-#
-# To change role membership or path assignments:
-# 1. Edit .github/codeowners-roles.yml
-# 2. Run `python3 .github/scripts/render-codeowners.py`
-# 3. Commit both files together
-#
-# CI fails if this file drifts from its source, and rejects PRs that
-# edit this file directly without also editing the yml.
-
-* @aaltshuler @ragnorc
-
-crates/** @aaltshuler @ragnorc
-docs/** @aaltshuler @ragnorc
-README.md @aaltshuler @ragnorc
-AGENTS.md @aaltshuler @ragnorc
-CLAUDE.md @aaltshuler @ragnorc
-SECURITY.md @aaltshuler @ragnorc
diff --git a/.github/branch-protection.json b/.github/branch-protection.json
index aa1ab19..fa1d57f 100644
--- a/.github/branch-protection.json
+++ b/.github/branch-protection.json
@@ -1,27 +1,19 @@
{
- "_comment": "Branch protection policy for main. Applied via scripts/apply-branch-protection.sh. See docs/branch-protection.md for rationale. NOTE: bypass_pull_request_allowances.users must mirror the engineering owners in .github/codeowners-roles.yml — code owners merge their own PRs without a second review; non-owners still need a code-owner approval. (render-codeowners.py does NOT generate this list; keep it in sync by hand.)",
+ "_comment": "Branch protection policy for main. Applied via scripts/apply-branch-protection.sh. See docs/dev/branch-protection.md for rationale. CODEOWNERS was removed (2-person team where both maintainers own everything, so code-owner review added friction without value). Review is no longer code-owner-scoped and no approvals are required; the required CI status checks are the gate. Maintainers merge their own PRs once checks pass.",
"required_status_checks": {
"strict": true,
"contexts": [
"Classify Changes",
"Check AGENTS.md Links",
- "Test omnigraph-server --features aws",
- "CODEOWNERS matches source",
- "CODEOWNERS not hand-edited"
+ "Test omnigraph-server --features aws"
]
},
"enforce_admins": false,
"required_pull_request_reviews": {
- "dismissal_restrictions": {},
- "dismiss_stale_reviews": true,
- "require_code_owner_reviews": true,
- "required_approving_review_count": 1,
- "require_last_push_approval": false,
- "bypass_pull_request_allowances": {
- "users": ["ragnorc", "aaltshuler"],
- "teams": [],
- "apps": []
- }
+ "dismiss_stale_reviews": false,
+ "require_code_owner_reviews": false,
+ "required_approving_review_count": 0,
+ "require_last_push_approval": false
},
"restrictions": null,
"required_linear_history": true,
diff --git a/.github/codeowners-roles.yml b/.github/codeowners-roles.yml
deleted file mode 100644
index 65f2400..0000000
--- a/.github/codeowners-roles.yml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Source of truth for .github/CODEOWNERS.
-#
-# How to change role membership or path assignments:
-# 1. Edit this file.
-# 2. Run `python3 .github/scripts/render-codeowners.py` to regenerate
-# .github/CODEOWNERS.
-# 3. Commit both files in the same PR.
-#
-# CI fails on drift between this source and the generated CODEOWNERS
-# (see .github/workflows/codeowners.yml). CI also rejects direct edits
-# to .github/CODEOWNERS that don't accompany a change here.
-#
-# Why a generator instead of editing CODEOWNERS directly?
-# The yml is the audit trail: `git log .github/codeowners-roles.yml`
-# shows every role change with a reviewable diff and a merge commit.
-# The rendered CODEOWNERS is what GitHub reads at PR time.
-
-roles:
- engineering:
- description: >
- All production code under crates/**. Engine, CLI, server,
- compiler.
- members:
- - aaltshuler
- - ragnorc
-
- docs:
- description: >
- Documentation under docs/**, plus repo-level docs (README.md,
- AGENTS.md, CLAUDE.md symlink, SECURITY.md).
- members:
- - aaltshuler
- - ragnorc
-
-# Path → role mapping. GitHub CODEOWNERS uses "last match wins"
-# semantics — when multiple patterns match a file, only the last
-# matching pattern's owners apply. The generator handles this by
-# emitting `default` as the first `*` line and the specific patterns
-# below afterward, so specific paths override the catch-all.
-#
-# Within this list, order matters only between overlapping specific
-# patterns (the later one wins). Today nothing overlaps; future
-# additions should keep more-specific patterns later.
-paths:
- "crates/**": [engineering]
- "docs/**": [docs]
- "README.md": [docs]
- "AGENTS.md": [docs]
- "CLAUDE.md": [docs]
- "SECURITY.md": [docs]
-
-# Catch-all for paths not explicitly mapped (.github/, scripts/,
-# Cargo.toml, Cargo.lock, openapi.json, LICENSE, etc.). Defaults to
-# engineering — every change to repo infrastructure needs the
-# engineering owner's review.
-default: [engineering]
diff --git a/.github/scripts/render-codeowners.py b/.github/scripts/render-codeowners.py
deleted file mode 100755
index 5e96545..0000000
--- a/.github/scripts/render-codeowners.py
+++ /dev/null
@@ -1,205 +0,0 @@
-#!/usr/bin/env python3
-"""Render .github/CODEOWNERS and the ownership tables in
-docs/dev/codeowners.md from .github/codeowners-roles.yml.
-
-The yml is the source of truth. This script expands the role-based yml
-into (1) the flat path→owners format GitHub expects in
-`.github/CODEOWNERS`, and (2) the "who owns what" markdown tables spliced
-between the generated-region markers in `docs/dev/codeowners.md`. Both are
-derived artifacts; CI re-renders them on every PR (see
-.github/workflows/codeowners.yml) and auto-commits the result on same-repo
-PRs, so the source of truth and the human-readable view never drift.
-
-Usage:
- python3 .github/scripts/render-codeowners.py
-
-Exits non-zero on:
- - Missing PyYAML.
- - Unknown role referenced in `paths` or `default`.
- - Role with no members (a role must always resolve to at least
- one owner; otherwise CODEOWNERS would assign nobody and GitHub
- would silently fall back to "no required reviewer", which
- defeats the purpose).
- - Missing generated-region markers in docs/dev/codeowners.md.
-"""
-
-from __future__ import annotations
-
-import sys
-from pathlib import Path
-
-try:
- import yaml
-except ImportError:
- sys.exit(
- "error: PyYAML is required. Install with `pip install pyyaml` "
- "or `python3 -m pip install pyyaml`."
- )
-
-REPO_ROOT = Path(__file__).resolve().parents[2]
-SOURCE = REPO_ROOT / ".github" / "codeowners-roles.yml"
-OUTPUT = REPO_ROOT / ".github" / "CODEOWNERS"
-DOCS = REPO_ROOT / "docs" / "dev" / "codeowners.md"
-
-# The "who owns what" tables in docs/dev/codeowners.md are spliced between
-# these markers so the human-readable view never drifts from the source of
-# truth. Edit codeowners-roles.yml and re-render — never the table by hand.
-DOCS_BEGIN = ""
-DOCS_END = ""
-
-BANNER = """\
-# AUTOGENERATED from .github/codeowners-roles.yml. Do not edit by hand.
-#
-# To change role membership or path assignments:
-# 1. Edit .github/codeowners-roles.yml
-# 2. Run `python3 .github/scripts/render-codeowners.py`
-# 3. Commit both files together
-#
-# CI fails if this file drifts from its source, and rejects PRs that
-# edit this file directly without also editing the yml.
-"""
-
-
-def resolve(role_name: str, roles: dict) -> list[str]:
- role = roles.get(role_name)
- if role is None:
- sys.exit(
- f"error: unknown role '{role_name}'. "
- f"Known roles: {sorted(roles.keys())}"
- )
- members = role.get("members") or []
- if not members:
- sys.exit(
- f"error: role '{role_name}' has no members. "
- f"A role must resolve to at least one owner."
- )
- return members
-
-
-def owners_for(role_names: list[str], roles: dict) -> list[str]:
- """Return @-prefixed GitHub handles, deduped, preserving order."""
- seen: list[str] = []
- for role_name in role_names:
- for member in resolve(role_name, roles):
- handle = f"@{member}"
- if handle not in seen:
- seen.append(handle)
- return seen
-
-
-def _oneline(text: str) -> str:
- """Collapse a folded/multi-line YAML description into one cell of text."""
- return " ".join((text or "").split())
-
-
-def ownership_tables(spec: dict, roles: dict) -> str:
- """Render the human-readable "who owns what" markdown — a path→owners
- table (the operative view at PR time, in last-match-wins order with the
- catch-all first) plus a role→members table. Spliced into the docs between
- the markers so it is always current with the source of truth."""
- out: list[str] = []
-
- out.append("**Path → owners** (GitHub applies *last match wins*; the `*` "
- "catch-all is listed first and is overridden by the specific "
- "patterns below it):")
- out.append("")
- out.append("| Path | Owners | Role(s) |")
- out.append("|---|---|---|")
- if "default" in spec:
- owners = " ".join(owners_for(spec["default"], roles))
- out.append(f"| `*` | {owners} | {', '.join(spec['default'])} |")
- for pattern, role_names in (spec.get("paths") or {}).items():
- owners = " ".join(owners_for(role_names, roles))
- out.append(f"| `{pattern}` | {owners} | {', '.join(role_names)} |")
- out.append("")
-
- out.append("**Roles**:")
- out.append("")
- out.append("| Role | Members | Description |")
- out.append("|---|---|---|")
- for name, role in roles.items():
- members = " ".join(f"@{m}" for m in (role.get("members") or []))
- out.append(f"| `{name}` | {members} | {_oneline(role.get('description', ''))} |")
- out.append("")
-
- return "\n".join(out)
-
-
-def splice_docs(table_md: str) -> None:
- """Replace the region between DOCS_BEGIN/DOCS_END in the docs file with the
- freshly generated tables, leaving surrounding prose untouched."""
- if not DOCS.exists():
- sys.exit(f"error: docs file not found: {DOCS}")
- text = DOCS.read_text()
- if DOCS_BEGIN not in text or DOCS_END not in text:
- sys.exit(
- f"error: ownership markers not found in {DOCS.relative_to(REPO_ROOT)}. "
- f"Add the lines:\n {DOCS_BEGIN}\n {DOCS_END}\n"
- f"around the generated table region."
- )
- head, rest = text.split(DOCS_BEGIN, 1)
- _, tail = rest.split(DOCS_END, 1)
- new = f"{head}{DOCS_BEGIN}\n\n{table_md}\n{DOCS_END}{tail}"
- DOCS.write_text(new)
-
-
-def main() -> int:
- if not SOURCE.exists():
- sys.exit(f"error: source file not found: {SOURCE}")
- spec = yaml.safe_load(SOURCE.read_text())
-
- roles = spec.get("roles") or {}
- if not roles:
- sys.exit("error: codeowners-roles.yml declares no roles")
-
- paths = spec.get("paths") or {}
- if not paths:
- sys.exit("error: codeowners-roles.yml declares no paths")
-
- lines: list[str] = [BANNER]
-
- # Pad the path column for alignment. Width is the longest pattern
- # plus a small margin.
- width = max(len(p) for p in paths) + 2
-
- # GitHub CODEOWNERS uses "last match wins" semantics. Emit the
- # default catch-all `*` FIRST so specific patterns below override
- # it for the paths they cover. If we emitted `*` last, every file
- # would resolve to the default owners regardless of more-specific
- # rules — which would silently nullify any role distinction.
- if "default" in spec:
- default_owners = owners_for(spec["default"], roles)
- lines.append(f"{'*':<{width}} {' '.join(default_owners)}")
- lines.append("")
-
- for pattern, role_names in paths.items():
- owners = owners_for(role_names, roles)
- lines.append(f"{pattern:<{width}} {' '.join(owners)}")
-
- lines.append("") # trailing newline so the file ends cleanly
-
- rendered = "\n".join(lines)
-
- # Regression check: the catch-all `*` line (if any) must precede
- # every specific-path line. Failure here means the generator is
- # silently nullifying specific rules.
- if "default" in spec:
- non_comment = [ln for ln in rendered.splitlines() if ln and not ln.startswith("#")]
- first_pattern = non_comment[0].split()[0] if non_comment else None
- if first_pattern != "*":
- sys.exit(
- f"error: generator invariant violated — first emitted pattern is "
- f"{first_pattern!r}, expected '*'. CODEOWNERS uses last-match-wins; "
- f"the catch-all must come first."
- )
-
- OUTPUT.write_text(rendered)
- print(f"wrote {OUTPUT.relative_to(REPO_ROOT)}")
-
- splice_docs(ownership_tables(spec, roles))
- print(f"updated {DOCS.relative_to(REPO_ROOT)}")
- return 0
-
-
-if __name__ == "__main__":
- sys.exit(main())
diff --git a/.github/workflows/codeowners.yml b/.github/workflows/codeowners.yml
deleted file mode 100644
index 75b3515..0000000
--- a/.github/workflows/codeowners.yml
+++ /dev/null
@@ -1,110 +0,0 @@
-name: CODEOWNERS
-
-# Runs on EVERY pull request (no paths filter). The two jobs below are
-# required status checks on `main`; a path-filtered required check never
-# reports for PRs outside the filter and leaves them permanently "pending"
-# (the trap that forced admin-override merges). Always-run + cheap
-# short-circuit is what keeps them honest.
-on:
- pull_request:
- workflow_dispatch:
-
-# `drift` auto-commits the regenerated artifacts back to same-repo PR
-# branches, so it needs write access.
-permissions:
- contents: write
-
-jobs:
- # NOTE: the job `name:` values below ("CODEOWNERS matches source" /
- # "CODEOWNERS not hand-edited") ARE the status-check contexts that
- # .github/branch-protection.json must list verbatim. Renaming a job here
- # is a branch-protection change — update the JSON and re-apply.
- drift:
- name: CODEOWNERS matches source
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v5.0.1
-
- - name: Set up Python
- uses: actions/setup-python@v5.4.0
- with:
- python-version: '3.13'
-
- - name: Install PyYAML
- run: pip install pyyaml
-
- - name: Re-render CODEOWNERS + ownership docs
- run: python3 .github/scripts/render-codeowners.py
-
- # Same-repo PR: push the regenerated artifacts back so contributors
- # never have to run the script locally. Mirrors the openapi.json
- # auto-commit in ci.yml (separate shallow clone of the head branch so
- # the pushed commit carries only the regenerated files).
- - name: Commit regenerated artifacts to PR branch
- if: |
- github.event_name == 'pull_request' &&
- github.event.pull_request.head.repo.full_name == github.repository
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- run: |
- if git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then
- echo "CODEOWNERS and ownership docs already in sync."
- exit 0
- fi
- tmp=$(mktemp -d)
- git clone --depth 1 --branch "${{ github.head_ref }}" \
- "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git" \
- "$tmp"
- cp .github/CODEOWNERS "$tmp/.github/CODEOWNERS"
- cp docs/dev/codeowners.md "$tmp/docs/dev/codeowners.md"
- cd "$tmp"
- if git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then
- echo "Head branch already matches; nothing to push."
- exit 0
- fi
- git config user.name "github-actions[bot]"
- git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
- git add .github/CODEOWNERS docs/dev/codeowners.md
- git commit -m "chore: regenerate CODEOWNERS + ownership docs"
- git push
-
- # Fork PR / workflow_dispatch: cannot push back, so enforce drift
- # strictly. The contributor runs the script and commits the result.
- - name: Verify in sync (forks / manual runs)
- if: |
- !(github.event_name == 'pull_request' &&
- github.event.pull_request.head.repo.full_name == github.repository)
- run: |
- if ! git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then
- echo "::error::Generated CODEOWNERS / ownership docs are out of sync with .github/codeowners-roles.yml."
- echo "::error::Run \`python3 .github/scripts/render-codeowners.py\` and commit the result."
- echo "--- diff ---"
- git --no-pager diff -- .github/CODEOWNERS docs/dev/codeowners.md
- exit 1
- fi
- echo "Generated artifacts are in sync with their source."
-
- noedit:
- name: CODEOWNERS not hand-edited
- runs-on: ubuntu-latest
- steps:
- - uses: actions/checkout@v5.0.1
- with:
- # Need history so we can diff against the PR base.
- fetch-depth: 0
-
- - name: Reject hand-edits to generated file
- # Only meaningful for PRs (needs a base to diff against).
- if: github.event_name == 'pull_request'
- run: |
- base="origin/${{ github.base_ref }}"
- git fetch origin "${{ github.base_ref }}" --quiet
- changed=$(git diff --name-only "$base" HEAD)
- edited_generated=$(echo "$changed" | grep -E '^\.github/CODEOWNERS$' || true)
- edited_source=$(echo "$changed" | grep -E '^\.github/codeowners-roles\.yml$' || true)
- if [ -n "$edited_generated" ] && [ -z "$edited_source" ]; then
- echo "::error::This PR edits .github/CODEOWNERS but not its source .github/codeowners-roles.yml."
- echo "::error::Edit the yml and regenerate via \`python3 .github/scripts/render-codeowners.py\`."
- exit 1
- fi
- echo "CODEOWNERS edits accompany source edits (or no CODEOWNERS edits in this PR)."
diff --git a/AGENTS.md b/AGENTS.md
index e231d45..843423f 100644
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -103,7 +103,6 @@ Full diagram and concurrency model: [docs/dev/architecture.md](docs/dev/architec
| Install (binary / Homebrew / source / channels) | [docs/user/install.md](docs/user/install.md) |
| Deployment (binary / container / S3-local testing / auth / build variants) | [docs/user/deployment.md](docs/user/deployment.md) |
| CI / release workflows | [docs/dev/ci.md](docs/dev/ci.md) |
-| Code ownership (CODEOWNERS source of truth, roles, regeneration) | [docs/dev/codeowners.md](docs/dev/codeowners.md) |
| Branch protection policy (declarative, applied via `scripts/apply-branch-protection.sh`) | [docs/dev/branch-protection.md](docs/dev/branch-protection.md) |
| Constants & tunables cheat sheet | [docs/user/reference/constants.md](docs/user/reference/constants.md) |
| Per-version release notes | [docs/releases/](docs/releases/) |
@@ -126,6 +125,8 @@ This is a decision lens, not a code-size rule. It cuts both ways. Sometimes the
When evaluating a design, ask: *"what does this look like after 5 more changes like it?"* If the answer is "this converges to one shape", cost is bounded. If it's "this forks every time", the option is mortgaging the future for present convenience — pick differently.
+The same lens has a structural corollary: **one source of truth, cheaply derived.** Lance and the manifest are the source of truth; everything else is a derived view. Maintaining a parallel copy invites drift that compounds over time, and re-deriving a view from the full source on every call makes its cost grow with history. Both are liabilities integrated over time, so both are ruled out the same way: hold a warm derived view and refresh it with a cheap probe, never shadow the source or rebuild from it cold. Invariant 15 in [docs/dev/invariants.md](docs/dev/invariants.md) states this; invariants 1 (respect the substrate) and 7 (indexes are derived state) are instances.
+
### Tiebreakers when liability alone is silent
- **Correctness > simplicity > performance.** Lexicographic — give up performance for simpler code; give up simplicity for correct code; never give up correctness. The deny-list ("no silent failures," "no acks before durable persistence," "no reads of partial commits") is this rule's hard floor.
@@ -146,6 +147,7 @@ These are architectural rules that need to be in scope on every change. They're
5. **Reads always see the current index state for the branch they're reading.** Indexes track the branch head, not historical snapshots. If you change index lifecycle, preserve this guarantee.
6. **Stable type IDs survive renames.** Schema migration relies on identity that's stable across rename — don't mint new IDs on rename.
7. **Logical contract over physical state.** Physical state (index coverage, fragment layout, compaction versions, staged writes) is derived and rebuildable; it must never fail a logical operation. Check preconditions against logical state and let reconciliation converge the physical state idempotently — genuine logical conflicts still fail loudly. This is the rule rules 1–6 instantiate; full statement and applications in [docs/dev/invariants.md](docs/dev/invariants.md).
+8. **One source of truth, cheaply derived.** Lance and the manifest are the source of truth; runtime state is a derived view of them. Don't maintain a parallel copy that can drift, and don't re-derive a view from cold storage on every call (that makes cost grow with history). Hold it warm, refresh with a cheap probe.
### Deny-list (fast-pass review filter — full reasoning in [docs/dev/invariants.md](docs/dev/invariants.md))
@@ -167,6 +169,7 @@ If a proposal fits one of these, the burden is on the proposer to justify why th
- Cloud-only correctness fixes — correctness is always OSS.
- Forking the codebase for Cloud — trait-extension only.
- Hand-rolling something Lance already does — check the spec first.
+- Shadowing the source of truth with a maintained parallel copy, or re-deriving a derived view from cold storage per call (cost then scales with history). Hold it warm and refresh cheaply.
- Mutating in place state that should be immutable (Lance fragments, index segments) — new segments instead.
- Silent failures — OOM, timeout, partial result must all be surfaced and bounded.
- Shipping observable behavior as if it weren't part of the contract — output ordering, error-message text, timestamp precision, default-flag values, latency profile. Per Hyrum's Law, every observable behavior gets depended on once shipped; don't expose what you don't want to commit to.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 2d77ef0..1029b2f 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -22,8 +22,8 @@ that turns out to be non-trivial will be redirected — that's about process, no
the merit of the change.
> **Maintainers (ModernRelay team)** follow a separate internal process and are
-> not bound by the intake rules above. Everyone is bound by review, CODEOWNERS,
-> branch protection, and CI.
+> not bound by the intake rules above. Everyone is bound by review, branch
+> protection, and CI.
## Development
diff --git a/Cargo.lock b/Cargo.lock
index 92279d2..9c77f87 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4942,6 +4942,7 @@ dependencies = [
"lance-datafusion",
"lance-file",
"lance-index",
+ "lance-io",
"lance-linalg",
"lance-namespace",
"lance-namespace-impls",
diff --git a/GOVERNANCE.md b/GOVERNANCE.md
index 5878f1f..2768e5b 100644
--- a/GOVERNANCE.md
+++ b/GOVERNANCE.md
@@ -9,19 +9,18 @@ to happen before code lands?*
> Discussions, RFCs, and pull requests from people outside the ModernRelay
> team. **Maintainers operate under a separate internal process** and are not
> bound by the intake gates below. Everyone, maintainer or not, is still bound
-> by the universal gates: branch protection on `main` and CODEOWNERS review
-> (see [docs/dev/branch-protection.md](docs/dev/branch-protection.md) and
-> [docs/dev/codeowners.md](docs/dev/codeowners.md)).
+> by the universal gates: branch protection on `main` and CI
+> (see [docs/dev/branch-protection.md](docs/dev/branch-protection.md)).
## Roles
| Role | Who | Authority |
|---|---|---|
-| **Maintainer** | The code owners in [`.github/CODEOWNERS`](.github/CODEOWNERS) (generated from [`.github/codeowners-roles.yml`](.github/codeowners-roles.yml)) | Validate issues, accept/reject RFCs, review and merge PRs, set direction. Final decision authority. |
+| **Maintainer** | The ModernRelay team (repository admins) | Validate issues, accept/reject RFCs, review and merge PRs, set direction. Final decision authority. |
| **Contributor** | Anyone else | Report problems (Issues), propose ideas (Discussions), author RFCs, and open pull requests. |
-Decision authority rests with the maintainers. CODEOWNERS is the single source
-of truth for who that is; this document does not duplicate the list.
+Decision authority rests with the maintainers (the ModernRelay team holding
+repository-admin access).
## The three channels
@@ -51,7 +50,7 @@ contribution.
Pull request ◀──────────┴──────────│── merged == accepted │
(links the issue or the accepted RFC) ◀───────┘ (implementation PRs reference it) │
│
- review + CODEOWNERS + branch protection
+ review + branch protection + CI
▼
merged
```
@@ -91,8 +90,8 @@ where it's reviewable.
## What maintainers do *not* gate
Maintainers' own changes do not pass through the intake gates above — the team
-runs a separate internal process. The universal gates (review, CODEOWNERS,
-branch protection, CI) apply to everyone. Enforcement of the intake rules is, to
+runs a separate internal process. The universal gates (review, branch
+protection, CI) apply to everyone. Enforcement of the intake rules is, to
start, **by convention and review** (PR template + labels); an automated check
keyed to author association may be added later if volume warrants.
diff --git a/README.md b/README.md
index 9c4f8bc..deaea8b 100644
--- a/README.md
+++ b/README.md
@@ -1,200 +1,233 @@
-# Omnigraph
+
-Omnigraph acts as operational state & coordination layer for agents.
-Hundreds of agents can enrich the graph on parallel isolated branches and changes can be reviewed and merged safely.
+
+
+
+
+
-- Git-style versioning & branching
-- Multimodal retrieval (graph+vector/fts+filters) optimized for context assembly
-- Runs on the local filesystem or any S3-compatible object store (AWS S3, R2, MinIO, RustFS)
-- Native blob-as-data support (docs, images, videos, etc)
-- VPC, On-prem, hybrid deployment
-- [`Lance`](https://github.com/lance-format/lance) format as open storage layer
+
-| AS CODE | What it means |
+Omnigraph is the operational state and coordination layer for fleets of agents.\
+Run it as a server, declared as code; hundreds of agents operate and enrich the graph on parallel isolated branches, and every change is reviewed and merged safely.
+
+## Key capabilities
+
+| Capability | What it gives you |
|---|---|
-| **Schema AS CODE** | Typed `.pg` schemas, planned, applied, enforced |
-| **Context AS CODE** | Linted queries & agentic nudges, versioned and reusable |
-| **Security AS CODE** | Cedar policies enforced server-side on every mutation |
-| **Dashboards AS CODE** | Declarative views & controls over the graph *(coming)* |
+| **Declared as code** | A `cluster.yaml` declares graphs, schemas, stored queries, embedding providers, and policies; `cluster apply` converges it and `omnigraph-server` brings every graph online at `/graphs/{id}/…`. |
+| **Built for fleets of agents** | Hundreds of agents enrich the graph on **parallel isolated branches**; changes are reviewed and merged safely, Git-style, across the whole graph. |
+| **Multimodal retrieval** | Graph traversal + vector ANN + full-text + Reciprocal Rank Fusion in **one** query runtime, for context assembly. |
+| **Security as code** | Cedar policy enforced **server-side on every mutation**, per-graph and server-wide; bearer auth; actor/audit tracking. |
+| **Runs on your infrastructure** | Any S3-compatible object store: **on-prem via RustFS / MinIO**, or AWS S3 / R2 / GCS. VPC, on-prem, hybrid; your data never leaves your store. |
+| **Open, versioned storage** | [`Lance`](https://github.com/lance-format/lance) columnar format: branchable, time-travelable, with native blob-as-data (docs, images, video). |
-## Core Use Cases
+## What you can build
-| Use case | What it's for
+| Use case | What it's for |
|---|---|
-| **Company brain** | Org knowledge unified into one queryable graph |
-| **Context graph** | Decision traces and codified tribal knowledge |
-| **Agentic memory** | Durable, versioned memory for long-running agents |
-| **Dev graph** | Issues & dependency model for coding agents |
-| **R&D data layer** | Experiments & trials data written into branches |
-| **ML workflows** | Versioned, branchable graphs for training & eval |
-| **Karpathy's LLM wiki** | A living, agent-updatable knowledge base |
+| **Company brain** | Org knowledge unified into one graph every agent can query |
+| **Agentic memory** | Durable, versioned memory: a branch per agent or per task, merged on review |
+| **Context graph** | Decision traces and codified tribal knowledge for retrieval |
+| **Dev graph** | Issues & dependency model that coding agents read and write |
+| **R&D / ML data layer** | Experiments and trials written into branches, versioned for training & eval |
-## Quick Install
+## Install
```bash
curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/install.sh | bash
```
-This installs `omnigraph` and `omnigraph-server` into `~/.local/bin` from
-published release binaries.
-
-Or install with Homebrew:
+This installs `omnigraph` (CLI) and `omnigraph-server` into `~/.local/bin` from
+published release binaries. Or with Homebrew:
```bash
brew tap ModernRelay/tap
brew install ModernRelay/tap/omnigraph
```
-## Quick start
-
-The fastest path is an **embedded, local file-backed graph** — no server, no
-object store, no Docker:
-
-```bash
-# A schema and one row of data
-cat > schema.pg <<'PG'
-node Person {
- slug: String @key
- name: String
- title: String?
-}
-PG
-echo '{"type":"Person","data":{"slug":"alice","name":"Alice","title":"Engineer"}}' > people.jsonl
-
-# Create → load (--mode is required) → query
-omnigraph init --schema schema.pg ./graph.omni
-omnigraph load --data people.jsonl --mode overwrite --store ./graph.omni
-omnigraph query find_people --store ./graph.omni --params '{"t":"Engineer"}' \
- -e 'query find_people($t: String) { match { $p: Person { title: $t } } return { $p.name } }'
-
-# Branch, write in isolation, merge — Git-style across the whole graph
-omnigraph branch create --from main review/new-hires --store ./graph.omni
-omnigraph branch merge review/new-hires --into main --store ./graph.omni
-```
-
-**Storage backends** — the same flow runs on any backend; only the graph address changes:
-
-| Backend | Use it for | Graph address |
-|---|---|---|
-| **Embedded** (local filesystem) | dev, demos, single machine — the default | `./graph.omni` |
-| **Object storage** (AWS S3, R2, GCS-S3) | shared, multi-host, durable | `s3://bucket/graph.omni` (+ the `AWS_*` env) |
-| **RustFS / MinIO** | rehearse the S3 path locally, no cloud account | `s3://…` against a local endpoint → [deployment guide](docs/user/deployment.md#testing-against-s3-locally) |
-
-`init` takes the address as its positional argument (`omnigraph init --schema schema.pg `); `load`, `query`, and `branch` take it via `--store `.
-
-For a **served, multi-graph deployment** (the cluster model), see [Common Commands](#common-commands) below.
-
## Set it up with an AI agent
-Omnigraph is built to be set up by coding agents. Paste this into Claude Code,
-Cursor, or any agent that can read a URL, install a package, and run a shell
-command — it installs the skill, reads the docs, and walks you through setup for
-your use case:
+Omnigraph is built to be run by coding agents. Two ways in:
-```text
-Help me set up Omnigraph (a lakehouse-native graph engine for agents).
-
-1. Install the Omnigraph skill so you operate it correctly:
- npx skills add ModernRelay/omnigraph@omnigraph
-2. Read the docs at https://github.com/ModernRelay/omnigraph — start with
- docs/user/quickstart.md, then docs/user/clusters/index.md.
-3. Skim the starter graphs and seed data in the cookbooks:
- https://github.com/ModernRelay/omnigraph-cookbooks
-4. Ask me what I want to build (company brain, agent memory, dev graph,
- research / R&D layer, …). Then install the CLI, stand up a first graph for
- that use case, load a little data, and run a query so I can see it working.
-```
-
-Works with any agent that can browse a URL, install a package, and run a shell.
-
-## Agent skill & starter graphs
-
-This repo ships the [**`omnigraph` agent skill**](skills/omnigraph) — the
-operational playbook (cluster mode, the two config surfaces, schema evolution,
-query linting, data writes, branches, Cedar policy, and common gotchas) that
-teaches a coding agent to drive Omnigraph correctly. Install it with:
+**Teach your agent the playbook.** This repo ships the
+[**`omnigraph` agent skill**](skills/omnigraph): the operational playbook
+covering cluster mode, the two config surfaces, schema evolution, query linting,
+data writes, branches, Cedar policy, and the common gotchas.
```bash
npx skills add ModernRelay/omnigraph@omnigraph
```
+**Or have an agent set it up from scratch.** Paste this into Claude Code,
+Codex, or any agent that can read a URL and run a shell command:
+
+```text
+Help me set up Omnigraph
+
+1. Read the docs at https://github.com/ModernRelay/omnigraph, starting with
+ docs/user/clusters/index.md, then docs/user/deployment.md.
+2. Skim the starter graphs and seed data in the cookbooks:
+ https://github.com/ModernRelay/omnigraph-cookbooks
+3. Ask me what I want to build (company brain, agent memory, dev graph,
+ research / R&D layer, …). Then stand up a cluster for it, load a little
+ data, and run a query so I can see it working.
+```
+
For ready-to-run graphs with real seed data (company brain, VC operating system,
pharma & industry intel),
[`ModernRelay/omnigraph-cookbooks`](https://github.com/ModernRelay/omnigraph-cookbooks)
-is the fastest way to see Omnigraph shaped to a real domain. To rehearse the S3
-path locally, see [deployment.md → Testing against S3 locally](docs/user/deployment.md#testing-against-s3-locally).
+is the fastest way to see Omnigraph shaped to a real domain.
-## Common Commands
+## Deploy
-A deployment is a **cluster**. A `cluster.yaml` declares its graphs, schemas,
-stored queries, and policies; you converge it with `cluster apply` and serve it.
-The server is cluster-first — it boots only from a cluster and serves every graph
-under `/graphs/{id}/…`. Day-to-day work goes through that server: graphs are
-addressed with `--server ` (+ `--graph `), and `query`/`mutate`
-invoke a stored query from the catalog **by name**.
+A deployment is a **cluster**: a **multigraph** config directory that declares
+its graphs, schemas, stored queries, and policies as code. You manage it
+**Terraform-style**: `cluster plan` previews the diff, `cluster apply` converges
+it. `omnigraph-server` then boots from the cluster and brings every graph online
+at `/graphs/{id}/…`, each behind its own policy.
-```bash
-# 1. Converge the declared cluster, then serve it (--as attributes the apply)
-omnigraph cluster apply --config ./company-brain --as you
-omnigraph-server --cluster ./company-brain --bind 0.0.0.0:8080
-# or config-free from object storage — the bucket IS the deployment:
-# omnigraph-server --cluster s3://my-bucket/company-brain --bind 0.0.0.0:8080
+**1. Declare the cluster.**
-# 2. Work against the served graph — stored queries invoked by name
-omnigraph query find_people --server prod --graph knowledge --params '{"q":"AI safety"}'
-omnigraph mutate add_person --server prod --graph knowledge --params '{"name":"Mina"}'
-omnigraph load --data ./data.jsonl --mode merge --server prod --graph knowledge
-
-# 3. Branch and merge, Git-style across the whole graph
-omnigraph branch create --from main review/2026-06 --server prod --graph knowledge
-omnigraph branch merge review/2026-06 --into main --server prod --graph knowledge
+```
+company-brain/
+├── cluster.yaml
+├── people.pg # schema for the "knowledge" graph
+├── queries/ # stored queries: the .gq files ARE the declaration
+│ └── people.gq
+└── base.policy.yaml # a Cedar policy bundle
```
-Set a default scope (or a `--profile`) in `~/.omnigraph/config.yaml` — operator
-identity, named servers/clusters, credentials — and the `--server`/`--graph`
-flags drop away (`omnigraph query find_people --params …`).
-
-**Local / ad-hoc.** For quick iteration on a standalone graph (no cluster, no
-server), address storage directly with `--store` (or a positional `file://` /
-`s3://` URI) and run ad-hoc `.gq` with `--query` (the positional then selects
-which query in the file):
-
-```bash
-omnigraph init --schema ./schema.pg ./graph.omni
-omnigraph load --data ./data.jsonl --mode merge --store ./graph.omni
-omnigraph query --query ./queries.gq get_person --params '{"name":"Alice"}' --store ./graph.omni
+```yaml
+# cluster.yaml
+version: 1
+metadata:
+ name: company-brain
+storage: s3://company/clusters/company-brain # ledger, catalog, and graph data live here
+graphs:
+ knowledge:
+ schema: people.pg
+ queries: queries/ # every `query ` in queries/*.gq registers
+policies:
+ base:
+ file: base.policy.yaml
+ applies_to: [knowledge] # graph-bound; use [cluster] for server-level
```
-See [docs/user/cli/index.md](docs/user/cli/index.md), the
-[CLI reference](docs/user/cli/reference.md), the
-[cluster guide](docs/user/clusters/index.md), and the
-[deployment guide](docs/user/deployment.md) for schema apply, snapshots, commits,
-profiles, and policy/queries tooling.
+**2. Stand up your object store.** On-prem, run RustFS (or MinIO); Omnigraph
+writes [Lance](https://github.com/lance-format/lance) to it over the standard S3
+API. In the cloud, point the same `AWS_*` env at S3 / R2 / GCS instead.
-## Clients
+**3. Converge and run.** `apply` creates each graph, applies its schema, and
+publishes queries and policies into the content-addressed catalog. It is
+idempotent; re-running is always safe.
-For programmatic access to a running `omnigraph-server`:
+```bash
+omnigraph cluster validate # parse + typecheck everything
+omnigraph cluster plan # preview what apply would do
+omnigraph cluster apply # converge
-- **TypeScript SDK + MCP server** — [`@modernrelay/omnigraph`](https://www.npmjs.com/package/@modernrelay/omnigraph) and [`@modernrelay/omnigraph-mcp`](https://www.npmjs.com/package/@modernrelay/omnigraph-mcp), versioned in lockstep with `omnigraph-server`. Source, docs, and examples: [`ModernRelay/omnigraph-ts`](https://github.com/ModernRelay/omnigraph-ts).
-- **Python SDK** — coming soon.
+# Boot the server from the cluster dir; storage resolves through cluster.yaml
+omnigraph-server --cluster company-brain --bind 0.0.0.0:8080
+```
+
+See the [cluster guide](docs/user/clusters/index.md) for the day-2 loop
+(edit → plan → apply → restart), approval gates for destructive changes, drift
+inspection, and recovery; the [deployment guide](docs/user/deployment.md) for
+containers, AWS/Railway, auth, and the full `AWS_*` contract.
+
+## Query and mutate
+
+Set a default server and graph once in `~/.omnigraph/config.yaml`, and the
+everyday commands stay short. Stored queries and mutations run **by name**:
+
+```bash
+omnigraph query search_docs --params '{"q":"AI safety"}'
+omnigraph mutate add_person --params '{"name":"Mina"}'
+
+# Branch, review, merge across the whole graph; agents write in isolation
+omnigraph branch create --from main agent/ingest-42
+omnigraph branch merge agent/ingest-42 --into main
+```
+
+An **alias** is shorter still: bind a server, graph, and stored query to one
+name, then `omnigraph alias triage` runs it. For an ad-hoc target, any command
+still takes `--server --graph ` (or `--store ` for a local
+graph). See the [CLI reference](docs/user/cli/reference.md).
+
+## Security & governance
+
+- **Engine-wide enforcement:** every write path goes through the same Cedar gate, so the HTTP server, the CLI, and the embedded SDK obey identical rules.
+- **Declared in the cluster:** a policy bundle is bound to graphs (or the whole server) via `policies:` → `applies_to`.
+- **Scoped:** rules apply per graph, per branch, or server-wide.
+- **No plaintext tokens:** bearer tokens are hashed at startup and compared in constant time.
+- **Forge-proof identity:** the actor is resolved server-side from the token; clients can't set it.
+
+See the [policy guide](docs/user/operations/policy.md).
+
+## Clients & SDKs
+
+| Client | Use it for | Where |
+|---|---|---|
+| **TypeScript SDK** | typed access from Node / TS | [`@modernrelay/omnigraph`](https://www.npmjs.com/package/@modernrelay/omnigraph) · [source](https://github.com/ModernRelay/omnigraph-ts) |
+| **MCP server** | bridge Omnigraph to LLM hosts (Claude, Codex, …) | [`@modernrelay/omnigraph-mcp`](https://www.npmjs.com/package/@modernrelay/omnigraph-mcp) |
+| **HTTP / OpenAPI** | any language, the wire contract | the server's OpenAPI spec |
+| **Python SDK** | typed access from Python | *coming soon* |
+
+Both npm packages are versioned in lockstep with `omnigraph-server`.
+
+## Local quick test (no server)
+
+1-min setup to try it: an **embedded, local file-backed graph** (no server, no
+object store). For dev and experiments; production is the deployed cluster above.
+
+```bash
+cat > schema.pg <<'PG'
+node Signal { slug: String @key, title: String }
+node Pattern { slug: String @key, name: String }
+edge Indicates: Signal -> Pattern
+PG
+printf '%s\n' \
+ '{"type":"Signal","data":{"slug":"s1","title":"OSS model adoption surging"}}' \
+ '{"type":"Pattern","data":{"slug":"p1","name":"adoption"}}' \
+ '{"edge":"Indicates","from":"s1","to":"p1"}' > data.jsonl
+
+omnigraph init --schema schema.pg ./graph.omni
+omnigraph load --data data.jsonl --mode overwrite --store ./graph.omni
+
+# "What pattern does signal s1 indicate?"
+omnigraph query --store ./graph.omni \
+ -e 'query indicates() { match { $s: Signal { slug: "s1" } $s indicates $p } return { $p.name } }'
+# → adoption
+```
## Docs
-- [Install guide](docs/user/install.md)
-- [Deployment guide](docs/user/deployment.md)
+- [Cluster guide](docs/user/clusters/index.md) · [Deployment guide](docs/user/deployment.md) · [CLI reference](docs/user/cli/reference.md)
+- [Schema](docs/user/schema/index.md) · [Queries](docs/user/queries/index.md) · [Search](docs/user/search/index.md) · [Policy](docs/user/operations/policy.md)
## Build And Test
```bash
cargo build --workspace
-cargo check --workspace
-cargo test --workspace
+cargo test --workspace
```
Notes:
@@ -211,8 +244,8 @@ Notes:
- `crates/omnigraph-policy`: Cedar policy compilation and enforcement
- `crates/omnigraph-api-types`: shared HTTP wire DTOs used by both the server and the CLI
- `crates/omnigraph-cluster`: cluster config validation, planning, and apply (the control plane)
-- `crates/omnigraph-server`: Axum HTTP server — cluster-first, serving N graphs under `/graphs/{id}/…`
-- `crates/omnigraph-cli`: CLI for graph lifecycle (init/load), query/mutate, branch/commit/merge, schema/lint, snapshot/export, cluster control, policy/queries, profiles, and maintenance (optimize/repair/cleanup)
+- `crates/omnigraph-server`: Axum HTTP server, cluster-first, runs N graphs under `/graphs/{id}/…`
+- `crates/omnigraph-cli`: CLI for graph lifecycle, query/mutate, branch/commit/merge, schema/lint, snapshot/export, cluster control, policy/queries, profiles, and maintenance
## Contributing
diff --git a/assets/omnigraph-wordmark-dark.svg b/assets/omnigraph-wordmark-dark.svg
new file mode 100644
index 0000000..47b2033
--- /dev/null
+++ b/assets/omnigraph-wordmark-dark.svg
@@ -0,0 +1,152 @@
+
diff --git a/assets/omnigraph-wordmark.svg b/assets/omnigraph-wordmark.svg
new file mode 100644
index 0000000..45778dc
--- /dev/null
+++ b/assets/omnigraph-wordmark.svg
@@ -0,0 +1,152 @@
+
diff --git a/crates/omnigraph-cli/src/helpers.rs b/crates/omnigraph-cli/src/helpers.rs
index 0bd2cca..59da9c3 100644
--- a/crates/omnigraph-cli/src/helpers.rs
+++ b/crates/omnigraph-cli/src/helpers.rs
@@ -873,6 +873,25 @@ pub(crate) async fn execute_queries_validate(
Ok(())
}
+/// Print a stored-query annotation under its `queries list` entry. A
+/// `@description`/`@instruction` value may be multiline (GQ string literals
+/// admit newlines); continuation lines are indented to align under the first
+/// so the catalog stays readable instead of breaking the left margin.
+fn print_query_annotation(label: &str, value: &str) {
+ let prefix = format!(" {label}: ");
+ let continuation = " ".repeat(prefix.len());
+ let mut lines = value.split('\n');
+ match lines.next() {
+ Some(first) => {
+ println!("{prefix}{first}");
+ for line in lines {
+ println!("{continuation}{line}");
+ }
+ }
+ None => println!("{prefix}"),
+ }
+}
+
/// `queries list --cluster ` (RFC-011): list the catalog's stored queries.
/// With `--graph`, scope to one graph.
pub(crate) async fn execute_queries_list(
@@ -891,6 +910,8 @@ pub(crate) async fn execute_queries_list(
mcp_expose: q.is_exposed(),
tool_name: q.decl.mcp.tool_name.clone(),
mutation: q.is_mutation(),
+ description: q.decl.description.clone(),
+ instruction: q.decl.instruction.clone(),
params: q
.decl
.params
@@ -931,6 +952,12 @@ pub(crate) async fn execute_queries_list(
String::new()
};
println!("{kind} {}({params}){mcp}", q.name);
+ if let Some(description) = &q.description {
+ print_query_annotation("description", description);
+ }
+ if let Some(instruction) = &q.instruction {
+ print_query_annotation("instruction", instruction);
+ }
}
}
Ok(())
diff --git a/crates/omnigraph-cli/src/main.rs b/crates/omnigraph-cli/src/main.rs
index bb3b062..fa6f4db 100644
--- a/crates/omnigraph-cli/src/main.rs
+++ b/crates/omnigraph-cli/src/main.rs
@@ -1050,7 +1050,7 @@ async fn main() -> Result<()> {
// The actor attributes graph-moving operations (sidecars,
// audit entries, engine schema-apply commits). Cluster FACTS
// stay unlayered; the operator's identity resolves --as flag
- // first, then the per-operator omnigraph.yaml `cli.actor`.
+ // first, then per-operator config `operator.actor`.
let actor = resolve_cluster_actor(cli.as_actor.as_deref())?;
let output = apply_config_dir_with_options(config, ApplyOptions { actor }).await;
finish_cluster_apply(&output, json)?;
@@ -1062,7 +1062,7 @@ async fn main() -> Result<()> {
} => {
let Some(approver) = resolve_cluster_actor(cli.as_actor.as_deref())? else {
bail!(
- "`cluster approve` requires an approver: pass the global --as flag or set `cli.actor` in your omnigraph.yaml — an approval without an approver is meaningless"
+ "`cluster approve` requires an approver: pass the global --as flag or set `operator.actor` in ~/.omnigraph/config.yaml — an approval without an approver is meaningless"
);
};
let output = approve_config_dir(config, &resource, &approver).await;
diff --git a/crates/omnigraph-cli/src/output.rs b/crates/omnigraph-cli/src/output.rs
index d6903f4..80de625 100644
--- a/crates/omnigraph-cli/src/output.rs
+++ b/crates/omnigraph-cli/src/output.rs
@@ -849,6 +849,13 @@ pub(crate) struct QueriesListItem {
pub(crate) mcp_expose: bool,
pub(crate) tool_name: Option,
pub(crate) mutation: bool,
+ /// `@description` from the query declaration — what the query is for.
+ /// Carried so the CLI catalog matches the HTTP `GET /queries` surface.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub(crate) description: Option,
+ /// `@instruction` from the query declaration — how/when to invoke it.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub(crate) instruction: Option,
pub(crate) params: Vec,
}
diff --git a/crates/omnigraph-cli/tests/cli_cluster.rs b/crates/omnigraph-cli/tests/cli_cluster.rs
index e35a54d..d2b6d13 100644
--- a/crates/omnigraph-cli/tests/cli_cluster.rs
+++ b/crates/omnigraph-cli/tests/cli_cluster.rs
@@ -796,6 +796,10 @@ fn cluster_approve_uses_operator_actor_fallback() {
);
let stderr = String::from_utf8_lossy(&output.stderr);
assert!(stderr.contains("--as"), "{stderr}");
+ assert!(stderr.contains("operator.actor"), "{stderr}");
+ assert!(stderr.contains("config.yaml"), "{stderr}");
+ assert!(!stderr.contains("cli.actor"), "{stderr}");
+ assert!(!stderr.contains("omnigraph.yaml"), "{stderr}");
}
#[test]
diff --git a/crates/omnigraph-cli/tests/cli_queries.rs b/crates/omnigraph-cli/tests/cli_queries.rs
index 92f7879..b51018e 100644
--- a/crates/omnigraph-cli/tests/cli_queries.rs
+++ b/crates/omnigraph-cli/tests/cli_queries.rs
@@ -231,6 +231,125 @@ fn queries_list_prints_registered_query() {
);
}
+#[test]
+fn queries_list_surfaces_description_and_instruction() {
+ // `@description`/`@instruction` are the whole point of a stored query in a
+ // catalog — they tell an agent/operator what it does and how to invoke it.
+ // The CLI catalog must surface them in both human and --json output, to
+ // match the HTTP `GET /queries` surface.
+ let cluster = converged_cluster_with_query(
+ "described.gq",
+ "query described($name: String) \
+ @description(\"Find a person by exact name.\") \
+ @instruction(\"Use for exact lookups; prefer search for fuzzy matches.\") \
+ { match { $p: Person { name: $name } } return { $p.age } }",
+ " described:\n file: ./described.gq\n",
+ );
+
+ // Human output.
+ let output = output_success(
+ cli().arg("queries").arg("list").arg("--cluster").arg(cluster.path()),
+ );
+ let stdout = stdout_string(&output);
+ assert!(
+ stdout.contains("description: Find a person by exact name."),
+ "human list must show @description; stdout:\n{stdout}"
+ );
+ assert!(
+ stdout.contains("instruction: Use for exact lookups; prefer search for fuzzy matches."),
+ "human list must show @instruction; stdout:\n{stdout}"
+ );
+
+ // --json output.
+ let output = output_success(
+ cli()
+ .arg("queries")
+ .arg("list")
+ .arg("--cluster")
+ .arg(cluster.path())
+ .arg("--json"),
+ );
+ let body: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap();
+ let entry = body["queries"]
+ .as_array()
+ .unwrap()
+ .iter()
+ .find(|q| q["name"] == "described")
+ .unwrap();
+ assert_eq!(entry["description"], "Find a person by exact name.");
+ assert_eq!(
+ entry["instruction"],
+ "Use for exact lookups; prefer search for fuzzy matches."
+ );
+}
+
+#[test]
+fn queries_list_indents_multiline_annotation_continuation() {
+ // GQ string literals admit newlines, so a `@description`/`@instruction`
+ // can be multiline. Human output must indent continuation lines to align
+ // under the first rather than breaking back to the left margin.
+ let cluster = converged_cluster_with_query(
+ "multi.gq",
+ "query multi($name: String) \
+ @description(\"line one\\nline two\") \
+ { match { $p: Person { name: $name } } return { $p.age } }",
+ " multi:\n file: ./multi.gq\n",
+ );
+ let output = output_success(
+ cli().arg("queries").arg("list").arg("--cluster").arg(cluster.path()),
+ );
+ let stdout = stdout_string(&output);
+ // " description: " is 17 chars wide; the continuation aligns under it.
+ assert!(
+ stdout.contains(" description: line one\n line two"),
+ "multiline annotation must indent the continuation; stdout:\n{stdout}"
+ );
+}
+
+#[test]
+fn queries_list_omits_annotations_when_absent() {
+ // The other half of the contract: a query that declares neither annotation
+ // prints no extra lines and omits both JSON fields entirely. This keeps the
+ // catalog clean rather than echoing empty `description:`/`instruction:`.
+ let cluster = converged_cluster_with_query(
+ "bare.gq",
+ "query bare() { match { $p: Person } return { $p.name } }",
+ " bare:\n file: ./bare.gq\n",
+ );
+
+ // Human output: the query is listed, but no annotation lines.
+ let output = output_success(
+ cli().arg("queries").arg("list").arg("--cluster").arg(cluster.path()),
+ );
+ let stdout = stdout_string(&output);
+ assert!(stdout.contains("bare()"), "stdout:\n{stdout}");
+ assert!(
+ !stdout.contains("description:") && !stdout.contains("instruction:"),
+ "a query without annotations prints no annotation lines; stdout:\n{stdout}"
+ );
+
+ // --json output: both fields omitted (not present as null).
+ let output = output_success(
+ cli()
+ .arg("queries")
+ .arg("list")
+ .arg("--cluster")
+ .arg(cluster.path())
+ .arg("--json"),
+ );
+ let body: serde_json::Value = serde_json::from_slice(&output.stdout).unwrap();
+ let entry = body["queries"]
+ .as_array()
+ .unwrap()
+ .iter()
+ .find(|q| q["name"] == "bare")
+ .unwrap();
+ assert!(
+ entry.get("description").is_none() && entry.get("instruction").is_none(),
+ "a query without annotations omits both JSON fields: {entry}"
+ );
+}
+
#[test]
fn queries_validate_requires_a_cluster() {
// RFC-011: with no --cluster (and no cluster profile), the command errors
diff --git a/crates/omnigraph-cluster/Cargo.toml b/crates/omnigraph-cluster/Cargo.toml
index 05a9308..f0a3a22 100644
--- a/crates/omnigraph-cluster/Cargo.toml
+++ b/crates/omnigraph-cluster/Cargo.toml
@@ -10,8 +10,8 @@ documentation = "https://docs.rs/omnigraph-cluster"
[features]
# Fault-injection hooks for the apply protocol (crash-mid-apply, CAS-race
-# tests). Deliberately does NOT enable omnigraph/failpoints.
-failpoints = ["dep:fail", "fail/failpoints"]
+# tests), including cluster/engine boundary failures.
+failpoints = ["dep:fail", "fail/failpoints", "omnigraph/failpoints"]
[dependencies]
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" }
diff --git a/crates/omnigraph-cluster/src/diff.rs b/crates/omnigraph-cluster/src/diff.rs
index 516a86e..ce29a45 100644
--- a/crates/omnigraph-cluster/src/diff.rs
+++ b/crates/omnigraph-cluster/src/diff.rs
@@ -18,6 +18,7 @@ pub(crate) fn diff_resources(
disposition: None,
reason: None,
binding_change: false,
+ metadata_change: None,
migration: None,
}),
Some(before) if before != after => changes.push(PlanChange {
@@ -28,6 +29,7 @@ pub(crate) fn diff_resources(
disposition: None,
reason: None,
binding_change: false,
+ metadata_change: None,
migration: None,
}),
Some(_) => {}
@@ -43,6 +45,7 @@ pub(crate) fn diff_resources(
disposition: None,
reason: None,
binding_change: false,
+ metadata_change: None,
migration: None,
});
}
@@ -82,6 +85,47 @@ pub(crate) fn append_policy_binding_changes(
disposition: None,
reason: None,
binding_change: true,
+ metadata_change: Some(PlanMetadataChange::PolicyBindings),
+ migration: None,
+ });
+ }
+ changes.sort_by(|a, b| a.resource.cmp(&b.resource));
+}
+
+/// Metadata-only embedding provider changes: the provider digest is unchanged
+/// but the applied state predates storing the profile body needed by
+/// config-free serving. This mirrors policy binding backfill instead of
+/// hiding a serving-time failure behind a no-op plan.
+pub(crate) fn append_embedding_profile_changes(
+ changes: &mut Vec,
+ prior_state: Option<&ClusterState>,
+ desired: &DesiredCluster,
+) {
+ let Some(state) = prior_state else {
+ return; // no state: provider Creates carry profiles already
+ };
+ for (address, desired_profile) in &desired.embedding_providers {
+ if changes
+ .iter()
+ .any(|change| change.resource.as_str() == address.as_str())
+ {
+ continue; // content change already covers it
+ }
+ let Some(entry) = state.applied_revision.resources.get(address) else {
+ continue; // not applied yet: the Create covers it
+ };
+ if entry.embedding_profile.as_ref() == Some(desired_profile) {
+ continue;
+ }
+ changes.push(PlanChange {
+ resource: address.clone(),
+ operation: PlanOperation::Update,
+ before_digest: Some(entry.digest.clone()),
+ after_digest: Some(entry.digest.clone()),
+ disposition: None,
+ reason: None,
+ binding_change: false,
+ metadata_change: Some(PlanMetadataChange::EmbeddingProfile),
migration: None,
});
}
diff --git a/crates/omnigraph-cluster/src/lib.rs b/crates/omnigraph-cluster/src/lib.rs
index 1c4e4fc..bed27c8 100644
--- a/crates/omnigraph-cluster/src/lib.rs
+++ b/crates/omnigraph-cluster/src/lib.rs
@@ -33,9 +33,9 @@ use config::{
validate_id, validate_query_source,
};
use diff::{
- FailedGraphOrigin, ResourceKind, append_policy_binding_changes, approved_resources,
- classify_changes, compute_approvals, compute_blast_radius, demote_dependents_of_failed_graphs,
- diff_resources, resource_kind,
+ FailedGraphOrigin, ResourceKind, append_embedding_profile_changes,
+ append_policy_binding_changes, approved_resources, classify_changes, compute_approvals,
+ compute_blast_radius, demote_dependents_of_failed_graphs, diff_resources, resource_kind,
};
pub use serve::{
ServingGraph, ServingPolicy, ServingQuery, ServingSnapshot, cluster_graph_ids,
@@ -160,7 +160,7 @@ pub async fn plan_config_dir(config_dir: impl AsRef) -> PlanOutput {
// Plan is read-only: pending sidecars are reported, never acted on
// (RFC-004 open question 3 keeps read-only commands warn-only).
- warn_pending_recovery_sidecars(&desired.config_dir, &mut diagnostics);
+ warn_pending_recovery_sidecars(&backend, &mut diagnostics).await;
let mut prior_resources = BTreeMap::new();
let mut prior_state: Option = None;
@@ -183,6 +183,7 @@ pub async fn plan_config_dir(config_dir: impl AsRef) -> PlanOutput {
};
if !has_errors(&diagnostics) {
append_policy_binding_changes(&mut changes, prior_state.as_ref(), &desired);
+ append_embedding_profile_changes(&mut changes, prior_state.as_ref(), &desired);
}
// Plan previews dispositions without sweeping; a pending recovery is
// surfaced as the cluster_recovery_pending warning above instead.
@@ -404,6 +405,7 @@ pub async fn apply_config_dir_with_options(
let prior_resources = state_resource_digests(&state);
let mut changes = diff_resources(&prior_resources, &desired.resource_digests);
append_policy_binding_changes(&mut changes, Some(&state), &desired);
+ append_embedding_profile_changes(&mut changes, Some(&state), &desired);
let approval_artifacts = backend.list_approval_artifacts(&mut diagnostics).await;
let approved = approved_resources(
&approval_artifacts,
@@ -639,42 +641,9 @@ pub async fn apply_config_dir_with_options(
continue;
}
};
- let observed_manifest_version = match db.snapshot_of(ReadTarget::branch("main")).await {
- Ok(snapshot) => Some(snapshot.version()),
- Err(_) => None,
- };
- let mut sidecar = RecoverySidecar {
- schema_version: 1,
- operation_id: Ulid::new().to_string(),
- started_at: now_rfc3339(),
- actor: options.actor.clone(),
- kind: RecoverySidecarKind::SchemaApply,
- graph_id: graph_id.clone(),
- graph_uri: graph_uri.clone(),
- observed_manifest_version,
- expected_manifest_version: None,
- desired_schema_digest: desired_graph.schema_digest.clone(),
- state_cas_base: expected_cas.clone(),
- approval_id: None,
- };
- let sidecar_path = match backend.write_recovery_sidecar(&sidecar).await {
- Ok(path) => path,
- Err(diagnostic) => {
- diagnostics.push(diagnostic);
- failed_graphs.insert(graph_id.clone(), FailedGraphOrigin::SchemaApply);
- graph_moving_aborted = true;
- continue;
- }
- };
- if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_schema_apply") {
- // Simulated crash before the engine call: the sidecar stays; the
- // sweep retires it next run (ledger still consistent with live).
- diagnostics.push(diagnostic);
- failed_graphs.insert(graph_id.clone(), FailedGraphOrigin::SchemaApply);
- graph_moving_aborted = true;
- continue;
- }
- // Re-read + digest-verify the desired schema source under the lock.
+ // Re-read + digest-verify the desired schema source before the
+ // cluster sidecar exists. Parser/planner rejections cannot have
+ // moved graph state, so they must not leave recovery work behind.
let schema_source = source_paths
.get(schema_address(graph_id).as_str())
.ok_or_else(|| {
@@ -708,12 +677,64 @@ pub async fn apply_config_dir_with_options(
Ok(source) => source,
Err(diagnostic) => {
diagnostics.push(diagnostic);
- backend.delete_object(&sidecar_path).await; // nothing moved
failed_graphs.insert(graph_id.clone(), FailedGraphOrigin::SchemaApply);
graph_moving_aborted = true;
continue;
}
};
+ if let Err(err) = db
+ .preview_schema_apply_with_options(&schema_source, SchemaApplyOptions::default())
+ .await
+ {
+ diagnostics.push(Diagnostic::error(
+ "schema_apply_failed",
+ schema_address(graph_id),
+ format!("schema apply is not supported on '{graph_uri}': {err}"),
+ ));
+ failed_graphs.insert(graph_id.clone(), FailedGraphOrigin::SchemaApply);
+ graph_moving_aborted = true;
+ continue;
+ }
+ let observed_manifest_version = match db.snapshot_of(ReadTarget::branch("main")).await {
+ Ok(snapshot) => Some(snapshot.version()),
+ Err(_) => None,
+ };
+ let recorded_schema_digest = state
+ .applied_revision
+ .resources
+ .get(&schema_address(graph_id))
+ .map(|entry| entry.digest.clone());
+ let mut sidecar = RecoverySidecar {
+ schema_version: 1,
+ operation_id: Ulid::new().to_string(),
+ started_at: now_rfc3339(),
+ actor: options.actor.clone(),
+ kind: RecoverySidecarKind::SchemaApply,
+ graph_id: graph_id.clone(),
+ graph_uri: graph_uri.clone(),
+ observed_manifest_version,
+ expected_manifest_version: None,
+ desired_schema_digest: desired_graph.schema_digest.clone(),
+ state_cas_base: expected_cas.clone(),
+ approval_id: None,
+ };
+ let sidecar_path = match backend.write_recovery_sidecar(&sidecar).await {
+ Ok(path) => path,
+ Err(diagnostic) => {
+ diagnostics.push(diagnostic);
+ failed_graphs.insert(graph_id.clone(), FailedGraphOrigin::SchemaApply);
+ graph_moving_aborted = true;
+ continue;
+ }
+ };
+ if let Err(diagnostic) = failpoints::maybe_fail("cluster_apply.before_schema_apply") {
+ // Simulated crash before the engine call: the sidecar stays; the
+ // sweep retires it next run (ledger still consistent with live).
+ diagnostics.push(diagnostic);
+ failed_graphs.insert(graph_id.clone(), FailedGraphOrigin::SchemaApply);
+ graph_moving_aborted = true;
+ continue;
+ }
// Soft drops only: allow_data_loss stays false until the approval
// artifacts of stage 4C exist (RFC-004 §D4).
match db
@@ -736,8 +757,29 @@ pub async fn apply_config_dir_with_options(
schema_address(graph_id),
format!("schema apply failed on '{graph_uri}': {err}"),
));
- // Sidecar stays; the sweep retires it (live digest unchanged
- // == ledger consistent) or flags real movement.
+ if live_schema_matches_recorded_digest(
+ &graph_uri,
+ recorded_schema_digest.as_deref(),
+ observed_manifest_version,
+ )
+ .await
+ {
+ // Pre-movement rejection: nothing moved, so retire the
+ // sidecar eagerly. A delete failure leaves it safe (the
+ // graph is quarantined until the next sweep), but surface
+ // it so an operator isn't left debugging a silent stick.
+ if let Err(err) = backend.try_delete_object(&sidecar_path).await {
+ diagnostics.push(Diagnostic::warning(
+ "recovery_sidecar_cleanup_failed",
+ sidecar_path.clone(),
+ format!(
+ "could not delete the stale recovery sidecar after a pre-movement \
+ schema-apply rejection; graph `{graph_id}` stays quarantined until \
+ a state-mutating cluster command sweeps it: {err}"
+ ),
+ ));
+ }
+ }
failed_graphs.insert(graph_id.clone(), FailedGraphOrigin::SchemaApply);
graph_moving_aborted = true;
continue;
@@ -1022,6 +1064,7 @@ pub async fn apply_config_dir_with_options(
&desired.resource_digests,
);
append_policy_binding_changes(&mut residual, Some(&new_state), &desired);
+ append_embedding_profile_changes(&mut residual, Some(&new_state), &desired);
let converged = residual.is_empty();
if converged {
new_state.applied_revision.config_digest = Some(desired.config_digest.clone());
@@ -1260,7 +1303,7 @@ pub async fn status_config_dir(config_dir: impl AsRef) -> StatusOutput {
backend
.observe_lock(&mut observations, &mut diagnostics)
.await;
- warn_pending_recovery_sidecars(&parsed.config_dir, &mut diagnostics);
+ warn_pending_recovery_sidecars(&backend, &mut diagnostics).await;
let mut resource_digests = BTreeMap::new();
let mut resource_statuses = BTreeMap::new();
@@ -1939,6 +1982,29 @@ fn embedding_provider_digest(profile: &EmbeddingProviderConfig) -> String {
sha256_hex(input.as_bytes())
}
+async fn live_schema_matches_recorded_digest(
+ graph_uri: &str,
+ recorded_schema_digest: Option<&str>,
+ observed_manifest_version: Option,
+) -> bool {
+ let Some(recorded_schema_digest) = recorded_schema_digest else {
+ return false;
+ };
+ let Some(observed_manifest_version) = observed_manifest_version else {
+ return false;
+ };
+ let Ok(db) = Omnigraph::open_read_only(graph_uri).await else {
+ return false;
+ };
+ let Ok(snapshot) = db.snapshot_of(ReadTarget::branch("main")).await else {
+ return false;
+ };
+ if snapshot.version() != observed_manifest_version {
+ return false;
+ }
+ sha256_hex(db.schema_source().as_bytes()) == recorded_schema_digest
+}
+
fn desired_config_digest(
raw: &RawClusterConfig,
resource_digests: &BTreeMap,
diff --git a/crates/omnigraph-cluster/src/serve.rs b/crates/omnigraph-cluster/src/serve.rs
index 6f89e2d..54d3017 100644
--- a/crates/omnigraph-cluster/src/serve.rs
+++ b/crates/omnigraph-cluster/src/serve.rs
@@ -37,11 +37,14 @@ pub struct ServingSnapshot {
pub graphs: Vec,
pub queries: Vec,
pub policies: Vec,
+ pub diagnostics: Vec,
}
/// Read the applied revision as a serving snapshot — the read-only loader for
-/// the Phase-5 server boot. All-or-nothing per RFC-005 §D4: every readiness
-/// failure is collected and the whole snapshot refused; no partial serving.
+/// the Phase-5 server boot. Cluster-global readiness failures are still
+/// all-or-nothing, but graph-attributed pending recovery sidecars quarantine
+/// only that graph so healthy graphs can continue serving. This loader never
+/// runs a recovery sweep.
/// Takes no lock: the state file is replaced atomically, so this reads a
/// consistent point-in-time ledger.
pub async fn read_serving_snapshot(
@@ -190,19 +193,44 @@ async fn read_snapshot_with_store(
backend: ClusterStore,
) -> Result> {
let mut diagnostics: Vec = Vec::new();
+ let mut startup_diagnostics: Vec = Vec::new();
+ let mut quarantined_graphs: BTreeSet = BTreeSet::new();
- // A ledger a sweep is about to rewrite must not start serving.
+ // Do not sweep at serve time. Valid graph-attributed sidecars quarantine
+ // that graph; malformed/unattributable sidecars remain cluster-fatal
+ // because serving cannot prove their blast radius.
+ let sidecar_diag_start = diagnostics.len();
let sidecars = backend.list_recovery_sidecars(&mut diagnostics).await;
- if !sidecars.is_empty() {
- diagnostics.push(Diagnostic::error(
+ // Every diagnostic `list_recovery_sidecars` appends is a genuine
+ // read/parse/version failure (emitted as a warning by `store::list_json_dir`)
+ // whose blast radius serving cannot prove — promote each to a cluster-fatal
+ // error. This depends on that listing only ever emitting failure diagnostics;
+ // if it grows a benign/informational one, promote by code instead.
+ for diagnostic in diagnostics.iter_mut().skip(sidecar_diag_start) {
+ diagnostic.severity = DiagnosticSeverity::Error;
+ }
+ for (path, sidecar) in sidecars {
+ if sidecar.graph_id.trim().is_empty() {
+ diagnostics.push(Diagnostic::error(
+ "cluster_recovery_unattributed",
+ path,
+ "recovery sidecar has no graph id; run a state-mutating cluster command to sweep it before serving",
+ ));
+ continue;
+ }
+ quarantined_graphs.insert(sidecar.graph_id.clone());
+ startup_diagnostics.push(Diagnostic::warning(
"cluster_recovery_pending",
- CLUSTER_RECOVERIES_DIR,
+ graph_address(&sidecar.graph_id),
format!(
- "{} interrupted operation(s) await recovery; run any state-mutating cluster command (e.g. `cluster apply`) to sweep, then retry",
- sidecars.len()
+ "graph `{}` is quarantined because interrupted operation `{}` awaits recovery; run any state-mutating cluster command (e.g. `cluster apply`) to sweep",
+ sidecar.graph_id, sidecar.operation_id
),
));
}
+ if has_errors(&diagnostics) {
+ return Err(diagnostics);
+ }
let mut observations = backend.observations();
let state = match backend.read_state(&mut observations).await {
@@ -223,14 +251,29 @@ async fn read_snapshot_with_store(
}
};
let Some(state) = state else {
+ diagnostics.extend(startup_diagnostics);
return Err(diagnostics);
};
+ let required_embedding_providers: BTreeSet = state
+ .applied_revision
+ .resources
+ .iter()
+ .filter_map(|(address, entry)| match resource_kind(address) {
+ ResourceKind::Graph(graph_id) if !quarantined_graphs.contains(&graph_id) => {
+ entry.embedding_provider.clone()
+ }
+ _ => None,
+ })
+ .collect();
let mut embedding_profiles: BTreeMap = BTreeMap::new();
for (address, entry) in &state.applied_revision.resources {
if !matches!(resource_kind(address), ResourceKind::EmbeddingProvider(_)) {
continue;
}
+ if !required_embedding_providers.contains(address) {
+ continue;
+ }
let Some(profile) = entry.embedding_profile.clone() else {
diagnostics.push(Diagnostic::error(
"embedding_provider_profile_missing",
@@ -256,9 +299,14 @@ async fn read_snapshot_with_store(
let mut graphs = Vec::new();
let mut queries = Vec::new();
let mut policies = Vec::new();
+ let mut saw_applied_graph = false;
for (address, entry) in &state.applied_revision.resources {
match resource_kind(address) {
ResourceKind::Graph(graph_id) => {
+ saw_applied_graph = true;
+ if quarantined_graphs.contains(&graph_id) {
+ continue;
+ }
let embedding = match entry.embedding_provider.as_deref() {
Some(provider_address) => match resource_kind(provider_address) {
ResourceKind::EmbeddingProvider(_) => {
@@ -300,6 +348,9 @@ async fn read_snapshot_with_store(
let ResourceKind::Query { graph, name } = &kind else {
unreachable!()
};
+ if quarantined_graphs.contains(graph) {
+ continue;
+ }
match backend
.read_verified_payload(&kind, &entry.digest, address)
.await
@@ -324,6 +375,17 @@ async fn read_snapshot_with_store(
));
continue;
};
+ let applies_to: Vec = applies_to
+ .into_iter()
+ .filter(|binding| {
+ binding
+ .strip_prefix("graph.")
+ .is_none_or(|graph| !quarantined_graphs.contains(graph))
+ })
+ .collect();
+ if applies_to.is_empty() {
+ continue;
+ }
match backend
.read_verified_payload(&kind, &entry.digest, address)
.await
@@ -342,19 +404,29 @@ async fn read_snapshot_with_store(
}
if graphs.is_empty() {
- diagnostics.push(Diagnostic::error(
- "cluster_empty",
- CLUSTER_STATE_FILE,
- "the applied revision records no graphs; apply a cluster with at least one graph before serving from it",
- ));
+ if saw_applied_graph && !quarantined_graphs.is_empty() {
+ diagnostics.push(Diagnostic::error(
+ "cluster_no_healthy_graphs",
+ CLUSTER_RECOVERIES_DIR,
+ "all applied graphs are quarantined by pending recovery sidecars; run any state-mutating cluster command (e.g. `cluster apply`) to sweep, then retry",
+ ));
+ } else {
+ diagnostics.push(Diagnostic::error(
+ "cluster_empty",
+ CLUSTER_STATE_FILE,
+ "the applied revision records no graphs; apply a cluster with at least one graph before serving from it",
+ ));
+ }
}
if has_errors(&diagnostics) {
+ diagnostics.extend(startup_diagnostics);
return Err(diagnostics);
}
Ok(ServingSnapshot {
graphs,
queries,
policies,
+ diagnostics: startup_diagnostics,
})
}
diff --git a/crates/omnigraph-cluster/src/store.rs b/crates/omnigraph-cluster/src/store.rs
index 5129397..a156d78 100644
--- a/crates/omnigraph-cluster/src/store.rs
+++ b/crates/omnigraph-cluster/src/store.rs
@@ -250,7 +250,14 @@ impl ClusterStore {
/// Best-effort object removal (sidecar retirement after a CAS lands,
/// lock cleanup) — failures are recoverable by the next sweep.
pub(crate) async fn delete_object(&self, uri: &str) {
- let _ = self.adapter.delete(uri).await;
+ let _ = self.try_delete_object(uri).await;
+ }
+
+ /// Like `delete_object` but surfaces the failure, so a caller that depends
+ /// on the deletion (e.g. the pre-movement sidecar cleanup fast-path) can
+ /// report it as a diagnostic instead of silently leaving stale state.
+ pub(crate) async fn try_delete_object(&self, uri: &str) -> Result<(), String> {
+ self.adapter.delete(uri).await.map_err(|err| err.to_string())
}
/// Recursive prefix delete for graph roots (approved deletes). Idempotent;
@@ -321,6 +328,32 @@ impl ClusterStore {
// ---- recovery sidecars ----
+ pub(crate) async fn list_recovery_sidecar_locations(
+ &self,
+ diagnostics: &mut Vec,
+ ) -> Vec {
+ let dir_uri = self.uri(CLUSTER_RECOVERIES_DIR);
+ let mut uris = match self.adapter.list_dir(&dir_uri).await {
+ Ok(uris) => uris,
+ Err(err) => {
+ diagnostics.push(Diagnostic::warning(
+ "recovery_sidecar_read_error",
+ CLUSTER_RECOVERIES_DIR,
+ format!("could not list '{CLUSTER_RECOVERIES_DIR}': {err}"),
+ ));
+ return Vec::new();
+ }
+ };
+ uris.retain(|uri| uri.ends_with(".json"));
+ uris.sort();
+ uris.into_iter()
+ .map(|uri| {
+ let name = uri.rsplit_once('/').map_or(uri.as_str(), |(_, name)| name);
+ format!("{}/{name}", self.display(CLUSTER_RECOVERIES_DIR))
+ })
+ .collect()
+ }
+
pub(crate) async fn list_recovery_sidecars(
&self,
diagnostics: &mut Vec,
diff --git a/crates/omnigraph-cluster/src/sweep.rs b/crates/omnigraph-cluster/src/sweep.rs
index 6539cae..27e6e9c 100644
--- a/crates/omnigraph-cluster/src/sweep.rs
+++ b/crates/omnigraph-cluster/src/sweep.rs
@@ -427,21 +427,14 @@ pub(crate) async fn mark_approvals_consumed(backend: &ClusterStore, approval_ids
}
/// Read-only commands report pending sidecars without acting on them.
-pub(crate) fn warn_pending_recovery_sidecars(config_dir: &Path, diagnostics: &mut Vec) {
- let recoveries_dir = config_dir.join(CLUSTER_RECOVERIES_DIR);
- let Ok(entries) = fs::read_dir(&recoveries_dir) else {
- return;
- };
- let mut names: Vec = entries
- .flatten()
- .filter(|entry| entry.path().extension().is_some_and(|ext| ext == "json"))
- .map(|entry| entry.file_name().to_string_lossy().into_owned())
- .collect();
- names.sort();
- for name in names {
+pub(crate) async fn warn_pending_recovery_sidecars(
+ backend: &ClusterStore,
+ diagnostics: &mut Vec,
+) {
+ for location in backend.list_recovery_sidecar_locations(diagnostics).await {
diagnostics.push(Diagnostic::warning(
"cluster_recovery_pending",
- format!("{CLUSTER_RECOVERIES_DIR}/{name}"),
+ location,
"a recovery sidecar from an interrupted apply is pending; the next state-mutating command will classify it",
));
}
diff --git a/crates/omnigraph-cluster/src/tests.rs b/crates/omnigraph-cluster/src/tests.rs
index ac448cf..7eae69f 100644
--- a/crates/omnigraph-cluster/src/tests.rs
+++ b/crates/omnigraph-cluster/src/tests.rs
@@ -1174,6 +1174,19 @@ graphs:
.unwrap()
}
+ fn recovery_sidecars(config_dir: &Path) -> Vec {
+ let dir = config_dir.join(CLUSTER_RECOVERIES_DIR);
+ if !dir.exists() {
+ return Vec::new();
+ }
+ let mut sidecars: Vec<_> = fs::read_dir(dir)
+ .unwrap()
+ .map(|entry| entry.unwrap().path())
+ .collect();
+ sidecars.sort();
+ sidecars
+ }
+
fn query_payload_path(config_dir: &Path, digest: &str) -> std::path::PathBuf {
config_dir
.join(CLUSTER_RESOURCES_DIR)
@@ -1586,8 +1599,17 @@ graphs:
state["applied_revision"]["resources"]["schema.knowledge"]["digest"],
desired.resource_digests["schema.knowledge"]
);
- // Second run: the sweep retires the stale sidecar (ledger consistent)
- // and the run fails just as loudly — idempotent loudness.
+ let db = Omnigraph::open_read_only(&derived_graph_uri(dir.path(), "knowledge"))
+ .await
+ .unwrap();
+ assert_eq!(db.schema_source().as_str(), SCHEMA);
+ assert!(
+ recovery_sidecars(dir.path()).is_empty(),
+ "{:?}",
+ recovery_sidecars(dir.path())
+ );
+ // Second run fails just as loudly and still leaves no sidecar because
+ // the engine preview rejects before graph state can move.
let second = apply_config_dir(dir.path()).await;
assert!(!second.ok);
assert!(
@@ -1596,6 +1618,45 @@ graphs:
.iter()
.any(|diagnostic| diagnostic.code == "schema_apply_failed")
);
+ assert!(
+ recovery_sidecars(dir.path()).is_empty(),
+ "{:?}",
+ recovery_sidecars(dir.path())
+ );
+ }
+
+ #[tokio::test]
+ async fn apply_schema_update_blocked_by_non_main_branch_leaves_no_sidecar() {
+ let dir = fixture();
+ init_derived_graph(dir.path()).await;
+ write_applyable_state(dir.path());
+ let graph_uri = derived_graph_uri(dir.path(), "knowledge");
+ let db = Omnigraph::open(&graph_uri).await.unwrap();
+ db.branch_create("feature").await.unwrap();
+ drop(db);
+ let before_state = read_state_json(dir.path());
+ fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap();
+
+ let out = apply_config_dir(dir.path()).await;
+ assert!(!out.ok);
+ assert!(out.diagnostics.iter().any(|diagnostic| {
+ diagnostic.code == "schema_apply_failed"
+ && diagnostic
+ .message
+ .contains("schema apply requires a graph with only main")
+ }));
+ assert!(
+ recovery_sidecars(dir.path()).is_empty(),
+ "{:?}",
+ recovery_sidecars(dir.path())
+ );
+ let after_state = read_state_json(dir.path());
+ assert_eq!(
+ after_state["applied_revision"]["resources"],
+ before_state["applied_revision"]["resources"]
+ );
+ let reopened = Omnigraph::open_read_only(&graph_uri).await.unwrap();
+ assert_eq!(reopened.schema_source().as_str(), SCHEMA);
}
#[tokio::test]
@@ -2964,6 +3025,10 @@ policies:
.find(|change| change.resource == "policy.base")
.expect("binding change must be visible in plan");
assert!(change.binding_change);
+ assert_eq!(
+ change.metadata_change,
+ Some(PlanMetadataChange::PolicyBindings)
+ );
assert_eq!(change.operation, PlanOperation::Update);
assert_eq!(change.before_digest, change.after_digest);
@@ -3002,9 +3067,9 @@ policies:
let plan = plan_config_dir(dir.path()).await;
assert!(
- plan.changes
- .iter()
- .any(|change| change.resource == "policy.base" && change.binding_change),
+ plan.changes.iter().any(|change| change.resource == "policy.base"
+ && change.binding_change
+ && change.metadata_change == Some(PlanMetadataChange::PolicyBindings)),
"{plan:?}"
);
let out = apply_config_dir(dir.path()).await;
@@ -3016,6 +3081,52 @@ policies:
);
}
+ #[tokio::test]
+ async fn pre_5a_state_backfills_embedding_profile() {
+ let dir = fixture();
+ init_derived_graph(dir.path()).await;
+ write_mock_embedding_cluster(dir.path(), "recorded-x");
+ write_applyable_state(dir.path());
+ let converge = apply_config_dir(dir.path()).await;
+ assert!(converge.converged, "{converge:?}");
+
+ let mut state = read_state_json(dir.path());
+ state["applied_revision"]["resources"]["provider.embedding.default"]
+ .as_object_mut()
+ .unwrap()
+ .remove("embedding_profile");
+ fs::write(
+ dir.path().join(CLUSTER_STATE_FILE),
+ serde_json::to_string_pretty(&state).unwrap(),
+ )
+ .unwrap();
+
+ let plan = plan_config_dir(dir.path()).await;
+ let change = plan
+ .changes
+ .iter()
+ .find(|change| change.resource == "provider.embedding.default")
+ .expect("embedding profile backfill must be visible in plan");
+ assert_eq!(change.operation, PlanOperation::Update);
+ assert_eq!(change.before_digest, change.after_digest);
+ assert_eq!(
+ change.metadata_change,
+ Some(PlanMetadataChange::EmbeddingProfile)
+ );
+
+ let out = apply_config_dir(dir.path()).await;
+ assert!(out.ok && out.converged, "{out:?}");
+ let healed = read_state_json(dir.path());
+ assert_eq!(
+ healed["applied_revision"]["resources"]["provider.embedding.default"]
+ ["embedding_profile"]["model"],
+ serde_json::json!("recorded-x")
+ );
+ let snapshot = read_serving_snapshot(dir.path()).await.unwrap();
+ let profile = snapshot.graphs[0].embedding.as_ref().unwrap();
+ assert_eq!(profile.model.as_deref(), Some("recorded-x"));
+ }
+
#[tokio::test]
async fn bindings_survive_refresh() {
let dir = fixture();
@@ -3189,9 +3300,92 @@ policies:
let err = read_serving_snapshot(dir.path()).await.unwrap_err();
assert!(
- err.iter().any(|diagnostic| diagnostic.code == "cluster_recovery_pending"),
+ err.iter()
+ .any(|diagnostic| diagnostic.code == "cluster_no_healthy_graphs"),
"{err:?}"
);
+ assert!(
+ err.iter().any(|diagnostic| {
+ diagnostic.code == "cluster_recovery_pending"
+ && diagnostic.path == "graph.knowledge"
+ }),
+ "{err:?}"
+ );
+ }
+
+ #[tokio::test]
+ async fn serving_snapshot_quarantines_one_graph_with_pending_recovery() {
+ let dir = fixture();
+ fs::write(
+ dir.path().join(CLUSTER_CONFIG_FILE),
+ r#"
+version: 1
+metadata:
+ name: test
+state:
+ backend: cluster
+ lock: true
+graphs:
+ knowledge:
+ schema: ./people.pg
+ archive:
+ schema: ./people.pg
+"#,
+ )
+ .unwrap();
+ let graph_dir = dir.path().join(CLUSTER_GRAPHS_DIR);
+ fs::create_dir_all(&graph_dir).unwrap();
+ Omnigraph::init(
+ graph_dir.join("knowledge.omni").to_string_lossy().as_ref(),
+ SCHEMA,
+ )
+ .await
+ .unwrap();
+ Omnigraph::init(
+ graph_dir.join("archive.omni").to_string_lossy().as_ref(),
+ SCHEMA,
+ )
+ .await
+ .unwrap();
+ let desired = validate_config_dir(dir.path());
+ assert!(desired.ok, "{:?}", desired.diagnostics);
+ let schema_digest = desired.resource_digests["schema.knowledge"].clone();
+ let empty_queries = BTreeMap::new();
+ let knowledge_digest = graph_digest(
+ "knowledge",
+ Some(&schema_digest),
+ Some(&empty_queries),
+ None,
+ None,
+ );
+ let archive_digest = graph_digest(
+ "archive",
+ Some(&schema_digest),
+ Some(&empty_queries),
+ None,
+ None,
+ );
+ write_state_resources(
+ dir.path(),
+ &[
+ ("graph.knowledge", knowledge_digest.as_str()),
+ ("schema.knowledge", schema_digest.as_str()),
+ ("graph.archive", archive_digest.as_str()),
+ ("schema.archive", schema_digest.as_str()),
+ ],
+ );
+ write_schema_apply_sidecar(dir.path(), "knowledge", "whatever", "01SERVE2");
+
+ let snapshot = read_serving_snapshot(dir.path()).await.unwrap();
+ assert_eq!(snapshot.graphs.len(), 1);
+ assert_eq!(snapshot.graphs[0].graph_id, "archive");
+ assert!(snapshot.queries.is_empty());
+ assert!(snapshot.policies.is_empty());
+ assert!(snapshot.diagnostics.iter().any(|diagnostic| {
+ diagnostic.code == "cluster_recovery_pending"
+ && diagnostic.path == "graph.knowledge"
+ && diagnostic.severity == DiagnosticSeverity::Warning
+ }));
}
#[tokio::test]
@@ -3375,6 +3569,96 @@ policies:
);
}
+ #[tokio::test]
+ async fn read_only_commands_ignore_missing_recovery_sidecar_dir() {
+ let dir = fixture();
+ write_applyable_state(dir.path());
+ assert!(!dir.path().join(CLUSTER_RECOVERIES_DIR).exists());
+
+ let status = status_config_dir(dir.path()).await;
+ assert!(status.ok, "{:?}", status.diagnostics);
+ assert!(
+ !status.diagnostics.iter().any(|diagnostic| matches!(
+ diagnostic.code.as_str(),
+ "recovery_sidecar_read_error" | "cluster_recovery_pending"
+ )),
+ "{:?}",
+ status.diagnostics
+ );
+
+ let plan = plan_config_dir(dir.path()).await;
+ assert!(plan.ok, "{:?}", plan.diagnostics);
+ assert!(
+ !plan.diagnostics.iter().any(|diagnostic| matches!(
+ diagnostic.code.as_str(),
+ "recovery_sidecar_read_error" | "cluster_recovery_pending"
+ )),
+ "{:?}",
+ plan.diagnostics
+ );
+ }
+
+ #[tokio::test]
+ async fn read_only_commands_warn_on_pending_recovery_sidecar_in_storage_root() {
+ let dir = fixture();
+ let storage = tempfile::tempdir().unwrap();
+ let storage_path = storage.path().to_string_lossy().to_string();
+ let mut config = fs::read_to_string(dir.path().join(CLUSTER_CONFIG_FILE)).unwrap();
+ config = config.replace(
+ "version: 1\n",
+ &format!("version: 1\nstorage: {storage_path}\n"),
+ );
+ fs::write(dir.path().join(CLUSTER_CONFIG_FILE), config).unwrap();
+
+ let desired = validate_config_dir(dir.path());
+ assert!(desired.ok, "{:?}", desired.diagnostics);
+ let schema_digest = desired
+ .resource_digests
+ .get("schema.knowledge")
+ .unwrap()
+ .clone();
+ let graph_composite = graph_digest(
+ "knowledge",
+ Some(&schema_digest),
+ Some(&BTreeMap::new()),
+ None,
+ None,
+ );
+ write_state_resources(
+ storage.path(),
+ &[
+ ("graph.knowledge", graph_composite.as_str()),
+ ("schema.knowledge", schema_digest.as_str()),
+ ],
+ );
+ write_create_sidecar(storage.path(), "knowledge", "irrelevant", "01STORAGE");
+
+ let status = status_config_dir(dir.path()).await;
+ assert!(status.ok, "{:?}", status.diagnostics);
+ assert!(
+ status
+ .diagnostics
+ .iter()
+ .any(|diagnostic| diagnostic.code == "cluster_recovery_pending"
+ && diagnostic.path.contains("01STORAGE.json")),
+ "{:?}",
+ status.diagnostics
+ );
+
+ let plan = plan_config_dir(dir.path()).await;
+ assert!(plan.ok, "{:?}", plan.diagnostics);
+ assert!(
+ plan.diagnostics
+ .iter()
+ .any(|diagnostic| diagnostic.code == "cluster_recovery_pending"
+ && diagnostic.path.contains("01STORAGE.json")),
+ "{:?}",
+ plan.diagnostics
+ );
+
+ assert!(!dir.path().join(CLUSTER_RECOVERIES_DIR).exists());
+ }
+
#[tokio::test]
async fn plan_annotates_apply_dispositions() {
let dir = fixture();
diff --git a/crates/omnigraph-cluster/src/types.rs b/crates/omnigraph-cluster/src/types.rs
index 97ad406..7687575 100644
--- a/crates/omnigraph-cluster/src/types.rs
+++ b/crates/omnigraph-cluster/src/types.rs
@@ -176,6 +176,10 @@ pub struct PlanChange {
/// pre-5A backfill case).
#[serde(default, skip_serializing_if = "std::ops::Not::not")]
pub binding_change: bool,
+ /// Metadata-only updates whose resource content digest is unchanged but
+ /// whose applied ledger metadata needs to converge.
+ #[serde(skip_serializing_if = "Option::is_none")]
+ pub metadata_change: Option,
/// For schema updates: the engine's migration plan against the live
/// graph (RFC-004 §D7's data-aware preview). Absent when the preview is
/// unavailable (warning `schema_preview_unavailable`).
@@ -183,6 +187,13 @@ pub struct PlanChange {
pub migration: Option,
}
+#[derive(Debug, Clone, Copy, Serialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum PlanMetadataChange {
+ PolicyBindings,
+ EmbeddingProfile,
+}
+
#[derive(Debug, Clone, Serialize, PartialEq, Eq)]
pub struct BlastRadius {
pub resource: String,
diff --git a/crates/omnigraph-cluster/tests/failpoints.rs b/crates/omnigraph-cluster/tests/failpoints.rs
index 5cdf2d4..51997ce 100644
--- a/crates/omnigraph-cluster/tests/failpoints.rs
+++ b/crates/omnigraph-cluster/tests/failpoints.rs
@@ -13,8 +13,9 @@ use std::fs;
use std::path::{Path, PathBuf};
use fail::FailScenario;
-use omnigraph_cluster::failpoints::ScopedFailPoint;
use omnigraph::db::Omnigraph;
+use omnigraph::failpoints::ScopedFailPoint as EngineScopedFailPoint;
+use omnigraph_cluster::failpoints::ScopedFailPoint;
use omnigraph_cluster::{
ApplyOptions, apply_config_dir, apply_config_dir_with_options, approve_config_dir,
validate_config_dir,
@@ -178,13 +179,12 @@ async fn apply_cas_race_surfaces_state_cas_mismatch() {
// after apply read it but before apply writes. RAII-guarded so a panic
// inside apply cannot leak the callback into the global registry.
let race_path = state_path(dir.path());
- let failpoint =
- ScopedFailPoint::with_callback("cluster_apply.before_state_write", move || {
- let mut state: serde_json::Value =
- serde_json::from_str(&fs::read_to_string(&race_path).unwrap()).unwrap();
- state["state_revision"] = serde_json::json!(99);
- fs::write(&race_path, serde_json::to_string_pretty(&state).unwrap()).unwrap();
- });
+ let failpoint = ScopedFailPoint::with_callback("cluster_apply.before_state_write", move || {
+ let mut state: serde_json::Value =
+ serde_json::from_str(&fs::read_to_string(&race_path).unwrap()).unwrap();
+ state["state_revision"] = serde_json::json!(99);
+ fs::write(&race_path, serde_json::to_string_pretty(&state).unwrap()).unwrap();
+ });
let out = apply_config_dir(dir.path()).await;
drop(failpoint);
@@ -336,10 +336,9 @@ async fn create_crash_after_init_rolls_state_forward() {
);
assert!(recovered.converged);
assert!(recovery_sidecars(dir.path()).is_empty());
- let state: serde_json::Value = serde_json::from_str(
- &fs::read_to_string(dir.path().join("__cluster/state.json")).unwrap(),
- )
- .unwrap();
+ let state: serde_json::Value =
+ serde_json::from_str(&fs::read_to_string(dir.path().join("__cluster/state.json")).unwrap())
+ .unwrap();
assert!(
state["recovery_records"]
.as_object()
@@ -422,6 +421,105 @@ async fn schema_crash_before_apply_recovers_via_sweep() {
scenario.teardown();
}
+/// Engine apply fails after cluster preview and sidecar creation, but before
+/// the graph manifest moves. The defensive cleanup proof should remove the
+/// cluster sidecar immediately so a pre-movement error cannot brick boot.
+#[tokio::test]
+async fn schema_apply_error_before_graph_movement_removes_sidecar() {
+ let scenario = FailScenario::setup();
+ let dir = fixture();
+ converge_with_live_graph(dir.path()).await;
+ let pre_digest = live_schema_digest(dir.path()).await;
+ fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap();
+
+ {
+ let _failpoint = EngineScopedFailPoint::new("schema_apply.before_staging_write", "return");
+ let out = apply_config_dir(dir.path()).await;
+ assert!(!out.ok);
+ assert!(
+ out.diagnostics
+ .iter()
+ .any(|diagnostic| diagnostic.code == "schema_apply_failed"),
+ "{:?}",
+ out.diagnostics
+ );
+ assert_eq!(live_schema_digest(dir.path()).await, pre_digest);
+ assert!(
+ recovery_sidecars(dir.path()).is_empty(),
+ "{:?}",
+ recovery_sidecars(dir.path())
+ );
+ }
+
+ let recovered = apply_config_dir(dir.path()).await;
+ assert!(recovered.ok && recovered.converged, "{recovered:?}");
+ assert!(recovery_sidecars(dir.path()).is_empty());
+ assert_ne!(live_schema_digest(dir.path()).await, pre_digest);
+ scenario.teardown();
+}
+
+/// Engine apply fails after the graph manifest moved. The cluster cannot
+/// prove this is a pre-movement failure, so the sidecar must survive for
+/// explicit recovery/quarantine instead of being cleaned up defensively.
+#[tokio::test]
+async fn schema_apply_error_after_graph_movement_keeps_sidecar() {
+ let scenario = FailScenario::setup();
+ let dir = fixture();
+ converge_with_live_graph(dir.path()).await;
+ let pre_digest = live_schema_digest(dir.path()).await;
+ fs::write(dir.path().join("people.pg"), SCHEMA_V2).unwrap();
+ let desired = validate_config_dir(dir.path());
+ let v2_digest = desired.resource_digests["schema.knowledge"].clone();
+
+ {
+ let _failpoint = EngineScopedFailPoint::new("schema_apply.after_manifest_commit", "return");
+ let out = apply_config_dir(dir.path()).await;
+ assert!(!out.ok);
+ assert!(
+ out.diagnostics
+ .iter()
+ .any(|diagnostic| diagnostic.code == "schema_apply_failed"),
+ "{:?}",
+ out.diagnostics
+ );
+ // Read-only opens do not run engine schema-state recovery, so the
+ // schema file still reads as the old digest even though the manifest
+ // has moved. The cluster sidecar must remain because movement was
+ // detected by the fallback manifest-version proof.
+ assert_eq!(live_schema_digest(dir.path()).await, pre_digest);
+ let sidecars = recovery_sidecars(dir.path());
+ assert_eq!(sidecars.len(), 1, "{sidecars:?}");
+ let sidecar: serde_json::Value =
+ serde_json::from_str(&fs::read_to_string(&sidecars[0]).unwrap()).unwrap();
+ assert_eq!(sidecar["kind"], "schema_apply");
+ assert!(sidecar["expected_manifest_version"].is_null(), "{sidecar}");
+ }
+
+ let uri = dir.path().join("graphs/knowledge.omni");
+ let db = Omnigraph::open(uri.to_string_lossy().as_ref())
+ .await
+ .unwrap();
+ assert_eq!(
+ db.schema_source().as_str(),
+ SCHEMA_V2,
+ "read-write open should complete engine schema-state recovery"
+ );
+ drop(db);
+ assert_eq!(live_schema_digest(dir.path()).await, v2_digest);
+
+ let recovered = apply_config_dir(dir.path()).await;
+ assert!(recovered.ok, "{:?}", recovered.diagnostics);
+ assert!(
+ recovered
+ .diagnostics
+ .iter()
+ .any(|diagnostic| diagnostic.code == "cluster_recovery_rolled_forward")
+ );
+ assert!(recovered.converged);
+ assert!(recovery_sidecars(dir.path()).is_empty());
+ scenario.teardown();
+}
+
/// Crash after the engine schema apply, before the state CAS: the manifest
/// moved, the ledger is stale, nothing acknowledged; the next run's sweep
/// rolls the ledger forward with an audit entry and the run converges.
@@ -447,7 +545,10 @@ async fn schema_crash_after_apply_rolls_state_forward() {
assert_eq!(sidecars.len(), 1);
let sidecar: serde_json::Value =
serde_json::from_str(&fs::read_to_string(&sidecars[0]).unwrap()).unwrap();
- assert!(sidecar["expected_manifest_version"].is_number(), "{sidecar}");
+ assert!(
+ sidecar["expected_manifest_version"].is_number(),
+ "{sidecar}"
+ );
}
let recovered = apply_config_dir(dir.path()).await;
diff --git a/crates/omnigraph-compiler/src/catalog/mod.rs b/crates/omnigraph-compiler/src/catalog/mod.rs
index 93f8d89..2287c3b 100644
--- a/crates/omnigraph-compiler/src/catalog/mod.rs
+++ b/crates/omnigraph-compiler/src/catalog/mod.rs
@@ -6,7 +6,7 @@ use std::sync::Arc;
use arrow_schema::{DataType, Field, Schema, SchemaRef};
-use crate::error::{NanoError, Result};
+use crate::error::{CompilerError, Result};
use crate::schema::ast::{Cardinality, Constraint, ConstraintBound, SchemaDecl, SchemaFile};
use crate::types::{PropType, ScalarType};
@@ -151,7 +151,7 @@ pub fn build_catalog(schema: &SchemaFile) -> Result {
for decl in &schema.declarations {
if let SchemaDecl::Node(node) = decl {
if node_types.contains_key(&node.name) {
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"duplicate node type: {}",
node.name
)));
@@ -250,19 +250,19 @@ pub fn build_catalog(schema: &SchemaFile) -> Result {
for decl in &schema.declarations {
if let SchemaDecl::Edge(edge) = decl {
if edge_types.contains_key(&edge.name) {
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"duplicate edge type: {}",
edge.name
)));
}
if !node_types.contains_key(&edge.from_type) {
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"edge {} references unknown source type: {}",
edge.name, edge.from_type
)));
}
if !node_types.contains_key(&edge.to_type) {
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"edge {} references unknown target type: {}",
edge.name, edge.to_type
)));
@@ -302,7 +302,7 @@ pub fn build_catalog(schema: &SchemaFile) -> Result {
if let Some(existing) = edge_name_index.get(&normalized_name)
&& existing != &edge.name
{
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"edge name collision after case folding: '{}' conflicts with '{}'",
edge.name, existing
)));
diff --git a/crates/omnigraph-compiler/src/catalog/schema_ir.rs b/crates/omnigraph-compiler/src/catalog/schema_ir.rs
index d90539e..4a56ffa 100644
--- a/crates/omnigraph-compiler/src/catalog/schema_ir.rs
+++ b/crates/omnigraph-compiler/src/catalog/schema_ir.rs
@@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
use sha2::{Digest, Sha256};
use crate::catalog::{Catalog, build_catalog};
-use crate::error::{NanoError, Result};
+use crate::error::{CompilerError, Result};
use crate::schema::ast::{Annotation, Cardinality, Constraint, PropDecl, SchemaDecl, SchemaFile};
use crate::types::PropType;
@@ -119,7 +119,7 @@ pub fn build_schema_ir(schema: &SchemaFile) -> Result {
pub fn build_catalog_from_ir(ir: &SchemaIR) -> Result {
if ir.ir_version != SCHEMA_IR_VERSION {
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"unsupported schema ir_version {} (expected {})",
ir.ir_version, SCHEMA_IR_VERSION
)));
@@ -167,12 +167,12 @@ pub fn build_catalog_from_ir(ir: &SchemaIR) -> Result {
pub fn schema_ir_json(ir: &SchemaIR) -> Result {
serde_json::to_string(ir)
- .map_err(|err| NanoError::Catalog(format!("serialize schema ir error: {}", err)))
+ .map_err(|err| CompilerError::Catalog(format!("serialize schema ir error: {}", err)))
}
pub fn schema_ir_pretty_json(ir: &SchemaIR) -> Result {
serde_json::to_string_pretty(ir)
- .map_err(|err| NanoError::Catalog(format!("serialize schema ir error: {}", err)))
+ .map_err(|err| CompilerError::Catalog(format!("serialize schema ir error: {}", err)))
}
pub fn schema_ir_hash(ir: &SchemaIR) -> Result {
@@ -228,7 +228,7 @@ fn canonical_properties(
.map(|property| {
let prop_id = stable_prop_id(&owner_key, &property.name);
if let Some(previous) = seen_prop_ids.insert(prop_id, property.name.clone()) {
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"property id collision on {}: '{}' and '{}' both hash to {}",
owner_name, previous, property.name, prop_id
)));
@@ -308,7 +308,7 @@ fn check_type_id_collision(
name: &str,
) -> Result<()> {
if let Some(previous) = seen_type_ids.insert(type_id, name.to_string()) {
- return Err(NanoError::Catalog(format!(
+ return Err(CompilerError::Catalog(format!(
"type id collision: '{}' and '{}' both hash to {}",
previous, name, type_id
)));
diff --git a/crates/omnigraph-compiler/src/error.rs b/crates/omnigraph-compiler/src/error.rs
index ea48759..0c642c2 100644
--- a/crates/omnigraph-compiler/src/error.rs
+++ b/crates/omnigraph-compiler/src/error.rs
@@ -55,7 +55,7 @@ pub fn decode_string_literal(raw: &str) -> Result {
let escaped = chars
.next()
- .ok_or_else(|| NanoError::Parse("unterminated escape sequence".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("unterminated escape sequence".to_string()))?;
match escaped {
'"' => decoded.push('"'),
'\\' => decoded.push('\\'),
@@ -63,7 +63,7 @@ pub fn decode_string_literal(raw: &str) -> Result {
'r' => decoded.push('\r'),
't' => decoded.push('\t'),
other => {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"unsupported escape sequence: \\{}",
other
)));
@@ -75,7 +75,7 @@ pub fn decode_string_literal(raw: &str) -> Result {
}
#[derive(Debug, Error)]
-pub enum NanoError {
+pub enum CompilerError {
#[error("parse error: {0}")]
Parse(String),
@@ -118,11 +118,16 @@ pub enum NanoError {
Manifest(String),
}
-pub type Result = std::result::Result;
+#[deprecated(note = "use CompilerError")]
+pub type NanoError = CompilerError;
+
+pub type Result = std::result::Result;
#[cfg(test)]
mod tests {
- use super::{SourceSpan, decode_string_literal, render_span};
+ use std::path::Path;
+
+ use super::{CompilerError, SourceSpan, decode_string_literal, render_span};
#[test]
fn source_span_preserves_zero_width() {
@@ -143,4 +148,77 @@ mod tests {
let decoded = decode_string_literal("\"a\\n\\r\\t\\\\\\\"b\"").unwrap();
assert_eq!(decoded, "a\n\r\t\\\"b");
}
+
+ #[test]
+ fn compiler_error_parse_display_is_stable() {
+ let err = CompilerError::Parse("bad token".to_string());
+ assert_eq!(err.to_string(), "parse error: bad token");
+ }
+
+ #[allow(deprecated)]
+ #[test]
+ fn legacy_nano_error_alias_constructs_variants() {
+ let err = super::NanoError::Parse("bad token".to_string());
+ assert_eq!(err.to_string(), "parse error: bad token");
+ }
+
+ #[test]
+ fn legacy_name_is_confined_to_alias_and_compatibility_test() {
+ let legacy_name = ["Nano", "Error"].concat();
+ let workspace_root = Path::new(env!("CARGO_MANIFEST_DIR"))
+ .parent()
+ .and_then(Path::parent)
+ .expect("compiler crate should live under crates/");
+ let allowed_file = workspace_root.join("crates/omnigraph-compiler/src/error.rs");
+ let mut offenders = Vec::new();
+
+ visit_rs_files(workspace_root, &mut |path| {
+ let text = std::fs::read_to_string(path).expect("source file should be readable");
+ let count = text.matches(&legacy_name).count();
+ if path == allowed_file {
+ if count != 2 {
+ offenders.push(format!(
+ "{} contains {count} legacy-name occurrences; expected exactly 2",
+ display_path(workspace_root, path)
+ ));
+ }
+ } else if count > 0 {
+ offenders.push(format!(
+ "{} contains {count} legacy-name occurrence(s)",
+ display_path(workspace_root, path)
+ ));
+ }
+ });
+
+ assert!(
+ offenders.is_empty(),
+ "legacy compiler error name should stay compatibility-only:\n{}",
+ offenders.join("\n")
+ );
+ }
+
+ fn visit_rs_files(dir: &Path, visit: &mut impl FnMut(&Path)) {
+ for entry in std::fs::read_dir(dir).expect("source directory should be readable") {
+ let entry = entry.expect("source entry should be readable");
+ let path = entry.path();
+ if path.is_dir() {
+ if matches!(
+ path.file_name().and_then(|name| name.to_str()),
+ Some(".git" | "target")
+ ) {
+ continue;
+ }
+ visit_rs_files(&path, visit);
+ } else if path.extension().and_then(|ext| ext.to_str()) == Some("rs") {
+ visit(&path);
+ }
+ }
+ }
+
+ fn display_path(root: &Path, path: &Path) -> String {
+ path.strip_prefix(root)
+ .unwrap_or(path)
+ .to_string_lossy()
+ .into_owned()
+ }
}
diff --git a/crates/omnigraph-compiler/src/ir/lower.rs b/crates/omnigraph-compiler/src/ir/lower.rs
index 6999d69..9427e27 100644
--- a/crates/omnigraph-compiler/src/ir/lower.rs
+++ b/crates/omnigraph-compiler/src/ir/lower.rs
@@ -14,7 +14,7 @@ pub fn lower_query(
type_ctx: &TypeContext,
) -> Result {
if !query.mutations.is_empty() {
- return Err(crate::error::NanoError::Plan(
+ return Err(crate::error::CompilerError::Plan(
"cannot lower mutation query with read-query lowerer".to_string(),
));
}
@@ -62,7 +62,7 @@ pub fn lower_query(
pub fn lower_mutation_query(query: &QueryDecl) -> Result {
if query.mutations.is_empty() {
- return Err(crate::error::NanoError::Plan(
+ return Err(crate::error::CompilerError::Plan(
"query does not contain a mutation body".to_string(),
));
}
@@ -261,7 +261,7 @@ fn lower_clauses(
let edge = catalog
.lookup_edge_by_name(&traversal.edge_name)
.ok_or_else(|| {
- crate::error::NanoError::Plan(format!(
+ crate::error::CompilerError::Plan(format!(
"lowering traversal referenced missing edge '{}' after typecheck",
traversal.edge_name
))
diff --git a/crates/omnigraph-compiler/src/query/parser.rs b/crates/omnigraph-compiler/src/query/parser.rs
index 29066c0..3d8f014 100644
--- a/crates/omnigraph-compiler/src/query/parser.rs
+++ b/crates/omnigraph-compiler/src/query/parser.rs
@@ -3,7 +3,7 @@ use pest::error::InputLocation;
use pest_derive::Parser;
use crate::error::{
- NanoError, ParseDiagnostic, Result, SourceSpan, decode_string_literal, render_span,
+ CompilerError, ParseDiagnostic, Result, SourceSpan, decode_string_literal, render_span,
};
use super::ast::*;
@@ -13,7 +13,7 @@ use super::ast::*;
struct QueryParser;
pub fn parse_query(input: &str) -> Result {
- parse_query_diagnostic(input).map_err(|e| NanoError::Parse(e.to_string()))
+ parse_query_diagnostic(input).map_err(|e| CompilerError::Parse(e.to_string()))
}
pub fn parse_query_diagnostic(input: &str) -> std::result::Result {
@@ -24,7 +24,7 @@ pub fn parse_query_diagnostic(input: &str) -> std::result::Result) -> ParseDiagnostic {
ParseDiagnostic::new(err.to_string(), span)
}
-fn nano_error_to_diagnostic(err: NanoError) -> ParseDiagnostic {
+fn compiler_error_to_diagnostic(err: CompilerError) -> ParseDiagnostic {
ParseDiagnostic::new(err.to_string(), None)
}
@@ -71,7 +71,7 @@ fn parse_query_decl(pair: pest::iterators::Pair) -> Result {
Rule::query_annotation => match parse_query_annotation(item)? {
ParsedAnnotation::Description(value) => {
if description.replace(value).is_some() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"query `{}` cannot include duplicate @description annotations",
name
)));
@@ -79,7 +79,7 @@ fn parse_query_decl(pair: pest::iterators::Pair) -> Result {
}
ParsedAnnotation::Instruction(value) => {
if instruction.replace(value).is_some() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"query `{}` cannot include duplicate @instruction annotations",
name
)));
@@ -87,7 +87,7 @@ fn parse_query_decl(pair: pest::iterators::Pair) -> Result {
}
ParsedAnnotation::Mcp(value) => {
if mcp_seen {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"query `{}` cannot include duplicate @mcp annotations",
name
)));
@@ -100,7 +100,7 @@ fn parse_query_decl(pair: pest::iterators::Pair) -> Result {
let body = item
.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse("query body cannot be empty".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("query body cannot be empty".to_string()))?;
match body.as_rule() {
Rule::read_query_body => {
for section in body.into_inner() {
@@ -130,7 +130,7 @@ fn parse_query_decl(pair: pest::iterators::Pair) -> Result {
let int_pair = section.into_inner().next().unwrap();
limit =
Some(int_pair.as_str().parse::().map_err(|e| {
- NanoError::Parse(format!("invalid limit: {}", e))
+ CompilerError::Parse(format!("invalid limit: {}", e))
})?);
}
_ => {}
@@ -141,7 +141,7 @@ fn parse_query_decl(pair: pest::iterators::Pair) -> Result {
for mutation_pair in body.into_inner() {
if let Rule::mutation_stmt = mutation_pair.as_rule() {
let stmt = mutation_pair.into_inner().next().ok_or_else(|| {
- NanoError::Parse(
+ CompilerError::Parse(
"mutation statement cannot be empty".to_string(),
)
})?;
@@ -181,7 +181,7 @@ enum ParsedAnnotation {
fn annotation_string(pair: pest::iterators::Pair, what: &str) -> Result {
pair.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse(format!("{what} requires a string literal")))
+ .ok_or_else(|| CompilerError::Parse(format!("{what} requires a string literal")))
.map(|value| parse_string_lit(value.as_str()))?
}
@@ -189,7 +189,7 @@ fn parse_query_annotation(pair: pest::iterators::Pair) -> Result Ok(ParsedAnnotation::Description(annotation_string(
inner,
@@ -200,7 +200,7 @@ fn parse_query_annotation(pair: pest::iterators::Pair) -> Result Ok(ParsedAnnotation::Mcp(parse_mcp_annotation(inner)?)),
- other => Err(NanoError::Parse(format!(
+ other => Err(CompilerError::Parse(format!(
"unexpected query annotation rule: {:?}",
other
))),
@@ -215,19 +215,19 @@ fn parse_mcp_annotation(pair: pest::iterators::Pair) -> Result {
let value = kv
.into_inner()
.next()
.ok_or_else(|| {
- NanoError::Parse("@mcp expose requires a boolean".to_string())
+ CompilerError::Parse("@mcp expose requires a boolean".to_string())
})?
.as_str()
== "true";
if meta.expose.replace(value).is_some() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"@mcp cannot include duplicate `expose` arguments".to_string(),
));
}
@@ -237,17 +237,17 @@ fn parse_mcp_annotation(pair: pest::iterators::Pair) -> Result {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"unexpected @mcp argument rule: {:?}",
other
)));
@@ -261,14 +261,14 @@ fn parse_param(pair: pest::iterators::Pair) -> Result {
let mut inner = pair.into_inner();
let mut next = inner
.next()
- .ok_or_else(|| NanoError::Parse("parameter is missing a variable".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("parameter is missing a variable".to_string()))?;
// Optional leading `@description("…")` documents the parameter.
let mut description = None;
if next.as_rule() == Rule::description_annotation {
description = Some(annotation_string(next, "@description")?);
next = inner
.next()
- .ok_or_else(|| NanoError::Parse("parameter is missing a variable".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("parameter is missing a variable".to_string()))?;
}
let var = next.as_str();
let name = var.strip_prefix('$').unwrap_or(var).to_string();
@@ -277,25 +277,25 @@ fn parse_param(pair: pest::iterators::Pair) -> Result {
let mut type_inner = type_ref.into_inner();
let core = type_inner
.next()
- .ok_or_else(|| NanoError::Parse("parameter type is missing".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("parameter type is missing".to_string()))?;
let base = match core.as_rule() {
Rule::base_type => core.as_str().to_string(),
Rule::list_type => {
let inner = core
.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse("list type missing item type".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("list type missing item type".to_string()))?;
format!("[{}]", inner.as_str().trim())
}
Rule::vector_type => {
let vector = core
.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse("Vector type missing dimension".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("Vector type missing dimension".to_string()))?;
format!("Vector({})", vector.as_str().trim())
}
other => {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"unexpected param type rule: {:?}",
other
)));
@@ -326,7 +326,7 @@ fn parse_clause(pair: pest::iterators::Pair) -> Result {
}
Ok(Clause::Negation(clauses))
}
- _ => Err(NanoError::Parse(format!(
+ _ => Err(CompilerError::Parse(format!(
"unexpected clause rule: {:?}",
inner.as_rule()
))),
@@ -337,13 +337,13 @@ fn parse_text_search_clause(pair: pest::iterators::Pair) -> Result
let inner = pair
.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse("text search clause cannot be empty".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("text search clause cannot be empty".to_string()))?;
let expr = match inner.as_rule() {
Rule::search_call => parse_search_call(inner)?,
Rule::fuzzy_call => parse_fuzzy_call(inner)?,
Rule::match_text_call => parse_match_text_call(inner)?,
other => {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"unexpected text search clause rule: {:?}",
other
)));
@@ -395,7 +395,7 @@ fn parse_mutation_stmt(pair: pest::iterators::Pair) -> Result {
Rule::insert_stmt => parse_insert_mutation(pair).map(Mutation::Insert),
Rule::update_stmt => parse_update_mutation(pair).map(Mutation::Update),
Rule::delete_stmt => parse_delete_mutation(pair).map(Mutation::Delete),
- other => Err(NanoError::Parse(format!(
+ other => Err(CompilerError::Parse(format!(
"unexpected mutation statement rule: {:?}",
other
))),
@@ -433,7 +433,7 @@ fn parse_update_mutation(pair: pest::iterators::Pair) -> Result) -> Result) -> Result {
}
Rule::now_call => Ok(MatchValue::Now),
Rule::literal => Ok(MatchValue::Literal(parse_literal(value_inner)?)),
- _ => Err(NanoError::Parse(format!(
+ _ => Err(CompilerError::Parse(format!(
"unexpected match value: {:?}",
value_inner.as_rule()
))),
@@ -508,7 +508,7 @@ fn parse_traversal(pair: pest::iterators::Pair) -> Result {
max_hops = max;
inner
.next()
- .ok_or_else(|| NanoError::Parse("traversal missing destination variable".to_string()))?
+ .ok_or_else(|| CompilerError::Parse("traversal missing destination variable".to_string()))?
} else {
next
};
@@ -529,16 +529,16 @@ fn parse_traversal_bounds(pair: pest::iterators::Pair) -> Result<(u32, Opt
let mut inner = pair.into_inner();
let min = inner
.next()
- .ok_or_else(|| NanoError::Parse("traversal bound missing min hop".to_string()))?
+ .ok_or_else(|| CompilerError::Parse("traversal bound missing min hop".to_string()))?
.as_str()
.parse::()
- .map_err(|e| NanoError::Parse(format!("invalid traversal min bound: {}", e)))?;
+ .map_err(|e| CompilerError::Parse(format!("invalid traversal min bound: {}", e)))?;
let max = inner
.next()
.map(|p| {
p.as_str()
.parse::()
- .map_err(|e| NanoError::Parse(format!("invalid traversal max bound: {}", e)))
+ .map_err(|e| CompilerError::Parse(format!("invalid traversal max bound: {}", e)))
})
.transpose()?;
Ok((min, max))
@@ -577,7 +577,7 @@ fn parse_expr(pair: pest::iterators::Pair) -> Result {
"avg" => AggFunc::Avg,
"min" => AggFunc::Min,
"max" => AggFunc::Max,
- other => return Err(NanoError::Parse(format!("unknown aggregate: {}", other))),
+ other => return Err(CompilerError::Parse(format!("unknown aggregate: {}", other))),
};
let arg = parse_expr(parts.next().unwrap())?;
Ok(Expr::Aggregate {
@@ -592,7 +592,7 @@ fn parse_expr(pair: pest::iterators::Pair) -> Result {
Rule::bm25_call => parse_bm25_call(inner),
Rule::rrf_call => parse_rrf_call(inner),
Rule::ident => Ok(Expr::AliasRef(inner.as_str().to_string())),
- _ => Err(NanoError::Parse(format!(
+ _ => Err(CompilerError::Parse(format!(
"unexpected expr rule: {:?}",
inner.as_rule()
))),
@@ -603,12 +603,12 @@ fn parse_search_call(pair: pest::iterators::Pair) -> Result {
let mut args = pair.into_inner();
let field = args
.next()
- .ok_or_else(|| NanoError::Parse("search() missing field argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("search() missing field argument".to_string()))?;
let query = args
.next()
- .ok_or_else(|| NanoError::Parse("search() missing query argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("search() missing query argument".to_string()))?;
if args.next().is_some() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"search() accepts exactly 2 arguments".to_string(),
));
}
@@ -622,13 +622,13 @@ fn parse_fuzzy_call(pair: pest::iterators::Pair) -> Result {
let mut args = pair.into_inner();
let field = args
.next()
- .ok_or_else(|| NanoError::Parse("fuzzy() missing field argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("fuzzy() missing field argument".to_string()))?;
let query = args
.next()
- .ok_or_else(|| NanoError::Parse("fuzzy() missing query argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("fuzzy() missing query argument".to_string()))?;
let max_edits = args.next().map(parse_expr).transpose()?.map(Box::new);
if args.next().is_some() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"fuzzy() accepts at most 3 arguments".to_string(),
));
}
@@ -643,12 +643,12 @@ fn parse_match_text_call(pair: pest::iterators::Pair) -> Result {
let mut args = pair.into_inner();
let field = args
.next()
- .ok_or_else(|| NanoError::Parse("match_text() missing field argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("match_text() missing field argument".to_string()))?;
let query = args
.next()
- .ok_or_else(|| NanoError::Parse("match_text() missing query argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("match_text() missing query argument".to_string()))?;
if args.next().is_some() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"match_text() accepts exactly 2 arguments".to_string(),
));
}
@@ -662,12 +662,12 @@ fn parse_bm25_call(pair: pest::iterators::Pair) -> Result {
let mut args = pair.into_inner();
let field = args
.next()
- .ok_or_else(|| NanoError::Parse("bm25() missing field argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("bm25() missing field argument".to_string()))?;
let query = args
.next()
- .ok_or_else(|| NanoError::Parse("bm25() missing query argument".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("bm25() missing query argument".to_string()))?;
if args.next().is_some() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"bm25() accepts exactly 2 arguments".to_string(),
));
}
@@ -681,14 +681,14 @@ fn parse_rank_expr(pair: pest::iterators::Pair) -> Result {
let inner = if pair.as_rule() == Rule::rank_expr {
pair.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse("rank expression cannot be empty".to_string()))?
+ .ok_or_else(|| CompilerError::Parse("rank expression cannot be empty".to_string()))?
} else {
pair
};
match inner.as_rule() {
Rule::nearest_ordering => parse_nearest_ordering(inner),
Rule::bm25_call => parse_bm25_call(inner),
- other => Err(NanoError::Parse(format!(
+ other => Err(CompilerError::Parse(format!(
"rrf() rank expression must be nearest(...) or bm25(...), got {:?}",
other
))),
@@ -699,13 +699,13 @@ fn parse_rrf_call(pair: pest::iterators::Pair) -> Result {
let mut args = pair.into_inner();
let primary = args
.next()
- .ok_or_else(|| NanoError::Parse("rrf() missing primary rank expression".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("rrf() missing primary rank expression".to_string()))?;
let secondary = args
.next()
- .ok_or_else(|| NanoError::Parse("rrf() missing secondary rank expression".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("rrf() missing secondary rank expression".to_string()))?;
let k = args.next().map(parse_expr).transpose()?.map(Box::new);
if args.next().is_some() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"rrf() accepts at most 3 arguments".to_string(),
));
}
@@ -724,7 +724,7 @@ fn parse_comp_op(pair: pest::iterators::Pair) -> Result {
"<" => Ok(CompOp::Lt),
">=" => Ok(CompOp::Ge),
"<=" => Ok(CompOp::Le),
- other => Err(NanoError::Parse(format!("unknown operator: {}", other))),
+ other => Err(CompilerError::Parse(format!("unknown operator: {}", other))),
}
}
@@ -743,14 +743,14 @@ fn parse_literal(pair: pest::iterators::Pair) -> Result {
let n: i64 = inner
.as_str()
.parse()
- .map_err(|e| NanoError::Parse(format!("invalid integer: {}", e)))?;
+ .map_err(|e| CompilerError::Parse(format!("invalid integer: {}", e)))?;
Ok(Literal::Integer(n))
}
Rule::float_lit => {
let f: f64 = inner
.as_str()
.parse()
- .map_err(|e| NanoError::Parse(format!("invalid float: {}", e)))?;
+ .map_err(|e| CompilerError::Parse(format!("invalid float: {}", e)))?;
Ok(Literal::Float(f))
}
Rule::bool_lit => {
@@ -758,7 +758,7 @@ fn parse_literal(pair: pest::iterators::Pair) -> Result {
"true" => true,
"false" => false,
other => {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"invalid boolean literal: {}",
other
)));
@@ -771,7 +771,7 @@ fn parse_literal(pair: pest::iterators::Pair) -> Result {
.into_inner()
.next()
.map(|s| parse_string_lit(s.as_str()))
- .ok_or_else(|| NanoError::Parse("date literal requires a string".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("date literal requires a string".to_string()))?;
Ok(Literal::Date(date_str?))
}
Rule::datetime_lit => {
@@ -780,7 +780,7 @@ fn parse_literal(pair: pest::iterators::Pair) -> Result {
.next()
.map(|s| parse_string_lit(s.as_str()))
.ok_or_else(|| {
- NanoError::Parse("datetime literal requires a string".to_string())
+ CompilerError::Parse("datetime literal requires a string".to_string())
})?;
Ok(Literal::DateTime(dt_str?))
}
@@ -793,7 +793,7 @@ fn parse_literal(pair: pest::iterators::Pair) -> Result {
}
Ok(Literal::List(items))
}
- _ => Err(NanoError::Parse(format!(
+ _ => Err(CompilerError::Parse(format!(
"unexpected literal: {:?}",
inner.as_rule()
))),
@@ -816,14 +816,14 @@ fn parse_ordering(pair: pest::iterators::Pair) -> Result {
let mut inner = pair.into_inner();
let first = inner
.next()
- .ok_or_else(|| NanoError::Parse("ordering cannot be empty".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("ordering cannot be empty".to_string()))?;
let (expr, descending) = match first.as_rule() {
Rule::nearest_ordering => (parse_nearest_ordering(first)?, false),
Rule::expr => {
let expr = parse_expr(first)?;
let direction = inner.next().map(|p| p.as_str().to_string());
if matches!(expr, Expr::Nearest { .. }) && direction.is_some() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"nearest() ordering does not accept asc/desc modifiers".to_string(),
));
}
@@ -831,7 +831,7 @@ fn parse_ordering(pair: pest::iterators::Pair) -> Result {
(expr, descending)
}
other => {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"unexpected ordering rule: {:?}",
other
)));
@@ -845,22 +845,22 @@ fn parse_nearest_ordering(pair: pest::iterators::Pair) -> Result {
let mut inner = pair.into_inner();
let prop = inner
.next()
- .ok_or_else(|| NanoError::Parse("nearest() missing property".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("nearest() missing property".to_string()))?;
let mut prop_parts = prop.into_inner();
let var = prop_parts
.next()
- .ok_or_else(|| NanoError::Parse("nearest() missing variable".to_string()))?
+ .ok_or_else(|| CompilerError::Parse("nearest() missing variable".to_string()))?
.as_str();
let variable = var.strip_prefix('$').unwrap_or(var).to_string();
let property = prop_parts
.next()
- .ok_or_else(|| NanoError::Parse("nearest() missing property name".to_string()))?
+ .ok_or_else(|| CompilerError::Parse("nearest() missing property name".to_string()))?
.as_str()
.to_string();
let query = inner
.next()
- .ok_or_else(|| NanoError::Parse("nearest() missing query expression".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("nearest() missing query expression".to_string()))?;
Ok(Expr::Nearest {
variable,
property,
diff --git a/crates/omnigraph-compiler/src/query/typecheck.rs b/crates/omnigraph-compiler/src/query/typecheck.rs
index b2c235a..2ac1604 100644
--- a/crates/omnigraph-compiler/src/query/typecheck.rs
+++ b/crates/omnigraph-compiler/src/query/typecheck.rs
@@ -4,7 +4,7 @@ use std::sync::Arc;
use arrow_schema::{DataType, Field, Schema, SchemaRef};
use crate::catalog::Catalog;
-use crate::error::{NanoError, Result};
+use crate::error::{CompilerError, Result};
use crate::types::{Direction, PropType, ScalarType};
use super::ast::*;
@@ -82,7 +82,7 @@ pub fn typecheck_query_decl(catalog: &Catalog, query: &QueryDecl) -> Result Result {
if !query.mutations.is_empty() {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"mutation query cannot be typechecked with read-query API".to_string(),
));
}
@@ -115,14 +115,14 @@ fn parse_declared_param_types(params: &[Param]) -> Result Result Result Result {}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T9: non-aggregate expressions in an aggregate query must be \
property accesses or variables"
.to_string(),
@@ -221,7 +221,7 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
match mutation {
Mutation::Insert(insert) => {
if insert.assignments.is_empty() {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T10: insert mutation requires at least one assignment".to_string(),
));
}
@@ -235,7 +235,7 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
.properties
.get(&assignment.property)
.ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T11: type `{}` has no property `{}`",
insert.type_name, assignment.property
))
@@ -265,13 +265,13 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
if assigned_props.contains(embed.source.as_str()) {
continue;
}
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T12: insert for `{}` must provide non-nullable property `{}` or @embed source `{}`",
insert.type_name, prop_name, embed.source
)));
}
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T12: insert for `{}` must provide non-nullable property `{}`",
insert.type_name, prop_name
)));
@@ -308,7 +308,7 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
.properties
.get(&assignment.property)
.ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T11: type `{}` has no property `{}`",
insert.type_name, assignment.property
))
@@ -324,13 +324,13 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
}
if !has_from {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T12: insert for `{}` must provide required endpoint `from`",
insert.type_name
)));
}
if !has_to {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T12: insert for `{}` must provide required endpoint `to`",
insert.type_name
)));
@@ -341,7 +341,7 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
continue;
}
if !insert.assignments.iter().any(|a| &a.property == prop_name) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T12: insert for `{}` must provide non-nullable property `{}`",
insert.type_name, prop_name
)));
@@ -350,7 +350,7 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
return Ok(insert.type_name.clone());
}
- Err(NanoError::Type(format!(
+ Err(CompilerError::Type(format!(
"T10: unknown node/edge type `{}`",
insert.type_name
)))
@@ -359,19 +359,19 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
let node_type = if let Some(node_type) = catalog.node_types.get(&update.type_name) {
node_type
} else if catalog.edge_types.contains_key(&update.type_name) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T16: update mutation for edge type `{}` is not supported",
update.type_name
)));
} else {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T10: unknown node/edge type `{}`",
update.type_name
)));
};
if update.assignments.is_empty() {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T10: update mutation requires at least one assignment".to_string(),
));
}
@@ -383,7 +383,7 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
.properties
.get(&assignment.property)
.ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T11: type `{}` has no property `{}`",
update.type_name, assignment.property
))
@@ -422,7 +422,7 @@ fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param])
)?;
Ok(delete.type_name.clone())
} else {
- Err(NanoError::Type(format!(
+ Err(CompilerError::Type(format!(
"T10: unknown node/edge type `{}`",
delete.type_name
)))
@@ -435,7 +435,7 @@ fn ensure_no_duplicate_assignment_names(assignments: &[MutationAssignment]) -> R
let mut seen = std::collections::HashSet::new();
for assignment in assignments {
if !seen.insert(&assignment.property) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T13: duplicate assignment for property `{}`",
assignment.property
)));
@@ -454,13 +454,13 @@ fn typecheck_mutation_predicate(
.properties
.get(&predicate.property)
.ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T11: type `{}` has no property `{}`",
type_name, predicate.property
))
})?;
if matches!(prop_type.scalar, ScalarType::Blob) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T11: blob property `{}` cannot be used in WHERE predicates",
predicate.property
)));
@@ -493,7 +493,7 @@ fn typecheck_edge_mutation_predicate(
.properties
.get(&predicate.property)
.ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T11: type `{}` has no property `{}`",
type_name, predicate.property
))
@@ -517,7 +517,7 @@ fn check_match_value_type(
MatchValue::Literal(lit) => check_literal_type(lit, expected, property),
MatchValue::Variable(v) => {
let Some(actual) = params.get(v) else {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T14: mutation variable `${}` must be a declared query parameter",
v
)));
@@ -528,7 +528,7 @@ fn check_match_value_type(
&& matches!(actual.scalar, ScalarType::String)
&& !actual.list);
if !compatible {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: cannot assign/compare {} with {} for property `{}`",
actual.display_name(),
expected.display_name(),
@@ -543,7 +543,7 @@ fn check_match_value_type(
fn check_now_match_value_type(expected: &PropType, property: &str) -> Result<()> {
if expected.list || expected.scalar != ScalarType::DateTime {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: cannot assign/compare DateTime with {} for property `{}`",
expected.display_name(),
property
@@ -597,7 +597,7 @@ fn typecheck_clauses(
}
}
if !has_outer {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T9: negation block must reference at least one outer-bound variable"
.to_string(),
));
@@ -616,7 +616,7 @@ fn typecheck_binding(
) -> Result<()> {
// T1: binding type must exist in catalog
if !catalog.node_types.contains_key(&binding.type_name) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T1: unknown node type `{}`",
binding.type_name
)));
@@ -627,14 +627,14 @@ fn typecheck_binding(
// T2 + T3: property match fields must exist and have correct types
for pm in &binding.prop_matches {
let prop = node_type.properties.get(&pm.prop_name).ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T2: type `{}` has no property `{}`",
binding.type_name, pm.prop_name
))
})?;
if matches!(prop.scalar, ScalarType::Blob) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T3: blob property `{}.{}` cannot be used in match patterns",
binding.type_name, pm.prop_name
)));
@@ -658,7 +658,7 @@ fn typecheck_binding(
if let Some(existing) = ctx.bindings.get(&binding.variable)
&& existing.type_name != binding.type_name
{
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"variable `${}` already bound to type `{}`, cannot rebind to `{}`",
binding.variable, existing.type_name, binding.type_name
)));
@@ -680,7 +680,7 @@ fn check_binding_literal_type(lit: &Literal, expected: &PropType, property: &str
if expected.list {
let lit_type = literal_type(lit)?;
if lit_type.list {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T3: list equality is not supported for property `{}`; use a scalar value to match list membership",
property
)));
@@ -688,7 +688,7 @@ fn check_binding_literal_type(lit: &Literal, expected: &PropType, property: &str
let expected_member = PropType::scalar(expected.scalar, expected.nullable);
if !types_compatible(&lit_type, &expected_member) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T3: property `{}` has type {} but membership match got {}",
property,
expected.display_name(),
@@ -708,7 +708,7 @@ fn check_binding_variable_type(
) -> Result<()> {
if expected.list {
if actual.list {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: list equality is not supported for property `{}`; use a scalar parameter for membership matching",
property
)));
@@ -716,7 +716,7 @@ fn check_binding_variable_type(
let expected_member = PropType::scalar(expected.scalar, expected.nullable);
if !types_compatible(actual, &expected_member) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: cannot compare {} membership against {} for property `{}`",
actual.display_name(),
expected.display_name(),
@@ -727,7 +727,7 @@ fn check_binding_variable_type(
}
if !types_compatible(actual, expected) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: cannot assign/compare {} with {} for property `{}`",
actual.display_name(),
expected.display_name(),
@@ -746,23 +746,23 @@ fn typecheck_traversal(
let edge = catalog
.lookup_edge_by_name(&traversal.edge_name)
.ok_or_else(|| {
- NanoError::Type(format!("T4: unknown edge type `{}`", traversal.edge_name))
+ CompilerError::Type(format!("T4: unknown edge type `{}`", traversal.edge_name))
})?;
if traversal.min_hops == 0 {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T15: traversal min hop bound must be >= 1".to_string(),
));
}
if let Some(max_hops) = traversal.max_hops {
if max_hops < traversal.min_hops {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T15: invalid traversal bounds {{{},{}}}; max must be >= min",
traversal.min_hops, max_hops
)));
}
} else {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T15: unbounded traversal is disabled; use bounded traversal {min,max}".to_string(),
));
}
@@ -784,7 +784,7 @@ fn typecheck_traversal(
// dst should be edge.from_type
bind_traversal_endpoint(ctx, &traversal.dst, &edge.from_type, edge)?;
} else {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T5: variable `${}` has type `{}`, which is not an endpoint of edge `{}: {} -> {}`",
traversal.src, src_bv.type_name, edge.name, edge.from_type, edge.to_type
)));
@@ -798,7 +798,7 @@ fn typecheck_traversal(
direction = Direction::In;
bind_traversal_endpoint(ctx, &traversal.src, &edge.to_type, edge)?;
} else {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T5: variable `${}` has type `{}`, which is not an endpoint of edge `{}: {} -> {}`",
traversal.dst, dst_bv.type_name, edge.name, edge.from_type, edge.to_type
)));
@@ -833,7 +833,7 @@ fn bind_traversal_endpoint(
}
if let Some(existing) = ctx.bindings.get(var) {
if existing.type_name != expected_type {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T5: variable `${}` has type `{}` but edge `{}` expects `{}`",
var, existing.type_name, edge.name, expected_type
)));
@@ -863,27 +863,27 @@ fn typecheck_filter(
if let (ResolvedType::Scalar(l), ResolvedType::Scalar(r)) = (&left_type, &right_type) {
if filter.op == CompOp::Contains {
if !l.list {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: contains requires a list property on the left, got {}",
l.display_name()
)));
}
if r.list {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T7: contains requires a scalar right operand".to_string(),
));
}
if matches!(l.scalar, ScalarType::Vector(_))
|| matches!(r.scalar, ScalarType::Vector(_))
{
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T7: vector membership filters are not supported".to_string(),
));
}
let expected_member = PropType::scalar(l.scalar, l.nullable);
if !types_compatible(&expected_member, r) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: cannot test membership of {} in {}",
r.display_name(),
l.display_name()
@@ -894,29 +894,29 @@ fn typecheck_filter(
// T7: check type compatibility
if l.list || r.list {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T7: list comparisons in filters are not supported; use `contains` for list membership".to_string(),
));
}
if matches!(l.scalar, ScalarType::Vector(_)) || matches!(r.scalar, ScalarType::Vector(_)) {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T7: vector comparisons in filters are not supported".to_string(),
));
}
if matches!(l.scalar, ScalarType::Blob) || matches!(r.scalar, ScalarType::Blob) {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T7: blob comparisons in filters are not supported".to_string(),
));
}
if !types_compatible(l, r) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: cannot compare {} with {}",
l.display_name(),
r.display_name()
)));
}
} else {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T7: filter comparisons require scalar operands, got {} and {}",
left_type.display_name(),
right_type.display_name()
@@ -940,15 +940,15 @@ fn resolve_expr_type(
Expr::PropAccess { variable, property } => {
// T6: variable must be bound and property must exist
let bv = ctx.bindings.get(variable).ok_or_else(|| {
- NanoError::Type(format!("T6: variable `${}` is not bound", variable))
+ CompilerError::Type(format!("T6: variable `${}` is not bound", variable))
})?;
let node_type = catalog.node_types.get(&bv.type_name).ok_or_else(|| {
- NanoError::Type(format!("T6: type `{}` not found in catalog", bv.type_name))
+ CompilerError::Type(format!("T6: type `{}` not found in catalog", bv.type_name))
})?;
let prop = node_type.properties.get(property).ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T6: type `{}` has no property `{}`",
bv.type_name, property
))
@@ -962,19 +962,19 @@ fn resolve_expr_type(
query,
} => {
let node_binding = ctx.bindings.get(variable).ok_or_else(|| {
- NanoError::Type(format!("T15: variable `${}` is not bound", variable))
+ CompilerError::Type(format!("T15: variable `${}` is not bound", variable))
})?;
let node_type = catalog
.node_types
.get(&node_binding.type_name)
.ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T15: type `{}` not found in catalog",
node_binding.type_name
))
})?;
let prop_type = node_type.properties.get(property).ok_or_else(|| {
- NanoError::Type(format!(
+ CompilerError::Type(format!(
"T15: type `{}` has no property `{}`",
node_binding.type_name, property
))
@@ -982,7 +982,7 @@ fn resolve_expr_type(
let vector_dim = match prop_type.scalar {
ScalarType::Vector(dim) => dim,
_ => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T15: nearest requires a Vector property, got {}.{}: {}",
node_binding.type_name,
property,
@@ -991,7 +991,7 @@ fn resolve_expr_type(
}
};
if prop_type.list {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T15: nearest does not support list-wrapped vectors".to_string(),
));
}
@@ -1000,7 +1000,7 @@ fn resolve_expr_type(
&& let Some(dim) = numeric_vector_literal_dim(lit)
{
if dim != vector_dim {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T15: nearest vector dimension mismatch: property is Vector({}), query literal has {} elements",
vector_dim, dim
)));
@@ -1019,7 +1019,7 @@ fn resolve_expr_type(
_ => unreachable!(),
};
if qdim != vector_dim {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T15: nearest vector dimension mismatch: property is Vector({}), query is Vector({})",
vector_dim, qdim
)));
@@ -1029,14 +1029,14 @@ fn resolve_expr_type(
// query-time string embedding is supported by the runtime executor
}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T15: nearest query must be Vector({}) or String, got {}",
vector_dim,
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T15: nearest query must be a scalar expression".to_string(),
));
}
@@ -1052,13 +1052,13 @@ fn resolve_expr_type(
match field_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T19: search field must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T19: search field must be a scalar String expression".to_string(),
));
}
@@ -1068,13 +1068,13 @@ fn resolve_expr_type(
match query_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T19: search query must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T19: search query must be a scalar String expression".to_string(),
));
}
@@ -1094,13 +1094,13 @@ fn resolve_expr_type(
match field_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T19: fuzzy field must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T19: fuzzy field must be a scalar String expression".to_string(),
));
}
@@ -1110,13 +1110,13 @@ fn resolve_expr_type(
match query_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T19: fuzzy query must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T19: fuzzy query must be a scalar String expression".to_string(),
));
}
@@ -1135,13 +1135,13 @@ fn resolve_expr_type(
| ScalarType::U64
) => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T19: fuzzy max_edits must be an integer scalar, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T19: fuzzy max_edits must be an integer scalar expression".to_string(),
));
}
@@ -1158,13 +1158,13 @@ fn resolve_expr_type(
match field_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T20: match_text field must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T20: match_text field must be a scalar String expression".to_string(),
));
}
@@ -1174,13 +1174,13 @@ fn resolve_expr_type(
match query_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T20: match_text query must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T20: match_text query must be a scalar String expression".to_string(),
));
}
@@ -1196,13 +1196,13 @@ fn resolve_expr_type(
match field_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T20: bm25 field must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T20: bm25 field must be a scalar String expression".to_string(),
));
}
@@ -1212,13 +1212,13 @@ fn resolve_expr_type(
match query_type {
ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T20: bm25 query must be String, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T20: bm25 query must be a scalar String expression".to_string(),
));
}
@@ -1235,12 +1235,12 @@ fn resolve_expr_type(
k,
} => {
if !matches!(primary.as_ref(), Expr::Nearest { .. } | Expr::Bm25 { .. }) {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T21: rrf primary expression must be nearest(...) or bm25(...)".to_string(),
));
}
if !matches!(secondary.as_ref(), Expr::Nearest { .. } | Expr::Bm25 { .. }) {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T21: rrf secondary expression must be nearest(...) or bm25(...)".to_string(),
));
}
@@ -1252,13 +1252,13 @@ fn resolve_expr_type(
match ty {
ResolvedType::Scalar(s) if s.scalar == ScalarType::F64 && !s.list => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T21: rrf rank expressions must evaluate to F64, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T21: rrf rank expressions must be scalar numeric expressions"
.to_string(),
));
@@ -1279,13 +1279,13 @@ fn resolve_expr_type(
| ScalarType::U64
) => {}
ResolvedType::Scalar(s) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T21: rrf k must be an integer scalar, got {}",
s.display_name()
)));
}
_ => {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T21: rrf k must be an integer scalar expression".to_string(),
));
}
@@ -1293,7 +1293,7 @@ fn resolve_expr_type(
if let Expr::Literal(Literal::Integer(v)) = k_expr.as_ref()
&& *v <= 0
{
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"T21: rrf k must be greater than 0".to_string(),
));
}
@@ -1311,7 +1311,7 @@ fn resolve_expr_type(
} else if let Some(bv) = ctx.bindings.get(name) {
Ok(ResolvedType::Node(bv.type_name.clone()))
} else {
- Err(NanoError::Type(format!(
+ Err(CompilerError::Type(format!(
"variable `${}` is not bound",
name
)))
@@ -1327,7 +1327,7 @@ fn resolve_expr_type(
if let ResolvedType::Scalar(s) = &arg_type
&& (s.list || !s.scalar.is_numeric())
{
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T8: {} requires numeric type, got {}",
func,
s.display_name()
@@ -1338,7 +1338,7 @@ fn resolve_expr_type(
if let ResolvedType::Scalar(s) = &arg_type
&& (s.list || (!s.scalar.is_numeric() && s.scalar != ScalarType::String))
{
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T8: {} requires numeric or string type, got {}",
func,
s.display_name()
@@ -1420,7 +1420,7 @@ fn resolved_type_to_field_shape(
ResolvedType::Scalar(prop_type) => Ok((prop_type.to_arrow(), prop_type.nullable)),
ResolvedType::Node(type_name) => {
let node_type = catalog.node_types.get(type_name).ok_or_else(|| {
- NanoError::Type(format!("type `{}` not found in catalog", type_name))
+ CompilerError::Type(format!("type `{}` not found in catalog", type_name))
})?;
let fields: Vec = node_type
.arrow_schema
@@ -1450,14 +1450,14 @@ fn literal_type(lit: &Literal) -> Result {
}
let first = literal_type(&items[0])?;
if first.list {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"nested list literals are not supported".to_string(),
));
}
for item in items.iter().skip(1) {
let item_type = literal_type(item)?;
if item_type.list || !types_compatible(&first, &item_type) {
- return Err(NanoError::Type(
+ return Err(CompilerError::Type(
"list literal elements must share a compatible scalar type".to_string(),
));
}
@@ -1473,7 +1473,7 @@ fn check_literal_type(lit: &Literal, expected: &PropType, prop_name: &str) -> Re
return if expected.nullable {
Ok(())
} else {
- Err(NanoError::Type(format!(
+ Err(CompilerError::Type(format!(
"T3: property `{}` is non-nullable but got null",
prop_name
)))
@@ -1487,7 +1487,7 @@ fn check_literal_type(lit: &Literal, expected: &PropType, prop_name: &str) -> Re
if actual_dim == expected_dim {
return Ok(());
}
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T3: property `{}` has type Vector({}) but got vector literal with {} elements",
prop_name, expected_dim, actual_dim
)));
@@ -1495,7 +1495,7 @@ fn check_literal_type(lit: &Literal, expected: &PropType, prop_name: &str) -> Re
let lit_type = literal_type(lit)?;
if !types_compatible(&lit_type, expected) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T3: property `{}` has type {} but got {}",
prop_name,
expected.display_name(),
@@ -1507,7 +1507,7 @@ fn check_literal_type(lit: &Literal, expected: &PropType, prop_name: &str) -> Re
match lit {
Literal::String(v) => {
if !allowed.contains(v) {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T3: property `{}` expects one of [{}], got '{}'",
prop_name,
allowed.join(", "),
@@ -1520,7 +1520,7 @@ fn check_literal_type(lit: &Literal, expected: &PropType, prop_name: &str) -> Re
match item {
Literal::String(v) if allowed.contains(v) => {}
Literal::String(v) => {
- return Err(NanoError::Type(format!(
+ return Err(CompilerError::Type(format!(
"T3: property `{}` expects one of [{}], got '{}'",
prop_name,
allowed.join(", "),
diff --git a/crates/omnigraph-compiler/src/query_input.rs b/crates/omnigraph-compiler/src/query_input.rs
index 5d48e53..77b569e 100644
--- a/crates/omnigraph-compiler/src/query_input.rs
+++ b/crates/omnigraph-compiler/src/query_input.rs
@@ -3,7 +3,7 @@ use std::fmt;
use serde_json::Value;
-use crate::error::NanoError;
+use crate::error::CompilerError;
use crate::ir::ParamMap;
use crate::json_output::{JS_MAX_SAFE_INTEGER_U64, is_js_safe_integer_i64};
use crate::query::ast::{Literal, Param, QueryDecl};
@@ -17,7 +17,7 @@ pub enum JsonParamMode {
#[derive(Debug)]
pub enum RunInputError {
- Core(NanoError),
+ Core(CompilerError),
Message(String),
}
@@ -45,8 +45,8 @@ impl Error for RunInputError {
}
}
-impl From for RunInputError {
- fn from(value: NanoError) -> Self {
+impl From for RunInputError {
+ fn from(value: CompilerError) -> Self {
Self::Core(value)
}
}
@@ -120,7 +120,7 @@ impl ToParam for i64 {
impl ToParam for isize {
fn to_param(self) -> crate::error::Result {
let value = i64::try_from(self).map_err(|_| {
- NanoError::Execution(format!(
+ CompilerError::Execution(format!(
"param value {} exceeds current engine range for numeric literals (max {})",
self,
i64::MAX
@@ -151,7 +151,7 @@ impl ToParam for u32 {
impl ToParam for u64 {
fn to_param(self) -> crate::error::Result {
let value = i64::try_from(self).map_err(|_| {
- NanoError::Execution(format!(
+ CompilerError::Execution(format!(
"param value {} exceeds current engine range for numeric literals (max {})",
self,
i64::MAX
@@ -164,7 +164,7 @@ impl ToParam for u64 {
impl ToParam for usize {
fn to_param(self) -> crate::error::Result {
let value = i64::try_from(self).map_err(|_| {
- NanoError::Execution(format!(
+ CompilerError::Execution(format!(
"param value {} exceeds current engine range for numeric literals (max {})",
self,
i64::MAX
@@ -177,7 +177,7 @@ impl ToParam for usize {
impl ToParam for f32 {
fn to_param(self) -> crate::error::Result {
if !self.is_finite() {
- return Err(NanoError::Execution(format!(
+ return Err(CompilerError::Execution(format!(
"invalid float parameter {}",
self
)));
@@ -189,7 +189,7 @@ impl ToParam for f32 {
impl ToParam for f64 {
fn to_param(self) -> crate::error::Result {
if !self.is_finite() {
- return Err(NanoError::Execution(format!(
+ return Err(CompilerError::Execution(format!(
"invalid float parameter {}",
self
)));
diff --git a/crates/omnigraph-compiler/src/result.rs b/crates/omnigraph-compiler/src/result.rs
index 7de77ac..d92dd1e 100644
--- a/crates/omnigraph-compiler/src/result.rs
+++ b/crates/omnigraph-compiler/src/result.rs
@@ -5,7 +5,7 @@ use arrow_ipc::writer::StreamWriter;
use arrow_schema::{DataType, Field, Schema, SchemaRef};
use serde::de::DeserializeOwned;
-use crate::error::{NanoError, Result};
+use crate::error::{CompilerError, Result};
use crate::json_output::{record_batches_to_json_rows, record_batches_to_rust_json_rows};
#[derive(Debug, Clone, Copy, Default)]
@@ -47,7 +47,7 @@ impl QueryResult {
}
arrow_select::concat::concat_batches(&self.schema, &self.batches)
- .map_err(|err| NanoError::Execution(err.to_string()))
+ .map_err(|err| CompilerError::Execution(err.to_string()))
}
pub fn to_sdk_json(&self) -> serde_json::Value {
@@ -60,7 +60,7 @@ impl QueryResult {
pub fn deserialize(&self) -> Result {
serde_json::from_value(self.to_rust_json()).map_err(|err| {
- NanoError::Execution(format!("failed to deserialize query result: {}", err))
+ CompilerError::Execution(format!("failed to deserialize query result: {}", err))
})
}
diff --git a/crates/omnigraph-compiler/src/schema/parser.rs b/crates/omnigraph-compiler/src/schema/parser.rs
index c5f4355..6e34e53 100644
--- a/crates/omnigraph-compiler/src/schema/parser.rs
+++ b/crates/omnigraph-compiler/src/schema/parser.rs
@@ -5,7 +5,7 @@ use pest::error::InputLocation;
use pest_derive::Parser;
use crate::error::{
- NanoError, ParseDiagnostic, Result, SourceSpan, decode_string_literal, render_span,
+ CompilerError, ParseDiagnostic, Result, SourceSpan, decode_string_literal, render_span,
};
use crate::types::{PropType, ScalarType};
@@ -16,7 +16,7 @@ use super::ast::*;
struct SchemaParser;
pub fn parse_schema(input: &str) -> Result {
- parse_schema_diagnostic(input).map_err(|e| NanoError::Parse(e.to_string()))
+ parse_schema_diagnostic(input).map_err(|e| CompilerError::Parse(e.to_string()))
}
pub fn parse_schema_diagnostic(input: &str) -> std::result::Result {
@@ -27,7 +27,8 @@ pub fn parse_schema_diagnostic(input: &str) -> std::result::Result std::result::Result = interfaces.iter().collect();
for decl in &mut declarations {
if let SchemaDecl::Node(node) = decl {
- resolve_interfaces(node, &iface_refs).map_err(nano_error_to_diagnostic)?;
+ resolve_interfaces(node, &iface_refs).map_err(compiler_error_to_diagnostic)?;
}
}
let schema = SchemaFile { declarations };
- validate_schema_annotations(&schema).map_err(nano_error_to_diagnostic)?;
- validate_constraints(&schema).map_err(nano_error_to_diagnostic)?;
+ validate_schema_annotations(&schema).map_err(compiler_error_to_diagnostic)?;
+ validate_constraints(&schema).map_err(compiler_error_to_diagnostic)?;
Ok(schema)
}
@@ -64,7 +65,7 @@ fn pest_error_to_diagnostic(err: pest::error::Error) -> ParseDiagnostic {
ParseDiagnostic::new(err.to_string(), span)
}
-fn nano_error_to_diagnostic(err: NanoError) -> ParseDiagnostic {
+fn compiler_error_to_diagnostic(err: CompilerError) -> ParseDiagnostic {
ParseDiagnostic::new(err.to_string(), None)
}
@@ -74,7 +75,7 @@ fn parse_schema_decl(pair: pest::iterators::Pair) -> Result {
Rule::interface_decl => Ok(SchemaDecl::Interface(parse_interface_decl(inner)?)),
Rule::node_decl => Ok(SchemaDecl::Node(parse_node_decl(inner)?)),
Rule::edge_decl => Ok(SchemaDecl::Edge(parse_edge_decl(inner)?)),
- _ => Err(NanoError::Parse(format!(
+ _ => Err(CompilerError::Parse(format!(
"unexpected rule: {:?}",
inner.as_rule()
))),
@@ -180,21 +181,20 @@ fn parse_cardinality(pair: pest::iterators::Pair) -> Result {
let min_str = inner.next().unwrap().as_str();
let min = min_str
.parse::()
- .map_err(|_| NanoError::Parse(format!("invalid cardinality min: {}", min_str)))?;
- let max = if let Some(max_pair) = inner.next() {
- let max_str = max_pair.as_str();
- Some(
- max_str
- .parse::()
- .map_err(|_| NanoError::Parse(format!("invalid cardinality max: {}", max_str)))?,
- )
- } else {
- None
- };
+ .map_err(|_| CompilerError::Parse(format!("invalid cardinality min: {}", min_str)))?;
+ let max =
+ if let Some(max_pair) = inner.next() {
+ let max_str = max_pair.as_str();
+ Some(max_str.parse::().map_err(|_| {
+ CompilerError::Parse(format!("invalid cardinality max: {}", max_str))
+ })?)
+ } else {
+ None
+ };
if let Some(max_val) = max {
if min > max_val {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"cardinality min ({}) exceeds max ({})",
min, max_val
)));
@@ -219,7 +219,7 @@ fn parse_body_constraint(pair: pest::iterators::Pair) -> Result>>()?;
if names.is_empty() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"@key constraint requires at least one property name".to_string(),
));
}
@@ -228,7 +228,7 @@ fn parse_body_constraint(pair: pest::iterators::Pair) -> Result {
let names = extract_ident_list_from_args(args)?;
if names.is_empty() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"@unique constraint requires at least one property name".to_string(),
));
}
@@ -237,7 +237,7 @@ fn parse_body_constraint(pair: pest::iterators::Pair) -> Result {
let names = extract_ident_list_from_args(args)?;
if names.is_empty() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"@index constraint requires at least one property name".to_string(),
));
}
@@ -246,7 +246,7 @@ fn parse_body_constraint(pair: pest::iterators::Pair) -> Result {
// @range(prop, min..max)
if args.len() < 2 {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"@range requires property name and bounds: @range(prop, min..max)".to_string(),
));
}
@@ -258,7 +258,7 @@ fn parse_body_constraint(pair: pest::iterators::Pair) -> Result {
// @check(prop, "regex")
if args.len() < 2 {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"@check requires property name and pattern: @check(prop, \"regex\")"
.to_string(),
));
@@ -267,7 +267,10 @@ fn parse_body_constraint(pair: pest::iterators::Pair) -> Result Err(NanoError::Parse(format!("unknown constraint: @{}", other))),
+ other => Err(CompilerError::Parse(format!(
+ "unknown constraint: @{}",
+ other
+ ))),
}
}
@@ -281,7 +284,7 @@ fn extract_ident_from_constraint_arg(pair: pest::iterators::Pair) -> Resul
return Ok(inner.as_str().to_string());
}
}
- Err(NanoError::Parse(
+ Err(CompilerError::Parse(
"expected property name in constraint".to_string(),
))
}
@@ -309,7 +312,7 @@ fn extract_string_from_constraint_arg(pair: &pest::iterators::Pair) -> Res
}
find_string(pair)?
- .ok_or_else(|| NanoError::Parse("expected string argument in constraint".to_string()))
+ .ok_or_else(|| CompilerError::Parse("expected string argument in constraint".to_string()))
}
fn extract_range_bounds(
@@ -327,7 +330,9 @@ fn extract_range_bounds(
}
}
found.ok_or_else(|| {
- NanoError::Parse("expected range bounds (min..max) in @range constraint".to_string())
+ CompilerError::Parse(
+ "expected range bounds (min..max) in @range constraint".to_string(),
+ )
})?
};
@@ -378,7 +383,7 @@ fn parse_constraint_bound(pair: &pest::iterators::Pair) -> Result Res
for iface_name in &node.implements {
let iface = interface_map.get(iface_name.as_str()).ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"node {} implements unknown interface '{}'",
node.name, iface_name
))
@@ -421,7 +426,7 @@ fn resolve_interfaces(node: &mut NodeDecl, interfaces: &[&InterfaceDecl]) -> Res
if let Some(existing) = node.properties.iter().find(|p| p.name == iface_prop.name) {
// Property exists — verify type compatibility
if existing.prop_type != iface_prop.prop_type {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"node {} property '{}' has type {} but interface {} declares it as {}",
node.name,
iface_prop.name,
@@ -472,36 +477,35 @@ fn parse_type_ref(pair: pest::iterators::Pair) -> Result {
let mut inner = pair
.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse("type reference is missing core type".to_string()))?;
+ .ok_or_else(|| CompilerError::Parse("type reference is missing core type".to_string()))?;
if inner.as_rule() == Rule::core_type {
- inner = inner
- .into_inner()
- .next()
- .ok_or_else(|| NanoError::Parse("type reference is missing core type".to_string()))?;
+ inner = inner.into_inner().next().ok_or_else(|| {
+ CompilerError::Parse("type reference is missing core type".to_string())
+ })?;
}
match inner.as_rule() {
Rule::base_type => {
let scalar = ScalarType::from_str_name(inner.as_str())
- .ok_or_else(|| NanoError::Parse(format!("unknown type: {}", inner.as_str())))?;
+ .ok_or_else(|| CompilerError::Parse(format!("unknown type: {}", inner.as_str())))?;
Ok(PropType::scalar(scalar, nullable))
}
Rule::vector_type => {
let dim_text = inner
.into_inner()
.next()
- .ok_or_else(|| NanoError::Parse("Vector type missing dimension".to_string()))?
+ .ok_or_else(|| CompilerError::Parse("Vector type missing dimension".to_string()))?
.as_str();
let dim = dim_text
.parse::()
- .map_err(|e| NanoError::Parse(format!("invalid Vector dimension: {}", e)))?;
+ .map_err(|e| CompilerError::Parse(format!("invalid Vector dimension: {}", e)))?;
if dim == 0 {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"Vector dimension must be greater than zero".to_string(),
));
}
if dim > i32::MAX as u32 {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"Vector dimension {} exceeds maximum supported {}",
dim,
i32::MAX
@@ -510,15 +514,14 @@ fn parse_type_ref(pair: pest::iterators::Pair) -> Result {
Ok(PropType::scalar(ScalarType::Vector(dim), nullable))
}
Rule::list_type => {
- let element = inner
- .into_inner()
- .next()
- .ok_or_else(|| NanoError::Parse("list type missing element type".to_string()))?;
+ let element = inner.into_inner().next().ok_or_else(|| {
+ CompilerError::Parse("list type missing element type".to_string())
+ })?;
let scalar = ScalarType::from_str_name(element.as_str()).ok_or_else(|| {
- NanoError::Parse(format!("unknown list element type: {}", element.as_str()))
+ CompilerError::Parse(format!("unknown list element type: {}", element.as_str()))
})?;
if matches!(scalar, ScalarType::Blob) {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"list of Blob is not supported".to_string(),
));
}
@@ -532,7 +535,7 @@ fn parse_type_ref(pair: pest::iterators::Pair) -> Result {
}
}
if values.is_empty() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"enum type must include at least one value".to_string(),
));
}
@@ -540,13 +543,13 @@ fn parse_type_ref(pair: pest::iterators::Pair) -> Result {
dedup.sort();
dedup.dedup();
if dedup.len() != values.len() {
- return Err(NanoError::Parse(
+ return Err(CompilerError::Parse(
"enum type cannot include duplicate values".to_string(),
));
}
Ok(PropType::enum_type(values, nullable))
}
- other => Err(NanoError::Parse(format!(
+ other => Err(CompilerError::Parse(format!(
"unexpected type rule: {:?}",
other
))),
@@ -595,19 +598,19 @@ fn validate_string_annotation(
continue;
}
if seen {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"{} declares @{} multiple times",
target, annotation
)));
}
let value = ann.value.as_deref().ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"@{} on {} requires a non-empty value",
annotation, target
))
})?;
if value.trim().is_empty() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@{} on {} requires a non-empty value",
annotation, target
)));
@@ -631,7 +634,7 @@ fn validate_schema_annotations(schema: &SchemaFile) -> Result<()> {
|| ann.name == "index"
|| ann.name == "embed"
{
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@{} is only supported on node properties or as body constraint (node {})",
ann.name, node.name
)));
@@ -660,7 +663,7 @@ fn validate_schema_annotations(schema: &SchemaFile) -> Result<()> {
|| ann.name == "index"
|| ann.name == "embed"
{
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@{} is not supported on edges (edge {})",
ann.name, edge.name
)));
@@ -714,13 +717,13 @@ fn validate_property_annotations(
|| ann.name == "index"
|| ann.name == "embed")
{
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@{} is not supported on list property {}.{}",
ann.name, type_name, prop.name
)));
}
if is_vector && (ann.name == "key" || ann.name == "unique") {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@{} is not supported on vector property {}.{}",
ann.name, type_name, prop.name
)));
@@ -731,13 +734,13 @@ fn validate_property_annotations(
|| ann.name == "index"
|| ann.name == "embed")
{
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@{} is not supported on blob property {}.{}",
ann.name, type_name, prop.name
)));
}
if ann.name == "instruction" {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@instruction is only supported on node and edge types (property {}.{})",
type_name, prop.name
)));
@@ -745,7 +748,7 @@ fn validate_property_annotations(
// Edge-specific restrictions
if is_edge && (ann.name == "key" || ann.name == "embed") {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@{} is not supported on edge properties (edge {}.{})",
ann.name, type_name, prop.name
)));
@@ -755,13 +758,13 @@ fn validate_property_annotations(
match ann.name.as_str() {
"key" => {
if ann.value.is_some() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@key on {}.{} does not accept a value",
type_name, prop.name
)));
}
if key_seen {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"property {}.{} declares @key multiple times",
type_name, prop.name
)));
@@ -770,13 +773,13 @@ fn validate_property_annotations(
}
"unique" => {
if ann.value.is_some() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@unique on {}.{} does not accept a value",
type_name, prop.name
)));
}
if unique_seen {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"property {}.{} declares @unique multiple times",
type_name, prop.name
)));
@@ -785,13 +788,13 @@ fn validate_property_annotations(
}
"index" => {
if ann.value.is_some() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@index on {}.{} does not accept a value",
type_name, prop.name
)));
}
if index_seen {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"property {}.{} declares @index multiple times",
type_name, prop.name
)));
@@ -800,7 +803,7 @@ fn validate_property_annotations(
}
"embed" => {
if embed_seen {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"property {}.{} declares @embed multiple times",
type_name, prop.name
)));
@@ -808,20 +811,20 @@ fn validate_property_annotations(
embed_seen = true;
if !is_vector {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@embed is only supported on vector properties ({}.{})",
type_name, prop.name
)));
}
let source_prop = ann.value.as_deref().ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"@embed on {}.{} requires a source property name",
type_name, prop.name
))
})?;
if source_prop.trim().is_empty() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@embed on {}.{} requires a non-empty source property name",
type_name, prop.name
)));
@@ -831,14 +834,14 @@ fn validate_property_annotations(
.iter()
.find(|p| p.name == source_prop)
.ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"@embed on {}.{} references unknown source property {}",
type_name, prop.name, source_prop
))
})?;
if source_decl.prop_type.list || source_decl.prop_type.scalar != ScalarType::String
{
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@embed source property {}.{} must be String",
type_name, source_prop
)));
@@ -848,7 +851,7 @@ fn validate_property_annotations(
// a typo can't be silently ignored (it would never validate).
for key in ann.kwargs.keys() {
if key != "model" {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@embed on {}.{} has unknown argument '{}=' (only 'model' is supported)",
type_name, prop.name, key
)));
@@ -893,45 +896,45 @@ fn validate_type_constraints(
match constraint {
Constraint::Key(cols) => {
if is_edge {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@key constraint is not supported on edges (edge {})",
type_name
)));
}
key_count += 1;
if key_count > 1 {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"node type {} has multiple @key constraints; only one is supported",
type_name
)));
}
for col in cols {
let prop = prop_names.get(col.as_str()).ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"@key on {} references unknown property '{}'",
type_name, col
))
})?;
if prop.prop_type.nullable {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@key property {}.{} cannot be nullable",
type_name, col
)));
}
if prop.prop_type.list {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@key is not supported on list property {}.{}",
type_name, col
)));
}
if matches!(prop.prop_type.scalar, ScalarType::Vector(_)) {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@key is not supported on vector property {}.{}",
type_name, col
)));
}
if matches!(prop.prop_type.scalar, ScalarType::Blob) {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@key is not supported on blob property {}.{}",
type_name, col
)));
@@ -945,7 +948,7 @@ fn validate_type_constraints(
continue;
}
if !prop_names.contains_key(col.as_str()) {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@unique on {} references unknown property '{}'",
type_name, col
)));
@@ -958,13 +961,13 @@ fn validate_type_constraints(
continue;
}
let prop = prop_names.get(col.as_str()).ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"@index on {} references unknown property '{}'",
type_name, col
))
})?;
if matches!(prop.prop_type.scalar, ScalarType::Blob) {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@index is not supported on blob property {}.{}",
type_name, col
)));
@@ -973,19 +976,19 @@ fn validate_type_constraints(
}
Constraint::Range { property, .. } => {
if is_edge {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@range constraint is not supported on edges (edge {})",
type_name
)));
}
let prop = prop_names.get(property.as_str()).ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"@range on {} references unknown property '{}'",
type_name, property
))
})?;
if !prop.prop_type.scalar.is_numeric() {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@range on {}.{} requires a numeric type, got {}",
type_name,
property,
@@ -995,19 +998,19 @@ fn validate_type_constraints(
}
Constraint::Check { property, .. } => {
if is_edge {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@check constraint is not supported on edges (edge {})",
type_name
)));
}
let prop = prop_names.get(property.as_str()).ok_or_else(|| {
- NanoError::Parse(format!(
+ CompilerError::Parse(format!(
"@check on {} references unknown property '{}'",
type_name, property
))
})?;
if prop.prop_type.scalar != ScalarType::String {
- return Err(NanoError::Parse(format!(
+ return Err(CompilerError::Parse(format!(
"@check on {}.{} requires String type, got {}",
type_name,
property,
diff --git a/crates/omnigraph-server/src/lib.rs b/crates/omnigraph-server/src/lib.rs
index 80b24e6..523bc01 100644
--- a/crates/omnigraph-server/src/lib.rs
+++ b/crates/omnigraph-server/src/lib.rs
@@ -2,9 +2,9 @@ pub mod api;
mod handlers;
mod mcp;
mod settings;
-pub use settings::{load_server_settings, classify_server_runtime_state, ServerRuntimeState};
-use settings::*;
use handlers::*;
+use settings::*;
+pub use settings::{ServerRuntimeState, classify_server_runtime_state, load_server_settings};
pub mod auth;
pub mod graph_id;
pub mod identity;
@@ -30,10 +30,10 @@ use api::{
BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput,
BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput,
CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, GraphInfo, GraphListResponse,
- HealthOutput, IngestOutput, IngestRequest, InvokeStoredQueryRequest,
- InvokeStoredQueryResponse, QueriesCatalogOutput, QueryRequest, ReadOutput, ReadRequest,
- SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, SnapshotQuery, ingest_output,
- schema_apply_output, snapshot_payload,
+ HealthOutput, IngestOutput, IngestRequest, InvokeStoredQueryRequest, InvokeStoredQueryResponse,
+ QueriesCatalogOutput, QueryRequest, ReadOutput, ReadRequest, SchemaApplyOutput,
+ SchemaApplyRequest, SchemaOutput, SnapshotQuery, ingest_output, schema_apply_output,
+ snapshot_payload,
};
pub use auth::{AWS_SECRET_ENV, EnvOrFileTokenSource, TokenSource, resolve_token_source};
use axum::body::{Body, Bytes};
@@ -167,6 +167,10 @@ pub struct ServerConfig {
/// who set up auth and forgot the policy file would otherwise ship
/// the illusion of protection.
pub allow_unauthenticated: bool,
+ /// Operator opt-in for fail-fast cluster boot. By default, graph-local
+ /// startup failures quarantine that graph and healthy graphs still serve.
+ /// When true, any quarantined or failed graph aborts startup.
+ pub require_all_graphs: bool,
}
/// What `load_server_settings` produces. RFC-011 cluster-only: the
@@ -316,7 +320,14 @@ impl AppState {
) -> Self {
let bearer_tokens = hash_bearer_tokens(bearer_tokens);
let per_graph_policy = policy_engine.map(Arc::new);
- Self::build_single_mode(uri, db, bearer_tokens, per_graph_policy, Arc::new(workload), None)
+ Self::build_single_mode(
+ uri,
+ db,
+ bearer_tokens,
+ per_graph_policy,
+ Arc::new(workload),
+ None,
+ )
}
/// Like `new_single`, but attaches a pre-validated stored-query
@@ -433,13 +444,8 @@ impl AppState {
bearer_tokens: Vec<(String, String)>,
policy_file: Option<&PathBuf>,
) -> Result {
- Self::open_single_with_queries(
- uri,
- bearer_tokens,
- policy_file,
- QueryRegistry::default(),
- )
- .await
+ Self::open_single_with_queries(uri, bearer_tokens, policy_file, QueryRegistry::default())
+ .await
}
/// Single-mode boot with a stored-query registry: open the engine,
@@ -522,8 +528,7 @@ impl AppState {
// reserved id `default` — both the registry key and the URL
// segment (`/graphs/default/...`).
let uri = normalize_root_uri(&uri).unwrap_or(uri);
- let graph_id =
- GraphId::try_from("default").expect("'default' is a valid GraphId");
+ let graph_id = GraphId::try_from("default").expect("'default' is a valid GraphId");
let key = GraphKey::cluster(graph_id);
let handle = Arc::new(GraphHandle {
key,
@@ -950,15 +955,21 @@ pub fn build_app(state: AppState) -> Router {
// flagged and their responses include RFC 9745 Deprecation +
// RFC 8288 Link headers. Suppress the call-site warning for the
// route registration itself.
- .route("/read", post({
- #[allow(deprecated)]
- server_read
- }))
+ .route(
+ "/read",
+ post({
+ #[allow(deprecated)]
+ server_read
+ }),
+ )
.route("/query", post(server_query))
- .route("/change", post({
- #[allow(deprecated)]
- server_change
- }))
+ .route(
+ "/change",
+ post({
+ #[allow(deprecated)]
+ server_change
+ }),
+ )
.route("/mutate", post(server_mutate))
.route("/queries", get(server_list_queries))
.route("/queries/{name}", post(server_invoke_query))
@@ -1080,7 +1091,14 @@ pub async fn serve(config: ServerConfig) -> Result<()> {
config = %config_path.display(),
"serving omnigraph"
);
- open_multi_graph_state(graphs, tokens, server_policy.as_ref(), config_path).await?
+ open_multi_graph_state(
+ graphs,
+ tokens,
+ server_policy.as_ref(),
+ config_path,
+ config.require_all_graphs,
+ )
+ .await?
}
};
@@ -1109,9 +1127,9 @@ fn load_graph_policy(source: &PolicySource, graph_id: &str) -> Result,
server_policy_source: Option<&PolicySource>,
config_path: PathBuf,
+ require_all_graphs: bool,
) -> Result {
- use futures::{StreamExt, TryStreamExt};
+ use futures::StreamExt;
if graphs.is_empty() {
bail!("multi-graph mode requires at least one graph in the `graphs:` map");
@@ -1134,21 +1153,48 @@ pub async fn open_multi_graph_state(
// `Omnigraph::Server::"root"` entity at evaluation time.
let server_policy = match server_policy_source {
Some(PolicySource::File(path)) => Some(PolicyEngine::load_server(path)?),
- Some(PolicySource::Inline(source)) => {
- Some(PolicyEngine::load_server_from_source(source)?)
- }
+ Some(PolicySource::Inline(source)) => Some(PolicyEngine::load_server_from_source(source)?),
None => None,
};
- // `try_collect` propagates the first error eagerly, dropping every
- // in-flight open. `buffer_unordered + collect::>` would drain
- // the stream before checking errors — incorrect for the docstring's
- // "fail-fast" claim and wasteful on S3-backed graphs.
- let handles: Vec> = futures::stream::iter(graphs.into_iter())
- .map(|cfg| async move { open_single_graph(cfg).await })
+ let configured_graphs = graphs.len();
+ let results = futures::stream::iter(graphs.into_iter())
+ .map(|cfg| async move {
+ let graph_id = cfg.graph_id.clone();
+ open_single_graph(cfg).await.map_err(|err| (graph_id, err))
+ })
.buffer_unordered(4)
- .try_collect()
- .await?;
+ .collect::>()
+ .await;
+ let mut handles = Vec::new();
+ let mut failed = 0usize;
+ for result in results {
+ match result {
+ Ok(handle) => handles.push(handle),
+ Err((graph_id, err)) => {
+ failed += 1;
+ warn!(
+ graph_id = %graph_id,
+ error = %err,
+ "graph quarantined during startup"
+ );
+ }
+ }
+ }
+ if require_all_graphs && failed > 0 {
+ bail!(
+ "strict multi-graph startup requires every graph to open ({} configured, {} failed)",
+ configured_graphs,
+ failed
+ );
+ }
+ if handles.is_empty() {
+ bail!(
+ "no healthy graphs opened from multi-graph startup config ({} configured, {} failed)",
+ configured_graphs,
+ failed
+ );
+ }
let workload = workload::WorkloadController::from_env();
let state = AppState::new_multi(handles, tokens, server_policy, workload, Some(config_path))
diff --git a/crates/omnigraph-server/src/main.rs b/crates/omnigraph-server/src/main.rs
index 482c9af..c45b77f 100644
--- a/crates/omnigraph-server/src/main.rs
+++ b/crates/omnigraph-server/src/main.rs
@@ -22,6 +22,11 @@ struct Cli {
/// Equivalent to setting `OMNIGRAPH_UNAUTHENTICATED=1`.
#[arg(long)]
unauthenticated: bool,
+ /// Fail startup if any applied graph is quarantined or fails to open.
+ /// By default, graph-local failures are logged and healthy graphs still
+ /// serve. Equivalent to setting `OMNIGRAPH_REQUIRE_ALL_GRAPHS=1`.
+ #[arg(long)]
+ require_all_graphs: bool,
}
#[tokio::main]
@@ -30,7 +35,12 @@ async fn main() -> Result<()> {
init_tracing();
let cli = Cli::parse();
- let settings: ServerConfig =
- load_server_settings(cli.cluster.as_ref(), cli.bind, cli.unauthenticated).await?;
+ let settings: ServerConfig = load_server_settings(
+ cli.cluster.as_ref(),
+ cli.bind,
+ cli.unauthenticated,
+ cli.require_all_graphs,
+ )
+ .await?;
serve(settings).await
}
diff --git a/crates/omnigraph-server/src/settings.rs b/crates/omnigraph-server/src/settings.rs
index 7c0e620..f17c8ad 100644
--- a/crates/omnigraph-server/src/settings.rs
+++ b/crates/omnigraph-server/src/settings.rs
@@ -12,6 +12,7 @@ pub(crate) async fn load_cluster_settings(
cluster_dir: &PathBuf,
cli_bind: Option,
cli_allow_unauthenticated: bool,
+ cli_require_all_graphs: bool,
) -> Result {
// `--cluster` accepts either a config directory (the ledger location is
// resolved through cluster.yaml's `storage:` key) or a storage-root URI
@@ -28,11 +29,45 @@ pub(crate) async fn load_cluster_settings(
.map_err(|diagnostics| {
let details = diagnostics
.iter()
- .map(|diagnostic| format!("[{}] {}: {}", diagnostic.code, diagnostic.path, diagnostic.message))
+ .map(|diagnostic| {
+ format!(
+ "[{}] {}: {}",
+ diagnostic.code, diagnostic.path, diagnostic.message
+ )
+ })
.collect::>()
.join("\n ");
- eyre!("the cluster at '{}' is not ready to serve:\n {details}", cluster_dir.display())
+ eyre!(
+ "the cluster at '{}' is not ready to serve:\n {details}",
+ cluster_dir.display()
+ )
})?;
+ for diagnostic in &snapshot.diagnostics {
+ warn!(
+ code = %diagnostic.code,
+ path = %diagnostic.path,
+ message = %diagnostic.message,
+ "cluster startup diagnostic"
+ );
+ }
+ let env_require_all_graphs = env_flag("OMNIGRAPH_REQUIRE_ALL_GRAPHS");
+ let require_all_graphs = cli_require_all_graphs || env_require_all_graphs;
+ if require_all_graphs && !snapshot.diagnostics.is_empty() {
+ let details = snapshot
+ .diagnostics
+ .iter()
+ .map(|diagnostic| {
+ format!(
+ "[{}] {}: {}",
+ diagnostic.code, diagnostic.path, diagnostic.message
+ )
+ })
+ .collect::>()
+ .join("\n ");
+ bail!(
+ "strict cluster boot requires every applied graph to be ready; startup diagnostics:\n {details}"
+ );
+ }
// Bindings -> Cedar slots. The serving pipeline loads one bundle per
// graph plus one server-level bundle; stacked bundles per scope are a
@@ -69,6 +104,7 @@ pub(crate) async fn load_cluster_settings(
}
let mut graphs = Vec::new();
+ let mut skipped_graphs = Vec::new();
for graph in &snapshot.graphs {
let specs: Vec = snapshot
.queries
@@ -82,40 +118,75 @@ pub(crate) async fn load_cluster_settings(
// spec carries only identity + source.
})
.collect();
- let registry = QueryRegistry::from_specs(specs).map_err(|errors| {
- let details = errors
- .iter()
- .map(|error| error.to_string())
- .collect::>()
- .join("\n ");
- eyre!(
- "stored queries in the applied revision failed to parse:\n {details}\nrun `cluster refresh` then `cluster apply`, and restart"
- )
- })?;
+ let registry = match QueryRegistry::from_specs(specs) {
+ Ok(registry) => registry,
+ Err(errors) => {
+ let details = errors
+ .iter()
+ .map(|error| error.to_string())
+ .collect::>()
+ .join("\n ");
+ warn!(
+ graph_id = %graph.graph_id,
+ errors = %details,
+ "graph quarantined because stored queries failed to parse"
+ );
+ skipped_graphs.push(format!(
+ "{}: stored queries failed to parse: {details}",
+ graph.graph_id
+ ));
+ continue;
+ }
+ };
+ let embedding = match graph
+ .embedding
+ .as_ref()
+ .map(|profile| {
+ profile.resolve().map_err(|err| {
+ eyre!("embedding provider for graph '{}': {err}", graph.graph_id)
+ })
+ })
+ .transpose()
+ {
+ Ok(embedding) => embedding,
+ Err(err) => {
+ warn!(
+ graph_id = %graph.graph_id,
+ error = %err,
+ "graph quarantined because embedding provider configuration failed"
+ );
+ skipped_graphs.push(format!("{}: {err}", graph.graph_id));
+ continue;
+ }
+ };
graphs.push(GraphStartupConfig {
graph_id: graph.graph_id.clone(),
uri: graph.root.to_string_lossy().to_string(),
policy: graph_policies.get(&graph.graph_id).cloned(),
- embedding: graph
- .embedding
- .as_ref()
- .map(|profile| {
- profile.resolve().map_err(|err| {
- eyre!("embedding provider for graph '{}': {err}", graph.graph_id)
- })
- })
- .transpose()?,
+ embedding,
queries: registry,
});
}
+ if graphs.is_empty() {
+ let skipped = skipped_graphs.join(", ");
+ bail!(
+ "the cluster at '{}' has no healthy graphs to serve{}",
+ cluster_dir.display(),
+ if skipped.is_empty() {
+ String::new()
+ } else {
+ format!(" (quarantined: {skipped})")
+ }
+ );
+ }
+ if require_all_graphs && !skipped_graphs.is_empty() {
+ bail!(
+ "strict cluster boot requires every graph to build startup settings (quarantined: {})",
+ skipped_graphs.join(", ")
+ );
+ }
- let env_unauth = std::env::var("OMNIGRAPH_UNAUTHENTICATED")
- .ok()
- .map(|v| {
- let trimmed = v.trim();
- !trimmed.is_empty() && trimmed != "0" && !trimmed.eq_ignore_ascii_case("false")
- })
- .unwrap_or(false);
+ let env_unauth = env_flag("OMNIGRAPH_UNAUTHENTICATED");
Ok(ServerConfig {
mode: ServerConfigMode::Multi {
@@ -125,6 +196,7 @@ pub(crate) async fn load_cluster_settings(
},
bind: cli_bind.unwrap_or_else(|| "127.0.0.1:8080".to_string()),
allow_unauthenticated: cli_allow_unauthenticated || env_unauth,
+ require_all_graphs,
})
}
@@ -136,6 +208,7 @@ pub async fn load_server_settings(
cli_cluster: Option<&PathBuf>,
cli_bind: Option,
cli_allow_unauthenticated: bool,
+ cli_require_all_graphs: bool,
) -> Result {
let Some(cluster_dir) = cli_cluster else {
bail!(
@@ -145,7 +218,23 @@ pub async fn load_server_settings(
was removed in RFC-011."
);
};
- load_cluster_settings(cluster_dir, cli_bind, cli_allow_unauthenticated).await
+ load_cluster_settings(
+ cluster_dir,
+ cli_bind,
+ cli_allow_unauthenticated,
+ cli_require_all_graphs,
+ )
+ .await
+}
+
+fn env_flag(name: &str) -> bool {
+ std::env::var(name)
+ .ok()
+ .map(|v| {
+ let trimmed = v.trim();
+ !trimmed.is_empty() && trimmed != "0" && !trimmed.eq_ignore_ascii_case("false")
+ })
+ .unwrap_or(false)
}
/// MR-723 server runtime state, classified from the three-state matrix
@@ -238,7 +327,9 @@ pub(crate) fn read_bearer_tokens_file(path: &str) -> Result) -> Result> {
+pub(crate) fn validate_bearer_tokens(
+ entries: Vec<(String, String)>,
+) -> Result> {
let mut seen_actors = HashSet::new();
let mut seen_tokens = HashSet::new();
let mut normalized = Vec::with_capacity(entries.len());
@@ -299,11 +390,18 @@ mod tests {
/// as 404 without also masking a 401/500. Pins each outcome.
#[test]
fn authorize_splits_decision_from_operational_error() {
- use super::{Authz, PolicyAction, PolicyCompiler, PolicyConfig, PolicyRequest, ResolvedActor, authorize};
+ use super::{
+ Authz, PolicyAction, PolicyCompiler, PolicyConfig, PolicyRequest, ResolvedActor,
+ authorize,
+ };
use std::sync::Arc;
fn req(action: PolicyAction) -> PolicyRequest {
- PolicyRequest { action, branch: None, target_branch: None }
+ PolicyRequest {
+ action,
+ branch: None,
+ target_branch: None,
+ }
}
let actor = ResolvedActor::cluster_static(Arc::from("act-alice"));
@@ -343,7 +441,11 @@ mod tests {
authorize(
Some(&actor),
Some(&engine),
- PolicyRequest { action: PolicyAction::Read, branch: Some("main".to_string()), target_branch: None },
+ PolicyRequest {
+ action: PolicyAction::Read,
+ branch: Some("main".to_string()),
+ target_branch: None
+ },
)
.unwrap(),
Authz::Allowed
@@ -352,11 +454,17 @@ mod tests {
match authorize(
Some(&actor),
Some(&engine),
- PolicyRequest { action: PolicyAction::Change, branch: Some("main".to_string()), target_branch: None },
+ PolicyRequest {
+ action: PolicyAction::Change,
+ branch: Some("main".to_string()),
+ target_branch: None,
+ },
)
.unwrap()
{
- Authz::Denied(message) => assert!(!message.is_empty(), "a deny carries its decision message"),
+ Authz::Denied(message) => {
+ assert!(!message.is_empty(), "a deny carries its decision message")
+ }
Authz::Allowed => panic!("change must be denied: only read is allowed"),
}
// Policy installed but no actor → operational failure (`Err`), NOT a
@@ -393,8 +501,7 @@ mod tests {
};
// Empty registry → nothing attached, no error.
- let empty =
- super::validate_and_attach(QueryRegistry::default(), &catalog, "g").unwrap();
+ let empty = super::validate_and_attach(QueryRegistry::default(), &catalog, "g").unwrap();
assert!(empty.is_none());
// A query that type-checks → attached.
@@ -403,7 +510,11 @@ mod tests {
"query find_user() { match { $u: User } return { $u.name } }",
)])
.unwrap();
- assert!(super::validate_and_attach(ok, &catalog, "g").unwrap().is_some());
+ assert!(
+ super::validate_and_attach(ok, &catalog, "g")
+ .unwrap()
+ .is_some()
+ );
// A query referencing a type the schema lacks → boot refusal that
// names both the graph label and the offending query.
@@ -416,7 +527,10 @@ mod tests {
let msg = err.to_string();
assert!(msg.contains("graph-x"), "labels the graph: {msg}");
assert!(msg.contains("ghost"), "names the query: {msg}");
- assert!(msg.contains("schema check"), "mentions the schema check: {msg}");
+ assert!(
+ msg.contains("schema check"),
+ "mentions the schema check: {msg}"
+ );
}
#[test]
@@ -447,7 +561,7 @@ mod tests {
async fn server_settings_require_cluster_boot_source() {
// RFC-011 cluster-only: with no --cluster the server refuses to
// start and names the cluster-required remedy.
- let error = super::load_server_settings(None, None, false)
+ let error = super::load_server_settings(None, None, false, false)
.await
.unwrap_err();
assert!(
@@ -530,6 +644,7 @@ mod tests {
},
bind: "127.0.0.1:0".to_string(),
allow_unauthenticated: false,
+ require_all_graphs: false,
};
let result = serve(config).await;
let err = result
@@ -582,6 +697,7 @@ mod tests {
},
bind: "127.0.0.1:0".to_string(),
allow_unauthenticated: false,
+ require_all_graphs: false,
};
let result = serve(config).await;
let err =
diff --git a/crates/omnigraph-server/tests/multi_graph.rs b/crates/omnigraph-server/tests/multi_graph.rs
index 617cc66..5679aef 100644
--- a/crates/omnigraph-server/tests/multi_graph.rs
+++ b/crates/omnigraph-server/tests/multi_graph.rs
@@ -13,7 +13,6 @@ use serde_json::Value;
use serial_test::serial;
use tower::ServiceExt;
-
mod support;
use support::*;
@@ -414,7 +413,7 @@ async fn cluster_boot_serves_applied_state() {
assert!(server_policy.is_none());
let state =
- omnigraph_server::open_multi_graph_state(graphs, Vec::new(), None, config_path)
+ omnigraph_server::open_multi_graph_state(graphs, Vec::new(), None, config_path, false)
.await
.unwrap();
let app = build_app(state);
@@ -424,7 +423,10 @@ async fn cluster_boot_serves_applied_state() {
// GET /graphs refuses even in cluster mode.
let (status, body) = json_response(
&app,
- Request::builder().uri("/graphs").body(Body::empty()).unwrap(),
+ Request::builder()
+ .uri("/graphs")
+ .body(Body::empty())
+ .unwrap(),
)
.await;
assert_eq!(status, StatusCode::FORBIDDEN, "{body}");
@@ -460,6 +462,115 @@ async fn cluster_boot_serves_applied_state() {
assert_eq!(status, StatusCode::OK, "{body}");
}
+#[tokio::test]
+async fn cluster_boot_quarantines_graph_open_failures() {
+ let temp = tempfile::tempdir().unwrap();
+ let schema = "\nnode Person {\n name: String @key\n}\n";
+ let good_uri = temp.path().join("good.omni");
+ Omnigraph::init(good_uri.to_string_lossy().as_ref(), schema)
+ .await
+ .unwrap();
+ let bad_uri = temp.path().join("missing.omni");
+ let server_policy = omnigraph_server::PolicySource::Inline(
+ r#"
+version: 1
+kind: server
+groups:
+ admins: [act-admin]
+rules:
+ - id: admins-list-graphs
+ allow:
+ actors: { group: admins }
+ actions: [graph_list]
+"#
+ .to_string(),
+ );
+ let graphs = vec![
+ omnigraph_server::GraphStartupConfig {
+ graph_id: "broken".to_string(),
+ uri: bad_uri.to_string_lossy().to_string(),
+ policy: None,
+ embedding: None,
+ queries: stored_query_registry(&[]),
+ },
+ omnigraph_server::GraphStartupConfig {
+ graph_id: "good".to_string(),
+ uri: good_uri.to_string_lossy().to_string(),
+ policy: None,
+ embedding: None,
+ queries: stored_query_registry(&[]),
+ },
+ ];
+ let strict_err = match omnigraph_server::open_multi_graph_state(
+ graphs.clone(),
+ vec![("act-admin".to_string(), "admin-token".to_string())],
+ Some(&server_policy),
+ temp.path().join("cluster.yaml"),
+ true,
+ )
+ .await
+ {
+ Ok(_) => panic!("strict startup should reject a failed graph open"),
+ Err(err) => err,
+ };
+ assert!(
+ strict_err
+ .to_string()
+ .contains("strict multi-graph startup requires every graph to open"),
+ "{strict_err}"
+ );
+ let state = omnigraph_server::open_multi_graph_state(
+ graphs,
+ vec![("act-admin".to_string(), "admin-token".to_string())],
+ Some(&server_policy),
+ temp.path().join("cluster.yaml"),
+ false,
+ )
+ .await
+ .unwrap();
+ let mut ready: Vec<_> = state
+ .routing()
+ .registry
+ .list()
+ .iter()
+ .map(|handle| handle.key.graph_id.as_str().to_string())
+ .collect();
+ ready.sort();
+ assert_eq!(ready, vec!["good"]);
+ let app = build_app(state);
+
+ let (status, body) = json_response(
+ &app,
+ Request::builder()
+ .uri("/graphs")
+ .header("authorization", "Bearer admin-token")
+ .body(Body::empty())
+ .unwrap(),
+ )
+ .await;
+ assert_eq!(status, StatusCode::OK, "{body}");
+ assert_eq!(
+ body["graphs"]
+ .as_array()
+ .unwrap()
+ .iter()
+ .map(|graph| graph["graph_id"].as_str().unwrap())
+ .collect::>(),
+ vec!["good"]
+ );
+
+ let (status, body) = json_response(
+ &app,
+ Request::builder()
+ .uri("/graphs/broken/queries")
+ .header("authorization", "Bearer admin-token")
+ .body(Body::empty())
+ .unwrap(),
+ )
+ .await;
+ assert_eq!(status, StatusCode::NOT_FOUND, "{body}");
+}
+
#[tokio::test(flavor = "multi_thread")]
#[serial]
async fn cluster_boot_injects_embedding_provider_config() {
@@ -555,6 +666,7 @@ graphs:
Vec::new(),
server_policy.as_ref(),
config_path,
+ false,
)
.await
.unwrap();
@@ -665,7 +777,10 @@ async fn cluster_boot_wires_policy_bindings_into_cedar_slots() {
.unwrap();
fs::write(
temp.path().join("cluster.policy.yaml"),
- permit_all_policy_yaml(&["default"]).replace("protected_branches: [main]\n", "protected_branches: [main]\nkind: server\n"),
+ permit_all_policy_yaml(&["default"]).replace(
+ "protected_branches: [main]\n",
+ "protected_branches: [main]\nkind: server\n",
+ ),
)
.unwrap();
fs::write(
@@ -719,7 +834,7 @@ graphs:
async fn cluster_boot_refusals() {
// RFC-011 cluster-only: with no --cluster, boot refuses with the
// cluster-required remedy.
- let err = omnigraph_server::load_server_settings(None, None, true)
+ let err = omnigraph_server::load_server_settings(None, None, true, false)
.await
.unwrap_err();
assert!(err.to_string().contains("boots from a cluster"), "{err}");
@@ -729,7 +844,12 @@ async fn cluster_boot_refusals() {
// Tampered catalog blob refuses boot with the remedy.
let blob_dir = dir.join("__cluster/resources/query/knowledge/find_person");
- let blob = fs::read_dir(&blob_dir).unwrap().next().unwrap().unwrap().path();
+ let blob = fs::read_dir(&blob_dir)
+ .unwrap()
+ .next()
+ .unwrap()
+ .unwrap()
+ .path();
fs::write(&blob, "tampered").unwrap();
let err = cluster_settings(&dir).await.unwrap_err();
assert!(
diff --git a/crates/omnigraph-server/tests/s3.rs b/crates/omnigraph-server/tests/s3.rs
index 99bf98d..793d79d 100644
--- a/crates/omnigraph-server/tests/s3.rs
+++ b/crates/omnigraph-server/tests/s3.rs
@@ -11,7 +11,6 @@ use omnigraph_server::api::ReadRequest;
use omnigraph_server::{AppState, build_app};
use serde_json::json;
-
mod support;
use support::*;
@@ -137,6 +136,7 @@ async fn server_boots_cluster_from_bare_storage_uri_and_serves_query() {
Some(&std::path::PathBuf::from(&root)),
None,
true,
+ false,
)
.await
.unwrap();
@@ -153,6 +153,7 @@ async fn server_boots_cluster_from_bare_storage_uri_and_serves_query() {
Vec::new(),
server_policy.as_ref(),
config_path,
+ false,
)
.await
.unwrap();
@@ -170,7 +171,9 @@ async fn server_boots_cluster_from_bare_storage_uri_and_serves_query() {
.await
.unwrap();
assert_eq!(response.status(), StatusCode::OK);
- let bytes = axum::body::to_bytes(response.into_body(), usize::MAX).await.unwrap();
+ let bytes = axum::body::to_bytes(response.into_body(), usize::MAX)
+ .await
+ .unwrap();
let value: serde_json::Value = serde_json::from_slice(&bytes).unwrap();
assert_eq!(value["rows"][0]["p.name"], "Ada", "{value}");
}
diff --git a/crates/omnigraph-server/tests/stored_queries.rs b/crates/omnigraph-server/tests/stored_queries.rs
index 1cf8e02..e3c32a4 100644
--- a/crates/omnigraph-server/tests/stored_queries.rs
+++ b/crates/omnigraph-server/tests/stored_queries.rs
@@ -370,6 +370,47 @@ async fn list_queries_requires_invoke_query() {
assert!(names.contains(&"find_person"), "invoker sees the exposed query: {names:?}");
}
+#[tokio::test(flavor = "multi_thread")]
+async fn list_queries_surfaces_query_description_and_instruction() {
+ // E2e for the query-level `.gq` surface: `@description`/`@instruction` on
+ // a stored query declaration are carried through to clients via the typed
+ // `QueryCatalogEntry` fields over `GET /queries`. A query without them
+ // omits both fields (serde `skip_serializing_if = "Option::is_none"`).
+ let described = "query described($name: String) \
+ @description(\"Find a person by exact name.\") \
+ @instruction(\"Use for exact lookups; prefer search for fuzzy matches.\") \
+ { match { $p: Person { name: $name } } return { $p.age } }";
+ let (_temp, app) = app_with_stored_queries(
+ &[
+ ("described", described, true),
+ ("bare", "query bare() { match { $p: Person } return { $p.name } }", true),
+ ],
+ &[("act-invoke", "t-invoke")],
+ INVOKE_POLICY_YAML,
+ )
+ .await;
+ let (status, body) = json_response(&app, get_request(&g("/queries"), "t-invoke")).await;
+ assert_eq!(status, StatusCode::OK, "body: {body}");
+ let entries = body["queries"].as_array().unwrap();
+
+ let described = entries.iter().find(|q| q["name"] == "described").unwrap();
+ assert_eq!(
+ described["description"], "Find a person by exact name.",
+ "query @description surfaces over GET /queries: {described}"
+ );
+ assert_eq!(
+ described["instruction"],
+ "Use for exact lookups; prefer search for fuzzy matches.",
+ "query @instruction surfaces over GET /queries: {described}"
+ );
+
+ let bare = entries.iter().find(|q| q["name"] == "bare").unwrap();
+ assert!(
+ bare.get("description").is_none() && bare.get("instruction").is_none(),
+ "a query without the annotations omits both fields: {bare}"
+ );
+}
+
#[tokio::test(flavor = "multi_thread")]
async fn list_queries_is_empty_when_no_registry() {
let (_temp, app) = app_for_loaded_graph_with_auth("demo-token").await;
diff --git a/crates/omnigraph-server/tests/support/mod.rs b/crates/omnigraph-server/tests/support/mod.rs
index 54efb80..4b972ee 100644
--- a/crates/omnigraph-server/tests/support/mod.rs
+++ b/crates/omnigraph-server/tests/support/mod.rs
@@ -15,15 +15,12 @@ use omnigraph::db::{Omnigraph, ReadTarget};
use omnigraph::error::OmniError;
use omnigraph::loader::{LoadMode, load_jsonl};
use omnigraph_policy::{PolicyChecker, PolicyEngine};
-use omnigraph_server::api::{
- BranchCreateRequest, BranchMergeRequest, ChangeRequest, ReadRequest,
-};
+use omnigraph_server::api::{BranchCreateRequest, BranchMergeRequest, ChangeRequest, ReadRequest};
use omnigraph_server::queries::{QueryRegistry, RegistrySpec};
use omnigraph_server::{AppState, build_app};
use serde_json::{Value, json};
use tower::ServiceExt;
-
pub const MUTATION_QUERIES: &str = r#"
query insert_person($name: String, $age: I32) {
insert Person { name: $name, age: $age }
@@ -1212,6 +1209,8 @@ graphs:
temp
}
-pub async fn cluster_settings(dir: &Path) -> color_eyre::eyre::Result {
- omnigraph_server::load_server_settings(Some(&dir.to_path_buf()), None, true).await
+pub async fn cluster_settings(
+ dir: &Path,
+) -> color_eyre::eyre::Result {
+ omnigraph_server::load_server_settings(Some(&dir.to_path_buf()), None, true, false).await
}
diff --git a/crates/omnigraph/Cargo.toml b/crates/omnigraph/Cargo.toml
index 7ee9bda..55d3008 100644
--- a/crates/omnigraph/Cargo.toml
+++ b/crates/omnigraph/Cargo.toml
@@ -55,5 +55,6 @@ arc-swap = { workspace = true }
omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.7.0" }
tokio = { workspace = true }
lance-namespace-impls = { workspace = true }
+lance-io = "7.0.0"
serial_test = "3"
proptest = "1"
diff --git a/crates/omnigraph/src/db/commit_graph.rs b/crates/omnigraph/src/db/commit_graph.rs
index 3d90e54..572bdf5 100644
--- a/crates/omnigraph/src/db/commit_graph.rs
+++ b/crates/omnigraph/src/db/commit_graph.rs
@@ -79,10 +79,14 @@ impl CommitGraph {
pub async fn open(root_uri: &str) -> Result {
let root = root_uri.trim_end_matches('/');
- let dataset = Dataset::open(&graph_commits_uri(root))
- .await
- .map_err(|e| OmniError::Lance(e.to_string()))?;
- let actor_dataset = Dataset::open(&graph_commit_actors_uri(root)).await.ok();
+ let wrapper = crate::instrumentation::commit_graph_wrapper();
+ let dataset =
+ crate::instrumentation::open_dataset_tracked(&graph_commits_uri(root), wrapper.clone())
+ .await?;
+ let actor_dataset =
+ crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(root), wrapper)
+ .await
+ .ok();
let actor_by_commit_id = match &actor_dataset {
Some(dataset) => load_commit_actor_cache(dataset).await?,
None => HashMap::new(),
@@ -101,14 +105,18 @@ impl CommitGraph {
pub async fn open_at_branch(root_uri: &str, branch: &str) -> Result {
let root = root_uri.trim_end_matches('/');
- let dataset = Dataset::open(&graph_commits_uri(root))
- .await
- .map_err(|e| OmniError::Lance(e.to_string()))?;
+ let wrapper = crate::instrumentation::commit_graph_wrapper();
+ let dataset =
+ crate::instrumentation::open_dataset_tracked(&graph_commits_uri(root), wrapper.clone())
+ .await?;
let dataset = dataset
.checkout_branch(branch)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
- let actor_dataset = Dataset::open(&graph_commit_actors_uri(root)).await.ok();
+ let actor_dataset =
+ crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(root), wrapper)
+ .await
+ .ok();
let actor_by_commit_id = match &actor_dataset {
Some(dataset) => load_commit_actor_cache(dataset).await?,
None => HashMap::new(),
@@ -127,9 +135,12 @@ impl CommitGraph {
pub async fn refresh(&mut self) -> Result<()> {
let root = self.root_uri.clone();
- self.dataset = Dataset::open(&graph_commits_uri(&root))
- .await
- .map_err(|e| OmniError::Lance(e.to_string()))?;
+ let wrapper = crate::instrumentation::commit_graph_wrapper();
+ self.dataset = crate::instrumentation::open_dataset_tracked(
+ &graph_commits_uri(&root),
+ wrapper.clone(),
+ )
+ .await?;
if let Some(branch) = &self.active_branch {
self.dataset = self
.dataset
@@ -137,7 +148,10 @@ impl CommitGraph {
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
}
- self.actor_dataset = Dataset::open(&graph_commit_actors_uri(&root)).await.ok();
+ self.actor_dataset =
+ crate::instrumentation::open_dataset_tracked(&graph_commit_actors_uri(&root), wrapper)
+ .await
+ .ok();
self.actor_by_commit_id = match &self.actor_dataset {
Some(dataset) => load_commit_actor_cache(dataset).await?,
None => HashMap::new(),
diff --git a/crates/omnigraph/src/db/graph_coordinator.rs b/crates/omnigraph/src/db/graph_coordinator.rs
index dfe2767..b9bcb11 100644
--- a/crates/omnigraph/src/db/graph_coordinator.rs
+++ b/crates/omnigraph/src/db/graph_coordinator.rs
@@ -10,7 +10,9 @@ use crate::storage::{StorageAdapter, join_uri, normalize_root_uri};
use super::commit_graph::{CommitGraph, GraphCommit};
use super::is_internal_system_branch;
-use super::manifest::{ManifestChange, ManifestCoordinator, Snapshot, SubTableUpdate};
+use super::manifest::{
+ ManifestChange, ManifestCoordinator, ManifestIncarnation, Snapshot, SubTableUpdate,
+};
const GRAPH_COMMITS_DIR: &str = "_graph_commits.lance";
@@ -26,10 +28,11 @@ impl SnapshotId {
&self.0
}
- pub(crate) fn synthetic(branch: Option<&str>, version: u64) -> Self {
- match branch {
- Some(branch) => Self(format!("manifest:{}:v{}", branch, version)),
- None => Self(format!("manifest:main:v{}", version)),
+ pub(crate) fn synthetic(branch: Option<&str>, version: u64, e_tag: Option<&str>) -> Self {
+ let branch = branch.unwrap_or("main");
+ match e_tag {
+ Some(e_tag) => Self(format!("manifest:{}:v{}:etag:{}", branch, version, e_tag)),
+ None => Self(format!("manifest:{}:v{}", branch, version)),
}
}
}
@@ -166,6 +169,10 @@ impl GraphCoordinator {
self.manifest.version()
}
+ pub(crate) fn manifest_incarnation(&self) -> ManifestIncarnation {
+ self.manifest.incarnation()
+ }
+
pub fn snapshot(&self) -> Snapshot {
self.manifest.snapshot()
}
@@ -182,6 +189,19 @@ impl GraphCoordinator {
Ok(())
}
+ pub(crate) async fn probe_latest_incarnation(&self) -> Result {
+ crate::instrumentation::record_probe();
+ self.manifest.probe_latest_incarnation().await
+ }
+
+ /// Refresh only the manifest (not the commit graph). The read path uses this
+ /// on a stale same-branch probe: a read pins its snapshot by manifest version
+ /// and never needs the commit graph, so a full `refresh` (which also scans
+ /// the commit graph) would be wasted IO.
+ pub async fn refresh_manifest_only(&mut self) -> Result<()> {
+ self.manifest.refresh().await
+ }
+
pub async fn branch_list(&self) -> Result> {
self.manifest.list_branches().await.map(|branches| {
branches
@@ -315,10 +335,13 @@ impl GraphCoordinator {
None => GraphCoordinator::open(self.root_uri(), Arc::clone(&self.storage)).await?,
};
- Ok(other
- .head_commit_id()
- .await?
- .unwrap_or_else(|| SnapshotId::synthetic(other.current_branch(), other.version())))
+ Ok(other.head_commit_id().await?.unwrap_or_else(|| {
+ SnapshotId::synthetic(
+ other.current_branch(),
+ other.version(),
+ other.manifest_incarnation().e_tag.as_deref(),
+ )
+ }))
}
pub async fn resolve_target(&self, target: &ReadTarget) -> Result {
@@ -339,7 +362,11 @@ impl GraphCoordinator {
}
};
let snapshot_id = other.head_commit_id().await?.unwrap_or_else(|| {
- SnapshotId::synthetic(other.current_branch(), other.version())
+ SnapshotId::synthetic(
+ other.current_branch(),
+ other.version(),
+ other.manifest_incarnation().e_tag.as_deref(),
+ )
});
Ok(ResolvedTarget {
requested: target.clone(),
@@ -509,9 +536,23 @@ impl GraphCoordinator {
return Ok(SnapshotId::synthetic(
current_branch.as_deref(),
manifest_version,
+ self.manifest_incarnation().e_tag.as_deref(),
));
};
failpoints::maybe_fail("graph_publish.before_commit_append")?;
+ // Refresh the commit-graph head from storage before selecting the
+ // parent. `append_commit` parents the new commit on the IN-MEMORY head
+ // (`head_commit_id`, zero storage read), but the manifest was just
+ // committed against a freshly rebased pin (`commit_all` opens a fresh
+ // coordinator) while THIS coordinator's cached head may be stale because
+ // an external writer advanced the branch. Without this refresh a
+ // same-branch write after an external commit appends off the stale head
+ // and FORKS the commit DAG (the new commit and the external commit
+ // sharing a parent). Refreshing makes the parent the true current head;
+ // the just-committed manifest version has no commit-graph row yet, so the
+ // fresh head is exactly the prior commit. (record_merge_commit is
+ // unaffected — it passes explicit parents, never the cached head.)
+ commit_graph.refresh().await?;
let graph_commit_id = commit_graph
.append_commit(current_branch.as_deref(), manifest_version, actor_id)
.await?;
diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs
index f130523..19f25a3 100644
--- a/crates/omnigraph/src/db/manifest.rs
+++ b/crates/omnigraph/src/db/manifest.rs
@@ -24,20 +24,19 @@ mod recovery;
mod state;
use graph::{init_manifest_graph, open_manifest_graph, snapshot_state_at};
-use layout::{manifest_uri, open_manifest_dataset, type_name_hash};
+use layout::{manifest_uri, open_manifest_dataset, table_uri_for_path, type_name_hash};
pub(crate) use metadata::TableVersionMetadata;
#[cfg(test)]
use metadata::{OMNIGRAPH_ROW_COUNT_KEY, table_version_metadata_for_state};
-use namespace::open_table_at_version_from_manifest;
pub(crate) use namespace::open_table_head_for_write;
#[cfg(test)]
use namespace::{branch_manifest_namespace, staged_table_namespace};
use publisher::{GraphNamespacePublisher, ManifestBatchPublisher};
pub(crate) use recovery::{
- RecoveryMode, RecoverySidecarHandle, SidecarKind, SidecarTablePin, SidecarTableRegistration,
- SidecarTombstone, delete_sidecar, has_schema_apply_sidecar, heal_pending_sidecars_roll_forward,
- list_sidecars, new_sidecar, recover_manifest_drift, schema_apply_serial_queue_key,
- write_sidecar,
+ RecoveryMode, RecoverySidecar, RecoverySidecarHandle, SidecarKind, SidecarTablePin,
+ SidecarTableRegistration, SidecarTombstone, confirm_sidecar_phase_b, delete_sidecar,
+ has_schema_apply_sidecar, heal_pending_sidecars_roll_forward, list_sidecars, new_sidecar,
+ recover_manifest_drift, schema_apply_serial_queue_key, write_sidecar,
};
pub use state::SubTableEntry;
#[cfg(test)]
@@ -74,16 +73,51 @@ pub struct Snapshot {
root_uri: String,
version: u64,
entries: HashMap,
+ /// Per-graph read caches (shared `Session` + held-handle cache), injected by
+ /// `Omnigraph::resolved_target` for live Branch reads so table opens reuse
+ /// handles (0 IO on a warm repeat) and one `Session`. `None` for write-prelude
+ /// snapshots, time-travel / Snapshot-id reads, and directly-built test
+ /// snapshots, which fall back to a plain open.
+ read_caches: Option>,
}
impl Snapshot {
- /// Open a sub-table dataset at its pinned version.
+ /// Open a sub-table dataset at its pinned version. With read caches present
+ /// (live Branch reads), reuse a held handle through the cache (0 open IO on a
+ /// warm repeat) and the shared `Session`; otherwise plain-open (Fix 2).
pub async fn open(&self, table_key: &str) -> Result {
let entry = self
.entries
.get(table_key)
.ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?;
- entry.open(&self.root_uri).await
+ match &self.read_caches {
+ Some(caches) => {
+ let location = table_uri_for_path(
+ &self.root_uri,
+ &entry.table_path,
+ entry.table_branch.as_deref(),
+ );
+ caches
+ .handles
+ .get_or_open(
+ &entry.table_path,
+ entry.table_branch.as_deref(),
+ entry.table_version,
+ entry.version_metadata.e_tag(),
+ &location,
+ Some(&caches.session),
+ )
+ .await
+ }
+ None => entry.open(&self.root_uri).await,
+ }
+ }
+
+ /// Attach per-graph read caches (shared `Session` + handle cache) so this
+ /// snapshot's table opens reuse handles and the session. Set by
+ /// `Omnigraph::resolved_target` for live Branch reads only.
+ pub(crate) fn set_read_caches(&mut self, caches: Arc) {
+ self.read_caches = Some(caches);
}
/// Manifest version this snapshot was taken from.
@@ -101,6 +135,31 @@ impl Snapshot {
}
}
+#[derive(Debug, Clone, PartialEq, Eq)]
+pub(crate) struct ManifestIncarnation {
+ pub(crate) version: u64,
+ pub(crate) e_tag: Option,
+ timestamp_nanos: Option,
+}
+
+impl ManifestIncarnation {
+ pub(crate) fn matches(&self, held: &Self) -> bool {
+ if self.version != held.version {
+ return false;
+ }
+ match (&self.e_tag, &held.e_tag) {
+ (Some(latest), Some(current)) => latest == current,
+ _ => match (self.timestamp_nanos, held.timestamp_nanos) {
+ (Some(latest), Some(current)) => latest == current,
+ // Some object stores can omit both e_tag and manifest timestamp
+ // from the reachable API. In that narrow case the version-number
+ // probe is the strongest available identity.
+ _ => true,
+ },
+ }
+ }
+}
+
impl SubTableUpdate {
pub(crate) fn to_create_table_version_request(&self) -> CreateTableVersionRequest {
self.version_metadata.to_create_table_version_request(
@@ -132,14 +191,28 @@ pub(crate) enum ManifestChange {
}
impl SubTableEntry {
+ /// Open this sub-table at its pinned version directly by location (Fix 2),
+ /// without the Lance namespace — which would full-scan `__manifest` twice per
+ /// open (`describe_table` + `describe_table_version`). The resolved Snapshot
+ /// already holds the path, version, and branch. Branches are Lance native
+ /// branches, so `with_branch` resolves `{base}/tree/{branch}` from the base
+ /// URI; main uses `with_version`.
pub(crate) async fn open(&self, root_uri: &str) -> Result {
- open_table_at_version_from_manifest(
- root_uri,
- &self.table_key,
- self.table_branch.as_deref(),
- self.table_version,
- )
- .await
+ // The branch-qualified location is the dataset that physically holds this
+ // version: main at `{table_path}`, a branch at
+ // `{table_path}/tree/{branch}` (Lance native-branch storage). `with_version`
+ // then resolves the version within THAT dataset's `_versions` — a branch
+ // version lives under `tree/{branch}/_versions`, not the base. This
+ // matches the physical layout the namespace path resolved, without the
+ // per-open `__manifest` scan.
+ let location = table_uri_for_path(root_uri, &self.table_path, self.table_branch.as_deref());
+ // Route through the instrumented data-table opener (Fix 3). With no
+ // session this is exactly the Fix-2 `from_uri(location).with_version`.
+ // This is the uncached fallback (a snapshot with no read caches); the
+ // cached path (`Snapshot::open` → handle cache) calls the same opener on
+ // a miss with the shared session, so both paths count on the per-query
+ // `table_wrapper`.
+ crate::instrumentation::open_table_dataset(&location, self.table_version, None).await
}
}
@@ -223,6 +296,7 @@ impl ManifestCoordinator {
.into_iter()
.map(|entry| (entry.table_key.clone(), entry))
.collect(),
+ read_caches: None,
}
}
@@ -359,6 +433,48 @@ impl ManifestCoordinator {
self.dataset.version().version
}
+ /// Latest committed manifest version on disk (one object-store op, no row
+ /// scan). The freshness probe for warm reuse: compare against `version()`
+ /// (the held handle's pinned version) to decide whether to refresh.
+ pub async fn probe_latest_version(&self) -> Result {
+ self.dataset
+ .latest_version_id()
+ .await
+ .map_err(|e| OmniError::Lance(e.to_string()))
+ }
+
+ pub(crate) fn incarnation(&self) -> ManifestIncarnation {
+ ManifestIncarnation {
+ version: self.version(),
+ e_tag: self.dataset.manifest_location().e_tag.clone(),
+ timestamp_nanos: Some(self.dataset.manifest().timestamp_nanos),
+ }
+ }
+
+ /// Latest committed manifest identity. Main cannot be deleted/recreated, so
+ /// the cheap version-number probe is sufficient there. Non-main Lance
+ /// branches can be deleted and recreated with the same version number, so
+ /// load the latest manifest location and compare its e_tag / timestamp too.
+ pub(crate) async fn probe_latest_incarnation(&self) -> Result {
+ if self.active_branch.is_none() {
+ return Ok(ManifestIncarnation {
+ version: self.probe_latest_version().await?,
+ e_tag: self.dataset.manifest_location().e_tag.clone(),
+ timestamp_nanos: Some(self.dataset.manifest().timestamp_nanos),
+ });
+ }
+ let (manifest, location) = self
+ .dataset
+ .latest_manifest()
+ .await
+ .map_err(|e| OmniError::Lance(e.to_string()))?;
+ Ok(ManifestIncarnation {
+ version: manifest.version,
+ e_tag: location.e_tag,
+ timestamp_nanos: Some(manifest.timestamp_nanos),
+ })
+ }
+
pub fn active_branch(&self) -> Option<&str> {
self.active_branch.as_deref()
}
diff --git a/crates/omnigraph/src/db/manifest/layout.rs b/crates/omnigraph/src/db/manifest/layout.rs
index 9cfde9a..08fe043 100644
--- a/crates/omnigraph/src/db/manifest/layout.rs
+++ b/crates/omnigraph/src/db/manifest/layout.rs
@@ -20,9 +20,12 @@ pub(super) fn manifest_uri(root: &str) -> String {
}
pub(super) async fn open_manifest_dataset(root_uri: &str, branch: Option<&str>) -> Result {
- let dataset = Dataset::open(&manifest_uri(root_uri.trim_end_matches('/')))
- .await
- .map_err(|e| OmniError::Lance(e.to_string()))?;
+ let uri = manifest_uri(root_uri.trim_end_matches('/'));
+ let dataset = crate::instrumentation::open_dataset_tracked(
+ &uri,
+ crate::instrumentation::manifest_wrapper(),
+ )
+ .await?;
match branch {
Some(branch) if branch != "main" => dataset
.checkout_branch(branch)
diff --git a/crates/omnigraph/src/db/manifest/metadata.rs b/crates/omnigraph/src/db/manifest/metadata.rs
index 0bf14b6..7cd6436 100644
--- a/crates/omnigraph/src/db/manifest/metadata.rs
+++ b/crates/omnigraph/src/db/manifest/metadata.rs
@@ -111,7 +111,6 @@ impl TableVersionMetadata {
self.manifest_size
}
- #[cfg(test)]
pub(crate) fn e_tag(&self) -> Option<&str> {
self.e_tag.as_deref()
}
@@ -138,6 +137,7 @@ impl TableVersionMetadata {
request
}
+ #[cfg(test)]
pub(super) fn to_namespace_version(&self, version: u64) -> TableVersion {
self.to_namespace_version_with_details(version, None, None)
}
diff --git a/crates/omnigraph/src/db/manifest/namespace.rs b/crates/omnigraph/src/db/manifest/namespace.rs
index 5e907ba..0d567e0 100644
--- a/crates/omnigraph/src/db/manifest/namespace.rs
+++ b/crates/omnigraph/src/db/manifest/namespace.rs
@@ -16,21 +16,30 @@ use object_store::{
use crate::error::{OmniError, Result};
-use super::layout::{
- namespace_internal_error, open_manifest_dataset, table_id_to_key, table_uri_for_path,
-};
-use super::metadata::{
- TableVersionMetadata, namespace_version_metadata, parse_namespace_version_request,
-};
+use super::layout::{namespace_internal_error, table_uri_for_path};
+#[cfg(test)]
+use super::layout::{open_manifest_dataset, table_id_to_key};
+use super::metadata::TableVersionMetadata;
+#[cfg(test)]
+use super::metadata::{namespace_version_metadata, parse_namespace_version_request};
+#[cfg(test)]
use super::publisher::GraphNamespacePublisher;
+// The read namespace (BranchManifestNamespace) is test-only since Fix 2: reads
+// open sub-tables directly by location+version (SubTableEntry::open), so nothing
+// in production routes a read through the Lance namespace. The writes path uses
+// StagedTableNamespace. These items are retained to validate the namespace
+// contract in unit tests.
+#[cfg(test)]
use super::state::{ManifestState, SubTableEntry, read_manifest_entries, read_manifest_state};
+#[cfg(test)]
#[derive(Debug, Clone)]
struct BranchManifestNamespace {
root_uri: String,
branch: Option,
}
+#[cfg(test)]
impl BranchManifestNamespace {
fn new(root_uri: &str, branch: Option<&str>) -> Self {
Self {
@@ -137,6 +146,7 @@ impl StagedTableNamespace {
}
}
+#[cfg(test)]
pub(crate) fn branch_manifest_namespace(
root_uri: &str,
branch: Option<&str>,
@@ -175,21 +185,7 @@ async fn load_table_from_namespace(
.map_err(|e| OmniError::Lance(e.to_string()))
}
-pub(crate) async fn open_table_at_version_from_manifest(
- root_uri: &str,
- table_key: &str,
- branch: Option<&str>,
- version: u64,
-) -> Result {
- load_table_from_namespace(
- branch_manifest_namespace(root_uri, branch),
- table_key,
- branch,
- Some(version),
- )
- .await
-}
-
+#[cfg(test)]
#[async_trait]
impl LanceNamespace for BranchManifestNamespace {
fn namespace_id(&self) -> String {
diff --git a/crates/omnigraph/src/db/manifest/publisher.rs b/crates/omnigraph/src/db/manifest/publisher.rs
index 288f4be..ba1166d 100644
--- a/crates/omnigraph/src/db/manifest/publisher.rs
+++ b/crates/omnigraph/src/db/manifest/publisher.rs
@@ -24,10 +24,13 @@ use lance::Dataset;
use lance::Error as LanceError;
use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched};
use lance_namespace::NamespaceError;
+#[cfg(test)]
use lance_namespace::models::CreateTableVersionRequest;
use crate::error::{OmniError, Result};
+#[cfg(test)]
+use super::SubTableUpdate;
use super::layout::{open_manifest_dataset, tombstone_object_id, version_object_id};
use super::metadata::parse_namespace_version_request;
use super::migrations::migrate_internal_schema;
@@ -37,7 +40,7 @@ use super::state::{
};
use super::{
ManifestChange, OBJECT_TYPE_TABLE, OBJECT_TYPE_TABLE_TOMBSTONE, OBJECT_TYPE_TABLE_VERSION,
- SubTableEntry, SubTableUpdate, TableRegistration, TableTombstone,
+ SubTableEntry, TableRegistration, TableTombstone,
};
/// Bound on the publisher-level retry loop that wraps Lance's row-level CAS
@@ -396,6 +399,7 @@ impl GraphNamespacePublisher {
Ok(Arc::try_unwrap(new_dataset).unwrap_or_else(|arc| (*arc).clone()))
}
+ #[cfg(test)]
pub(super) async fn publish_requests(
&self,
requests: &[CreateTableVersionRequest],
diff --git a/crates/omnigraph/src/db/manifest/recovery.rs b/crates/omnigraph/src/db/manifest/recovery.rs
index 968d3f4..d21e0fd 100644
--- a/crates/omnigraph/src/db/manifest/recovery.rs
+++ b/crates/omnigraph/src/db/manifest/recovery.rs
@@ -62,10 +62,26 @@ pub(crate) const RECOVERY_ACTOR: &str = "omnigraph:recovery";
/// Subdirectory under the graph root holding sidecar files.
pub(crate) const RECOVERY_DIR_NAME: &str = "__recovery";
-/// Current sidecar JSON shape version. Bumping this is a breaking change:
-/// older binaries will refuse to interpret newer sidecars (intentional —
-/// see [`SidecarSchemaError`]).
-pub(crate) const SIDECAR_SCHEMA_VERSION: u32 = 1;
+/// Max sidecar JSON shape/semantics version this binary writes and understands.
+/// The reader accepts every version `<= ` this and refuses only versions ABOVE
+/// it (an older binary cannot guess semantics a newer writer baked in — see
+/// [`SidecarSchemaError`] and [`parse_sidecar`]). Bump this whenever a change
+/// alters how an existing field is *interpreted* (not merely adds an optional
+/// one), and add a fixed `*_SCHEMA_VERSION` floor like the one below so older
+/// generations keep their original semantics.
+///
+/// v1 → v2: Phase-B confirmation. A `BranchMerge` sidecar at v2 carries
+/// `confirmed_version` and is classified strictly (unconfirmed ⇒ partial ⇒ roll
+/// back); at v1 it predates confirmation and keeps the loose roll-forward. The
+/// reader must distinguish the two, so this is a real version bump, not an
+/// additive field.
+pub(crate) const SIDECAR_SCHEMA_VERSION: u32 = 2;
+
+/// The version at which Phase-B confirmation shipped. A `BranchMerge` sidecar is
+/// confirmation-aware (strict classification) iff `schema_version >=` this.
+/// FIXED at 2 — NOT derived from [`SIDECAR_SCHEMA_VERSION`] — so a future bump to
+/// v3+ still treats v2 sidecars as confirmation-aware.
+pub(crate) const CONFIRMATION_SCHEMA_VERSION: u32 = 2;
/// Selects which recovery actions are allowed in a sweep.
///
@@ -115,6 +131,54 @@ pub(crate) enum SidecarKind {
Optimize,
}
+/// Which recovery-classification semantics a sidecar's tables use. Resolved once
+/// from `(writer_kind, schema_version)` — see [`SidecarKind::classification_mode`]
+/// — so [`classify_table`] dispatches on the mode instead of re-deriving it from
+/// a kind×version match. Adding a writer kind or a version floor is then one arm
+/// in the resolver, not a guard threaded through `classify_table`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub(crate) enum ClassificationMode {
+ /// Exactly one `commit_staged` per table (`Mutation`, `Load`): require
+ /// `lance_head == manifest_pinned + 1` and the pin to match.
+ Strict,
+ /// N ≥ 1 commits per table whose drift is content-preserving / derived
+ /// state (`SchemaApply`, `EnsureIndices`, `Optimize`, and pre-confirmation
+ /// `BranchMerge`): any `lance_head > manifest_pinned` rolls forward.
+ Loose,
+ /// Multi-commit publish of *distinct logical rows* with a recorded
+ /// `confirmed_version` (`BranchMerge` at `schema_version >=
+ /// CONFIRMATION_SCHEMA_VERSION`): roll forward ONLY to the confirmed
+ /// version; an unconfirmed moved HEAD is a partial publish and rolls back.
+ Confirmed,
+}
+
+impl SidecarKind {
+ /// Resolve the classification mode for this writer at a given sidecar
+ /// `schema_version`. Exhaustive over `SidecarKind`, so adding a variant is a
+ /// compile error here until its recovery semantics are declared.
+ pub(crate) fn classification_mode(self, schema_version: u32) -> ClassificationMode {
+ match self {
+ SidecarKind::Mutation | SidecarKind::Load => ClassificationMode::Strict,
+ // BranchMerge gained two-phase confirmation at
+ // `CONFIRMATION_SCHEMA_VERSION`. A sidecar written before that
+ // carries no `confirmed_version` and must keep the prior loose
+ // roll-forward — classifying it strictly would misread a *completed*
+ // pre-upgrade merge as a partial and roll it back. (The read gate
+ // already refused any version newer than this binary.)
+ SidecarKind::BranchMerge => {
+ if schema_version >= CONFIRMATION_SCHEMA_VERSION {
+ ClassificationMode::Confirmed
+ } else {
+ ClassificationMode::Loose
+ }
+ }
+ SidecarKind::SchemaApply | SidecarKind::EnsureIndices | SidecarKind::Optimize => {
+ ClassificationMode::Loose
+ }
+ }
+ }
+}
+
/// One table's contribution to a sidecar's intended commit. The classifier
/// uses these to decide per-table state at recovery time.
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
@@ -126,8 +190,22 @@ pub(crate) struct SidecarTablePin {
/// Manifest-pinned version at writer start (CAS expectation).
pub expected_version: u64,
/// Lance HEAD that the writer's `commit_staged` would produce
- /// (typically `expected_version + 1`).
+ /// (typically `expected_version + 1`). For multi-commit writers this is
+ /// only a *lower bound* — see `confirmed_version`.
pub post_commit_pin: u64,
+ /// Phase-B confirmation: the exact Lance HEAD this table reached once the
+ /// writer's *entire* multi-commit publish for it finished, recorded by a
+ /// second sidecar write immediately before the manifest publish (Phase C).
+ /// `None` means Phase B did not complete (the writer crashed mid-publish),
+ /// so the on-disk drift is a *partial* commit and recovery must roll the
+ /// whole operation BACK rather than publish an incomplete state. Only the
+ /// `BranchMerge` writer records this today (its per-table publish is
+ /// append → upsert → delete, several HEAD advances that the manifest
+ /// publish makes atomic); other writers leave it `None` and keep their
+ /// existing loose roll-forward. Backward-compatible: absent on older
+ /// sidecars.
+ #[serde(default, skip_serializing_if = "Option::is_none")]
+ pub confirmed_version: Option,
/// Lance branch ref this table lives on (mirrors
/// `SubTableEntry::table_branch`). Required for the recovery sweep
/// to open the dataset at the correct ref — `Dataset::open(path)`
@@ -218,25 +296,27 @@ pub(crate) struct RecoverySidecarHandle {
pub(crate) sidecar_uri: String,
}
-/// Error returned when the sidecar's `schema_version` is unknown to this
-/// binary. We refuse-and-error rather than read-and-warn: an old binary
-/// cannot guess what semantics a newer writer baked into a future shape.
-/// Operator action is required (typically: upgrade the binary).
+/// Error returned when the sidecar's `schema_version` is NEWER than this binary
+/// understands. We refuse-and-error rather than read-and-warn: an old binary
+/// cannot guess what semantics a newer writer baked into a future shape. (Older
+/// versions are accepted and interpreted with their original semantics — see
+/// [`parse_sidecar`].) Operator action is required (typically: upgrade the
+/// binary).
#[derive(Debug)]
pub(crate) struct SidecarSchemaError {
pub sidecar_uri: String,
pub found_version: u32,
- pub supported_version: u32,
+ pub max_supported_version: u32,
}
impl std::fmt::Display for SidecarSchemaError {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
f,
- "recovery sidecar at '{}' declares schema_version={}, but this \
- binary supports only schema_version={}; refusing to interpret \
+ "recovery sidecar at '{}' declares schema_version={}, newer than the \
+ maximum this binary supports (schema_version={}); refusing to interpret \
— upgrade omnigraph or remove the sidecar with operator review",
- self.sidecar_uri, self.found_version, self.supported_version,
+ self.sidecar_uri, self.found_version, self.max_supported_version,
)
}
}
@@ -271,6 +351,14 @@ pub(crate) enum TableClassification {
/// previous restore attempt or an external mutation. Roll back to
/// the manifest pin.
UnexpectedMultistep,
+ /// A confirmation-using writer (`BranchMerge`) advanced this table's HEAD
+ /// (`lance_head > manifest_pinned`) but the sidecar carries no
+ /// `confirmed_version` — its multi-commit publish crashed mid-flight, so
+ /// the drift is a *partial* commit (e.g. an append without its sibling
+ /// upsert/delete). Roll back to the manifest pin; the whole operation is
+ /// re-run from scratch. Distinct from `UnexpectedMultistep` so the audit
+ /// records a partial Phase B, not a foreign mutation.
+ IncompletePhaseB,
/// `lance_head < manifest_pinned`. Should be impossible: the manifest
/// pin can only advance after a successful Lance commit. Surface
/// loudly and abort recovery.
@@ -341,6 +429,58 @@ pub(crate) async fn write_sidecar(
})
}
+/// Phase-B confirmation: stamp each pin with the exact Lance HEAD its publish
+/// reached, then re-write the sidecar in place (same object). Called once, after
+/// the writer's whole multi-commit publish completed and before the manifest
+/// publish (Phase C). Recovery then rolls forward ONLY to these confirmed
+/// versions; a sidecar still missing them is a partial Phase B that rolls back.
+///
+/// Overwriting the same object is atomic (same contract as [`write_sidecar`]):
+/// a torn rewrite is never observed, so recovery reads either the pre-confirm
+/// sidecar (→ roll back, safe) or the confirmed one (→ roll forward). A failure
+/// here leaves the pre-confirm sidecar, so the operation rolls back — correct.
+///
+/// SURVIVES the fragment-adopt work (unlike the row-level merge it currently
+/// serves — see `AdoptDelta` in `exec/merge.rs`). The recovery sidecar is the
+/// cross-table write-ahead log that makes a fast-forward-main commit
+/// all-or-nothing across N tables, which a fragment graft still needs. What
+/// narrows is the *within-table* reason for confirmation: once each table's
+/// merge is a single graft commit, the multi-step partial window shrinks to one
+/// commit, so the `BranchMerge` arm of `classify_table` could fold back into the
+/// strict single-commit path and `IncompletePhaseB` retire. Do NOT delete this
+/// with the row path — keep the sidecar; only simplify the classifier.
+pub(crate) async fn confirm_sidecar_phase_b(
+ root_uri: &str,
+ storage: &dyn StorageAdapter,
+ sidecar: &mut RecoverySidecar,
+ confirmed_versions: &HashMap,
+) -> Result<()> {
+ // Failpoint: models a storage failure on the confirmation write — the
+ // pre-confirm sidecar stays on disk, so recovery rolls the operation back.
+ crate::failpoints::maybe_fail("recovery.sidecar_confirm")?;
+ for pin in &mut sidecar.tables {
+ // Every pinned table MUST have an achieved version. A miss means the
+ // pin set and the publish `updates` diverged — fail loudly at the
+ // producer rather than leave the pin unconfirmed, which recovery would
+ // read as a partial Phase B and silently roll the whole (complete) merge
+ // back. Today the two are kept in lockstep by construction; this guards
+ // the invariant against a future edit to either filter.
+ let version = confirmed_versions.get(&pin.table_key).ok_or_else(|| {
+ OmniError::manifest_internal(format!(
+ "confirm_sidecar_phase_b: no achieved version for pinned table '{}' \
+ (pins and publish updates diverged)",
+ pin.table_key
+ ))
+ })?;
+ pin.confirmed_version = Some(*version);
+ }
+ let uri = sidecar_uri(root_uri, &sidecar.operation_id);
+ let json = serde_json::to_string_pretty(sidecar).map_err(|err| {
+ OmniError::manifest_internal(format!("failed to serialize recovery sidecar: {}", err))
+ })?;
+ storage.write_text(&uri, &json).await
+}
+
/// Delete a sidecar after Phase C succeeded. Idempotent (safe to retry).
pub(crate) async fn delete_sidecar(
handle: &RecoverySidecarHandle,
@@ -408,11 +548,15 @@ pub(crate) fn parse_sidecar(sidecar_uri: &str, body: &str) -> Result SIDECAR_SCHEMA_VERSION {
return Err(SidecarSchemaError {
sidecar_uri: sidecar_uri.to_string(),
found_version: peek.schema_version,
- supported_version: SIDECAR_SCHEMA_VERSION,
+ max_supported_version: SIDECAR_SCHEMA_VERSION,
}
.into());
}
@@ -427,26 +571,38 @@ pub(crate) fn parse_sidecar(sidecar_uri: &str, body: &str) -> Result manifest_pinned` is ambiguous — it may be a *complete*
+/// publish or a *partial* one crashed mid-sequence. The writer resolves
+/// the ambiguity by recording the exact achieved version
+/// (`confirmed_version`) only after the whole publish finished. So roll
+/// forward ONLY to that confirmed version; a missing confirmation is a
+/// partial commit (`IncompletePhaseB`) and rolls back. This is the safe
+/// form of the loose match for writers where a partial would publish an
+/// incomplete delta.
/// - **Strict** (`Mutation`, `Load`): exactly one `commit_staged` per
/// table, so `lance_head == manifest_pinned + 1` AND
/// `post_commit_pin == lance_head` is required.
-/// - **Loose** (`SchemaApply`, `EnsureIndices`, `BranchMerge`,
-/// `Optimize`): the writer advances the Lance HEAD by N ≥ 1 commits
-/// per table (one per index built + one for the overwrite, etc.;
-/// merge tables run merge_insert + delete_where + index rebuilds;
-/// `Optimize` runs `compact_files`, which commits reserve-fragments +
-/// rewrite) and the exact N is hard to compute at sidecar-write time.
-/// The loose match accepts
+/// - **Loose** (`SchemaApply`, `EnsureIndices`, `Optimize`): the writer
+/// advances the Lance HEAD by N ≥ 1 commits per table (one per index
+/// built + one for the overwrite, etc.; `Optimize` runs `compact_files`,
+/// which commits reserve-fragments + rewrite) and the exact N is hard to
+/// compute at sidecar-write time. The loose match accepts
/// any `lance_head > manifest_pinned` as `RolledPastExpected` when
/// `pin.expected_version == manifest_pinned` (the writer's CAS
-/// target matches what the manifest currently shows). The risk this
-/// admits — an external agent advancing HEAD between sidecar write
-/// and recovery — is out of scope for the single-coordinator model.
+/// target matches what the manifest currently shows). This is safe for
+/// these writers because their drift is derived state (index coverage,
+/// compaction) the reconciler reproduces — a partial roll-forward loses
+/// no logical rows. The risk it admits — an external agent advancing HEAD
+/// between sidecar write and recovery — is out of scope for the
+/// single-coordinator model.
pub(crate) fn classify_table(
pin: &SidecarTablePin,
lance_head: u64,
manifest_pinned: u64,
kind: SidecarKind,
+ schema_version: u32,
) -> TableClassification {
use TableClassification::*;
if lance_head < manifest_pinned {
@@ -457,27 +613,49 @@ pub(crate) fn classify_table(
if lance_head == manifest_pinned {
return NoMovement;
}
- // lance_head > manifest_pinned
- let strict = matches!(kind, SidecarKind::Mutation | SidecarKind::Load);
- if strict {
- if lance_head == manifest_pinned + 1 {
- if pin.expected_version == manifest_pinned && pin.post_commit_pin == lance_head {
- RolledPastExpected
- } else {
- UnexpectedAtP1
+ // lance_head > manifest_pinned. The "which semantics" decision is resolved
+ // once from (kind, schema_version); dispatch on it.
+ match kind.classification_mode(schema_version) {
+ ClassificationMode::Confirmed => {
+ // Two-phase confirmation: roll forward only to the exact version the
+ // writer recorded after its whole multi-commit publish completed. No
+ // confirmation ⇒ the publish crashed mid-sequence ⇒ partial ⇒ roll
+ // back. A confirmation that doesn't match the observed HEAD means a
+ // foreign writer advanced the table — don't roll a surprise forward.
+ match pin.confirmed_version {
+ Some(confirmed)
+ if lance_head == confirmed && pin.expected_version == manifest_pinned =>
+ {
+ RolledPastExpected
+ }
+ Some(_) => UnexpectedMultistep,
+ None => IncompletePhaseB,
}
- } else {
- // lance_head > manifest_pinned + 1
- UnexpectedMultistep
}
- } else {
- // Loose match for multi-commit writers (SchemaApply, EnsureIndices).
- if pin.expected_version == manifest_pinned {
- RolledPastExpected
- } else if lance_head == manifest_pinned + 1 {
- UnexpectedAtP1
- } else {
- UnexpectedMultistep
+ ClassificationMode::Strict => {
+ if lance_head == manifest_pinned + 1 {
+ if pin.expected_version == manifest_pinned && pin.post_commit_pin == lance_head {
+ RolledPastExpected
+ } else {
+ UnexpectedAtP1
+ }
+ } else {
+ // lance_head > manifest_pinned + 1
+ UnexpectedMultistep
+ }
+ }
+ ClassificationMode::Loose => {
+ // Multi-commit writers whose drift is content-preserving / derived
+ // state (and pre-confirmation BranchMerge sidecars): any
+ // `lance_head > manifest_pinned` rolls forward when the CAS target
+ // matches what the manifest currently shows.
+ if pin.expected_version == manifest_pinned {
+ RolledPastExpected
+ } else if lance_head == manifest_pinned + 1 {
+ UnexpectedAtP1
+ } else {
+ UnexpectedMultistep
+ }
}
}
}
@@ -496,7 +674,7 @@ pub(crate) fn decide(classifications: &[TableClassification]) -> SidecarDecision
}
if classifications
.iter()
- .any(|c| matches!(c, NoMovement | UnexpectedAtP1 | UnexpectedMultistep))
+ .any(|c| matches!(c, NoMovement | UnexpectedAtP1 | UnexpectedMultistep | IncompletePhaseB))
{
return RollBack;
}
@@ -830,7 +1008,12 @@ pub(crate) async fn recover_manifest_drift(
// write-entry heal: a deferred sidecar whose branch was
// deleted would otherwise fail every ReadWrite open.
coordinator.refresh().await?;
- if !coordinator.all_branches().await?.iter().any(|name| name == b) {
+ if !coordinator
+ .all_branches()
+ .await?
+ .iter()
+ .any(|name| name == b)
+ {
discard_orphaned_branch_sidecar(
root_uri,
storage.as_ref(),
@@ -886,7 +1069,13 @@ async fn process_sidecar(
.map(|e| e.table_version)
.unwrap_or(0);
states.push(ClassifiedTable {
- classification: classify_table(pin, lance_head, manifest_pinned, sidecar.writer_kind),
+ classification: classify_table(
+ pin,
+ lance_head,
+ manifest_pinned,
+ sidecar.writer_kind,
+ sidecar.schema_version,
+ ),
manifest_pinned,
lance_head,
});
@@ -1023,7 +1212,7 @@ async fn process_sidecar(
Phase C did not land)"
);
let (new_manifest_version, published_versions) =
- roll_forward_all(root_uri, sidecar, snapshot).await?;
+ roll_forward_all(root_uri, sidecar, &states, snapshot).await?;
// `to_version` records the ACTUAL Lance HEAD published for
// each table (not pin.post_commit_pin, which is a lower bound
// for loose-match writers like SchemaApply / EnsureIndices /
@@ -1107,6 +1296,7 @@ async fn roll_back_sidecar(
TableClassification::RolledPastExpected
| TableClassification::UnexpectedAtP1
| TableClassification::UnexpectedMultistep
+ | TableClassification::IncompletePhaseB
) {
restore_table_to_version(
&pin.table_path,
@@ -1114,14 +1304,17 @@ async fn roll_back_sidecar(
state.manifest_pinned,
)
.await?;
- // Publish the post-restore HEAD, CAS against the current (unmoved)
- // manifest pin — the same helper roll-forward uses.
- push_table_update_at_head(
+ // Publish the post-restore HEAD (the restore commit we just made),
+ // CAS against the current (unmoved) manifest pin — the same helper
+ // roll-forward uses. `None` target: there is no prior observation to
+ // pin to; the version to publish is the HEAD the restore produced.
+ push_table_update(
root_uri,
&pin.table_key,
&pin.table_path,
pin.table_branch.as_deref(),
state.manifest_pinned,
+ None,
&mut updates,
&mut expected,
)
@@ -1222,6 +1415,7 @@ async fn record_audit_recovery_rollforward(
async fn roll_forward_all(
root_uri: &str,
sidecar: &RecoverySidecar,
+ states: &[ClassifiedTable],
snapshot: &Snapshot,
) -> Result<(u64, HashMap)> {
let total_changes =
@@ -1231,22 +1425,25 @@ async fn roll_forward_all(
let mut published_versions: HashMap =
HashMap::with_capacity(sidecar.tables.len() + sidecar.additional_registrations.len());
- for pin in &sidecar.tables {
- // Publish to the table's CURRENT Lance HEAD on the pin's branch (not the
- // sidecar's `post_commit_pin`, a lower bound for loose-match writers that
- // run multiple commit_staged calls per table). CAS against the pin's
- // pre-write `expected_version`.
- let head_version = push_table_update_at_head(
+ for (pin, state) in sidecar.tables.iter().zip(states.iter()) {
+ // Publish the version classification OBSERVED (`state.lance_head`), not a
+ // fresh HEAD re-read. For a `Confirmed` pin classify already validated
+ // `lance_head == confirmed_version`, so this publishes the recorded WAL
+ // intent by construction; for loose/strict pins it's the multi-commit
+ // HEAD classify saw. Single observation, no classify→publish TOCTOU. CAS
+ // against the pin's pre-write `expected_version`.
+ let published = push_table_update(
root_uri,
&pin.table_key,
&pin.table_path,
pin.table_branch.as_deref(),
pin.expected_version,
+ Some(state.lance_head),
&mut updates,
&mut expected,
)
.await?;
- published_versions.insert(pin.table_key.clone(), head_version);
+ published_versions.insert(pin.table_key.clone(), published);
}
// SchemaApply-only: register added tables (and renamed targets) and
@@ -1346,45 +1543,61 @@ async fn roll_forward_all(
/// version the table was just restored to). The HEAD is read AFTER any restore
/// in the same single-threaded sweep, so no concurrent writer can have advanced
/// it.
-async fn push_table_update_at_head(
+/// Stage a manifest `Update` for one table.
+///
+/// `target_version` selects WHICH Lance version's state to publish:
+/// - `Some(v)` — pin the dataset at version `v` and publish it. Roll-FORWARD
+/// passes the version classification observed (and, for a `Confirmed` pin,
+/// validated equals `confirmed_version`), so recovery publishes the version it
+/// *decided* on rather than re-reading a HEAD a concurrent writer may have
+/// advanced since classification — one observation, used for both the decision
+/// and the publish (invariant 15).
+/// - `None` — publish the dataset's current HEAD. Roll-BACK uses this: it just
+/// created the restore commit, so HEAD *is* the version to publish.
+async fn push_table_update(
root_uri: &str,
table_key: &str,
table_path: &str,
branch: Option<&str>,
expected_version: u64,
+ target_version: Option,
updates: &mut Vec,
expected: &mut HashMap,
) -> Result {
- let head_ds = Dataset::open(table_path)
+ let ds = Dataset::open(table_path)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?;
- let head_ds = match branch {
- Some(b) if b != "main" => head_ds
+ let ds = match branch {
+ Some(b) if b != "main" => ds
.checkout_branch(b)
.await
.map_err(|e| OmniError::Lance(e.to_string()))?,
- _ => head_ds,
+ _ => ds,
};
- let head_version = head_ds.version().version;
- let row_count = head_ds
+ let ds = match target_version {
+ Some(v) => ds
+ .checkout_version(v)
+ .await
+ .map_err(|e| OmniError::Lance(e.to_string()))?,
+ None => ds,
+ };
+ let published_version = ds.version().version;
+ let row_count = ds
.count_rows(None)
.await
.map_err(|e| OmniError::Lance(e.to_string()))? as u64;
let table_relative_path = super::table_path_for_table_key(table_key)?;
- let version_metadata = super::metadata::TableVersionMetadata::from_dataset(
- root_uri,
- &table_relative_path,
- &head_ds,
- )?;
+ let version_metadata =
+ super::metadata::TableVersionMetadata::from_dataset(root_uri, &table_relative_path, &ds)?;
updates.push(ManifestChange::Update(SubTableUpdate {
table_key: table_key.to_string(),
- table_version: head_version,
+ table_version: published_version,
table_branch: branch.map(str::to_string),
row_count,
version_metadata,
}));
expected.insert(table_key.to_string(), expected_version);
- Ok(head_version)
+ Ok(published_version)
}
/// Append the audit row describing this recovery action.
@@ -1568,6 +1781,7 @@ mod tests {
table_path: table_path.to_string(),
expected_version: expected,
post_commit_pin: post,
+ confirmed_version: None,
table_branch: None,
}
}
@@ -1592,30 +1806,39 @@ mod tests {
}
#[test]
- fn parse_sidecar_refuses_unknown_schema_version() {
- let body = r#"{
- "schema_version": 99,
- "operation_id": "01H000000000000000000000XX",
- "started_at": "0",
- "branch": null,
- "actor_id": null,
- "writer_kind": "Mutation",
- "tables": []
- }"#;
- let err = parse_sidecar("file:///tmp/__recovery/x.json", body).unwrap_err();
+ fn parse_sidecar_refuses_future_but_accepts_older_schema_version() {
+ let body = |version: u32| {
+ format!(
+ r#"{{
+ "schema_version": {version},
+ "operation_id": "01H000000000000000000000XX",
+ "started_at": "0",
+ "branch": null,
+ "actor_id": null,
+ "writer_kind": "BranchMerge",
+ "tables": []
+ }}"#
+ )
+ };
+ // A version NEWER than this binary's max → refuse (can't guess the future).
+ let err = parse_sidecar("file:///tmp/__recovery/x.json", &body(99)).unwrap_err();
let msg = err.to_string();
assert!(
- msg.contains("schema_version=99") && msg.contains("supports only schema_version=1"),
- "expected SidecarSchemaError mentioning the version mismatch, got: {}",
- msg,
+ msg.contains("schema_version=99") && msg.contains("newer than the maximum"),
+ "expected a future-version refusal, got: {msg}",
);
+ // An OLDER version (pre-confirmation v1) → accept and interpret with its
+ // original semantics; never refuse a version we were built to understand.
+ let parsed = parse_sidecar("file:///tmp/__recovery/x.json", &body(1))
+ .expect("a v1 (older) sidecar must parse, not be refused");
+ assert_eq!(parsed.schema_version, 1);
}
#[test]
fn classify_no_movement_when_head_equals_pinned() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 5, 5, SidecarKind::Mutation),
+ classify_table(&pin, 5, 5, SidecarKind::Mutation, SIDECAR_SCHEMA_VERSION),
TableClassification::NoMovement,
);
}
@@ -1624,7 +1847,7 @@ mod tests {
fn classify_rolled_past_expected_when_sidecar_matches_strict() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 6, 5, SidecarKind::Mutation),
+ classify_table(&pin, 6, 5, SidecarKind::Mutation, SIDECAR_SCHEMA_VERSION),
TableClassification::RolledPastExpected,
);
}
@@ -1634,7 +1857,7 @@ mod tests {
// Same +1 drift but post_commit_pin says it should be 7, not 6.
let pin = make_pin("node:Person", "irrelevant", 5, 7);
assert_eq!(
- classify_table(&pin, 6, 5, SidecarKind::Mutation),
+ classify_table(&pin, 6, 5, SidecarKind::Mutation, SIDECAR_SCHEMA_VERSION),
TableClassification::UnexpectedAtP1,
);
}
@@ -1643,7 +1866,7 @@ mod tests {
fn classify_unexpected_multistep_when_head_jumped_more_than_one_strict() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 8, 5, SidecarKind::Mutation),
+ classify_table(&pin, 8, 5, SidecarKind::Mutation, SIDECAR_SCHEMA_VERSION),
TableClassification::UnexpectedMultistep,
);
}
@@ -1652,7 +1875,7 @@ mod tests {
fn classify_invariant_violation_when_head_below_pinned() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 3, 5, SidecarKind::Mutation),
+ classify_table(&pin, 3, 5, SidecarKind::Mutation, SIDECAR_SCHEMA_VERSION),
TableClassification::InvariantViolation { observed: 3 },
);
}
@@ -1668,7 +1891,7 @@ mod tests {
// built two indices). Strict would say UnexpectedMultistep; loose
// accepts it as RolledPastExpected.
assert_eq!(
- classify_table(&pin, 8, 5, SidecarKind::SchemaApply),
+ classify_table(&pin, 8, 5, SidecarKind::SchemaApply, SIDECAR_SCHEMA_VERSION),
TableClassification::RolledPastExpected,
);
}
@@ -1677,7 +1900,7 @@ mod tests {
fn classify_loose_match_accepts_multi_commit_drift_for_ensure_indices() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 9, 5, SidecarKind::EnsureIndices),
+ classify_table(&pin, 9, 5, SidecarKind::EnsureIndices, SIDECAR_SCHEMA_VERSION),
TableClassification::RolledPastExpected,
);
}
@@ -1686,7 +1909,7 @@ mod tests {
fn classify_loose_match_no_movement_unchanged() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 5, 5, SidecarKind::SchemaApply),
+ classify_table(&pin, 5, 5, SidecarKind::SchemaApply, SIDECAR_SCHEMA_VERSION),
TableClassification::NoMovement,
);
}
@@ -1695,31 +1918,65 @@ mod tests {
fn classify_loose_match_invariant_violation_unchanged() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 3, 5, SidecarKind::SchemaApply),
+ classify_table(&pin, 3, 5, SidecarKind::SchemaApply, SIDECAR_SCHEMA_VERSION),
TableClassification::InvariantViolation { observed: 3 },
);
}
- /// BranchMerge must be loose-matched, not strict: while the strict
- /// classifier expects exactly one `commit_staged` per table,
- /// `publish_rewritten_merge_table` runs multiple per table
- /// (merge_insert + delete_where + index rebuilds — the comment in
- /// `merge.rs` explicitly says so). Strict classification would roll
- /// back valid completed Phase B work as `UnexpectedMultistep`.
+ /// BranchMerge advances each table by several commits per table
+ /// (adopt: append + upsert + delete; three-way: merge_insert + delete +
+ /// index), so a bare "HEAD moved" is ambiguous between a complete and a
+ /// partial publish. At a confirmation-aware version the two-phase
+ /// confirmation resolves it: roll forward ONLY to the recorded
+ /// `confirmed_version`; an unconfirmed moved HEAD is a partial publish
+ /// (`IncompletePhaseB` ⇒ roll back), and a confirmed version that doesn't
+ /// match the observed HEAD is a foreign advance (`UnexpectedMultistep` ⇒
+ /// roll back). A *pre-confirmation* (v1) sidecar carries no confirmation and
+ /// must keep the original loose roll-forward — reading it as strict would
+ /// roll a completed pre-upgrade merge back (silent discard).
#[test]
- fn classify_loose_match_accepts_multi_commit_drift_for_branch_merge() {
- let pin = make_pin("node:Person", "irrelevant", 5, 6);
+ fn classify_branch_merge_requires_phase_b_confirmation() {
+ // Unconfirmed multi-commit drift at a confirmation-aware version →
+ // partial Phase B → roll back.
+ let unconfirmed = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 8, 5, SidecarKind::BranchMerge),
+ classify_table(&unconfirmed, 8, 5, SidecarKind::BranchMerge, SIDECAR_SCHEMA_VERSION),
+ TableClassification::IncompletePhaseB,
+ );
+ // Backward-compat: the SAME unconfirmed pin in a PRE-confirmation (v1)
+ // sidecar → loose roll-forward (the regression fix — a completed
+ // pre-upgrade merge must not be discarded).
+ assert_eq!(
+ classify_table(
+ &unconfirmed,
+ 8,
+ 5,
+ SidecarKind::BranchMerge,
+ CONFIRMATION_SCHEMA_VERSION - 1,
+ ),
TableClassification::RolledPastExpected,
);
+ // Confirmed to the observed HEAD → complete Phase B → roll forward.
+ let confirmed = SidecarTablePin {
+ confirmed_version: Some(8),
+ ..make_pin("node:Person", "irrelevant", 5, 6)
+ };
+ assert_eq!(
+ classify_table(&confirmed, 8, 5, SidecarKind::BranchMerge, SIDECAR_SCHEMA_VERSION),
+ TableClassification::RolledPastExpected,
+ );
+ // Confirmed, but HEAD drifted past it (foreign writer) → roll back.
+ assert_eq!(
+ classify_table(&confirmed, 9, 5, SidecarKind::BranchMerge, SIDECAR_SCHEMA_VERSION),
+ TableClassification::UnexpectedMultistep,
+ );
}
#[test]
fn classify_loose_match_branch_merge_no_movement_unchanged() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 5, 5, SidecarKind::BranchMerge),
+ classify_table(&pin, 5, 5, SidecarKind::BranchMerge, SIDECAR_SCHEMA_VERSION),
TableClassification::NoMovement,
);
}
@@ -1728,7 +1985,7 @@ mod tests {
fn classify_loose_match_branch_merge_invariant_violation_unchanged() {
let pin = make_pin("node:Person", "irrelevant", 5, 6);
assert_eq!(
- classify_table(&pin, 3, 5, SidecarKind::BranchMerge),
+ classify_table(&pin, 3, 5, SidecarKind::BranchMerge, SIDECAR_SCHEMA_VERSION),
TableClassification::InvariantViolation { observed: 3 },
);
}
@@ -1883,6 +2140,37 @@ mod tests {
assert!(after.is_empty());
}
+ #[tokio::test]
+ async fn confirm_sidecar_phase_b_errors_when_pin_missing_from_updates() {
+ // A pinned table with no achieved version in the publish `updates` must
+ // be a loud producer error, NOT a silent skip that leaves the pin
+ // unconfirmed (which recovery would read as a partial Phase B and roll
+ // the whole complete merge back). Guards the implicit `pins ⊆ updates`
+ // invariant against a future divergence between the two filters.
+ let dir = tempfile::tempdir().unwrap();
+ let storage = ObjectStorageAdapter::local();
+ let mut sidecar = new_sidecar(
+ SidecarKind::BranchMerge,
+ Some("main".to_string()),
+ None,
+ vec![make_pin("node:Person", "file:///tmp/x.lance", 5, 6)],
+ );
+ // The confirmed-versions map does NOT cover the pinned table.
+ let confirmed: HashMap = HashMap::new();
+ let err = confirm_sidecar_phase_b(
+ dir.path().to_str().unwrap(),
+ &storage,
+ &mut sidecar,
+ &confirmed,
+ )
+ .await
+ .expect_err("a pinned table with no achieved version must be a loud error");
+ assert!(
+ err.to_string().contains("pins and publish updates diverged"),
+ "expected a pin/updates divergence error, got: {err}",
+ );
+ }
+
#[tokio::test]
async fn list_sidecars_skips_non_json_files() {
let dir = tempfile::tempdir().unwrap();
diff --git a/crates/omnigraph/src/db/manifest/tests.rs b/crates/omnigraph/src/db/manifest/tests.rs
index 0e00505..3888bd4 100644
--- a/crates/omnigraph/src/db/manifest/tests.rs
+++ b/crates/omnigraph/src/db/manifest/tests.rs
@@ -1531,7 +1531,11 @@ async fn test_v2_to_v3_sweeps_legacy_run_branches_on_write_open() {
.await
.unwrap();
let post = open_manifest_dataset(uri, None).await.unwrap();
- assert_eq!(super::migrations::read_stamp(&post), 2, "stamp rewound to v2");
+ assert_eq!(
+ super::migrations::read_stamp(&post),
+ 2,
+ "stamp rewound to v2"
+ );
}
// A no-op publish forces the open-for-write path, which runs the migration.
@@ -1556,7 +1560,10 @@ async fn test_v2_to_v3_sweeps_legacy_run_branches_on_write_open() {
!after.iter().any(|b| b.starts_with("__run__")),
"legacy run branch must be swept; got {after:?}",
);
- assert!(after.iter().any(|b| b == "feature"), "user branch must survive");
+ assert!(
+ after.iter().any(|b| b == "feature"),
+ "user branch must survive"
+ );
assert!(after.iter().any(|b| b == "main"), "main must survive");
// Idempotent: a second write-open finds the stamp at current and does not
diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs
index 48be274..e1d7acf 100644
--- a/crates/omnigraph/src/db/omnigraph.rs
+++ b/crates/omnigraph/src/db/omnigraph.rs
@@ -106,6 +106,12 @@ pub struct Omnigraph {
coordinator: Arc>,
table_store: TableStore,
runtime_cache: RuntimeCache,
+ /// Per-graph read caches: one shared Lance `Session` plus the held-`Dataset`
+ /// handle cache, handed to live-Branch-read snapshots (via
+ /// `resolved_target`) so table opens reuse handles (0 IO on a warm repeat)
+ /// and one session. Invalidated alongside `runtime_cache` on branch switch /
+ /// refresh — hygiene only; version-in-key carries correctness.
+ read_caches: Arc,
/// Read-heavy on every query, written only by `apply_schema`. ArcSwap
/// gives atomic pointer swap with zero-cost reads (`load()` returns a
/// `Guard>`), so concurrent queries on different actors
@@ -327,6 +333,14 @@ impl Omnigraph {
coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
table_store: TableStore::new(&root),
runtime_cache: RuntimeCache::default(),
+ // One shared Session per graph (LanceDB's one-session-per-connection
+ // model) plus the held-handle cache, created once and reused across
+ // reads. Session::default() caps are lazy (6 GiB index / 1 GiB
+ // metadata); multi-graph cap/sharing is a deferred follow-up.
+ read_caches: Arc::new(crate::runtime_cache::ReadCaches {
+ session: Arc::new(lance::session::Session::default()),
+ handles: Arc::new(crate::runtime_cache::TableHandleCache::default()),
+ }),
catalog: Arc::new(ArcSwap::from_pointee(catalog)),
schema_source: Arc::new(ArcSwap::from_pointee(schema_source.to_string())),
write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
@@ -351,12 +365,10 @@ impl Omnigraph {
Self::open_with_storage_and_mode(uri, storage_for_uri(uri)?, OpenMode::ReadOnly).await
}
- /// `open_with_storage` retained for existing callers (init/test paths).
- /// Defaults to `OpenMode::ReadWrite`.
- pub(crate) async fn open_with_storage(
- uri: &str,
- storage: Arc,
- ) -> Result {
+ /// Open with a caller-supplied [`StorageAdapter`]. Used by init/test paths
+ /// and by embedding/test consumers that wrap storage (e.g. a counting
+ /// decorator for IO-budget tests). Defaults to `OpenMode::ReadWrite`.
+ pub async fn open_with_storage(uri: &str, storage: Arc) -> Result {
Self::open_with_storage_and_mode(uri, storage, OpenMode::ReadWrite).await
}
@@ -428,6 +440,14 @@ impl Omnigraph {
coordinator: Arc::new(tokio::sync::RwLock::new(coordinator)),
table_store: TableStore::new(&root),
runtime_cache: RuntimeCache::default(),
+ // One shared Session per graph (LanceDB's one-session-per-connection
+ // model) plus the held-handle cache, created once and reused across
+ // reads. Session::default() caps are lazy (6 GiB index / 1 GiB
+ // metadata); multi-graph cap/sharing is a deferred follow-up.
+ read_caches: Arc::new(crate::runtime_cache::ReadCaches {
+ session: Arc::new(lance::session::Session::default()),
+ handles: Arc::new(crate::runtime_cache::TableHandleCache::default()),
+ }),
catalog: Arc::new(ArcSwap::from_pointee(catalog)),
schema_source: Arc::new(ArcSwap::from_pointee(schema_source)),
write_queue: Arc::new(crate::db::write_queue::WriteQueueManager::new()),
@@ -539,6 +559,12 @@ impl Omnigraph {
}
pub(crate) async fn ensure_schema_state_valid(&self) -> Result<()> {
+ // Full per-call validation is intentional: a long-lived handle must
+ // detect external drift of the schema source, IR, OR state on its next
+ // operation (see lifecycle::long_lived_handle_rejects_schema_* tests). A
+ // source-only fast path would miss IR/state drift when _schema.pg is
+ // unchanged, so the only safe latency win is not calling this twice per
+ // query (finding A removes the redundant caller in exec/query.rs).
validate_schema_contract(self.uri(), Arc::clone(&self.storage)).await
}
@@ -719,10 +745,13 @@ impl Omnigraph {
let normalized = normalize_branch_name(branch.unwrap_or("main"))?;
let coord = self.coordinator.read().await;
if normalized.as_deref() == coord.current_branch() {
- let snapshot_id = coord
- .head_commit_id()
- .await?
- .unwrap_or_else(|| SnapshotId::synthetic(coord.current_branch(), coord.version()));
+ let snapshot_id = coord.head_commit_id().await?.unwrap_or_else(|| {
+ SnapshotId::synthetic(
+ coord.current_branch(),
+ coord.version(),
+ coord.manifest_incarnation().e_tag.as_deref(),
+ )
+ });
return Ok(ResolvedTarget {
requested,
branch: coord.current_branch().map(str::to_string),
@@ -785,10 +814,15 @@ impl Omnigraph {
let branch = normalize_branch_name(branch)?;
let next = self.open_coordinator_for_branch(branch.as_deref()).await?;
*self.coordinator.write().await = next;
- self.runtime_cache.invalidate_all().await;
+ self.invalidate_read_caches().await;
Ok(())
}
+ async fn invalidate_read_caches(&self) {
+ self.runtime_cache.invalidate_all().await;
+ self.read_caches.handles.invalidate_all().await;
+ }
+
/// Re-read the handle-local coordinator state from storage AND run
/// in-process recovery. Closes the Phase B → Phase C residual (e.g.
/// `MutationStaging::finalize` crash mid-publish in a long-running
@@ -888,7 +922,7 @@ impl Omnigraph {
)
.await?;
self.reload_schema_if_source_changed().await?;
- self.runtime_cache.invalidate_all().await;
+ self.invalidate_read_caches().await;
Ok(())
}
@@ -920,7 +954,7 @@ impl Omnigraph {
// write that triggered the heal validates against the stale
// schema. Same post-heal step as `refresh`.
self.reload_schema_if_source_changed().await?;
- self.runtime_cache.invalidate_all().await;
+ self.invalidate_read_caches().await;
}
Ok(())
}
@@ -956,7 +990,7 @@ impl Omnigraph {
/// own publish path.
pub(crate) async fn refresh_coordinator_only(&self) -> Result<()> {
self.coordinator.write().await.refresh().await?;
- self.runtime_cache.invalidate_all().await;
+ self.invalidate_read_caches().await;
Ok(())
}
@@ -974,11 +1008,66 @@ impl Omnigraph {
target: impl Into,
) -> Result {
self.ensure_schema_state_valid().await?;
- self.coordinator
- .read()
- .await
- .resolve_target(&target.into())
- .await
+ let target = target.into();
+ let mut resolved = self.resolve_target_inner(&target).await?;
+ // Attach the read caches (shared Session + held-handle cache) for live
+ // Branch reads so table opens reuse handles (0 IO on a warm repeat).
+ // Snapshot-id reads are deliberately NOT cached: they pin a historical
+ // version `cleanup` may GC, so bypassing the cache sidesteps the
+ // cleanup-vs-cached-handle edge. Writes never reach here (they use
+ // `resolved_branch_target`), so they never receive a pinned handle.
+ if matches!(target, ReadTarget::Branch(_)) {
+ resolved
+ .snapshot
+ .set_read_caches(Arc::clone(&self.read_caches));
+ }
+ Ok(resolved)
+ }
+
+ /// Resolve a read target to its snapshot, without attaching read caches.
+ /// Same-branch reads reuse the warm coordinator, gated by a cheap version
+ /// probe (invariant 6: strong consistency, never a blind warm read). Reads do
+ /// not need the commit graph (the manifest version is the visibility
+ /// authority, invariant 2), so the id is synthetic and no commit-graph scan
+ /// happens on this path.
+ async fn resolve_target_inner(&self, target: &ReadTarget) -> Result {
+ if let ReadTarget::Branch(branch) = target {
+ let normalized = normalize_branch_name(branch)?;
+ {
+ let coord = self.coordinator.read().await;
+ if normalized.as_deref() != coord.current_branch() {
+ // Different branch: cold resolve (opens that branch).
+ return coord.resolve_target(target).await;
+ }
+ let held = coord.manifest_incarnation();
+ if coord.probe_latest_incarnation().await?.matches(&held) {
+ return Ok(warm_resolved_target(&coord, target));
+ }
+ // Stale: refresh under the write lock below.
+ }
+ let mut coord = self.coordinator.write().await;
+ if normalized.as_deref() == coord.current_branch() {
+ // Re-check after taking the write lock; another writer may have
+ // refreshed (tokio RwLock has no read->write upgrade).
+ let held = coord.manifest_incarnation();
+ let mut refreshed = false;
+ if !coord.probe_latest_incarnation().await?.matches(&held) {
+ coord.refresh_manifest_only().await?;
+ refreshed = true;
+ }
+ let resolved = warm_resolved_target(&coord, target);
+ drop(coord);
+ if refreshed {
+ self.invalidate_read_caches().await;
+ }
+ return Ok(resolved);
+ }
+ // Branch changed while waiting for the write lock: cold resolve.
+ return coord.resolve_target(target).await;
+ }
+
+ // Snapshot target: resolve through the commit graph as before.
+ self.coordinator.read().await.resolve_target(target).await
}
// ─── Change detection ────────────────────────────────────────────────
@@ -1673,6 +1762,24 @@ pub(crate) fn normalize_branch_name(branch: &str) -> Result