diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d4ecfa5..e937724 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,9 +8,9 @@ # CI fails if this file drifts from its source, and rejects PRs that # edit this file directly without also editing the yml. -* @ragnorc +* @ragnorc @aaltshuler -crates/** @ragnorc +crates/** @ragnorc @aaltshuler docs/** @ragnorc README.md @ragnorc AGENTS.md @ragnorc diff --git a/.github/DISCUSSION_TEMPLATE/rfc.yml b/.github/DISCUSSION_TEMPLATE/rfc.yml new file mode 100644 index 0000000..2a63525 --- /dev/null +++ b/.github/DISCUSSION_TEMPLATE/rfc.yml @@ -0,0 +1,34 @@ +labels: ["rfc"] +body: + - type: markdown + attributes: + value: | + Use this to **incubate an RFC** β€” socialize a design and reach rough + consensus before writing the formal document. When it's ready, graduate + it into a pull request that adds `docs/rfcs/NNNN-title.md` + (see [docs/rfcs/README.md](../blob/main/docs/rfcs/README.md)); a + maintainer merging that PR is acceptance. + + For a plain feature request or open-ended idea, use the **Ideas** + category instead. For bugs, open an [Issue](../../issues/new/choose). + - type: textarea + id: problem + attributes: + label: Problem / motivation + description: What needs solving, and why is it worth the long-run cost? + validations: + required: true + - type: textarea + id: sketch + attributes: + label: Proposed direction (sketch) + description: A rough shape of the design. Detail comes later in the RFC document. + validations: + required: true + - type: textarea + id: invariants + attributes: + label: Invariants touched + description: Which items in docs/dev/invariants.md does this affect or risk? Any deny-list brush? + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..8e19465 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,55 @@ +name: Bug report +description: Report a reproducible problem or wrong behavior in OmniGraph. +title: "bug: " +labels: ["bug", "needs-triage"] +body: + - type: markdown + attributes: + value: | + Issues are for **reporting problems** β€” concrete, reproducible bugs. + For ideas, feature requests, or questions, please use + [Discussions](../../discussions) instead. + For a security vulnerability, follow [SECURITY.md](../../blob/main/SECURITY.md) β€” do **not** file it here. + + A maintainer will triage this; once labelled **`accepted`** it's open for a pull request + (see [GOVERNANCE.md](../../blob/main/GOVERNANCE.md)). + - type: textarea + id: what-happened + attributes: + label: What happened + description: What went wrong, and what you expected instead. + validations: + required: true + - type: textarea + id: repro + attributes: + label: Steps to reproduce + description: Minimal steps, commands, schema/query, or a failing snippet. + placeholder: | + 1. omnigraph init ... + 2. omnigraph ... + 3. observed: ... / expected: ... + validations: + required: true + - type: input + id: version + attributes: + label: Version + description: Output of `omnigraph --version` (or the engine/crate version) and how you installed it. + validations: + required: true + - type: input + id: environment + attributes: + label: Environment + description: OS, architecture, and storage backend (local FS / S3 / RustFS / MinIO). + validations: + required: false + - type: textarea + id: logs + attributes: + label: Logs / output + description: Relevant error text or logs. Will be rendered as code. + render: shell + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..50720b8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,13 @@ +# Issues are for problem reports only. Disable blank issues so everything is +# routed: bugs through the form, everything else to Discussions / SECURITY.md. +blank_issues_enabled: false +contact_links: + - name: πŸ’‘ Idea, feature request, or RFC + url: https://github.com/ModernRelay/omnigraph/discussions + about: Propose features and designs in Discussions. RFCs graduate from there into a docs/rfcs/ pull request. + - name: ❓ Question or help + url: https://github.com/ModernRelay/omnigraph/discussions + about: Ask in Discussions β€” questions are not tracked as Issues. + - name: πŸ”’ Security vulnerability + url: https://github.com/ModernRelay/omnigraph/blob/main/SECURITY.md + about: Report security issues privately per SECURITY.md β€” never as a public Issue. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..2a548c7 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,29 @@ + + +## What & why + + + +## Backing issue / RFC + + + +- [ ] Fixes an **accepted** issue: Closes # +- [ ] Implements / is an **accepted** RFC: +- [ ] **Trivial fast-lane** (typo / docs / dependency bump / comment / one-line CI) β€” no issue/RFC required + +## Checklist + +- [ ] Change is focused (one logical change) +- [ ] Tests added/updated for behavior changes (or N/A) +- [ ] Public docs updated if user-facing surface changed (or N/A) +- [ ] Reviewed against [docs/dev/invariants.md](../blob/main/docs/dev/invariants.md) β€” no Hard Invariant weakened, no deny-list item hit (or justified) + +## Notes for reviewers + + diff --git a/.github/branch-protection.json b/.github/branch-protection.json index 61b7d33..7ca46b9 100644 --- a/.github/branch-protection.json +++ b/.github/branch-protection.json @@ -7,8 +7,8 @@ "Check AGENTS.md Links", "Test Workspace", "Test omnigraph-server --features aws", - "CODEOWNERS / drift", - "CODEOWNERS / noedit" + "CODEOWNERS matches source", + "CODEOWNERS not hand-edited" ] }, "enforce_admins": false, diff --git a/.github/codeowners-roles.yml b/.github/codeowners-roles.yml index c5e36a9..ce4014d 100644 --- a/.github/codeowners-roles.yml +++ b/.github/codeowners-roles.yml @@ -22,6 +22,7 @@ roles: compiler. members: - ragnorc + - aaltshuler docs: description: > diff --git a/.github/scripts/render-codeowners.py b/.github/scripts/render-codeowners.py index f243d0c..5e96545 100755 --- a/.github/scripts/render-codeowners.py +++ b/.github/scripts/render-codeowners.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 -"""Render .github/CODEOWNERS from .github/codeowners-roles.yml. +"""Render .github/CODEOWNERS and the ownership tables in +docs/dev/codeowners.md from .github/codeowners-roles.yml. -The yml is the source of truth β€” editing CODEOWNERS directly is -rejected by CI (see .github/workflows/codeowners.yml). This script -expands the role-based yml into the flat pathβ†’owners format GitHub -expects. +The yml is the source of truth. This script expands the role-based yml +into (1) the flat pathβ†’owners format GitHub expects in +`.github/CODEOWNERS`, and (2) the "who owns what" markdown tables spliced +between the generated-region markers in `docs/dev/codeowners.md`. Both are +derived artifacts; CI re-renders them on every PR (see +.github/workflows/codeowners.yml) and auto-commits the result on same-repo +PRs, so the source of truth and the human-readable view never drift. Usage: python3 .github/scripts/render-codeowners.py @@ -16,6 +20,7 @@ Exits non-zero on: one owner; otherwise CODEOWNERS would assign nobody and GitHub would silently fall back to "no required reviewer", which defeats the purpose). + - Missing generated-region markers in docs/dev/codeowners.md. """ from __future__ import annotations @@ -34,6 +39,13 @@ except ImportError: REPO_ROOT = Path(__file__).resolve().parents[2] SOURCE = REPO_ROOT / ".github" / "codeowners-roles.yml" OUTPUT = REPO_ROOT / ".github" / "CODEOWNERS" +DOCS = REPO_ROOT / "docs" / "dev" / "codeowners.md" + +# The "who owns what" tables in docs/dev/codeowners.md are spliced between +# these markers so the human-readable view never drifts from the source of +# truth. Edit codeowners-roles.yml and re-render β€” never the table by hand. +DOCS_BEGIN = "" +DOCS_END = "" BANNER = """\ # AUTOGENERATED from .github/codeowners-roles.yml. Do not edit by hand. @@ -75,6 +87,62 @@ def owners_for(role_names: list[str], roles: dict) -> list[str]: return seen +def _oneline(text: str) -> str: + """Collapse a folded/multi-line YAML description into one cell of text.""" + return " ".join((text or "").split()) + + +def ownership_tables(spec: dict, roles: dict) -> str: + """Render the human-readable "who owns what" markdown β€” a pathβ†’owners + table (the operative view at PR time, in last-match-wins order with the + catch-all first) plus a roleβ†’members table. Spliced into the docs between + the markers so it is always current with the source of truth.""" + out: list[str] = [] + + out.append("**Path β†’ owners** (GitHub applies *last match wins*; the `*` " + "catch-all is listed first and is overridden by the specific " + "patterns below it):") + out.append("") + out.append("| Path | Owners | Role(s) |") + out.append("|---|---|---|") + if "default" in spec: + owners = " ".join(owners_for(spec["default"], roles)) + out.append(f"| `*` | {owners} | {', '.join(spec['default'])} |") + for pattern, role_names in (spec.get("paths") or {}).items(): + owners = " ".join(owners_for(role_names, roles)) + out.append(f"| `{pattern}` | {owners} | {', '.join(role_names)} |") + out.append("") + + out.append("**Roles**:") + out.append("") + out.append("| Role | Members | Description |") + out.append("|---|---|---|") + for name, role in roles.items(): + members = " ".join(f"@{m}" for m in (role.get("members") or [])) + out.append(f"| `{name}` | {members} | {_oneline(role.get('description', ''))} |") + out.append("") + + return "\n".join(out) + + +def splice_docs(table_md: str) -> None: + """Replace the region between DOCS_BEGIN/DOCS_END in the docs file with the + freshly generated tables, leaving surrounding prose untouched.""" + if not DOCS.exists(): + sys.exit(f"error: docs file not found: {DOCS}") + text = DOCS.read_text() + if DOCS_BEGIN not in text or DOCS_END not in text: + sys.exit( + f"error: ownership markers not found in {DOCS.relative_to(REPO_ROOT)}. " + f"Add the lines:\n {DOCS_BEGIN}\n {DOCS_END}\n" + f"around the generated table region." + ) + head, rest = text.split(DOCS_BEGIN, 1) + _, tail = rest.split(DOCS_END, 1) + new = f"{head}{DOCS_BEGIN}\n\n{table_md}\n{DOCS_END}{tail}" + DOCS.write_text(new) + + def main() -> int: if not SOURCE.exists(): sys.exit(f"error: source file not found: {SOURCE}") @@ -127,6 +195,9 @@ def main() -> int: OUTPUT.write_text(rendered) print(f"wrote {OUTPUT.relative_to(REPO_ROOT)}") + + splice_docs(ownership_tables(spec, roles)) + print(f"updated {DOCS.relative_to(REPO_ROOT)}") return 0 diff --git a/.github/workflows/codeowners.yml b/.github/workflows/codeowners.yml index 19d5835..75b3515 100644 --- a/.github/workflows/codeowners.yml +++ b/.github/workflows/codeowners.yml @@ -1,19 +1,24 @@ name: CODEOWNERS +# Runs on EVERY pull request (no paths filter). The two jobs below are +# required status checks on `main`; a path-filtered required check never +# reports for PRs outside the filter and leaves them permanently "pending" +# (the trap that forced admin-override merges). Always-run + cheap +# short-circuit is what keeps them honest. on: pull_request: - paths: - - '.github/codeowners-roles.yml' - - '.github/CODEOWNERS' - - '.github/scripts/render-codeowners.py' - - '.github/workflows/codeowners.yml' workflow_dispatch: -# Read-only; we never push from this workflow. +# `drift` auto-commits the regenerated artifacts back to same-repo PR +# branches, so it needs write access. permissions: - contents: read + contents: write jobs: + # NOTE: the job `name:` values below ("CODEOWNERS matches source" / + # "CODEOWNERS not hand-edited") ARE the status-check contexts that + # .github/branch-protection.json must list verbatim. Renaming a job here + # is a branch-protection change β€” update the JSON and re-apply. drift: name: CODEOWNERS matches source runs-on: ubuntu-latest @@ -28,19 +33,56 @@ jobs: - name: Install PyYAML run: pip install pyyaml - - name: Re-render CODEOWNERS + - name: Re-render CODEOWNERS + ownership docs run: python3 .github/scripts/render-codeowners.py - - name: Reject drift + # Same-repo PR: push the regenerated artifacts back so contributors + # never have to run the script locally. Mirrors the openapi.json + # auto-commit in ci.yml (separate shallow clone of the head branch so + # the pushed commit carries only the regenerated files). + - name: Commit regenerated artifacts to PR branch + if: | + github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name == github.repository + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - if ! git diff --quiet .github/CODEOWNERS; then - echo "::error::.github/CODEOWNERS is out of sync with .github/codeowners-roles.yml." - echo "::error::Run \`python3 .github/scripts/render-codeowners.py\` locally and commit the result." + if git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then + echo "CODEOWNERS and ownership docs already in sync." + exit 0 + fi + tmp=$(mktemp -d) + git clone --depth 1 --branch "${{ github.head_ref }}" \ + "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git" \ + "$tmp" + cp .github/CODEOWNERS "$tmp/.github/CODEOWNERS" + cp docs/dev/codeowners.md "$tmp/docs/dev/codeowners.md" + cd "$tmp" + if git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then + echo "Head branch already matches; nothing to push." + exit 0 + fi + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add .github/CODEOWNERS docs/dev/codeowners.md + git commit -m "chore: regenerate CODEOWNERS + ownership docs" + git push + + # Fork PR / workflow_dispatch: cannot push back, so enforce drift + # strictly. The contributor runs the script and commits the result. + - name: Verify in sync (forks / manual runs) + if: | + !(github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name == github.repository) + run: | + if ! git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then + echo "::error::Generated CODEOWNERS / ownership docs are out of sync with .github/codeowners-roles.yml." + echo "::error::Run \`python3 .github/scripts/render-codeowners.py\` and commit the result." echo "--- diff ---" - git --no-pager diff .github/CODEOWNERS + git --no-pager diff -- .github/CODEOWNERS docs/dev/codeowners.md exit 1 fi - echo "CODEOWNERS is in sync with its source." + echo "Generated artifacts are in sync with their source." noedit: name: CODEOWNERS not hand-edited @@ -52,6 +94,8 @@ jobs: fetch-depth: 0 - name: Reject hand-edits to generated file + # Only meaningful for PRs (needs a base to diff against). + if: github.event_name == 'pull_request' run: | base="origin/${{ github.base_ref }}" git fetch origin "${{ github.base_ref }}" --quiet diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 48ab38c..a265c40 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -121,6 +121,31 @@ jobs: run: | ./scripts/update-homebrew-formula.sh "${GITHUB_REF_NAME}" homebrew-tap/Formula/omnigraph.rb + # Diagnostic only: brew is not on PATH on the ubuntu runner by default, so + # set it up explicitly. Both this setup and the audit below are best-effort + # canaries, not gates β€” continue-on-error on each keeps a failed/flaky brew + # (the action is pinned to a moving @master ref) from skipping the actual + # tap publish below. The formula is correct by construction + # (update-homebrew-formula.sh), so brew tooling must never block the push. + - name: Set up Homebrew + if: env.HOMEBREW_TAP_SKIP != '1' + continue-on-error: true + uses: Homebrew/actions/setup-homebrew@master + + - name: Audit generated formula + if: env.HOMEBREW_TAP_SKIP != '1' + continue-on-error: true + run: | + # Audit the checked-out tap by name (brew audit rejects bare paths + # and needs tap context). Symlink the checkout into Homebrew's Taps + # tree so `modernrelay/tap/omnigraph` resolves to it. Offline audit + # (no --online) keeps it deterministic; it still catches the + # ComponentsOrder/structure class of problems. + tap_dir="$(brew --repository)/Library/Taps/modernrelay/homebrew-tap" + mkdir -p "$(dirname "$tap_dir")" + ln -sfn "$PWD/homebrew-tap" "$tap_dir" + brew audit --strict modernrelay/tap/omnigraph + - name: Commit and push formula update if: env.HOMEBREW_TAP_SKIP != '1' working-directory: homebrew-tap diff --git a/AGENTS.md b/AGENTS.md index 68de6b8..b876749 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,7 +16,7 @@ Tools that support `@`-imports (Claude Code) auto-include all three files via th `CLAUDE.md` is a symlink to this file β€” there is exactly one source of truth. Edit `AGENTS.md`. -**Version surveyed:** 0.6.0 +**Version surveyed:** 0.6.1 **Workspace crates:** `omnigraph-compiler`, `omnigraph` (engine), `omnigraph-policy`, `omnigraph-cli`, `omnigraph-server` **Storage substrate:** Lance 6.x (columnar, versioned, branchable) **License:** MIT @@ -237,7 +237,7 @@ omnigraph policy explain --actor act-alice --action change --branch main | Per-dataset versioning + time travel | βœ… | `snapshot_at_version`, `entity_at`, snapshot-pinned reads across many tables | | Per-dataset branches | βœ… | **Graph-level** branches (atomic across all sub-tables), lazy fork, system branch filtering | | Atomic single-dataset commits | βœ… | **Multi-table publish via three layers**, NOT a single Lance primitive: (1) per-table Lance `commit_staged` for the data write, (2) `__manifest` row-level CAS via `ManifestBatchPublisher` for cross-table ordering, (3) the open-time recovery sweep for the residual gap between (1) and (2). All three layers ship; the four migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`) write a `__recovery/{ulid}.json` sidecar before Phase B and delete it after Phase C. The next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the sweep in `db/manifest/recovery.rs`: classify, decide all-or-nothing per sidecar, roll forward via single `ManifestBatchPublisher::publish` or roll back via `Dataset::restore`, and record an audit row in `_graph_commit_recoveries.lance` (queryable via `omnigraph commit list --filter actor=omnigraph:recovery`). Continuous in-process recovery (no restart needed between Phase B failure and recovery) is the goal of a future background reconciler. Engine writes route through a sealed `TableStorage` trait exposing `stage_*` + `commit_staged` as the canonical staged-write surface; documented inline-commit residuals (`delete_where`, `create_vector_index`, plus legacy `append_batch` / `merge_insert_batches` / `overwrite_batch` / `create_*_index`) remain on the trait until upstream Lance ships a public two-phase API ([#6658](https://github.com/lance-format/lance/issues/6658), [#6666](https://github.com/lance-format/lance/issues/6666)) and the migration of every call site completes. | -| Compaction (`compact_files`) | βœ… | `omnigraph optimize` orchestrates over all node/edge tables, bounded concurrency | +| Compaction (`compact_files`) | βœ… | `omnigraph optimize` orchestrates over all node/edge tables, bounded concurrency; **skips blob-bearing tables** (reported via `TableOptimizeStats.skipped`, not silent), gated on `LANCE_SUPPORTS_BLOB_COMPACTION` until the upstream blob-v2 compaction-decode bug is fixed (see [docs/dev/invariants.md](docs/dev/invariants.md) Known Gaps) | | Cleanup (`cleanup_old_versions`) | βœ… | `omnigraph cleanup` with `--keep` / `--older-than` policy | | BTREE / inverted (FTS) / vector indexes | βœ… | `ensure_indices` builds them on every relevant column; idempotent; lazy across branches | | `merge_insert` upsert | βœ… | `LoadMode::Merge`, mutation `update`/`insert`/`delete` lowering | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d9c687..2d77ef0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,10 +1,29 @@ # Contributing -Small bug fixes and documentation improvements are welcome directly through pull -requests. +Thanks for your interest in OmniGraph. This page is the practical how-to; the +rules and decision authority behind it live in [GOVERNANCE.md](GOVERNANCE.md). -For larger changes, please open an issue or design discussion first so the -proposed direction is clear before implementation starts. +## Start in the right place + +| I want to… | Go to | Notes | +|---|---|---| +| **Report a bug** or wrong behavior | **[Open an Issue](../../issues/new/choose)** | Concrete and reproducible. A maintainer triages it; once labelled **`accepted`** it's open for a PR. | +| **Suggest a feature / share an idea / ask** | **[Start a Discussion](../../discussions)** | Ideas and questions live here, not in Issues. | +| **Propose a design / RFC** | **An RFC pull request** | Anyone can author one β€” see [docs/rfcs/README.md](docs/rfcs/README.md). A maintainer merging it is acceptance. | +| **Fix something / implement a change** | **A pull request** | Must link an `accepted` issue or an accepted RFC β€” unless it's trivial (below). | +| **Report a security vulnerability** | **[SECURITY.md](SECURITY.md)** | Do **not** open a public Issue. | + +### When can I just open a PR? +The **trivial fast-lane** β€” open directly, no prior issue/RFC needed: typo and +wording fixes, doc corrections, dependency bumps, comment fixes, obvious +one-line CI tweaks. Anything more substantial needs a backing `accepted` issue +or accepted RFC first, so the *why* is agreed before the *how* is reviewed. A PR +that turns out to be non-trivial will be redirected β€” that's about process, not +the merit of the change. + +> **Maintainers (ModernRelay team)** follow a separate internal process and are +> not bound by the intake rules above. Everyone is bound by review, CODEOWNERS, +> branch protection, and CI. ## Development @@ -49,6 +68,11 @@ CI runs both. ## Pull Requests -- keep changes focused -- include tests for behavior changes when practical -- update public docs when the user-facing surface changes +- **Link the backing issue or RFC** (`Closes #123`, or reference the RFC) β€” or + mark the PR as trivial per the fast-lane. +- Keep changes focused; one logical change per PR. +- Include tests for behavior changes when practical. +- Update public docs when the user-facing surface changes. + +New to the codebase? Read [AGENTS.md](AGENTS.md) β€” the architecture map and the +always-on invariants every change is reviewed against. diff --git a/Cargo.lock b/Cargo.lock index a3d6d62..3223b9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4543,7 +4543,7 @@ dependencies = [ [[package]] name = "omnigraph-cli" -version = "0.6.0" +version = "0.6.1" dependencies = [ "assert_cmd", "clap", @@ -4565,7 +4565,7 @@ dependencies = [ [[package]] name = "omnigraph-compiler" -version = "0.6.0" +version = "0.6.1" dependencies = [ "ahash", "arrow-array", @@ -4586,7 +4586,7 @@ dependencies = [ [[package]] name = "omnigraph-engine" -version = "0.6.0" +version = "0.6.1" dependencies = [ "arc-swap", "arrow-array", @@ -4627,7 +4627,7 @@ dependencies = [ [[package]] name = "omnigraph-policy" -version = "0.6.0" +version = "0.6.1" dependencies = [ "cedar-policy", "clap", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "omnigraph-server" -version = "0.6.0" +version = "0.6.1" dependencies = [ "arc-swap", "async-trait", diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 0000000..5878f1f --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,106 @@ +# Governance + +This document describes how **external contributions** to OmniGraph are +proposed, accepted, and merged. It exists so an outside contributor can answer, +without asking: *where does my report/idea/change go, who decides, and what has +to happen before code lands?* + +> **Scope.** This governs the public contribution surface β€” Issues, +> Discussions, RFCs, and pull requests from people outside the ModernRelay +> team. **Maintainers operate under a separate internal process** and are not +> bound by the intake gates below. Everyone, maintainer or not, is still bound +> by the universal gates: branch protection on `main` and CODEOWNERS review +> (see [docs/dev/branch-protection.md](docs/dev/branch-protection.md) and +> [docs/dev/codeowners.md](docs/dev/codeowners.md)). + +## Roles + +| Role | Who | Authority | +|---|---|---| +| **Maintainer** | The code owners in [`.github/CODEOWNERS`](.github/CODEOWNERS) (generated from [`.github/codeowners-roles.yml`](.github/codeowners-roles.yml)) | Validate issues, accept/reject RFCs, review and merge PRs, set direction. Final decision authority. | +| **Contributor** | Anyone else | Report problems (Issues), propose ideas (Discussions), author RFCs, and open pull requests. | + +Decision authority rests with the maintainers. CODEOWNERS is the single source +of truth for who that is; this document does not duplicate the list. + +## The three channels + +Each channel has one job. Using the right one is the first thing we ask of a +contribution. + +| Channel | Purpose | Not for | +|---|---|---| +| **[Issues](../../issues)** | **Report a problem** β€” a bug, a regression, a documented behavior that's wrong. Something concrete and reproducible. | Feature requests, ideas, questions, or design proposals (β†’ Discussions). | +| **[Discussions](../../discussions)** | **Propose and explore** β€” new ideas, feature requests, questions, and the incubation of RFCs. | Bug reports (β†’ Issues). | +| **Pull requests** | **Land a sanctioned change** β€” a fix for a *validated* issue, an *accepted* RFC, or a trivial change (see fast-lane). | Substantive change with no backing issue/RFC β€” it will be redirected. | + +## How a change becomes mergeable + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ bug ───────────┐ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€ idea / feature ────────┐ + β–Ό β”‚ β–Ό β”‚ + Issue (problem report) β”‚ Discussion (idea / RFC incubation) β”‚ + β”‚ β”‚ β”‚ β”‚ + maintainer triage β”‚ rough consensus β”‚ + β”‚ β”‚ β”‚ graduate β”‚ + β–Ό β”‚ β–Ό β”‚ + label: accepted ──────────┐ β”‚ RFC PR (docs/rfcs/NNNN-*.md) β”‚ + β”‚ β”‚ β”‚ β”‚ β”‚ + β”‚ β”‚ β”‚ maintainer review β”‚ + β–Ό β–Ό β”‚ β–Ό β”‚ + Pull request ◀──────────┴──────────│── merged == accepted β”‚ + (links the issue or the accepted RFC) β—€β”€β”€β”€β”€β”€β”€β”€β”˜ (implementation PRs reference it) β”‚ + β”‚ + review + CODEOWNERS + branch protection + β–Ό + merged +``` + +### Issues β†’ validated +A new issue starts unlabeled. A maintainer triages it and, if it's a real, +in-scope problem, applies the **`accepted`** label. **Only `accepted` issues are +open for a contributor PR.** This prevents the "I fixed an issue you hadn't +agreed was a problem" rejection. Want to fix something? Get the issue accepted +first, or pick one already labelled `accepted` / `help wanted`. + +### Discussions β†’ RFCs β†’ accepted +Ideas and feature requests start in **Discussions**. Anyone β€” including external +contributors β€” may then **author an RFC** by opening a pull request that adds +`docs/rfcs/NNNN-title.md` (see [docs/rfcs/README.md](docs/rfcs/README.md)). The +RFC is reviewed as code; **a maintainer merging it is the act of acceptance** +(it becomes the durable decision record). Implementation PRs then reference the +accepted RFC. + +Authoring an RFC is open to everyone; **accepting one is a maintainer +decision.** Maintainers may also decline an RFC, with rationale, by closing it. + +### Pull requests β†’ sanctioned +A contributor PR must do one of: +1. link a maintainer-**`accepted`** issue it fixes, or +2. be (or reference) an **accepted RFC**, or +3. qualify for the **trivial fast-lane**. + +**Trivial fast-lane** β€” these may be opened directly, no prior issue/RFC: +typo and wording fixes, documentation corrections, dependency bumps, comment +fixes, and obviously-correct one-line CI tweaks. When in doubt, open an Issue or +Discussion first; a PR that turns out to be non-trivial will be asked to. + +A substantive PR with no backing issue/RFC will be closed with a pointer to the +right channel β€” not as a judgment of the idea, but to keep design discussion +where it's reviewable. + +## What maintainers do *not* gate +Maintainers' own changes do not pass through the intake gates above β€” the team +runs a separate internal process. The universal gates (review, CODEOWNERS, +branch protection, CI) apply to everyone. Enforcement of the intake rules is, to +start, **by convention and review** (PR template + labels); an automated check +keyed to author association may be added later if volume warrants. + +## Code of conduct & security +- Conduct: [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). +- Security issues are **not** public Issues β€” see [SECURITY.md](SECURITY.md). + +## Changing this document +Governance changes the same way code does: a pull request, reviewed by +maintainers. This file describes the external surface; the internal maintainer +process is intentionally out of scope here. diff --git a/crates/omnigraph-cli/Cargo.toml b/crates/omnigraph-cli/Cargo.toml index 0d35ed8..641068e 100644 --- a/crates/omnigraph-cli/Cargo.toml +++ b/crates/omnigraph-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-cli" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "CLI for the Omnigraph graph database." license = "MIT" @@ -13,10 +13,10 @@ name = "omnigraph" path = "src/main.rs" [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.0" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.0" } -omnigraph-server = { path = "../omnigraph-server", version = "0.6.0" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.1" } +omnigraph-server = { path = "../omnigraph-server", version = "0.6.1" } clap = { workspace = true } color-eyre = { workspace = true } serde = { workspace = true } diff --git a/crates/omnigraph-cli/src/main.rs b/crates/omnigraph-cli/src/main.rs index b7e3041..29b55c4 100644 --- a/crates/omnigraph-cli/src/main.rs +++ b/crates/omnigraph-cli/src/main.rs @@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Args, CommandFactory, FromArgMatches, Parser, Subcomm use color_eyre::eyre::{Result, bail}; use omnigraph::db::{Omnigraph, ReadTarget, SnapshotId}; use omnigraph::loader::LoadMode; +use omnigraph::storage::normalize_root_uri; use omnigraph_compiler::query::parser::parse_query; use omnigraph_compiler::schema::parser::parse_schema; use omnigraph_compiler::{ @@ -24,9 +25,10 @@ use omnigraph_server::api::{ SnapshotTableOutput, commit_output, ingest_output, read_output, schema_apply_output, snapshot_payload, }; +use omnigraph_server::queries::{QueryRegistry, check, format_check_breakages}; use omnigraph_server::{ AliasCommand, OmnigraphConfig, PolicyAction, PolicyDecision, PolicyEngine, PolicyRequest, - PolicyTestConfig, ReadOutputFormat, load_config, + PolicyTestConfig, ReadOutputFormat, graph_resource_id_for_selection, load_config, }; use reqwest::Method; use reqwest::header::AUTHORIZATION; @@ -153,6 +155,11 @@ enum Command { #[arg(long)] json: bool, }, + /// Operate on the server-side stored-query registry (`queries:`). + Queries { + #[command(subcommand)] + command: QueriesCommand, + }, /// Show graph snapshot Snapshot { /// Graph URI @@ -502,6 +509,35 @@ enum PolicyCommand { }, } +#[derive(Debug, Subcommand)] +enum QueriesCommand { + /// Type-check the stored-query registry against the live schema. + /// + /// Distinct from `omnigraph lint` (which lints one `.gq` file): + /// this validates the whole `queries:` registry β€” opening the graph + /// to read its schema and confirming every stored query still + /// type-checks. Exits non-zero on any breakage. + Validate { + /// Graph URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + json: bool, + }, + /// List the registered stored queries (name, MCP exposure, params). + List { + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + json: bool, + }, +} + #[derive(Debug, Args, Clone)] struct ParamsArgs { #[arg(long, conflicts_with = "params_file")] @@ -743,25 +779,66 @@ fn load_cli_config(config_path: Option<&PathBuf>) -> Result { Ok(config) } -fn resolve_policy_engine(config: &OmnigraphConfig) -> Result { - let policy_file = config - .resolve_policy_file() - .ok_or_else(|| color_eyre::eyre::eyre!("policy.file must be set in omnigraph.yaml"))?; - PolicyEngine::load_graph(&policy_file, &policy_graph_id(config)) +#[derive(Debug, Clone)] +struct ResolvedCliGraph { + uri: String, + selected: Option, + graph_id: String, + policy_file: Option, + is_remote: bool, } -/// Open a local-URI graph and, when `policy.file` is configured in -/// `omnigraph.yaml`, install the resolved `PolicyEngine` on the engine -/// handle so every direct-engine write goes through -/// `Omnigraph::enforce(...)` (MR-722). Without a configured policy this -/// is identical to a bare `Omnigraph::open`. -/// -/// Returns owned `Omnigraph`; chained on top of `Omnigraph::open(...)`'s -/// existing future to keep call sites narrow. -async fn open_local_db_with_policy(uri: &str, config: &OmnigraphConfig) -> Result { - let db = Omnigraph::open(uri).await?; - if config.resolve_policy_file().is_some() { - let engine = Arc::new(resolve_policy_engine(config)?); +impl ResolvedCliGraph { + fn selected(&self) -> Option<&str> { + self.selected.as_deref() + } +} + +struct ResolvedPolicyContext { + policy_file: PathBuf, + graph_id: String, +} + +fn resolve_policy_context(config: &OmnigraphConfig) -> Result { + let selected = config.resolve_policy_tooling_graph_selection()?; + let policy_file = config + .resolve_policy_file_for(selected) + .ok_or_else(|| { + color_eyre::eyre::eyre!( + "policy.file or graphs..policy.file must be set in omnigraph.yaml" + ) + })?; + let graph_id = match selected { + Some(name) => graph_resource_id_for_selection(Some(name), ""), + None => graph_resource_id_for_selection(None, "default"), + }; + Ok(ResolvedPolicyContext { + policy_file, + graph_id, + }) +} + +fn resolve_policy_engine(context: &ResolvedPolicyContext) -> Result { + PolicyEngine::load_graph(&context.policy_file, &context.graph_id) +} + +fn resolve_policy_engine_for_graph(graph: &ResolvedCliGraph) -> Result { + let policy_file = graph.policy_file.as_ref().ok_or_else(|| { + color_eyre::eyre::eyre!( + "policy.file or graphs..policy.file must be set in omnigraph.yaml" + ) + })?; + PolicyEngine::load_graph(policy_file, &graph.graph_id) +} + +/// Open a local graph and install the policy resolved for the same graph +/// identity that produced the URI. A named graph uses +/// `graphs..policy.file`; an explicit positional URI is anonymous and +/// uses the legacy top-level `policy.file`. +async fn open_local_db_with_policy(graph: &ResolvedCliGraph) -> Result { + let db = Omnigraph::open(&graph.uri).await?; + if graph.policy_file.is_some() { + let engine = Arc::new(resolve_policy_engine_for_graph(graph)?); Ok(db.with_policy(engine as Arc)) } else { Ok(db) @@ -778,22 +855,16 @@ fn resolve_cli_actor<'a>(cli_as: Option<&'a str>, config: &'a OmnigraphConfig) - cli_as.or(config.cli.actor.as_deref()) } -fn resolve_policy_tests_path(config: &OmnigraphConfig) -> Result { - config.resolve_policy_tests_file().ok_or_else(|| { - color_eyre::eyre::eyre!( - "policy.tests.yaml requires policy.file to be set in omnigraph.yaml" - ) - }) +fn resolve_policy_tests_path(context: &ResolvedPolicyContext) -> PathBuf { + context.policy_file.with_file_name("policy.tests.yaml") } -fn policy_graph_id(config: &OmnigraphConfig) -> String { - if let Some(name) = &config.project.name { - return name.clone(); +fn normalize_policy_graph_uri(uri: &str) -> Result { + if is_remote_uri(uri) { + Ok(uri.trim_end_matches('/').to_string()) + } else { + Ok(normalize_root_uri(uri)?) } - config - .resolve_target_uri(None, None, config.server_graph_name()) - .or_else(|_| config.resolve_target_uri(None, None, config.cli_graph_name())) - .unwrap_or_else(|_| "default".to_string()) } fn resolve_remote_bearer_token( @@ -877,6 +948,47 @@ fn resolve_uri( config.resolve_target_uri(cli_uri, cli_target, config.cli_graph_name()) } +fn resolve_cli_graph( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, +) -> Result { + let selected = if cli_uri.is_some() { + None + } else { + cli_target + .map(str::to_string) + .or_else(|| config.cli_graph_name().map(str::to_string)) + }; + config.resolve_graph_selection(selected.as_deref())?; + let uri = resolve_uri(config, cli_uri, cli_target)?; + let normalized_uri = normalize_policy_graph_uri(&uri)?; + let graph_id = graph_resource_id_for_selection(selected.as_deref(), &normalized_uri); + Ok(ResolvedCliGraph { + graph_id, + is_remote: is_remote_uri(&uri), + policy_file: config.resolve_policy_file_for(selected.as_deref()), + selected, + uri, + }) +} + +fn resolve_local_graph( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, + operation: &str, +) -> Result { + let graph = resolve_cli_graph(config, cli_uri, cli_target)?; + if graph.is_remote { + bail!( + "{} is only supported against local graph URIs in this milestone", + operation + ); + } + Ok(graph) +} + /// Parse a Go-style compact duration: `7d`, `24h`, `30m`, `90s`, or a plain /// integer as seconds. Used by the `cleanup --older-than` flag. fn parse_duration_arg(s: &str) -> Result { @@ -915,14 +1027,7 @@ fn resolve_local_uri( cli_target: Option<&str>, operation: &str, ) -> Result { - let uri = resolve_uri(config, cli_uri, cli_target)?; - if is_remote_uri(&uri) { - bail!( - "{} is only supported against local graph URIs in this milestone", - operation - ); - } - Ok(uri) + Ok(resolve_local_graph(config, cli_uri, cli_target, operation)?.uri) } fn resolve_branch( @@ -1609,6 +1714,248 @@ async fn execute_query_lint( )) } +#[derive(serde::Serialize)] +struct QueriesIssue { + query: String, + message: String, +} + +#[derive(serde::Serialize)] +struct QueriesValidateOutput { + ok: bool, + breakages: Vec, + warnings: Vec, +} + +#[derive(serde::Serialize)] +struct QueriesParam { + name: String, + #[serde(rename = "type")] + type_name: String, + nullable: bool, +} + +#[derive(serde::Serialize)] +struct QueriesListItem { + name: String, + mcp_expose: bool, + tool_name: Option, + mutation: bool, + params: Vec, +} + +#[derive(serde::Serialize)] +struct QueriesListOutput { + queries: Vec, +} + +/// Resolve the selected graph to `(local URI, registry selection)` from one +/// precedence, so a command's schema and its stored-query registry can never +/// come from different graphs. A **positional URI is anonymous** (top-level +/// registry, ignoring the configured default graph); otherwise `--target` +/// or the configured `cli.graph` names the graph (its per-graph block). +/// Mirrors the server's single-mode identity rule. +fn resolve_selected_graph( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, + operation: &str, +) -> Result<(String, Option)> { + let graph = resolve_local_graph(config, cli_uri, cli_target, operation)?; + Ok((graph.uri, graph.selected)) +} + +/// Load the stored-query registry for an already-resolved graph selection +/// (`None` = anonymous β†’ top-level; `Some(name)` = that graph's block). +fn load_registry_or_report( + config: &OmnigraphConfig, + selected: Option<&str>, +) -> Result { + QueryRegistry::load(config, config.query_entries_for(selected)).map_err(|errors| { + color_eyre::eyre::eyre!( + "stored-query registry failed to load:\n {}", + errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n ") + ) + }) +} + +fn graph_query_registry_names(config: &OmnigraphConfig) -> Vec<&str> { + config + .graphs + .iter() + .filter_map(|(name, graph)| (!graph.queries.is_empty()).then_some(name.as_str())) + .collect() +} + +fn resolve_registry_selection_for_list( + config: &OmnigraphConfig, + target: Option<&str>, +) -> Result> { + let selected = target + .map(str::to_string) + .or_else(|| config.cli_graph_name().map(str::to_string)); + if let Some(name) = selected.as_deref() { + config.resolve_graph_selection(Some(name))?; + return Ok(selected); + } + + if !config.query_entries().is_empty() { + return Ok(None); + } + + let graph_names = graph_query_registry_names(config); + if graph_names.is_empty() { + return Ok(None); + } + + bail!( + "stored-query registries are configured for graph{} {} but no graph was selected. Pass `--target {}` or set `cli.graph`.", + if graph_names.len() == 1 { "" } else { "s" }, + graph_names.join(", "), + graph_names[0], + ) +} + +fn validate_registry_for_catalog( + registry: &QueryRegistry, + catalog: &omnigraph_compiler::catalog::Catalog, + label: &str, +) -> omnigraph::error::Result<()> { + let report = check(registry, catalog); + if report.has_breakages() { + return Err(omnigraph::error::OmniError::manifest( + format_check_breakages(label, &report), + )); + } + Ok(()) +} + +async fn execute_queries_validate( + uri: Option, + target: Option, + config_path: Option<&PathBuf>, + json: bool, +) -> Result<()> { + let config = load_cli_config(config_path)?; + // One selection drives both the schema URI and the registry, so a + // positional URI and a `--target` can't validate different graphs. + let (uri, selected) = + resolve_selected_graph(&config, uri, target.as_deref(), "queries validate")?; + let registry = load_registry_or_report(&config, selected.as_deref())?; + let db = Omnigraph::open(&uri).await?; + let report = check(®istry, &db.catalog()); + + let output = QueriesValidateOutput { + ok: !report.has_breakages(), + breakages: report + .breakages + .iter() + .map(|b| QueriesIssue { + query: b.query.clone(), + message: b.message.clone(), + }) + .collect(), + warnings: report + .warnings + .iter() + .map(|w| QueriesIssue { + query: w.query.clone(), + message: w.message.clone(), + }) + .collect(), + }; + + if json { + print_json(&output)?; + } else { + if output.breakages.is_empty() { + println!( + "OK {} stored quer{} type-check against the schema", + registry.len(), + if registry.len() == 1 { "y" } else { "ies" } + ); + } + for issue in &output.breakages { + println!("ERROR query '{}': {}", issue.query, issue.message); + } + for issue in &output.warnings { + println!("WARN query '{}': {}", issue.query, issue.message); + } + } + + if report.has_breakages() { + io::stdout().flush()?; + std::process::exit(1); + } + Ok(()) +} + +fn execute_queries_list( + target: Option, + config_path: Option<&PathBuf>, + json: bool, +) -> Result<()> { + let config = load_cli_config(config_path)?; + let selected = resolve_registry_selection_for_list(&config, target.as_deref())?; + let registry = load_registry_or_report(&config, selected.as_deref())?; + + let output = QueriesListOutput { + queries: registry + .iter() + .map(|q| QueriesListItem { + name: q.name.clone(), + mcp_expose: q.expose, + tool_name: q.tool_name.clone(), + mutation: q.is_mutation(), + params: q + .decl + .params + .iter() + .map(|p| QueriesParam { + name: p.name.clone(), + type_name: p.type_name.clone(), + nullable: p.nullable, + }) + .collect(), + }) + .collect(), + }; + + if json { + print_json(&output)?; + } else if output.queries.is_empty() { + println!("(no stored queries registered)"); + } else { + for q in &output.queries { + let kind = if q.mutation { "mutation" } else { "read" }; + let params = q + .params + .iter() + .map(|p| { + format!( + "${}: {}{}", + p.name, + p.type_name, + if p.nullable { "?" } else { "" } + ) + }) + .collect::>() + .join(", "); + let mcp = if q.mcp_expose { + format!(" [mcp: {}]", q.tool_name.as_deref().unwrap_or(&q.name)) + } else { + String::new() + }; + println!("{kind} {}({params}){mcp}", q.name); + } + } + Ok(()) +} + async fn execute_read( uri: &str, query_source: &str, @@ -1655,7 +2002,7 @@ async fn execute_read_remote( } async fn execute_change( - uri: &str, + graph: &ResolvedCliGraph, query_source: &str, query_name: Option<&str>, branch: &str, @@ -1665,7 +2012,7 @@ async fn execute_change( ) -> Result { let (selected_name, query_params) = select_named_query(query_source, query_name)?; let params = query_params_from_json(&query_params, params_json)?; - let db = open_local_db_with_policy(uri, config).await?; + let db = open_local_db_with_policy(graph).await?; let actor = resolve_cli_actor(cli_as_actor, config); let result = db .mutate_as(branch, query_source, &selected_name, ¶ms, actor) @@ -1893,9 +2240,10 @@ async fn main() -> Result<()> { json, } => { let config = load_cli_config(config.as_ref())?; - let uri = resolve_local_uri(&config, uri, target.as_deref(), "load")?; + let graph = resolve_local_graph(&config, uri, target.as_deref(), "load")?; + let uri = graph.uri.clone(); let branch = resolve_branch(&config, branch, None, "main"); - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); let result = db .load_file_as(&branch, &data.to_string_lossy(), mode.into(), actor) @@ -1936,10 +2284,11 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let branch = resolve_branch(&config, branch, None, "main"); let from = resolve_branch(&config, from, None, "main"); - let payload = if is_remote_uri(&uri) { + let payload = if graph.is_remote { let data = fs::read_to_string(&data)?; remote_json::( &http_client, @@ -1955,7 +2304,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); let result = db .ingest_file_as( @@ -1986,9 +2335,10 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let from = resolve_branch(&config, from, None, "main"); - let payload = if is_remote_uri(&uri) { + let payload = if graph.is_remote { remote_json::( &http_client, Method::POST, @@ -2001,7 +2351,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); db.branch_create_from_as(ReadTarget::branch(&from), &name, actor) .await?; @@ -2027,8 +2377,9 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let payload = if is_remote_uri(&uri) { + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); + let payload = if graph.is_remote { remote_json::( &http_client, Method::GET, @@ -2061,8 +2412,9 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let payload = if is_remote_uri(&uri) { + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); + let payload = if graph.is_remote { remote_json::( &http_client, Method::DELETE, @@ -2072,7 +2424,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); db.branch_delete_as(&name, actor).await?; BranchDeleteOutput { @@ -2098,9 +2450,10 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let into = resolve_branch(&config, into, None, "main"); - let payload = if is_remote_uri(&uri) { + let payload = if graph.is_remote { remote_json::( &http_client, Method::POST, @@ -2113,7 +2466,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); let outcome = db.branch_merge_as(&source, &into, actor).await?; BranchMergeOutput { @@ -2248,9 +2601,10 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let schema_source = fs::read_to_string(&schema)?; - let output = if is_remote_uri(&uri) { + let output = if graph.is_remote { // MR-694 PR B: SchemaApplyRequest gained an // allow_data_loss field so Hard-mode drops are no // longer CLI-only. The previous bail is gone; the @@ -2268,13 +2622,22 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); + let registry = load_registry_or_report(&config, graph.selected())?; + let registry = (!registry.is_empty()).then_some(registry); + let label = graph.selected().unwrap_or(&uri).to_string(); let result = db - .apply_schema_as( + .apply_schema_as_with_catalog_check( &schema_source, omnigraph::db::SchemaApplyOptions { allow_data_loss }, actor, + |catalog| { + if let Some(registry) = registry.as_ref() { + validate_registry_for_catalog(registry, catalog, &label)?; + } + Ok(()) + }, ) .await?; schema_apply_output(&uri, result) @@ -2331,6 +2694,23 @@ async fn main() -> Result<()> { .await?; finish_query_lint(&output, json)?; } + Command::Queries { command } => match command { + QueriesCommand::Validate { + uri, + target, + config, + json, + } => { + execute_queries_validate(uri, target, config.as_ref(), json).await?; + } + QueriesCommand::List { + target, + config, + json, + } => { + execute_queries_list(target, config.as_ref(), json)?; + } + }, Command::Snapshot { uri, target, @@ -2436,7 +2816,8 @@ async fn main() -> Result<()> { .as_deref() .or_else(|| alias_config.and_then(|alias| alias.graph.as_deref())); let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target_name)?; - let uri = resolve_uri(&config, uri, target_name)?; + let graph = resolve_cli_graph(&config, uri, target_name)?; + let uri = graph.uri.clone(); let query_source = resolve_query_source( &config, query.as_ref(), @@ -2458,7 +2839,7 @@ async fn main() -> Result<()> { alias_config.and_then(|alias| alias.branch.clone()), )?; let query_name = name.or_else(|| alias_config.and_then(|alias| alias.name.clone())); - let output = if is_remote_uri(&uri) { + let output = if graph.is_remote { execute_read_remote( &http_client, &uri, @@ -2521,7 +2902,8 @@ async fn main() -> Result<()> { .as_deref() .or_else(|| alias_config.and_then(|alias| alias.graph.as_deref())); let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target_name)?; - let uri = resolve_uri(&config, uri, target_name)?; + let graph = resolve_cli_graph(&config, uri, target_name)?; + let uri = graph.uri.clone(); let query_source = resolve_query_source( &config, query.as_ref(), @@ -2543,7 +2925,7 @@ async fn main() -> Result<()> { "main", ); let query_name = name.or_else(|| alias_config.and_then(|alias| alias.name.clone())); - let output = if is_remote_uri(&uri) { + let output = if graph.is_remote { execute_change_remote( &http_client, &uri, @@ -2556,7 +2938,7 @@ async fn main() -> Result<()> { .await? } else { execute_change( - &uri, + &graph, &query_source, query_name.as_deref(), &branch, @@ -2575,20 +2957,19 @@ async fn main() -> Result<()> { Command::Policy { command } => match command { PolicyCommand::Validate { config } => { let config = load_cli_config(config.as_ref())?; - let engine = resolve_policy_engine(&config)?; - let policy_file = config - .resolve_policy_file() - .expect("policy file should exist after resolve_policy_engine"); + let context = resolve_policy_context(&config)?; + let engine = resolve_policy_engine(&context)?; println!( "policy valid: {} [{} actors]", - policy_file.display(), + context.policy_file.display(), engine.known_actor_count() ); } PolicyCommand::Test { config } => { let config = load_cli_config(config.as_ref())?; - let engine = resolve_policy_engine(&config)?; - let tests_path = resolve_policy_tests_path(&config)?; + let context = resolve_policy_context(&config)?; + let engine = resolve_policy_engine(&context)?; + let tests_path = resolve_policy_tests_path(&context); let tests = PolicyTestConfig::load(&tests_path)?; engine.run_tests(&tests)?; println!("policy tests passed: {} cases", tests.cases.len()); @@ -2601,7 +2982,8 @@ async fn main() -> Result<()> { target_branch, } => { let config = load_cli_config(config.as_ref())?; - let engine = resolve_policy_engine(&config)?; + let context = resolve_policy_context(&config)?; + let engine = resolve_policy_engine(&context)?; let request = PolicyRequest { action, branch, @@ -2629,18 +3011,19 @@ async fn main() -> Result<()> { "fragments_removed": s.fragments_removed, "fragments_added": s.fragments_added, "committed": s.committed, + "skipped": s.skipped.map(|r| r.as_str()), })).collect::>(), }); print_json(&value)?; } else { println!("optimize {} β€” {} tables", uri, stats.len()); for s in &stats { - if s.committed { + if let Some(reason) = s.skipped { + println!(" {:<40} skipped ({reason})", s.table_key); + } else if s.committed { println!( " {:<40} frags {} β†’ {} βœ“", - s.table_key, - s.fragments_removed + s.fragments_added - s.fragments_added, - s.fragments_added + s.table_key, s.fragments_removed, s.fragments_added ); } else { println!(" {:<40} no-op", s.table_key); @@ -2699,20 +3082,33 @@ async fn main() -> Result<()> { "table_key": s.table_key, "bytes_removed": s.bytes_removed, "old_versions_removed": s.old_versions_removed, + "error": s.error, })).collect::>(), }); print_json(&value)?; } else { let total_bytes: u64 = stats.iter().map(|s| s.bytes_removed).sum(); let total_versions: u64 = stats.iter().map(|s| s.old_versions_removed).sum(); + let failed: Vec<&str> = stats + .iter() + .filter(|s| s.error.is_some()) + .map(|s| s.table_key.as_str()) + .collect(); println!( "cleanup {} ({}) β€” removed {} versions ({} bytes) across {} tables", uri, policy_desc, total_versions, total_bytes, - stats.len() + stats.len() - failed.len() ); + if !failed.is_empty() { + println!( + " {} table(s) failed and will be retried on the next cleanup: {}", + failed.len(), + failed.join(", ") + ); + } } } Command::Graphs { command } => match command { @@ -2761,7 +3157,8 @@ mod tests { use super::{ DEFAULT_BEARER_TOKEN_ENV, apply_bearer_token, bearer_token_from_env_file, legacy_change_request_body, load_cli_config, load_env_file_into_process, - normalize_bearer_token, parse_env_assignment, resolve_remote_bearer_token, + normalize_bearer_token, parse_env_assignment, resolve_policy_context, + resolve_cli_graph, resolve_remote_bearer_token, }; use omnigraph_server::load_config; use reqwest::header::AUTHORIZATION; @@ -3021,4 +3418,150 @@ graphs: } } } + + #[test] + fn graph_identity_resolve_policy_context_named_cli_graph_uses_graph_key_not_project_name_or_uri() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/local-policy-graph.omni + policy: + file: ./policy.yaml +cli: + graph: local +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let context = resolve_policy_context(&config).unwrap(); + assert_eq!(context.graph_id, "local"); + } + + #[test] + fn graph_identity_resolve_policy_context_server_graph_uses_graph_key_when_cli_graph_absent() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/local-policy-graph.omni + policy: + file: ./server-policy.yaml +server: + graph: local +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let context = resolve_policy_context(&config).unwrap(); + assert_eq!(context.graph_id, "local"); + assert!(context.policy_file.ends_with("server-policy.yaml")); + } + + #[test] + fn graph_identity_resolve_policy_context_anonymous_uses_top_level_default_identity() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/local-policy-graph.omni +policy: + file: ./top-policy.yaml +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let context = resolve_policy_context(&config).unwrap(); + assert_eq!(context.graph_id, "default"); + assert!(context.policy_file.ends_with("top-policy.yaml")); + } + + #[test] + fn graph_identity_resolve_cli_graph_named_target_uses_graph_key_not_project_name_or_uri() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + prod: + uri: s3://bucket/prod-graph/ + policy: + file: ./prod-policy.yaml +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let graph = resolve_cli_graph(&config, None, Some("prod")).unwrap(); + assert_eq!(graph.selected(), Some("prod")); + assert_eq!(graph.graph_id, "prod"); + assert_eq!(graph.uri, "s3://bucket/prod-graph/"); + } + + #[test] + fn graph_identity_resolve_cli_graph_positional_uri_uses_anonymous_normalized_uri() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/configured-graph.omni + policy: + file: ./policy.yaml +cli: + graph: local +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let local_graph_path = temp.path().join("explicit-graph.omni"); + let local_graph = resolve_cli_graph( + &config, + Some(format!("file://{}", local_graph_path.display())), + None, + ) + .unwrap(); + assert_eq!(local_graph.selected(), None); + assert_eq!( + local_graph.graph_id, + local_graph_path.to_string_lossy().as_ref() + ); + assert_eq!(local_graph.policy_file, None); + + let s3_graph = resolve_cli_graph( + &config, + Some("s3://bucket/anonymous-graph/".to_string()), + None, + ) + .unwrap(); + assert_eq!(s3_graph.selected(), None); + assert_eq!(s3_graph.graph_id, "s3://bucket/anonymous-graph"); + assert_eq!(s3_graph.policy_file, None); + } } diff --git a/crates/omnigraph-cli/tests/cli.rs b/crates/omnigraph-cli/tests/cli.rs index 6e5de37..9682d9a 100644 --- a/crates/omnigraph-cli/tests/cli.rs +++ b/crates/omnigraph-cli/tests/cli.rs @@ -2376,3 +2376,295 @@ fn graphs_list_against_local_uri_errors_with_remote_only_message() { "expected 'remote multi-graph server URL' rejection in stderr; got:\n{stderr}" ); } + +fn queries_test_config(graph_uri: &str, entry: &str, gq_file: &str) -> String { + format!( + "graphs:\n local:\n uri: '{}'\n queries:\n {entry}:\n file: ./{gq_file}\n\ + cli:\n graph: local\npolicy: {{}}\n", + graph_uri.replace('\'', "''") + ) +} + +#[test] +fn queries_validate_exits_zero_on_clean_registry() { + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &queries_test_config(&graph.path().to_string_lossy(), "find_person", "find_person.gq"), + ); + let output = output_success(cli().arg("queries").arg("validate").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!(stdout.contains("OK"), "stdout:\n{stdout}"); +} + +#[test] +fn queries_validate_exits_nonzero_on_type_broken_query() { + let graph = SystemGraph::loaded(); + // `Widget` is not in the fixture schema. + graph.write_query("ghost.gq", "query ghost() { match { $w: Widget } return { $w.name } }"); + let config = graph.write_config( + "omnigraph.yaml", + &queries_test_config(&graph.path().to_string_lossy(), "ghost", "ghost.gq"), + ); + let output = output_failure(cli().arg("queries").arg("validate").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!( + stdout.contains("ghost"), + "validation should name the broken query; stdout:\n{stdout}" + ); +} + +#[test] +fn queries_list_prints_registered_query() { + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + // Exposed with an explicit tool name so the list shows the MCP suffix. + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " find_person:\n", + " file: ./find_person.gq\n", + " mcp: {{ expose: true, tool_name: lookup_person }}\n", + "cli:\n", + " graph: local\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + let output = output_success(cli().arg("queries").arg("list").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!(stdout.contains("find_person"), "stdout:\n{stdout}"); + assert!( + stdout.contains("$name: String"), + "list should show typed params; stdout:\n{stdout}" + ); + assert!( + stdout.contains("[mcp: lookup_person]"), + "list should show the MCP tool name for exposed queries; stdout:\n{stdout}" + ); +} + +#[test] +fn queries_list_requires_graph_selection_for_per_graph_only_registries() { + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " find_person:\n", + " file: ./find_person.gq\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + + let output = output_failure(cli().arg("queries").arg("list").arg("--config").arg(&config)); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("local") && stderr.contains("--target local"), + "error must name the graph and give a concrete selection hint; stderr:\n{stderr}" + ); +} + +#[test] +fn queries_list_without_graph_selection_lists_top_level_registry() { + let graph = SystemGraph::loaded(); + graph.write_query( + "top_find.gq", + "query top_find($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + concat!( + "queries:\n", + " top_find:\n", + " file: ./top_find.gq\n", + "policy: {}\n", + ), + ); + + let output = output_success(cli().arg("queries").arg("list").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!(stdout.contains("top_find"), "stdout:\n{stdout}"); +} + +#[test] +fn queries_list_unknown_target_errors() { + // `queries list` opens no graph URI, so unknown-graph validation can't ride + // along on URI resolution the way it does for every other command. An + // unknown `--target` must still error (naming the graph) instead of + // silently falling back to the top-level registry and showing the wrong + // (or empty) catalog. + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &queries_test_config(&graph.path().to_string_lossy(), "find_person", "find_person.gq"), + ); + let output = output_failure( + cli() + .arg("queries") + .arg("list") + .arg("--target") + .arg("nonexistent") + .arg("--config") + .arg(&config), + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("nonexistent"), + "error must name the unknown graph; stderr:\n{stderr}" + ); +} + +#[test] +fn queries_commands_reject_named_graph_with_populated_top_level_block() { + // A named graph (here via `cli.graph`) uses its own `graphs.` block, + // so a populated top-level `queries:` block would be silently ignored β€” a + // config the server REFUSES to boot. `queries validate`/`list` must reject + // it too (matching boot) instead of validating/listing the per-graph block + // and giving a false green. + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " find_person:\n", + " file: ./find_person.gq\n", + "cli:\n", + " graph: local\n", + "queries:\n", // populated top-level block: the coherence violation + " legacy:\n", + " file: ./legacy.gq\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + // Both resolve `local` from cli.graph (no positional URI), so both must + // error and name the graph + the ignored block β€” like server boot does. + for sub in ["validate", "list"] { + let output = output_failure(cli().arg("queries").arg(sub).arg("--config").arg(&config)); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("local") && stderr.contains("queries"), + "`queries {sub}` must reject a named graph with a populated top-level block; stderr:\n{stderr}" + ); + } +} + +#[test] +fn queries_validate_exits_nonzero_on_duplicate_tool_name() { + // Two exposed queries claiming one MCP tool name is a load-time + // collision β€” `queries validate` must fail (offline, before the engine + // opens) and name both queries plus the contested tool. + let graph = SystemGraph::loaded(); + graph.write_query("a.gq", "query a() { match { $p: Person } return { $p.name } }"); + graph.write_query("b.gq", "query b() { match { $p: Person } return { $p.name } }"); + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " a:\n", + " file: ./a.gq\n", + " mcp: {{ expose: true, tool_name: dup }}\n", + " b:\n", + " file: ./b.gq\n", + " mcp: {{ expose: true, tool_name: dup }}\n", + "cli:\n", + " graph: local\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + let output = output_failure(cli().arg("queries").arg("validate").arg("--config").arg(&config)); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("dup") && stderr.contains("'a'") && stderr.contains("'b'"), + "duplicate tool name should be reported naming both queries; stderr:\n{stderr}" + ); +} + +#[test] +fn queries_validate_positional_uri_ignores_default_graph() { + // A positional URI is anonymous β†’ the schema AND the registry both come + // from top-level, even when `cli.graph` names a graph whose per-graph + // queries would fail. Pins that the URI and registry can't diverge. + let graph = SystemGraph::loaded(); + graph.write_query( + "clean.gq", + "query clean($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + // `Widget` is not in the fixture schema β€” the default graph's per-graph + // query would break validate if it were (wrongly) selected. + graph.write_query("broken.gq", "query broken() { match { $w: Widget } return { $w.name } }"); + let config = graph.write_config( + "omnigraph.yaml", + concat!( + "cli:\n graph: prod\n", + "graphs:\n", + " prod:\n", + " uri: /nonexistent-prod.omni\n", + " queries:\n", + " broken:\n", + " file: ./broken.gq\n", + "queries:\n", + " clean:\n", + " file: ./clean.gq\n", + "policy: {}\n", + ), + ); + // Positional URI = the real loaded graph; selection is anonymous, so the + // CLEAN top-level registry validates (not prod's broken one). + let output = output_success( + cli() + .arg("queries") + .arg("validate") + .arg(graph.path()) + .arg("--config") + .arg(&config), + ); + let stdout = stdout_string(&output); + assert!( + stdout.contains("OK"), + "positional URI must validate the top-level registry, not the cli.graph default; stdout:\n{stdout}" + ); +} diff --git a/crates/omnigraph-cli/tests/system_local.rs b/crates/omnigraph-cli/tests/system_local.rs index 08f653d..4fc3e9a 100644 --- a/crates/omnigraph-cli/tests/system_local.rs +++ b/crates/omnigraph-cli/tests/system_local.rs @@ -74,14 +74,36 @@ project: graphs: local: uri: {} + policy: + file: ./policy.yaml cli: graph: local branch: main query: roots: - . -policy: - file: ./policy.yaml +", + yaml_string(&graph.path().to_string_lossy()) + ) +} + +fn local_policy_server_graph_config(graph: &SystemGraph) -> String { + format!( + "\ +project: + name: policy-e2e-local +graphs: + local: + uri: {} + policy: + file: ./policy.yaml +server: + graph: local +cli: + branch: main +query: + roots: + - . ", yaml_string(&graph.path().to_string_lossy()) ) @@ -1000,49 +1022,55 @@ query vector_search($q: String) { #[test] fn local_cli_policy_tooling_is_end_to_end() { // Sanity check for the read-only policy CLI surfaces. These don't - // mutate the graph β€” they just parse and evaluate the policy file β€” - // so they don't depend on PR #4's engine-side enforcement. + // mutate the graph; they parse and evaluate the effective policy for + // named graph selections, including per-graph policy files. let graph = SystemGraph::loaded(); let config = graph.write_config("omnigraph-policy.yaml", &local_policy_config(&graph)); + let server_graph_config = graph.write_config( + "omnigraph-policy-server.yaml", + &local_policy_server_graph_config(&graph), + ); graph.write_config("policy.yaml", POLICY_E2E_YAML); graph.write_config("policy.tests.yaml", POLICY_E2E_TESTS_YAML); - let validate = output_success( - cli() - .arg("policy") - .arg("validate") - .arg("--config") - .arg(&config), - ); - assert!(stdout_string(&validate).contains("policy valid:")); + for config in [&config, &server_graph_config] { + let validate = output_success( + cli() + .arg("policy") + .arg("validate") + .arg("--config") + .arg(config), + ); + assert!(stdout_string(&validate).contains("policy valid:")); - let tests = output_success(cli().arg("policy").arg("test").arg("--config").arg(&config)); - assert!(stdout_string(&tests).contains("policy tests passed: 2 cases")); + let tests = output_success(cli().arg("policy").arg("test").arg("--config").arg(config)); + assert!(stdout_string(&tests).contains("policy tests passed: 2 cases")); - let explain = output_success( - cli() - .arg("policy") - .arg("explain") - .arg("--config") - .arg(&config) - .arg("--actor") - .arg("act-bruno") - .arg("--action") - .arg("change") - .arg("--branch") - .arg("main"), - ); - let explain_stdout = stdout_string(&explain); - assert!(explain_stdout.contains("decision: deny")); - assert!(explain_stdout.contains("branch: main")); + let explain = output_success( + cli() + .arg("policy") + .arg("explain") + .arg("--config") + .arg(config) + .arg("--actor") + .arg("act-bruno") + .arg("--action") + .arg("change") + .arg("--branch") + .arg("main"), + ); + let explain_stdout = stdout_string(&explain); + assert!(explain_stdout.contains("decision: deny")); + assert!(explain_stdout.contains("branch: main")); + } } #[test] fn local_cli_change_enforces_engine_layer_policy() { - // Asserts MR-722 PR #4: when `policy.file` is configured in - // `omnigraph.yaml`, the CLI loads PolicyEngine into Omnigraph and - // every direct-engine write hits `enforce(action, scope, actor)` β€” - // identical to what the HTTP server gets, regardless of transport. + // Asserts MR-722 PR #4: when the selected graph has a configured + // policy file, the CLI loads PolicyEngine into Omnigraph and every + // direct-engine write hits `enforce(action, scope, actor)` β€” identical + // to what the HTTP server gets, regardless of transport. // // Three cases, each discriminating: // @@ -1135,6 +1163,32 @@ fn local_cli_change_enforces_engine_layer_policy() { assert_eq!(verify["rows"][0]["p.name"], "RagnorOnMain"); } +#[test] +fn local_cli_positional_uri_does_not_inherit_default_graph_policy() { + let graph = SystemGraph::loaded(); + let config = graph.write_config("omnigraph-policy.yaml", &local_policy_config(&graph)); + graph.write_config("policy.yaml", POLICY_E2E_YAML); + let mutation_file = insert_person_query(&graph, "system-local-policy-positional.gq"); + + let allowed = parse_stdout_json(&output_success( + cli() + .arg("--as") + .arg("act-bruno") + .arg("change") + .arg("--config") + .arg(&config) + .arg("--uri") + .arg(graph.path()) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"PositionalUriBruno","age":4}"#) + .arg("--json"), + )); + assert_eq!(allowed["affected_nodes"], 1); + assert_eq!(allowed["actor_id"], "act-bruno"); +} + // ─── MR-722 PR A: CLIΓ—writer matrix ─────────────────────────────────────── // // The change writer is covered above by `local_cli_change_enforces_engine_layer_policy`. @@ -1293,6 +1347,62 @@ fn local_cli_schema_apply_enforces_engine_layer_policy() { assert_eq!(allowed["applied"], true); } +#[test] +fn local_cli_schema_apply_rejects_stored_query_breakage_before_publish() { + let graph = SystemGraph::loaded(); + graph.write_query( + "stored-find-person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph-stored-query-schema.yaml", + &format!( + "\ +graphs: + local: + uri: {} + queries: + find_person: + file: ./stored-find-person.gq +cli: + graph: local + branch: main +query: + roots: + - . +policy: {{}} +", + yaml_string(&graph.path().to_string_lossy()) + ), + ); + let renamed_schema = std::fs::read_to_string(fixture("test.pg")) + .unwrap() + .replace("age: I32?", "years: I32? @rename_from(\"age\")"); + let schema_path = graph.write_file("stored-query-breaks.pg", &renamed_schema); + + let rejected = output_failure( + cli() + .arg("schema") + .arg("apply") + .arg("--config") + .arg(&config) + .arg("--schema") + .arg(&schema_path) + .arg("--json"), + ); + let stderr = String::from_utf8_lossy(&rejected.stderr); + assert!( + stderr.contains("find_person") && stderr.contains("schema check"), + "schema apply should reject the stored-query breakage before publish; stderr: {stderr}" + ); + + let schema = stdout_string(&output_success( + cli().arg("schema").arg("show").arg("--config").arg(&config), + )); + assert!(schema.contains("age: I32?")); + assert!(!schema.contains("years: I32?")); +} + #[test] fn local_cli_branch_create_enforces_engine_layer_policy() { let graph = SystemGraph::loaded(); @@ -1448,6 +1558,8 @@ project: graphs: local: uri: {} + policy: + file: ./policy.yaml cli: graph: local branch: main @@ -1455,8 +1567,6 @@ cli: query: roots: - . -policy: - file: ./policy.yaml ", yaml_string(&graph.path().to_string_lossy()), actor, diff --git a/crates/omnigraph-cli/tests/system_remote.rs b/crates/omnigraph-cli/tests/system_remote.rs index c86e32e..45bf502 100644 --- a/crates/omnigraph-cli/tests/system_remote.rs +++ b/crates/omnigraph-cli/tests/system_remote.rs @@ -60,10 +60,10 @@ project: graphs: local: uri: {} + policy: + file: ./policy.yaml server: graph: local -policy: - file: ./policy.yaml ", yaml_string(&graph.path().to_string_lossy()) ) diff --git a/crates/omnigraph-compiler/Cargo.toml b/crates/omnigraph-compiler/Cargo.toml index 229b862..545db83 100644 --- a/crates/omnigraph-compiler/Cargo.toml +++ b/crates/omnigraph-compiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-compiler" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "Schema/query compiler for Omnigraph. Zero Lance dependency." license = "MIT" diff --git a/crates/omnigraph-policy/Cargo.toml b/crates/omnigraph-policy/Cargo.toml index dacda35..3d14fc5 100644 --- a/crates/omnigraph-policy/Cargo.toml +++ b/crates/omnigraph-policy/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-policy" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "Policy / authorization layer for Omnigraph β€” Cedar-backed PolicyEngine, PolicyChecker trait, ResourceScope enum." license = "MIT" diff --git a/crates/omnigraph-policy/src/lib.rs b/crates/omnigraph-policy/src/lib.rs index 6459fcd..cb59796 100644 --- a/crates/omnigraph-policy/src/lib.rs +++ b/crates/omnigraph-policy/src/lib.rs @@ -56,6 +56,21 @@ pub enum PolicyAction { /// from v0.6.0; operators add and remove graphs by editing /// `omnigraph.yaml` and restarting. GraphList, + /// Gates invoking a server-side stored query by name. Per-graph and + /// **graph-scoped** (no branch dimension, like `Admin`): the per-branch + /// access of the query body is enforced by the inner `Read`/`Change` + /// gate, so branch-scoping this outer gate would be redundant (and was + /// wrong for snapshot reads). A rule that sets `branch_scope` on + /// `invoke_query` is rejected by `validate()`. In this release it is + /// **coarse**: an `invoke_query` allow rule permits *any* stored query + /// on the graph (no per-query dimension yet); a future, additive + /// refinement adds an optional query-name scope. + /// + /// This gate sits at the HTTP boundary. The engine `_as` writers still + /// enforce `Read`/`Change` per the query body, so a stored *mutation* + /// is double-gated: `invoke_query` to reach the tool, plus `change` for + /// the write itself. + InvokeQuery, } impl PolicyAction { @@ -70,6 +85,7 @@ impl PolicyAction { Self::BranchMerge => "branch_merge", Self::Admin => "admin", Self::GraphList => "graph_list", + Self::InvokeQuery => "invoke_query", } } @@ -99,7 +115,8 @@ impl PolicyAction { | Self::BranchCreate | Self::BranchDelete | Self::BranchMerge - | Self::Admin => PolicyResourceKind::Graph, + | Self::Admin + | Self::InvokeQuery => PolicyResourceKind::Graph, } } } @@ -155,6 +172,7 @@ impl FromStr for PolicyAction { "branch_merge" => Ok(Self::BranchMerge), "admin" => Ok(Self::Admin), "graph_list" => Ok(Self::GraphList), + "invoke_query" => Ok(Self::InvokeQuery), other => bail!("unknown policy action '{other}'"), } } @@ -806,6 +824,7 @@ namespace Omnigraph { action "branch_delete" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "branch_merge" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "admin" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; + action "invoke_query" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "graph_list" appliesTo { principal: Actor, resource: Server, context: RequestContext }; } @@ -1264,6 +1283,80 @@ rules: assert!(!deny.allowed); } + #[test] + fn invoke_query_authorizes_per_graph() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-alice] + others: [act-bruno] +rules: + - id: team-invoke-queries + allow: + actors: { group: team } + actions: [invoke_query] +"#, + ) + .unwrap(); + let engine = PolicyCompiler::compile(&policy, "graph").unwrap(); + + let allow = engine + .authorize( + "act-alice", + &PolicyRequest { + action: PolicyAction::InvokeQuery, + branch: None, + target_branch: None, + }, + ) + .unwrap(); + assert!(allow.allowed); + assert_eq!( + allow.matched_rule_id.as_deref(), + Some("team-invoke-queries") + ); + + // Actor outside the group β†’ deny. + let deny = engine + .authorize( + "act-bruno", + &PolicyRequest { + action: PolicyAction::InvokeQuery, + branch: None, + target_branch: None, + }, + ) + .unwrap(); + assert!(!deny.allowed); + } + + #[test] + fn invoke_query_rejects_branch_scope() { + // invoke_query is graph-scoped (like admin) β€” per-branch access is + // enforced by the inner read/change gate β€” so a rule that puts a + // `branch_scope` qualifier on it is rejected at validate(). + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-alice] +rules: + - id: team-invoke-any-branch + allow: + actors: { group: team } + actions: [invoke_query] + branch_scope: any +"#, + ) + .unwrap(); + let err = policy.validate().unwrap_err().to_string(); + assert!( + err.contains("branch_scope") && err.contains("invoke_query"), + "branch_scope on invoke_query must be rejected: {err}" + ); + } + #[test] fn server_scoped_rule_cannot_use_branch_scope() { let policy: PolicyConfig = serde_yaml::from_str( diff --git a/crates/omnigraph-server/Cargo.toml b/crates/omnigraph-server/Cargo.toml index e9a0e46..5994aa1 100644 --- a/crates/omnigraph-server/Cargo.toml +++ b/crates/omnigraph-server/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-server" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "HTTP server for the Omnigraph graph database." license = "MIT" @@ -19,9 +19,9 @@ default = [] aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"] [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.0" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.0" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.1" } axum = { workspace = true } clap = { workspace = true } color-eyre = { workspace = true } diff --git a/crates/omnigraph-server/src/api.rs b/crates/omnigraph-server/src/api.rs index 2c818ae..4a6024f 100644 --- a/crates/omnigraph-server/src/api.rs +++ b/crates/omnigraph-server/src/api.rs @@ -1,8 +1,11 @@ use omnigraph::db::{GraphCommit, MergeOutcome, ReadTarget, SchemaApplyResult, Snapshot}; use omnigraph::error::{MergeConflict, MergeConflictKind}; use omnigraph::loader::{IngestResult, LoadMode}; +use crate::queries::StoredQuery; use omnigraph_compiler::SchemaMigrationStep; +use omnigraph_compiler::query::ast::Param; use omnigraph_compiler::result::QueryResult; +use omnigraph_compiler::types::{PropType, ScalarType}; use serde::{Deserialize, Serialize}; use serde_json::Value; use utoipa::{IntoParams, ToSchema}; @@ -300,6 +303,162 @@ pub struct ChangeRequest { pub branch: Option, } +/// Body for `POST /queries/{name}` β€” invokes the server-side stored query +/// named in the path. The query source and name come from the registry, +/// never the body; only the runtime inputs are supplied here. +#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] +pub struct InvokeStoredQueryRequest { + /// JSON object whose keys match the stored query's declared parameters. + #[serde(default)] + pub params: Option, + /// Branch to run against. Defaults to `main`; for a stored mutation the + /// write targets this branch. + #[serde(default)] + pub branch: Option, + /// Snapshot id to read from (read queries only β€” rejected for a stored + /// mutation). Mutually exclusive with `branch`. + #[serde(default)] + pub snapshot: Option, +} + +/// Response for `POST /queries/{name}`: the read envelope for a stored +/// read, or the mutation envelope for a stored mutation. Serialized +/// **untagged**, so the wire shape is exactly [`ReadOutput`] or +/// [`ChangeOutput`] β€” classification follows the stored query, not a +/// wrapper field. +#[derive(Debug, Serialize, ToSchema)] +#[serde(untagged)] +pub enum InvokeStoredQueryResponse { + Read(ReadOutput), + Change(ChangeOutput), +} + +/// The kind of a stored-query parameter, decomposed so a client (e.g. an +/// MCP server) can build a typed input schema with a closed `match` and +/// never re-parse omnigraph's type spelling. `bigint`/`date`/`datetime`/ +/// `blob` are carried as JSON strings on the wire: a 64-bit integer past +/// 2^53 loses precision as a JSON number, and Date/DateTime are ISO +/// strings, Blob a blob-URI string. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "snake_case")] +pub enum ParamKind { + String, + Bool, + Int, + #[serde(rename = "bigint")] + BigInt, + Float, + Date, + #[serde(rename = "datetime")] + DateTime, + Blob, + Vector, + List, +} + +/// One declared parameter of a stored query, projected for the catalog. +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct ParamDescriptor { + pub name: String, + pub kind: ParamKind, + /// Element kind when `kind == list` (always a scalar β€” the grammar + /// forbids lists of vectors or nested lists). + #[serde(skip_serializing_if = "Option::is_none")] + pub item_kind: Option, + /// Dimension when `kind == vector`. + #[serde(skip_serializing_if = "Option::is_none")] + pub vector_dim: Option, + /// `false` β†’ the caller must supply it; `true` β†’ optional. + pub nullable: bool, +} + +/// One entry in the stored-query catalog (`GET /queries`). +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct QueryCatalogEntry { + /// Registry key / invoke path segment (`POST /queries/{name}`). + pub name: String, + /// MCP tool id (the `tool_name` override, else `name`). + pub tool_name: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub instruction: Option, + /// `true` for a stored mutation β†’ an MCP read-only hint of `false`. + pub mutation: bool, + pub params: Vec, +} + +/// Response for `GET /queries`: the `mcp.expose` subset of a graph's +/// stored-query registry, each with typed parameters. +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct QueriesCatalogOutput { + pub queries: Vec, +} + +/// Total map from a resolved scalar to its catalog kind. Exhaustive on +/// purpose: a new `ScalarType` is a compile error here until catalogued. +fn scalar_kind(scalar: ScalarType) -> ParamKind { + match scalar { + ScalarType::String => ParamKind::String, + ScalarType::Bool => ParamKind::Bool, + ScalarType::I32 | ScalarType::U32 => ParamKind::Int, + ScalarType::I64 | ScalarType::U64 => ParamKind::BigInt, + ScalarType::F32 | ScalarType::F64 => ParamKind::Float, + ScalarType::Date => ParamKind::Date, + ScalarType::DateTime => ParamKind::DateTime, + ScalarType::Blob => ParamKind::Blob, + ScalarType::Vector(_) => ParamKind::Vector, + } +} + +fn param_descriptor(param: &Param) -> ParamDescriptor { + match PropType::from_param_type_name(¶m.type_name, param.nullable) { + Some(pt) if pt.list => ParamDescriptor { + name: param.name.clone(), + kind: ParamKind::List, + item_kind: Some(scalar_kind(pt.scalar)), + vector_dim: None, + nullable: param.nullable, + }, + Some(pt) => { + let (kind, vector_dim) = match pt.scalar { + ScalarType::Vector(dim) => (ParamKind::Vector, Some(dim)), + other => (scalar_kind(other), None), + }; + ParamDescriptor { + name: param.name.clone(), + kind, + item_kind: None, + vector_dim, + nullable: param.nullable, + } + } + // Unreachable for a parsed query (every declared param type is + // grammatical); fall back to an opaque string so the field is still + // usable rather than dropped. + None => ParamDescriptor { + name: param.name.clone(), + kind: ParamKind::String, + item_kind: None, + vector_dim: None, + nullable: param.nullable, + }, + } +} + +/// Project a loaded stored query into its catalog entry (typed params, +/// MCP tool name, read/mutate flag, description/instruction). +pub fn query_catalog_entry(query: &StoredQuery) -> QueryCatalogEntry { + QueryCatalogEntry { + name: query.name.clone(), + tool_name: query.effective_tool_name().to_string(), + description: query.decl.description.clone(), + instruction: query.decl.instruction.clone(), + mutation: query.is_mutation(), + params: query.decl.params.iter().map(param_descriptor).collect(), + } +} + #[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] pub struct SchemaApplyRequest { /// Project schema in `.pg` source form. The diff against the current diff --git a/crates/omnigraph-server/src/config.rs b/crates/omnigraph-server/src/config.rs index 87737d0..b308b72 100644 --- a/crates/omnigraph-server/src/config.rs +++ b/crates/omnigraph-server/src/config.rs @@ -9,6 +9,13 @@ use serde::{Deserialize, Serialize}; pub const DEFAULT_CONFIG_FILE: &str = "omnigraph.yaml"; +pub fn graph_resource_id_for_selection( + selected_graph: Option<&str>, + normalized_uri: &str, +) -> String { + selected_graph.unwrap_or(normalized_uri).to_string() +} + #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct ProjectConfig { pub name: Option, @@ -24,6 +31,14 @@ pub struct TargetConfig { /// graph's HTTP-layer Cedar enforcement. #[serde(default)] pub policy: PolicySettings, + /// Per-graph stored-query registry: an inline `name -> entry` + /// map. Mirrors the per-graph `policy` shape β€” each + /// `graphs..queries` declares that graph's stored queries. Absent + /// (or empty) = no stored queries for the graph. v1 is inline-only; + /// an external `queries.yaml` manifest indirection is a deferred + /// convenience. + #[serde(default)] + pub queries: BTreeMap, } #[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Serialize, Deserialize, ValueEnum)] @@ -90,6 +105,50 @@ pub struct PolicySettings { pub file: Option, } +/// One stored-query registry entry. The map **key** is the query's +/// identity β€” it must equal the `query ` symbol declared inside +/// the referenced `.gq` file (asserted when the registry loads). +/// Renaming the key (or the symbol) is a breaking change to callers, by +/// design. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueryEntry { + /// Path to the `.gq` file (relative to the config's `base_dir`). The + /// file may declare several queries; the registry selects the one + /// whose symbol matches the map key. + pub file: String, + #[serde(default)] + pub mcp: McpSettings, +} + +/// MCP exposure for a stored query. A *deployment* concern (the same +/// `.gq` may be exposed in one graph and hidden in another), so it lives +/// in YAML rather than in the `.gq` source. **Default `expose: true`** β€” +/// declaring a query in the manifest *is* the opt-in, so it appears in the +/// MCP tool catalog (`GET /queries`) by default; set `expose: false` to +/// keep a query HTTP/service-callable but hidden from the agent tool list. +/// `expose` governs catalog membership only β€” it is **not** an +/// authorization gate (invocation is gated by `invoke_query`), so a hidden +/// query is still invocable by name with the right permission. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct McpSettings { + #[serde(default = "mcp_expose_default")] + pub expose: bool, + pub tool_name: Option, +} + +fn mcp_expose_default() -> bool { + true +} + +impl Default for McpSettings { + fn default() -> Self { + Self { + expose: mcp_expose_default(), + tool_name: None, + } + } +} + #[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum AliasCommand { @@ -137,6 +196,12 @@ pub struct OmnigraphConfig { pub aliases: BTreeMap, #[serde(default)] pub policy: PolicySettings, + /// Top-level stored-query registry, used in single-graph + /// mode β€” mirrors how the top-level `policy` applies to the single + /// graph. In multi-graph mode this is unused; each graph's + /// `graphs..queries` applies instead. + #[serde(default)] + pub queries: BTreeMap, #[serde(skip)] base_dir: PathBuf, } @@ -152,6 +217,7 @@ impl Default for OmnigraphConfig { query: QueryDefaults::default(), aliases: BTreeMap::new(), policy: PolicySettings::default(), + queries: BTreeMap::new(), base_dir: PathBuf::new(), } } @@ -244,6 +310,124 @@ impl OmnigraphConfig { .map(|path| self.resolve_config_path(path)) } + /// The top-level stored-query registry entries (single-graph mode). + pub fn query_entries(&self) -> &BTreeMap { + &self.queries + } + + /// The per-graph stored-query registry entries for a named target + /// (multi-graph mode). Returns `None` if the target is unknown. + pub fn target_query_entries( + &self, + target_name: &str, + ) -> Option<&BTreeMap> { + self.graphs.get(target_name).map(|target| &target.queries) + } + + /// The stored-query registry entries that apply for a graph + /// selection β€” the single definition of "which `queries:` block + /// governs graph X", shared by server boot and the CLI so the two + /// can't drift. A named graph present in `graphs:` uses its + /// per-graph block; everything else (no selection, or a name that is + /// not a known graph, e.g. a bare URI) falls back to the top-level + /// block (single-graph mode). + pub fn query_entries_for(&self, graph: Option<&str>) -> &BTreeMap { + match graph { + Some(name) if self.graphs.contains_key(name) => &self.graphs[name].queries, + _ => &self.queries, + } + } + + /// The single CLI gate that turns a raw graph selection into a *validated* + /// one β€” the fallible counterpart to the infallible + /// [`OmnigraphConfig::query_entries_for`]. Both `queries` subcommands route + /// their selection through here so neither can skip a check the other (or + /// server boot) applies: + /// * a known name passes through, but only after the same coherence check + /// server boot enforces + /// ([`OmnigraphConfig::ensure_top_level_blocks_honored`]) β€” a named graph + /// with a populated top-level block is rejected; + /// * an unknown name errors with the **same** message + /// [`OmnigraphConfig::resolve_target_uri`] produces, so a command that + /// opens no URI rejects an unknown `--target` exactly like the + /// URI-resolving commands do; + /// * an anonymous selection (`None`, e.g. a bare URI) stays anonymous, + /// resolving to the top-level registry downstream (top-level honored). + pub fn resolve_graph_selection<'a>(&self, graph: Option<&'a str>) -> Result> { + match graph { + Some(name) if self.graphs.contains_key(name) => { + self.ensure_top_level_blocks_honored(Some(name))?; + Ok(Some(name)) + } + Some(name) => bail!("graph '{}' not found in {}", name, DEFAULT_CONFIG_FILE), + None => Ok(None), + } + } + + pub fn resolve_policy_tooling_graph_selection(&self) -> Result> { + self.resolve_graph_selection(self.cli_graph_name().or_else(|| self.server_graph_name())) + } + + /// The policy file that applies for a graph selection β€” the policy + /// sibling of [`OmnigraphConfig::query_entries_for`], so policy and + /// queries resolve by the same identity rule. A named graph in + /// `graphs:` uses its per-graph `policy.file` with **no** top-level + /// fallback (a named graph with no per-graph policy has no policy β€” + /// that keeps the boot-time coherence check meaningful); anything else + /// (no selection, or a bare URI) uses the top-level `policy.file`. + pub fn resolve_policy_file_for(&self, graph: Option<&str>) -> Option { + match graph { + Some(name) if self.graphs.contains_key(name) => self.resolve_target_policy_file(name), + _ => self.resolve_policy_file(), + } + } + + /// Names of any top-level config blocks (`policy.file`, `queries:`) + /// that are populated. Used by the boot-time coherence check: when a + /// **named** graph is served (single-mode by name, or multi-mode), + /// the top-level blocks are not honored, so a populated one is a + /// configuration error rather than a silent no-op. + pub fn populated_top_level_blocks(&self) -> Vec<&'static str> { + let mut blocks = Vec::new(); + if self.policy.file.is_some() { + blocks.push("policy.file"); + } + if !self.queries.is_empty() { + blocks.push("queries"); + } + blocks + } + + /// A named graph uses its own `graphs.` block, so a populated + /// top-level block would be silently ignored β€” a config error. The single + /// definition of that rule, shared by server boot and the CLI selection + /// gate ([`OmnigraphConfig::resolve_graph_selection`]) so the two can't + /// drift. An anonymous selection (`None`, e.g. a bare URI) legitimately + /// honors the top-level blocks, so it is never rejected here. + pub fn ensure_top_level_blocks_honored(&self, selected: Option<&str>) -> Result<()> { + if let Some(name) = selected { + let unhonored = self.populated_top_level_blocks(); + if !unhonored.is_empty() { + bail!( + "named graph '{name}' uses its own `graphs.{name}.…` block, but top-level {} \ + {} set and would be ignored. Move it to `graphs.{name}` (e.g. \ + `graphs.{name}.policy.file`, `graphs.{name}.queries`).", + unhonored.join(" and "), + if unhonored.len() == 1 { "is" } else { "are" }, + ); + } + } + Ok(()) + } + + /// Resolve a stored-query `.gq` file path (from a registry entry), + /// relative to the config's `base_dir`. Mirrors policy-file + /// resolution; the registry loader calls this to turn each entry's + /// `file:` value into an absolute path. + pub fn resolve_query_file(&self, value: &str) -> PathBuf { + self.resolve_config_path(value) + } + /// Resolve the server-level policy file path (used by management /// endpoints). Returns `None` if `server.policy.file` is not set. pub fn resolve_server_policy_file(&self) -> Option { @@ -387,7 +571,9 @@ mod tests { use tempfile::tempdir; - use super::{ReadOutputFormat, TableCellLayout, load_config_in}; + use super::{ + ReadOutputFormat, TableCellLayout, graph_resource_id_for_selection, load_config_in, + }; #[test] fn load_config_reads_yaml_defaults_from_current_dir() { @@ -451,6 +637,114 @@ policy: {} assert!(config.graphs.is_empty()); } + #[test] + fn graph_resource_id_for_selection_uses_name_or_anonymous_uri() { + assert_eq!( + graph_resource_id_for_selection(Some("local"), "/tmp/graph.omni"), + "local" + ); + assert_eq!( + graph_resource_id_for_selection(None, "/tmp/graph.omni"), + "/tmp/graph.omni" + ); + } + + #[test] + fn resolve_graph_selection_validates_membership_and_coherence() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./demo.omni\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + + // A known graph passes through unchanged. + assert_eq!(config.resolve_graph_selection(Some("local")).unwrap(), Some("local")); + // An anonymous selection stays anonymous (β†’ top-level registry downstream). + assert_eq!(config.resolve_graph_selection(None).unwrap(), None); + // An unknown name errors, naming the graph (matching resolve_target_uri). + let err = config.resolve_graph_selection(Some("ghost")).unwrap_err().to_string(); + assert!( + err.contains("ghost") && err.contains("not found"), + "unknown graph must error naming it: {err}" + ); + + // Coherence: a named graph plus a populated top-level block is the + // config server boot refuses, so the gate rejects it too (shared rule + // via ensure_top_level_blocks_honored). An anonymous selection still + // passes β€” top-level is honored when no graph is named. + let temp2 = tempdir().unwrap(); + fs::write( + temp2.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./demo.omni\npolicy:\n file: ./top.yaml\n", + ) + .unwrap(); + let incoherent = load_config_in(temp2.path(), None).unwrap(); + let err = incoherent + .resolve_graph_selection(Some("local")) + .unwrap_err() + .to_string(); + assert!( + err.contains("local") && err.contains("policy.file"), + "named graph + populated top-level block must be rejected, naming both: {err}" + ); + assert_eq!( + incoherent.resolve_graph_selection(None).unwrap(), + None, + "anonymous selection still honors top-level" + ); + } + + #[test] + fn policy_tooling_graph_selection_prefers_cli_then_server_and_validates() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./local.omni\n prod:\n uri: ./prod.omni\n\ + server:\n graph: local\ncli:\n graph: prod\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!( + config.resolve_policy_tooling_graph_selection().unwrap(), + Some("prod") + ); + + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./local.omni\nserver:\n graph: local\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!( + config.resolve_policy_tooling_graph_selection().unwrap(), + Some("local") + ); + + let temp = tempdir().unwrap(); + fs::write(temp.path().join("omnigraph.yaml"), "policy: {}\n").unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!(config.resolve_policy_tooling_graph_selection().unwrap(), None); + + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./local.omni\nserver:\n graph: ghost\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + let err = config + .resolve_policy_tooling_graph_selection() + .unwrap_err() + .to_string(); + assert!( + err.contains("ghost") && err.contains("not found"), + "unknown server.graph must use graph-selection validation: {err}" + ); + } + #[test] fn resolve_query_path_searches_config_roots() { let temp = tempdir().unwrap(); @@ -489,6 +783,118 @@ policy: {} assert_eq!(resolved, config_dir.join("local.gq")); } + #[test] + fn queries_block_round_trips_inline_and_per_graph() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + r#" +graphs: + prod: + uri: s3://bucket/prod + queries: + find_user: + file: ./queries/find_user.gq + mcp: + expose: true + tool_name: lookup_user + internal_audit: + file: ./queries/audit.gq +queries: + single_mode_q: + file: ./q.gq +"#, + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + + // Per-graph registry (multi-graph mode). + let prod = config.target_query_entries("prod").unwrap(); + assert_eq!(prod.len(), 2); + let find_user = &prod["find_user"]; + assert_eq!(find_user.file, "./queries/find_user.gq"); + assert!(find_user.mcp.expose); + assert_eq!(find_user.mcp.tool_name.as_deref(), Some("lookup_user")); + // Default exposure is true (the manifest entry is the opt-in); tool_name absent. + let audit = &prod["internal_audit"]; + assert!(audit.mcp.expose); + assert!(audit.mcp.tool_name.is_none()); + + // Top-level registry (single-graph mode). + assert_eq!(config.query_entries().len(), 1); + + // The shared selector resolves the same blocks the server boot + // and the CLI use: a known graph β†’ its per-graph block; no + // selection or an unknown name β†’ the top-level block (the latter + // pins the behavior of the CLI's now-deleted fallback arm). + assert_eq!(config.query_entries_for(Some("prod")).len(), 2); + assert_eq!(config.query_entries_for(None).len(), 1); + assert_eq!(config.query_entries_for(Some("nonexistent")).len(), 1); + + // Path resolution joins against base_dir, like policy files. + assert_eq!( + config.resolve_query_file(&find_user.file), + temp.path().join("./queries/find_user.gq") + ); + } + + #[test] + fn resolve_policy_file_for_follows_identity() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "policy:\n file: ./top.yaml\ngraphs:\n prod:\n uri: s3://b/prod\n \ + policy:\n file: ./prod.yaml\n bare:\n uri: s3://b/bare\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + + // Named graph with its own policy β†’ per-graph (not top-level). + assert!( + config + .resolve_policy_file_for(Some("prod")) + .unwrap() + .ends_with("prod.yaml") + ); + // Named graph with NO per-graph policy β†’ None (no top-level fallback; + // load-bearing for the boot coherence check). + assert!(config.resolve_policy_file_for(Some("bare")).is_none()); + // Anonymous (bare URI) or an unknown name β†’ top-level. + assert!( + config + .resolve_policy_file_for(None) + .unwrap() + .ends_with("top.yaml") + ); + assert!( + config + .resolve_policy_file_for(Some("nope")) + .unwrap() + .ends_with("top.yaml") + ); + } + + #[test] + fn queries_block_absent_yields_empty_registry() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./demo.omni\n", + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + // Additive: no `queries:` anywhere β†’ empty registries everywhere. + assert!(config.query_entries().is_empty()); + assert!( + config + .target_query_entries("local") + .unwrap() + .is_empty() + ); + } + #[test] fn policy_block_accepts_non_empty_mapping() { let temp = tempdir().unwrap(); diff --git a/crates/omnigraph-server/src/lib.rs b/crates/omnigraph-server/src/lib.rs index ad41f9d..60ebef3 100644 --- a/crates/omnigraph-server/src/lib.rs +++ b/crates/omnigraph-server/src/lib.rs @@ -4,6 +4,7 @@ pub mod config; pub mod graph_id; pub mod identity; pub mod policy; +pub mod queries; pub mod registry; pub mod workload; @@ -11,6 +12,8 @@ pub use graph_id::GraphId; pub use identity::{AuthSource, GraphKey, ResolvedActor, Scope, TenantId}; pub use registry::{GraphHandle, GraphRegistry, InsertError, RegistryLookup, RegistrySnapshot}; +use crate::queries::{QueryRegistry, check, format_check_breakages}; + use std::collections::{HashMap, HashSet}; use std::fs; use std::io; @@ -22,7 +25,8 @@ use api::{ BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, GraphInfo, GraphListResponse, - HealthOutput, IngestOutput, IngestRequest, QueryRequest, ReadOutput, ReadRequest, + HealthOutput, IngestOutput, IngestRequest, InvokeStoredQueryRequest, + InvokeStoredQueryResponse, QueriesCatalogOutput, QueryRequest, ReadOutput, ReadRequest, SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, SnapshotQuery, ingest_output, schema_apply_output, snapshot_payload, }; @@ -40,12 +44,13 @@ use color_eyre::eyre::{Result, WrapErr, bail}; pub use config::{ AliasCommand, AliasConfig, CliDefaults, DEFAULT_CONFIG_FILE, OmnigraphConfig, PolicySettings, ProjectConfig, QueryDefaults, ReadOutputFormat, ServerDefaults, TableCellLayout, TargetConfig, - load_config, + graph_resource_id_for_selection, load_config, }; use futures::stream; use omnigraph::db::{Omnigraph, ReadTarget}; use omnigraph::error::{ManifestConflictDetails, ManifestErrorKind, OmniError}; use omnigraph::storage::normalize_root_uri; +use omnigraph_compiler::catalog::Catalog; use omnigraph_compiler::json_params_to_param_map; use omnigraph_compiler::query::parser::parse_query; use omnigraph_compiler::{JsonParamMode, ParamMap}; @@ -93,6 +98,8 @@ fn hash_bearer_token(token: &str) -> BearerTokenHash { server_export, #[allow(deprecated)] server_change, server_mutate, + server_list_queries, + server_invoke_query, server_schema_apply, server_schema_get, server_ingest, @@ -157,8 +164,16 @@ pub enum ServerConfigMode { /// set to a named target. Single { uri: String, + /// Cedar graph resource id for the single graph. A named selection + /// uses the graph name; an anonymous URI uses the normalized URI to + /// preserve legacy single-graph policy identity. + graph_id: String, /// Top-level `policy.file` (single-graph Cedar policy). policy_file: Option, + /// Top-level stored-query registry, loaded and identity-checked + /// at settings-build time; type-checked against the schema when + /// the engine opens. + queries: QueryRegistry, }, /// Multi-graph invocation β€” `--config omnigraph.yaml` with a /// non-empty `graphs:` map and no single-mode selector. @@ -185,6 +200,10 @@ pub struct GraphStartupConfig { pub graph_id: String, pub uri: String, pub policy_file: Option, + /// Per-graph stored-query registry, loaded and identity-checked at + /// settings-build time; type-checked against the schema when this + /// graph's engine opens. + pub queries: QueryRegistry, } /// Runtime routing for the server. Single mode = legacy @@ -285,7 +304,31 @@ impl AppState { ) -> Self { let bearer_tokens = hash_bearer_tokens(bearer_tokens); let per_graph_policy = policy_engine.map(Arc::new); - Self::build_single_mode(uri, db, bearer_tokens, per_graph_policy, Arc::new(workload)) + Self::build_single_mode(uri, db, bearer_tokens, per_graph_policy, Arc::new(workload), None) + } + + /// Like `new_single`, but attaches a pre-validated stored-query + /// registry. Private β€” the production single-mode boot path + /// (`open_single_with_queries`) is the only caller; every public + /// `new_*` constructor builds with no stored queries. + fn new_single_with_queries( + uri: String, + db: Omnigraph, + bearer_tokens: Vec<(String, String)>, + policy_engine: Option, + workload: workload::WorkloadController, + queries: Option>, + ) -> Self { + let bearer_tokens = hash_bearer_tokens(bearer_tokens); + let per_graph_policy = policy_engine.map(Arc::new); + Self::build_single_mode( + uri, + db, + bearer_tokens, + per_graph_policy, + Arc::new(workload), + queries, + ) } pub fn new(uri: String, db: Omnigraph) -> Self { @@ -377,6 +420,39 @@ impl AppState { uri: impl Into, bearer_tokens: Vec<(String, String)>, policy_file: Option<&PathBuf>, + ) -> Result { + Self::open_single_with_queries( + uri, + bearer_tokens, + policy_file, + QueryRegistry::default(), + ) + .await + } + + /// Single-mode boot with a stored-query registry: open the engine, + /// **type-check the registry against the live schema and refuse to + /// start on a breakage** (same posture as bad policy YAML), log + /// non-blocking warnings, then attach the registry to the handle. + /// With an empty registry the check is a no-op and no registry is + /// attached β€” that is the path `open_with_bearer_tokens_and_policy` + /// (no stored queries) takes. + pub async fn open_single_with_queries( + uri: impl Into, + bearer_tokens: Vec<(String, String)>, + policy_file: Option<&PathBuf>, + queries: QueryRegistry, + ) -> Result { + Self::open_single_with_queries_for_graph_id(uri, bearer_tokens, policy_file, queries, None) + .await + } + + async fn open_single_with_queries_for_graph_id( + uri: impl Into, + bearer_tokens: Vec<(String, String)>, + policy_file: Option<&PathBuf>, + queries: QueryRegistry, + graph_id: Option, ) -> Result { // The "policy requires tokens" invariant is enforced once by // `classify_server_runtime_state` in `serve()`, before either @@ -384,16 +460,24 @@ impl AppState { // time we get here, the (policy, no-tokens) combination has // already been rejected β€” no second bail needed. let uri = normalize_root_uri(&uri.into()).wrap_err("normalize graph URI")?; + let graph_id = graph_id.unwrap_or_else(|| uri.clone()); let db = Omnigraph::open(&uri).await?; + + // Validate the registry against the live schema and resolve it to + // an attachable handle (refuse boot on breakage). + let registry = validate_and_attach(queries, &db.catalog(), &graph_id)?; + let policy_engine = match policy_file { - Some(path) => Some(PolicyEngine::load_graph(path, &uri)?), + Some(path) => Some(PolicyEngine::load_graph(path, &graph_id)?), None => None, }; - Ok(Self::new_with_bearer_tokens_and_policy( + Ok(Self::new_single_with_queries( uri, db, bearer_tokens, policy_engine, + workload::WorkloadController::from_env(), + registry, )) } @@ -408,6 +492,7 @@ impl AppState { bearer_tokens: Arc<[(BearerTokenHash, Arc)]>, policy_engine: Option>, workload: Arc, + queries: Option>, ) -> Self { // Engine-layer policy gate (MR-722). With a per-graph policy // installed, every `_as` writer on `Omnigraph` calls into the @@ -436,6 +521,7 @@ impl AppState { uri, engine: Arc::new(db), policy: policy_engine, + queries, }); Self { routing: GraphRouting::Single { handle }, @@ -750,6 +836,58 @@ pub fn init_tracing() { let _ = tracing_subscriber::fmt().with_env_filter(filter).try_init(); } +/// Log each non-blocking advisory from a registry check report. +fn log_registry_warnings(label: &str, report: &queries::CheckReport) { + for warning in &report.warnings { + warn!(graph = label, query = %warning.query, "stored query: {}", warning.message); + } +} + +fn validate_registry_against_catalog( + registry: &QueryRegistry, + catalog: &Catalog, + label: &str, +) -> omnigraph::error::Result<()> { + let report = check(registry, catalog); + if report.has_breakages() { + return Err(OmniError::manifest(format_check_breakages(label, &report))); + } + log_registry_warnings(label, &report); + Ok(()) +} + +/// Validate a loaded stored-query registry against the live schema and +/// resolve it to an attachable handle. Refuses boot on any breakage +/// (same posture as bad policy YAML), logs the non-blocking warnings, +/// and collapses an empty registry to `None` (nothing attached). This is +/// the single gate every open path funnels through, so no opener can +/// attach a registry that has not been schema-checked. `label` names the +/// graph in messages. +fn validate_and_attach( + queries: QueryRegistry, + catalog: &Catalog, + label: &str, +) -> Result>> { + validate_registry_against_catalog(&queries, catalog, label) + .map_err(|err| color_eyre::eyre::eyre!(err.to_string()))?; + Ok(if queries.is_empty() { + None + } else { + Some(Arc::new(queries)) + }) +} + +/// Format every load error (parse / identity failure) into a multi-line +/// boot-abort message. +fn format_registry_load_errors(label: &str, errors: &[queries::LoadError]) -> String { + let joined = errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n "); + format!("graph '{label}': stored-query registry failed to load:\n {joined}") +} + pub fn load_server_settings( config_path: Option<&PathBuf>, cli_uri: Option, @@ -799,15 +937,43 @@ pub fn load_server_settings( let uri = normalize_root_uri(&raw_uri).wrap_err_with(|| { format!("normalize single-graph URI '{raw_uri}' from server settings") })?; - let policy_file = config.resolve_policy_file(); - ServerConfigMode::Single { uri, policy_file } + // Config follows graph IDENTITY, not mode: a bare URI is anonymous + // (top-level config); a graph chosen by name uses its per-graph + // `graphs..{policy,queries}`. `resolve_target_uri` already + // errored on an unknown name, so a `Some(name)` here is a known graph. + let selected: Option<&str> = if has_cli_uri { + None + } else { + cli_target.as_deref().or_else(|| config.server_graph_name()) + }; + // A named selection must not leave a populated top-level block + // silently unused β€” refuse boot and point at the per-graph block. The + // same rule the CLI selection gate enforces, shared via one helper so + // the boot check and `omnigraph queries validate`/`list` can't drift. + config.ensure_top_level_blocks_honored(selected)?; + // Load + identity-check now (no engine needed); the schema + // type-check happens when the engine opens. + let policy_file = config.resolve_policy_file_for(selected); + let queries = QueryRegistry::load(&config, config.query_entries_for(selected)) + .map_err(|errs| color_eyre::eyre::eyre!(format_registry_load_errors(&uri, &errs)))?; + let graph_id = graph_resource_id_for_selection(selected, &uri); + ServerConfigMode::Single { + uri, + graph_id, + policy_file, + queries, + } } else if has_explicit_config && has_graphs_map { - if config.resolve_policy_file().is_some() { + // Multi mode: every graph uses its per-graph block; top-level + // policy/queries are never honored, so a populated one is an error. + let unhonored = config.populated_top_level_blocks(); + if !unhonored.is_empty() { bail!( - "top-level `policy.file` is single-graph/CLI-local policy only; \ - in multi-graph mode move per-graph rules to \ - `graphs..policy.file` and move `graph_list` rules to \ - `server.policy.file`." + "multi-graph mode: top-level {} {} not honored β€” each graph uses its own \ + `graphs..…` block. Move per-graph rules there (and any \ + `graph_list` policy to `server.policy.file`).", + unhonored.join(" and "), + if unhonored.len() == 1 { "is" } else { "are" }, ); } // Rule 4 β†’ Multi mode. Build a startup config per graph. @@ -823,10 +989,17 @@ pub fn load_server_settings( let uri = normalize_root_uri(&raw_uri).wrap_err_with(|| { format!("normalize URI '{raw_uri}' for graph '{name}' in omnigraph.yaml") })?; + // Per-graph `queries:`, selected through the shared + // `query_entries_for` so server and CLI resolve identically. + // Load + identity-check now; the schema type-check happens + // when this graph's engine opens. + let queries = QueryRegistry::load(&config, config.query_entries_for(Some(name.as_str()))) + .map_err(|errs| color_eyre::eyre::eyre!(format_registry_load_errors(name, &errs)))?; graphs.push(GraphStartupConfig { graph_id: name.clone(), uri, policy_file: config.resolve_target_policy_file(name), + queries, }); } let config_path = config_path @@ -949,6 +1122,8 @@ pub fn build_app(state: AppState) -> Router { server_change })) .route("/mutate", post(server_mutate)) + .route("/queries", get(server_list_queries)) + .route("/queries/{name}", post(server_invoke_query)) .route("/schema", get(server_schema_get)) .route("/schema/apply", post(server_schema_apply)) .route( @@ -1046,10 +1221,28 @@ pub async fn serve(config: ServerConfig) -> Result<()> { let bind = config.bind.clone(); let state = match config.mode { - ServerConfigMode::Single { uri, policy_file } => { + ServerConfigMode::Single { + uri, + graph_id, + policy_file, + queries, + } => { let uri_for_log = uri.clone(); - info!(uri = %uri_for_log, bind = %bind, mode = "single", "serving omnigraph"); - AppState::open_with_bearer_tokens_and_policy(uri, tokens, policy_file.as_ref()).await? + info!( + uri = %uri_for_log, + graph_id = %graph_id, + bind = %bind, + mode = "single", + "serving omnigraph" + ); + AppState::open_single_with_queries_for_graph_id( + uri, + tokens, + policy_file.as_ref(), + queries, + Some(graph_id), + ) + .await? } ServerConfigMode::Multi { graphs, @@ -1131,6 +1324,12 @@ async fn open_single_graph(cfg: GraphStartupConfig) -> Result> .await .map_err(|err| color_eyre::eyre::eyre!("open graph '{}' at {}: {err}", graph_id, uri))?; + // Validate this graph's stored queries against the live schema and + // resolve them to an attachable handle (refuse boot on breakage). + // Done before the policy match rebinds `db`; the catalog handle is an + // owned `Arc`, so no borrow of `db` survives into the match. + let queries = validate_and_attach(cfg.queries, &db.catalog(), graph_id.as_str())?; + let (policy_arc, db) = match &cfg.policy_file { Some(path) => { let policy = PolicyEngine::load_graph(path, graph_id.as_str())?; @@ -1146,6 +1345,7 @@ async fn open_single_graph(cfg: GraphStartupConfig) -> Result> uri, engine: Arc::new(db), policy: policy_arc, + queries, })) } @@ -1479,7 +1679,21 @@ fn log_policy_decision(actor_id: &str, request: &PolicyRequest, decision: &Polic ); } -/// HTTP-layer Cedar policy gate. Two sources of the policy engine: +/// The allow/deny **decision** an authorization check produces, kept +/// separate from the operational failures (`Err`) that can occur while +/// computing it. [`authorize_request`] collapses `Denied` to a 403; a caller +/// that needs to remap a denial without also remapping operational failures +/// (the stored-query invoke handler hides a denial as a 404) matches on this +/// directly, so a real 401 (missing bearer) or 500 (policy-evaluation error) +/// keeps its true status instead of being masked as the denial's response. +enum Authz { + Allowed, + Denied(String), +} + +/// HTTP-layer Cedar policy gate, returning the allow/deny [`Authz`] decision +/// and reserving `Err` for operational failures (401 missing bearer, 500 +/// policy-evaluation error). Two sources of the policy engine: /// * Per-graph handler β€” passes `handle.policy.as_deref()` so the /// graph's Cedar rules govern read/change/branch_*/schema_apply. /// * Management handler β€” passes `state.server_policy.as_deref()` so @@ -1493,11 +1707,11 @@ fn log_policy_decision(actor_id: &str, request: &PolicyRequest, decision: &Polic /// dropped from the type), so handlers cannot smuggle it through the /// request. See `actor_id_resolves_from_bearer_token_ignoring_client_supplied_headers` /// at `tests/server.rs`. -fn authorize_request( +fn authorize( actor: Option<&ResolvedActor>, policy: Option<&PolicyEngine>, request: PolicyRequest, -) -> std::result::Result<(), ApiError> { +) -> std::result::Result { let Some(engine) = policy else { // No PolicyEngine installed. Three runtime states can reach this: // @@ -1524,21 +1738,23 @@ fn authorize_request( // operator's only path to enabling it is configuring an // explicit `server.policy.file` in omnigraph.yaml. if request.action.resource_kind() == PolicyResourceKind::Server { - return Err(ApiError::forbidden( + return Ok(Authz::Denied( "server-scoped actions require an explicit `server.policy.file` \ configured in omnigraph.yaml β€” the management surface is closed \ by default in every runtime state, including --unauthenticated, \ - so that server topology is never exposed without operator opt-in.", + so that server topology is never exposed without operator opt-in." + .to_string(), )); } if actor.is_some() && request.action != PolicyAction::Read { - return Err(ApiError::forbidden( + return Ok(Authz::Denied( "server runs in default-deny mode (bearer tokens configured but no \ policy file). Only `read` actions are permitted; configure \ - `policy.file` in omnigraph.yaml to enable other actions.", + `policy.file` in omnigraph.yaml to enable other actions." + .to_string(), )); } - return Ok(()); + return Ok(Authz::Allowed); }; let Some(actor) = actor else { return Err(ApiError::unauthorized("missing bearer token")); @@ -1560,9 +1776,26 @@ fn authorize_request( .map_err(|err| ApiError::internal(format!("policy: {err}")))?; log_policy_decision(actor_id, &request, &decision); if decision.allowed { - Ok(()) + Ok(Authz::Allowed) } else { - Err(ApiError::forbidden(decision.message)) + Ok(Authz::Denied(decision.message)) + } +} + +/// Thin wrapper over [`authorize`] for the handlers that treat any denial as a +/// 403: a denial becomes `ApiError::forbidden`, and operational failures +/// (401 missing bearer, 500 policy-evaluation error) propagate unchanged. The +/// stored-query invoke handler does **not** use this β€” it consumes the +/// [`Authz`] decision directly to hide a denial as a 404 while letting an +/// operational failure keep its true status. +fn authorize_request( + actor: Option<&ResolvedActor>, + policy: Option<&PolicyEngine>, + request: PolicyRequest, +) -> std::result::Result<(), ApiError> { + match authorize(actor, policy, request)? { + Authz::Allowed => Ok(()), + Authz::Denied(message) => Err(ApiError::forbidden(message)), } } @@ -2001,6 +2234,194 @@ async fn server_mutate( )) } +/// Path parameter for `POST /queries/{name}`. +#[derive(Deserialize)] +struct QueryNamePath { + name: String, +} + +fn parse_optional_invoke_body( + body: Bytes, +) -> std::result::Result { + if body.is_empty() { + return Ok(InvokeStoredQueryRequest::default()); + } + serde_json::from_slice::>(&body) + .map(|request| request.unwrap_or_default()) + .map_err(|err| { + ApiError::bad_request(format!("invalid stored-query invocation body: {err}")) + }) +} + +#[utoipa::path( + post, + path = "/queries/{name}", + tag = "queries", + operation_id = "invoke_query", + params(("name" = String, Path, description = "Stored query name (the registry key)")), + request_body = Option, + responses( + (status = 200, description = "Read envelope (ReadOutput) or mutation envelope (ChangeOutput), serialized untagged", body = InvokeStoredQueryResponse), + (status = 400, description = "Bad request (param type error; snapshot on a stored mutation)", body = ErrorOutput), + (status = 401, description = "Unauthorized", body = ErrorOutput), + (status = 403, description = "Forbidden (the inner `change` gate for a stored mutation)", body = ErrorOutput), + (status = 404, description = "Unknown stored query, or `invoke_query` denied β€” indistinguishable to a caller without the grant", body = ErrorOutput), + (status = 409, description = "Merge conflict", body = ErrorOutput), + (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput), + (status = 500, description = "Policy evaluation error (a denial is reported as 404, not 500)", body = ErrorOutput), + ), + security(("bearer_token" = [])), +)] +/// Invoke a curated, server-side stored query by name. +/// +/// The query source comes from the graph's `queries:` registry, not the +/// request body β€” callers send only runtime inputs (`params`, `branch`, +/// `snapshot`). Gated by the `invoke_query` Cedar action at the boundary; +/// a stored *mutation* additionally passes the engine's `change` gate +/// (double-gated). An actor **without** `invoke_query` cannot tell a denied +/// query from a missing one β€” both return the same 404, so the catalog +/// can't be probed without the grant. Once `invoke_query` is held, the +/// inner `read`/`change` gate may surface a 403 for an existing query the +/// actor can't run (the intended double-gate signal). +async fn server_invoke_query( + State(state): State, + Extension(handle): Extension>, + actor: Option>, + Path(QueryNamePath { name }): Path, + body: Bytes, +) -> std::result::Result, ApiError> { + let req = parse_optional_invoke_body(body)?; + // A caller without `invoke_query` can't tell a denial from a missing + // query: both 404 with this exact message, so the catalog can't be + // probed without the grant. (A caller that holds invoke_query may still + // see the inner gate's 403 for an existing query it can't run β€” intended.) + const NOT_FOUND: &str = "stored query not found"; + let actor_ref = actor.as_ref().map(|Extension(actor)| actor); + + // Boundary gate (authentication already ran in `require_bearer_auth`). + // A denial is hidden as 404 (deny == missing, so the catalog can't be + // probed without the grant), but operational failures (401 missing bearer, + // 500 policy-evaluation error) propagate with their true status via `?` + // rather than being masked as a missing query. + match authorize( + actor_ref, + handle.policy.as_deref(), + PolicyRequest { + action: PolicyAction::InvokeQuery, + // Graph-scoped: no branch dimension. The per-branch/snapshot + // access is enforced by the inner read/change gate in the + // runner, so the outer gate must not resolve a branch (doing so + // was wrong for snapshot reads). + branch: None, + target_branch: None, + }, + )? { + Authz::Allowed => {} + Authz::Denied(_) => return Err(ApiError::not_found(NOT_FOUND)), + } + + // Resolve against the per-graph registry (same 404 on a miss). + let stored = handle + .queries + .as_ref() + .and_then(|registry| registry.lookup(&name)) + .ok_or_else(|| ApiError::not_found(NOT_FOUND))?; + + // Detach what we need before `handle` moves into the runner β€” the + // registry borrow lives inside `handle`. + let source = Arc::clone(&stored.source); + let query_name = stored.name.clone(); + let is_mutation = stored.is_mutation(); + + info!( + graph = %handle.uri, + actor = ?actor_ref.map(|a| a.actor_id.as_ref()), + query = %query_name, + kind = if is_mutation { "mutate" } else { "read" }, + "stored query invoked" + ); + + if is_mutation { + if req.snapshot.is_some() { + return Err(ApiError::bad_request( + "stored mutation cannot target a snapshot", + )); + } + let branch = req.branch.unwrap_or_else(|| "main".to_string()); + let output = run_mutate( + state, + handle, + actor_ref, + &source, + Some(&query_name), + req.params.as_ref(), + branch, + ) + .await?; + Ok(Json(InvokeStoredQueryResponse::Change(output))) + } else { + let (selected, target, result) = run_query( + handle, + actor_ref, + &source, + Some(&query_name), + req.params.as_ref(), + req.branch, + req.snapshot, + true, + ) + .await?; + Ok(Json(InvokeStoredQueryResponse::Read(api::read_output( + selected, &target, result, + )))) + } +} + +#[utoipa::path( + get, + path = "/queries", + tag = "queries", + operation_id = "list_queries", + responses( + (status = 200, description = "Stored-query catalog (the mcp.expose subset, with typed params)", body = QueriesCatalogOutput), + (status = 401, description = "Unauthorized", body = ErrorOutput), + (status = 403, description = "Forbidden", body = ErrorOutput), + ), + security(("bearer_token" = [])), +)] +/// List the graph's exposed stored queries as a typed tool catalog. +/// +/// Returns the `mcp.expose == true` subset of the `queries:` registry, each +/// with its MCP tool name, read/mutate flag, description/instruction, and +/// typed parameters β€” enough for a client to register them as tools without +/// fetching `.gq` source. Read-gated; the catalog is graph-wide (branch +/// independent β€” `read` is authorized against `main`). **Not** Cedar-filtered +/// per query yet, so it can list a query whose `invoke_query` the caller +/// lacks (a known gap until per-query authorization lands). +async fn server_list_queries( + Extension(handle): Extension>, + actor: Option>, +) -> std::result::Result, ApiError> { + authorize_request( + actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), + PolicyRequest { + action: PolicyAction::Read, + branch: Some("main".to_string()), + target_branch: None, + }, + )?; + let queries = match handle.queries.as_ref() { + Some(registry) => registry + .iter() + .filter(|q| q.expose) + .map(api::query_catalog_entry) + .collect(), + None => Vec::new(), + }; + Ok(Json(QueriesCatalogOutput { queries })) +} + #[utoipa::path( get, path = "/schema", @@ -2088,18 +2509,26 @@ async fn server_schema_apply( .map_err(ApiError::from_workload_reject)?; let result = { let db = &handle.engine; + let registry = handle.queries.as_deref(); + let label = handle.key.graph_id.as_str().to_string(); // Engine-layer policy enforcement (MR-722): pass the resolved // actor through so apply_schema_as can call enforce() with the // authoritative identity. With a policy installed in AppState, // engine-side enforcement re-checks the same decision the // HTTP-layer authorize_request just made above. PR #3 collapses // the redundancy. - db.apply_schema_as( + db.apply_schema_as_with_catalog_check( &request.schema_source, omnigraph::db::SchemaApplyOptions { allow_data_loss: request.allow_data_loss, }, actor_id, + |catalog| { + if let Some(registry) = registry { + validate_registry_against_catalog(registry, catalog, &label)?; + } + Ok(()) + }, ) .await .map_err(ApiError::from_omni)? @@ -2658,12 +3087,133 @@ mod tests { use std::fs; use tempfile::tempdir; + /// `authorize` returns the allow/deny **decision** (`Authz`) and reserves + /// `Err` for operational failures, so the invoke handler can hide a denial + /// as 404 without also masking a 401/500. Pins each outcome. + #[test] + fn authorize_splits_decision_from_operational_error() { + use super::{Authz, PolicyAction, PolicyCompiler, PolicyConfig, PolicyRequest, ResolvedActor, authorize}; + use std::sync::Arc; + + fn req(action: PolicyAction) -> PolicyRequest { + PolicyRequest { action, branch: None, target_branch: None } + } + let actor = ResolvedActor::cluster_static(Arc::from("act-alice")); + + // --- No policy engine installed (open / default-deny modes) --- + // A server-scoped action is denied in every no-policy state. + assert!(matches!( + authorize(Some(&actor), None, req(PolicyAction::GraphList)).unwrap(), + Authz::Denied(_) + )); + // Authenticated actor + a non-read per-graph action β†’ default-deny. + assert!(matches!( + authorize(Some(&actor), None, req(PolicyAction::Change)).unwrap(), + Authz::Denied(_) + )); + // `read` is the one per-graph action permitted without a policy. + assert!(matches!( + authorize(Some(&actor), None, req(PolicyAction::Read)).unwrap(), + Authz::Allowed + )); + // Open mode (no actor, no policy) β†’ allowed. + assert!(matches!( + authorize(None, None, req(PolicyAction::Read)).unwrap(), + Authz::Allowed + )); + + // --- Policy engine installed --- + let policy: PolicyConfig = serde_yaml::from_str( + "version: 1\n\ + groups:\n team: [act-alice]\n\ + rules:\n - id: team-read\n allow:\n actors: { group: team }\n actions: [read]\n branch_scope: any\n", + ) + .unwrap(); + let engine = PolicyCompiler::compile(&policy, "graph").unwrap(); + + // A matched allow rule β†’ Allowed. + assert!(matches!( + authorize( + Some(&actor), + Some(&engine), + PolicyRequest { action: PolicyAction::Read, branch: Some("main".to_string()), target_branch: None }, + ) + .unwrap(), + Authz::Allowed + )); + // Known actor, no matching allow rule β†’ Denied, carrying the decision message. + match authorize( + Some(&actor), + Some(&engine), + PolicyRequest { action: PolicyAction::Change, branch: Some("main".to_string()), target_branch: None }, + ) + .unwrap() + { + Authz::Denied(message) => assert!(!message.is_empty(), "a deny carries its decision message"), + Authz::Allowed => panic!("change must be denied: only read is allowed"), + } + // Policy installed but no actor β†’ operational failure (`Err`), NOT a + // decision. This is the split that keeps a 401/500 from being masked + // as the denial's response in the invoke handler. + assert!( + authorize(None, Some(&engine), req(PolicyAction::Read)).is_err(), + "a missing actor with a policy installed is an operational error, not a deny" + ); + } + #[test] fn hash_bearer_token_produces_32_byte_output() { let hash = hash_bearer_token("any-token"); assert_eq!(hash.len(), 32); } + /// The single gate both open paths funnel through: it refuses a + /// schema breakage (naming the graph label + query), attaches a clean + /// registry, and collapses an empty one to `None`. Pure over its args + /// (no engine), so it covers the multi-graph path's logic too β€” the + /// only per-path difference is the `label`, asserted here. + #[test] + fn validate_and_attach_gates_on_schema_and_collapses_empty() { + use crate::queries::{QueryRegistry, RegistrySpec}; + use omnigraph_compiler::catalog::build_catalog; + use omnigraph_compiler::schema::parser::parse_schema; + + let schema = parse_schema("node User {\nname: String\n}\n").unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let spec = |name: &str, source: &str| RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose: false, + tool_name: None, + }; + + // Empty registry β†’ nothing attached, no error. + let empty = + super::validate_and_attach(QueryRegistry::default(), &catalog, "g").unwrap(); + assert!(empty.is_none()); + + // A query that type-checks β†’ attached. + let ok = QueryRegistry::from_specs(vec![spec( + "find_user", + "query find_user() { match { $u: User } return { $u.name } }", + )]) + .unwrap(); + assert!(super::validate_and_attach(ok, &catalog, "g").unwrap().is_some()); + + // A query referencing a type the schema lacks β†’ boot refusal that + // names both the graph label and the offending query. + let broken = QueryRegistry::from_specs(vec![spec( + "ghost", + "query ghost() { match { $w: Widget } return { $w.name } }", + )]) + .unwrap(); + let err = super::validate_and_attach(broken, &catalog, "graph-x").unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("graph-x"), "labels the graph: {msg}"); + assert!(msg.contains("ghost"), "names the query: {msg}"); + assert!(msg.contains("schema check"), "mentions the schema check: {msg}"); + } + #[test] fn hash_bearer_token_is_deterministic() { assert_eq!( @@ -2707,7 +3257,10 @@ server: let settings = load_server_settings(Some(&config), None, None, None, false).unwrap(); match &settings.mode { - ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/demo.omni"), + ServerConfigMode::Single { uri, graph_id, .. } => { + assert_eq!(uri, "/tmp/demo.omni"); + assert_eq!(graph_id, "local"); + } ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), } assert_eq!(settings.bind, "0.0.0.0:9090"); @@ -2739,7 +3292,10 @@ server: ) .unwrap(); match &settings.mode { - ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/override.omni"), + ServerConfigMode::Single { uri, graph_id, .. } => { + assert_eq!(uri, "/tmp/override.omni"); + assert_eq!(graph_id, "/tmp/override.omni"); + } ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), } assert_eq!(settings.bind, "0.0.0.0:9999"); @@ -2768,7 +3324,10 @@ server: load_server_settings(Some(&config), None, Some("dev".to_string()), None, false) .unwrap(); match &settings.mode { - ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "http://127.0.0.1:8080"), + ServerConfigMode::Single { uri, graph_id, .. } => { + assert_eq!(uri, "http://127.0.0.1:8080"); + assert_eq!(graph_id, "dev"); + } ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), } } @@ -2848,6 +3407,7 @@ server: .to_string_lossy() .into_owned(), policy_file: None, + queries: crate::queries::QueryRegistry::default(), }], config_path: temp.path().join("omnigraph.yaml"), server_policy_file: Some(policy_path), @@ -2895,7 +3455,9 @@ server: .join("graph.omni") .to_string_lossy() .into_owned(), + graph_id: "default".to_string(), policy_file: None, + queries: crate::queries::QueryRegistry::default(), }, bind: "127.0.0.1:0".to_string(), allow_unauthenticated: false, diff --git a/crates/omnigraph-server/src/queries.rs b/crates/omnigraph-server/src/queries.rs new file mode 100644 index 0000000..bf131c8 --- /dev/null +++ b/crates/omnigraph-server/src/queries.rs @@ -0,0 +1,688 @@ +//! Stored-query registry. +//! +//! A server-side registry of named, parameter-typed `.gq` queries that +//! operators declare in `omnigraph.yaml` (per-graph, or top-level in +//! single mode) and the server loads at startup. Each entry is parsed +//! and its identity asserted here (`load`); type-checking against the +//! live schema happens separately (a `check` pass) so the loader stays +//! callable without an open engine (the CLI's offline `queries check`). +//! +//! Identity is the query **name**: the manifest key must equal the +//! `query ` symbol declared in the referenced `.gq` file. The two +//! are asserted equal at load β€” one name, two places that must agree. +//! Renaming either is a breaking change to callers, by design. + +use std::collections::BTreeMap; +use std::fs; +use std::sync::Arc; + +use omnigraph_compiler::catalog::Catalog; +use omnigraph_compiler::query::ast::QueryDecl; +use omnigraph_compiler::query::parser::parse_query; +use omnigraph_compiler::query::typecheck::typecheck_query_decl; +use omnigraph_compiler::types::{PropType, ScalarType}; + +use crate::config::{OmnigraphConfig, QueryEntry}; + +/// One loaded stored query. `source` is the full `.gq` file text β€” the +/// invocation handler hands it to `run_query` / `run_mutate` verbatim, +/// which reuse the same parse/IR/exec path as the inline routes (no +/// parallel implementation). +#[derive(Debug, Clone)] +pub struct StoredQuery { + /// Identity: manifest key == `query ` symbol. + pub name: String, + /// Full `.gq` source text the query was selected from. + pub source: Arc, + /// Parsed declaration (params, mutations, description, …). + pub decl: QueryDecl, + /// Whether this query is listed in the MCP tool catalog (`GET /queries`). + /// Default `true` (the manifest entry is the opt-in); `expose: false` + /// keeps it HTTP/service-callable but hidden from the agent tool list. + /// Catalog membership only β€” not an authorization gate. + pub expose: bool, + /// Optional MCP tool-name override; defaults to `name`. + pub tool_name: Option, +} + +impl StoredQuery { + /// `true` if the selected declaration contains insert/update/delete + /// statements β€” drives read-vs-mutate routing at invocation time. + pub fn is_mutation(&self) -> bool { + !self.decl.mutations.is_empty() + } + + /// The MCP tool name this query is catalogued under: the explicit + /// `tool_name` override, else the query `name`. The catalog key β€” + /// enforced unique across exposed queries at load. Server-side + /// consumers (the uniqueness check, the future catalog projection) read + /// this; the CLI `queries list` resolves the same rule on its own DTO. + pub fn effective_tool_name(&self) -> &str { + self.tool_name.as_deref().unwrap_or(&self.name) + } +} + +/// A loaded, identity-checked stored-query registry for one graph. +#[derive(Debug, Clone, Default)] +pub struct QueryRegistry { + by_name: BTreeMap, +} + +/// In-memory registry entry before file I/O. Used by [`QueryRegistry::load`] +/// (after reading each `.gq` from disk) and directly by tests. +#[derive(Debug, Clone)] +pub struct RegistrySpec { + pub name: String, + pub source: String, + pub expose: bool, + pub tool_name: Option, +} + +/// A single registry load failure. Collected (not fail-fast) so a bad +/// `omnigraph.yaml` surfaces every broken entry at once, matching the +/// bad-policy-YAML posture. +#[derive(Debug, Clone)] +pub struct LoadError { + /// The offending query name, when the failure is entry-scoped. + pub query: Option, + pub message: String, +} + +impl std::fmt::Display for LoadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.query { + Some(name) => write!(f, "stored query '{name}': {}", self.message), + None => write!(f, "stored query registry: {}", self.message), + } + } +} + +impl QueryRegistry { + /// Build a registry from in-memory specs: parse each source, select + /// the declaration whose symbol equals the manifest key, and assert + /// they agree. Collects every failure. No schema type-checking here + /// β€” that is [`check`]. + pub fn from_specs(specs: Vec) -> Result> { + let mut by_name = BTreeMap::new(); + let mut errors = Vec::new(); + + for spec in specs { + match parse_query(&spec.source) { + Ok(file) => { + match file.queries.into_iter().find(|q| q.name == spec.name) { + Some(decl) => { + by_name.insert( + spec.name.clone(), + StoredQuery { + name: spec.name, + source: Arc::from(spec.source), + decl, + expose: spec.expose, + tool_name: spec.tool_name, + }, + ); + } + None => errors.push(LoadError { + query: Some(spec.name.clone()), + message: format!( + "no `query {}` declaration found in its `.gq` file \ + (the registry key must match the query symbol)", + spec.name + ), + }), + } + } + Err(err) => errors.push(LoadError { + query: Some(spec.name), + message: format!("parse error: {err}"), + }), + } + } + + // Exposed queries are catalogued under their effective tool name; + // two claiming one name is an MCP-namespace collision. Refuse it at + // load (collected, not fail-fast), naming the loser and the winner. + // Iterating the `BTreeMap` makes the winner deterministic (the + // lexicographically-first query name; config is a map, so YAML + // declaration order isn't preserved anyway) and the error order + // stable. Scoped to a block so these borrows of `by_name` end + // before it is moved into `Self`. + { + let mut claimed: BTreeMap<&str, &str> = BTreeMap::new(); + for query in by_name.values().filter(|q| q.expose) { + let tool = query.effective_tool_name(); + if let Some(winner) = claimed.insert(tool, &query.name) { + errors.push(LoadError { + query: Some(query.name.clone()), + message: format!( + "MCP tool name '{tool}' already claimed by exposed query '{winner}'" + ), + }); + } + } + } + + if errors.is_empty() { + Ok(Self { by_name }) + } else { + Err(errors) + } + } + + /// Read each registry entry's `.gq` file from disk and build the + /// registry. `entries` is either the top-level `queries` map (single + /// mode) or a graph's `queries` map (multi mode); `config` resolves + /// each entry's relative `file:` path against `base_dir`. + pub fn load( + config: &OmnigraphConfig, + entries: &BTreeMap, + ) -> Result> { + let mut specs = Vec::with_capacity(entries.len()); + let mut errors = Vec::new(); + for (name, entry) in entries { + let path = config.resolve_query_file(&entry.file); + match fs::read_to_string(&path) { + Ok(source) => specs.push(RegistrySpec { + name: name.clone(), + source, + expose: entry.mcp.expose, + tool_name: entry.mcp.tool_name.clone(), + }), + Err(err) => errors.push(LoadError { + query: Some(name.clone()), + message: format!("cannot read '{}': {err}", path.display()), + }), + } + } + + // Parse/identity/uniqueness-check the readable specs even when some + // files failed to read, so every broken entry (I/O, parse, identity, + // tool-name collision) surfaces in one pass rather than one per + // restart. I/O errors come first (in `entries` key order), then the + // spec errors. A non-empty `errors` always fails the load. + match Self::from_specs(specs) { + Ok(registry) if errors.is_empty() => Ok(registry), + Ok(_) => Err(errors), + Err(spec_errors) => { + errors.extend(spec_errors); + Err(errors) + } + } + } + + pub fn lookup(&self, name: &str) -> Option<&StoredQuery> { + self.by_name.get(name) + } + + pub fn iter(&self) -> impl Iterator { + self.by_name.values() + } + + pub fn is_empty(&self) -> bool { + self.by_name.is_empty() + } + + pub fn len(&self) -> usize { + self.by_name.len() + } +} + +/// A stored query that fails to type-check against the live schema β€” +/// e.g. it references a node/edge type or property that was renamed or +/// removed by a migration. Breakages **block server boot** (same posture +/// as bad policy YAML), surfacing schema drift at the deploy boundary +/// rather than silently at invocation time. +#[derive(Debug, Clone)] +pub struct Breakage { + pub query: String, + pub message: String, +} + +/// A non-blocking advisory found during validation. Logged at boot; +/// never blocks startup. Currently: an MCP-exposed query that declares a +/// parameter an agent cannot realistically supply. +#[derive(Debug, Clone)] +pub struct Warning { + pub query: String, + pub message: String, +} + +/// Outcome of validating a registry against a schema. Breakages are +/// fatal (boot refuses); warnings are advisory. +#[derive(Debug, Clone, Default)] +pub struct CheckReport { + pub breakages: Vec, + pub warnings: Vec, +} + +impl CheckReport { + pub fn has_breakages(&self) -> bool { + !self.breakages.is_empty() + } + + pub fn is_clean(&self) -> bool { + self.breakages.is_empty() && self.warnings.is_empty() + } +} + +/// Validate a loaded registry against the live schema. +/// +/// Pure over `(registry, catalog)` β€” takes an already-parsed registry and +/// a catalog, so it is callable both at server boot (with the engine's +/// `catalog()`) and offline from the CLI (`omnigraph queries check`), +/// without coupling to server config or an open engine connection. +/// +/// Every query is type-checked via the same `typecheck_query_decl` the +/// engine runs for inline queries β€” no parallel implementation. Failures +/// are **collected, not fail-fast**, so an operator sees every broken +/// query in one pass. +/// +/// Advisory lint (warn, never block): an `mcp.expose: true` query that +/// declares a `Vector(N)` parameter. An LLM cannot supply a raw embedding +/// vector; such a query should take a `String` parameter and let the +/// engine embed it server-side at query time. Service-to-service callers +/// may legitimately pass vectors, so this warns rather than rejects. +pub fn check(registry: &QueryRegistry, catalog: &Catalog) -> CheckReport { + let mut report = CheckReport::default(); + for query in registry.iter() { + if let Err(err) = typecheck_query_decl(catalog, &query.decl) { + report.breakages.push(Breakage { + query: query.name.clone(), + message: err.to_string(), + }); + } + if query.expose { + for param in &query.decl.params { + // Resolve to the structured type via the compiler's own + // resolver rather than string-matching `Vector(` β€” one + // canonical definition of "is a vector", so this lint can't + // drift from how the parser/type system spells the type. + let is_vector = PropType::from_param_type_name(¶m.type_name, param.nullable) + .is_some_and(|pt| matches!(pt.scalar, ScalarType::Vector(_))); + if is_vector { + report.warnings.push(Warning { + query: query.name.clone(), + message: format!( + "MCP-exposed query declares a `{}` parameter `${}` that agents \ + cannot supply; use a `String` parameter for server-side embedding", + param.type_name, param.name + ), + }); + } + } + } + } + report +} + +/// Format every breakage in a registry check report into a multi-line +/// operator-facing message, naming each offending query. +pub fn format_check_breakages(label: &str, report: &CheckReport) -> String { + let joined = report + .breakages + .iter() + .map(|b| format!("query '{}': {}", b.query, b.message)) + .collect::>() + .join("\n "); + format!( + "graph '{label}': {} stored quer{} failed the schema check:\n {joined}", + report.breakages.len(), + if report.breakages.len() == 1 { + "y" + } else { + "ies" + } + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn spec(name: &str, source: &str, expose: bool) -> RegistrySpec { + RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose, + tool_name: None, + } + } + + fn spec_tool(name: &str, source: &str, expose: bool, tool_name: &str) -> RegistrySpec { + RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose, + tool_name: Some(tool_name.to_string()), + } + } + + #[test] + fn key_equal_symbol_loads() { + let reg = QueryRegistry::from_specs(vec![spec( + "find_user", + "query find_user($id: String) { match { $u: User } return { $u.name } }", + true, + )]) + .unwrap(); + let q = reg.lookup("find_user").unwrap(); + assert_eq!(q.name, "find_user"); + assert!(q.expose); + assert_eq!(q.decl.params.len(), 1); + assert!(!q.is_mutation()); + // No override β†’ the effective tool name is the query name. + assert_eq!(q.effective_tool_name(), "find_user"); + + // An explicit override is what the catalog keys on. + let with_tool = QueryRegistry::from_specs(vec![spec_tool( + "find_user", + "query find_user($id: String) { match { $u: User } return { $u.name } }", + true, + "lookup_user", + )]) + .unwrap(); + assert_eq!( + with_tool.lookup("find_user").unwrap().effective_tool_name(), + "lookup_user" + ); + } + + #[test] + fn key_mismatch_is_an_identity_error() { + let errors = QueryRegistry::from_specs(vec![spec( + "find_user", + // symbol is `lookup`, key is `find_user` β€” must be rejected. + "query lookup($id: String) { match { $u: User } return { $u.name } }", + false, + )]) + .unwrap_err(); + assert_eq!(errors.len(), 1); + assert_eq!(errors[0].query.as_deref(), Some("find_user")); + assert!(errors[0].message.contains("must match the query symbol")); + } + + #[test] + fn multi_query_file_selects_the_matching_symbol() { + let source = "query a($x: I64) { match { $u: User } return { $u.name } }\n\ + query b($y: String) { match { $u: User } return { $u.name } }"; + let reg = QueryRegistry::from_specs(vec![spec("b", source, false)]).unwrap(); + let q = reg.lookup("b").unwrap(); + assert_eq!(q.name, "b"); + assert_eq!(q.decl.params[0].name, "y"); + assert!(reg.lookup("a").is_none(), "only the selected symbol is registered"); + } + + #[test] + fn duplicate_exposed_tool_name_is_a_load_error() { + // Two MCP-exposed queries claiming one tool name is an ambiguity in + // the catalog key space β€” refused at load, naming both queries and + // the contested tool. + let errors = QueryRegistry::from_specs(vec![ + spec_tool("a", "query a() { match { $u: User } return { $u.name } }", true, "dup"), + spec_tool("b", "query b() { match { $u: User } return { $u.name } }", true, "dup"), + ]) + .unwrap_err(); + assert_eq!(errors.len(), 1); + let msg = errors[0].to_string(); + assert!(msg.contains("'dup'"), "names the contested tool: {msg}"); + assert!(msg.contains("'a'"), "names the winning query: {msg}"); + assert!(msg.contains("'b'"), "names the losing query: {msg}"); + } + + #[test] + fn duplicate_tool_name_among_unexposed_is_allowed() { + // Unexposed queries have no MCP tool, so a shared effective tool + // name is inert β€” must not error (pins the exposed-only scope). + let reg = QueryRegistry::from_specs(vec![ + spec_tool("a", "query a() { match { $u: User } return { $u.name } }", false, "dup"), + spec_tool("b", "query b() { match { $u: User } return { $u.name } }", false, "dup"), + ]) + .unwrap(); + assert_eq!(reg.len(), 2); + } + + #[test] + fn parse_error_surfaces_per_entry() { + let errors = + QueryRegistry::from_specs(vec![spec("broken", "query broken( {{ not valid", false)]) + .unwrap_err(); + assert_eq!(errors[0].query.as_deref(), Some("broken")); + assert!(errors[0].message.contains("parse error")); + } + + #[test] + fn errors_collect_rather_than_fail_fast() { + let errors = QueryRegistry::from_specs(vec![ + spec("good", "query good() { match { $u: User } return { $u.name } }", false), + spec("mismatch", "query other() { match { $u: User } return { $u.name } }", false), + spec("broken", "query broken(", false), + ]) + .unwrap_err(); + // `good` loads cleanly; only the mismatch and the parse error are + // reported, and both surface in one pass (not fail-fast). + assert_eq!(errors.len(), 2); + } + + #[test] + fn mutation_body_classifies_as_mutation() { + let reg = QueryRegistry::from_specs(vec![spec( + "add_user", + "query add_user($name: String) { insert User { name: $name } }", + false, + )]) + .unwrap(); + assert!(reg.lookup("add_user").unwrap().is_mutation()); + } + + // --- check(registry, catalog) --- + + use omnigraph_compiler::catalog::build_catalog; + use omnigraph_compiler::schema::parser::parse_schema; + + fn test_catalog() -> Catalog { + let schema = parse_schema( + r#" +node User { +name: String +age: I32? +embedding: Vector(4) +} +"#, + ) + .unwrap(); + build_catalog(&schema).unwrap() + } + + #[test] + fn check_passes_for_valid_query() { + let reg = QueryRegistry::from_specs(vec![spec( + "find_user", + "query find_user($name: String) { match { $u: User { name: $name } } return { $u.age } }", + false, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.is_clean(), "unexpected: {:?}", report); + } + + #[test] + fn check_reports_unknown_type_as_breakage() { + let reg = QueryRegistry::from_specs(vec![spec( + "ghost", + // `Widget` is not in the schema. + "query ghost() { match { $w: Widget } return { $w.name } }", + false, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.has_breakages()); + assert_eq!(report.breakages[0].query, "ghost"); + } + + #[test] + fn check_reports_unknown_property_as_breakage() { + let reg = QueryRegistry::from_specs(vec![spec( + "bad_prop", + // `User` exists but has no `nickname`. + "query bad_prop() { match { $u: User } return { $u.nickname } }", + false, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.has_breakages()); + assert_eq!(report.breakages[0].query, "bad_prop"); + } + + #[test] + fn check_collects_every_breakage_not_fail_fast() { + let reg = QueryRegistry::from_specs(vec![ + spec("a", "query a() { match { $w: Widget } return { $w.x } }", false), + spec("b", "query b() { match { $g: Gadget } return { $g.y } }", false), + spec( + "ok", + "query ok() { match { $u: User } return { $u.name } }", + false, + ), + ]) + .unwrap(); + let report = check(®, &test_catalog()); + assert_eq!(report.breakages.len(), 2, "both bad queries reported: {:?}", report); + } + + #[test] + fn vector_param_on_exposed_query_warns() { + let reg = QueryRegistry::from_specs(vec![spec( + "vec_search", + "query vec_search($q: Vector(4)) { match { $u: User } return { $u.name } \ + order { nearest($u.embedding, $q) } limit 3 }", + true, // mcp.expose + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(!report.has_breakages(), "valid query: {:?}", report); + assert_eq!(report.warnings.len(), 1); + assert_eq!(report.warnings[0].query, "vec_search"); + } + + #[test] + fn vector_param_on_unexposed_query_is_silent() { + let reg = QueryRegistry::from_specs(vec![spec( + "vec_search", + "query vec_search($q: Vector(4)) { match { $u: User } return { $u.name } \ + order { nearest($u.embedding, $q) } limit 3 }", + false, // not exposed β€” vector param is fine for service-to-service callers + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.is_clean(), "unexpected: {:?}", report); + } + + #[test] + fn non_vector_param_on_exposed_query_does_not_warn() { + // The recommended `String` alternative on an exposed query does not + // resolve to a Vector, so the embedding advisory stays silent. Guards + // the structured type check against a false positive (and pins that + // only `Vector(_)` triggers the warning). + let reg = QueryRegistry::from_specs(vec![spec( + "search", + "query search($name: String) { match { $u: User { name: $name } } return { $u.name } }", + true, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.is_clean(), "no breakage or warning expected: {:?}", report); + } + + // --- catalog projection (api::query_catalog_entry) --- + + #[test] + fn catalog_entry_projects_every_param_kind() { + use crate::api::{self, ParamKind}; + let reg = QueryRegistry::from_specs(vec![spec_tool( + "all_types", + "query all_types($s: String, $i: I32, $big: I64, $u: U64, $f: F64, $b: Bool, \ + $d: Date, $dt: DateTime, $blob: Blob, $opt: String?, $list: [I32], $vec: Vector(4)) \ + { match { $x: User } return { $x.name } }", + true, + "all", + )]) + .unwrap(); + let entry = api::query_catalog_entry(reg.lookup("all_types").unwrap()); + assert_eq!(entry.name, "all_types"); + assert_eq!(entry.tool_name, "all"); + assert!(!entry.mutation); + + let by: std::collections::HashMap<_, _> = + entry.params.iter().map(|p| (p.name.as_str(), p)).collect(); + assert_eq!(by["s"].kind, ParamKind::String); + assert_eq!(by["i"].kind, ParamKind::Int); + assert_eq!(by["big"].kind, ParamKind::BigInt, "I64 β†’ bigint (string on the wire)"); + assert_eq!(by["u"].kind, ParamKind::BigInt, "U64 β†’ bigint"); + assert_eq!(by["f"].kind, ParamKind::Float); + assert_eq!(by["b"].kind, ParamKind::Bool); + assert_eq!(by["d"].kind, ParamKind::Date); + assert_eq!(by["dt"].kind, ParamKind::DateTime); + assert_eq!(by["blob"].kind, ParamKind::Blob); + assert!(!by["s"].nullable); + assert!(by["opt"].nullable, "String? β†’ nullable"); + assert_eq!(by["list"].kind, ParamKind::List); + assert_eq!(by["list"].item_kind, Some(ParamKind::Int), "[I32] β†’ list of int"); + assert_eq!(by["vec"].kind, ParamKind::Vector); + assert_eq!(by["vec"].vector_dim, Some(4)); + } + + #[test] + fn catalog_entry_flags_mutation_and_empty_params() { + use crate::api; + let reg = QueryRegistry::from_specs(vec![spec( + "add_user", + "query add_user($name: String) { insert User { name: $name } }", + true, + )]) + .unwrap(); + let entry = api::query_catalog_entry(reg.lookup("add_user").unwrap()); + assert!(entry.mutation, "insert body β†’ mutation flag"); + + let reg2 = QueryRegistry::from_specs(vec![spec( + "no_params", + "query no_params() { match { $u: User } return { $u.name } }", + true, + )]) + .unwrap(); + let entry2 = api::query_catalog_entry(reg2.lookup("no_params").unwrap()); + assert!(entry2.params.is_empty(), "no declared params β†’ empty list"); + } + + // --- load() error collection (file I/O + parse in one pass) --- + + #[test] + fn load_collects_io_and_parse_errors_in_one_pass() { + use crate::config::load_config; + let temp = tempfile::tempdir().unwrap(); + std::fs::write( + temp.path().join("good.gq"), + "query good() { match { $u: User } return { $u.name } }", + ) + .unwrap(); + std::fs::write(temp.path().join("broken.gq"), "query broken( {{ not valid").unwrap(); + // `missing.gq` is deliberately not written (an I/O failure). + std::fs::write( + temp.path().join("omnigraph.yaml"), + "queries:\n good:\n file: ./good.gq\n \ + missing:\n file: ./missing.gq\n broken:\n file: ./broken.gq\n", + ) + .unwrap(); + let config = load_config(Some(&temp.path().join("omnigraph.yaml"))).unwrap(); + + let errors = QueryRegistry::load(&config, config.query_entries()).unwrap_err(); + let joined = errors.iter().map(|e| e.to_string()).collect::>().join("\n"); + // Both the missing file AND the parse error surface in one pass β€” + // the I/O failure must not mask the parse failure. + assert!(joined.contains("missing"), "I/O error must surface: {joined}"); + assert!( + joined.contains("broken") && joined.contains("parse error"), + "the parse error in a readable file must surface in the same pass: {joined}" + ); + assert!(!joined.contains("'good'"), "the valid entry is not an error: {joined}"); + } +} diff --git a/crates/omnigraph-server/src/registry.rs b/crates/omnigraph-server/src/registry.rs index 5897ad1..54115e4 100644 --- a/crates/omnigraph-server/src/registry.rs +++ b/crates/omnigraph-server/src/registry.rs @@ -29,6 +29,7 @@ use tokio::sync::Mutex; use crate::identity::GraphKey; use crate::policy::PolicyEngine; +use crate::queries::QueryRegistry; /// Open handle for a single graph in the registry. Cheap to clone (`Arc`-wrapped /// engine + policy). Cluster-mode handlers extract this via @@ -47,6 +48,11 @@ pub struct GraphHandle { /// `_as` writers"; the HTTP-layer `require_bearer_auth` middleware still /// runs regardless. pub policy: Option>, + /// Per-graph stored-query registry, loaded and validated at + /// startup. `None` means the operator declared no stored queries for + /// this graph β€” `POST /queries/{name}` then 404s. Mirrors the + /// optional `policy` shape. + pub queries: Option>, } /// Immutable snapshot of the registry's current state. Replaced atomically @@ -245,6 +251,7 @@ fn canonicalize_handle_uri( uri: canonical_uri.clone(), engine: Arc::clone(&handle.engine), policy: handle.policy.clone(), + queries: handle.queries.clone(), }); Ok((canonical_uri, canonical_handle)) } @@ -276,6 +283,7 @@ mod tests { uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, }) } @@ -340,12 +348,14 @@ mod tests { uri: shared_uri.clone(), engine: Arc::clone(&engine), policy: None, + queries: None, }); let h2 = Arc::new(GraphHandle { key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), uri: shared_uri, engine, policy: None, + queries: None, }); let registry = GraphRegistry::new(); @@ -411,12 +421,14 @@ mod tests { uri: shared_uri.clone(), engine: Arc::clone(&engine), policy: None, + queries: None, }); let h2 = Arc::new(GraphHandle { key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), uri: shared_uri, engine, policy: None, + queries: None, }); let err = match GraphRegistry::from_handles(vec![h1, h2]) { Ok(_) => panic!("expected DuplicateUri, got Ok"), diff --git a/crates/omnigraph-server/tests/openapi.rs b/crates/omnigraph-server/tests/openapi.rs index a2542db..3d13e74 100644 --- a/crates/omnigraph-server/tests/openapi.rs +++ b/crates/omnigraph-server/tests/openapi.rs @@ -168,6 +168,8 @@ const EXPECTED_PATHS: &[&str] = &[ "/export", "/change", "/mutate", + "/queries", + "/queries/{name}", "/schema", "/schema/apply", "/ingest", @@ -701,6 +703,8 @@ fn protected_endpoints_reference_bearer_token_security() { ("/read", "post"), ("/change", "post"), ("/schema/apply", "post"), + ("/queries", "get"), + ("/queries/{name}", "post"), ("/ingest", "post"), ("/export", "post"), ("/snapshot", "get"), @@ -913,6 +917,34 @@ fn post_endpoints_have_request_body() { } } +#[test] +fn invoke_stored_query_request_body_is_optional() { + let doc = openapi_json(); + let request_body = &doc["paths"]["/queries/{name}"]["post"]["requestBody"]; + assert!( + request_body.is_object(), + "POST /queries/{{name}} should document its optional request body" + ); + assert_eq!( + request_body["required"].as_bool().unwrap_or(false), + false, + "stored-query invocation body should be optional" + ); + let schema = &request_body["content"]["application/json"]["schema"]; + let ref_path = schema["$ref"] + .as_str() + .or_else(|| { + schema["oneOf"] + .as_array() + .and_then(|schemas| schemas.iter().find_map(|schema| schema["$ref"].as_str())) + }) + .unwrap(); + assert!( + ref_path.contains("InvokeStoredQueryRequest"), + "POST /queries/{{name}} requestBody should reference InvokeStoredQueryRequest, got {ref_path}" + ); +} + // --------------------------------------------------------------------------- // Serialization round-trip test // --------------------------------------------------------------------------- @@ -1117,6 +1149,7 @@ async fn app_for_multi_mode(graph_ids: &[&str]) -> (Vec, Rout uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, })); dirs.push(dir); } diff --git a/crates/omnigraph-server/tests/server.rs b/crates/omnigraph-server/tests/server.rs index 3ace80e..4a49a14 100644 --- a/crates/omnigraph-server/tests/server.rs +++ b/crates/omnigraph-server/tests/server.rs @@ -8,7 +8,7 @@ use axum::body::{Body, to_bytes}; use axum::http::header::AUTHORIZATION; use axum::http::{Method, Request, StatusCode}; use lance::index::DatasetIndexExt; -use omnigraph::db::{Omnigraph, ReadTarget, SchemaApplyOptions}; +use omnigraph::db::{Omnigraph, ReadTarget}; use omnigraph::error::OmniError; use omnigraph::loader::{LoadMode, load_jsonl}; use omnigraph_policy::{PolicyChecker, PolicyEngine}; @@ -16,6 +16,7 @@ use omnigraph_server::api::{ BranchCreateRequest, BranchMergeRequest, ChangeRequest, ErrorOutput, ExportRequest, IngestRequest, QueryRequest, ReadRequest, SchemaApplyRequest, SchemaOutput, }; +use omnigraph_server::queries::{QueryRegistry, RegistrySpec}; use omnigraph_server::{AppState, build_app}; use serde_json::{Value, json}; use serial_test::serial; @@ -141,6 +142,469 @@ fn graph_path(root: &Path) -> PathBuf { root.join("server.omni") } +fn stored_query_registry(specs: &[(&str, &str, bool)]) -> QueryRegistry { + QueryRegistry::from_specs( + specs + .iter() + .map(|(name, source, expose)| RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose: *expose, + tool_name: None, + }) + .collect(), + ) + .expect("specs parse and key==symbol") +} + +#[tokio::test] +async fn server_boots_with_a_valid_stored_query_registry() { + // A stored query that type-checks against the fixture schema + // (`Person { name, age }`) must let the server boot. + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let registry = stored_query_registry(&[( + "find_person", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + false, + )]); + let state = AppState::open_single_with_queries( + graph.to_string_lossy().to_string(), + vec![], + None, + registry, + ) + .await; + assert!(state.is_ok(), "valid registry should boot: {:?}", state.err()); +} + +#[tokio::test] +async fn server_refuses_boot_on_type_broken_stored_query() { + // A stored query referencing a type not in the schema (`Widget`) + // must abort boot, naming the offending query. + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let registry = stored_query_registry(&[( + "ghost", + "query ghost() { match { $w: Widget } return { $w.name } }", + false, + )]); + let result = AppState::open_single_with_queries( + graph.to_string_lossy().to_string(), + vec![], + None, + registry, + ) + .await; + // `AppState` is not `Debug`, so match rather than `expect_err`. + let err = match result { + Ok(_) => panic!("type-broken stored query must refuse boot"), + Err(err) => err, + }; + let msg = err.to_string(); + assert!(msg.contains("ghost"), "error should name the broken query: {msg}"); + assert!( + msg.contains("schema check"), + "error should mention the schema check: {msg}" + ); +} + +/// Build a single-mode app with a stored-query registry plus a bearerβ†’actor +/// pairing and a policy, so invoke tests exercise the `invoke_query` +/// boundary gate and the inner read/change gates together. +async fn app_with_stored_queries( + specs: &[(&str, &str, bool)], + tokens: &[(&str, &str)], + policy: &str, +) -> (tempfile::TempDir, Router) { + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, policy).unwrap(); + let registry = stored_query_registry(specs); + let state = AppState::open_single_with_queries( + graph.to_string_lossy().to_string(), + tokens + .iter() + .map(|(actor, token)| ((*actor).to_string(), (*token).to_string())) + .collect(), + Some(&policy_path), + registry, + ) + .await + .unwrap(); + (temp, build_app(state)) +} + +/// - `act-invoke`: invoke_query + read (stored reads, not mutations) +/// - `act-full`: invoke_query + read + change (stored mutations) +/// - `act-noinvoke`: read only, no invoke_query (boundary-denied) +/// - `act-invokeonly`: invoke_query only, no read (clears the boundary, inner read denies) +const INVOKE_POLICY_YAML: &str = r#" +version: 1 +groups: + invokers: ["act-invoke"] + full: ["act-full"] + readers: ["act-noinvoke"] + invoke_only: ["act-invokeonly"] +protected_branches: [main] +rules: + # invoke_query is graph-scoped β€” its own rules, no branch_scope. + - id: invokers-can-invoke + allow: + actors: { group: invokers } + actions: [invoke_query] + - id: full-can-invoke + allow: + actors: { group: full } + actions: [invoke_query] + - id: invoke-only-can-invoke + allow: + actors: { group: invoke_only } + actions: [invoke_query] + # read / change are branch-scoped. + - id: invokers-can-read + allow: + actors: { group: invokers } + actions: [read] + branch_scope: any + - id: full-can-read-change + allow: + actors: { group: full } + actions: [read, change] + branch_scope: any + - id: readers-can-read + allow: + actors: { group: readers } + actions: [read] + branch_scope: any +"#; + +const STORED_QUERY_SCHEMA_APPLY_POLICY_YAML: &str = r#" +version: 1 +groups: + admins: [act-ragnor] +protected_branches: [main] +rules: + - id: admins-can-invoke + allow: + actors: { group: admins } + actions: [invoke_query] + - id: admins-can-read + allow: + actors: { group: admins } + actions: [read] + branch_scope: any + - id: admins-can-schema-apply + allow: + actors: { group: admins } + actions: [schema_apply] + target_branch_scope: protected +"#; + +const FIND_PERSON_GQ: &str = + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }"; + +fn invoke_request(name: &str, token: &str, body: Value) -> Request { + Request::builder() + .uri(format!("/queries/{name}")) + .method(Method::POST) + .header("content-type", "application/json") + .header("authorization", format!("Bearer {token}")) + .body(Body::from(serde_json::to_vec(&body).unwrap())) + .unwrap() +} + +fn invoke_request_bytes( + name: &str, + token: &str, + body: impl Into, + content_type: Option<&str>, +) -> Request { + let mut builder = Request::builder() + .uri(format!("/queries/{name}")) + .method(Method::POST) + .header("authorization", format!("Bearer {token}")); + if let Some(content_type) = content_type { + builder = builder.header("content-type", content_type); + } + builder.body(body.into()).unwrap() +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_read_returns_rows() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + let (status, body) = json_response( + &app, + invoke_request("find_person", "t-invoke", json!({ "params": { "name": "Alice" } })), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert_eq!(body["query_name"], "find_person"); + assert_eq!(body["row_count"], 1, "Alice is in the fixture; body: {body}"); + assert!(body["rows"].is_array(), "read envelope shape; body: {body}"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_read_accepts_absent_or_empty_body() { + let no_param_query = "query list_people() { match { $p: Person } return { $p.name } }"; + let (_temp, app) = app_with_stored_queries( + &[("list_people", no_param_query, false)], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + + let (status, body) = json_response( + &app, + invoke_request_bytes("list_people", "t-invoke", Body::empty(), None), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert_eq!(body["query_name"], "list_people"); + + let (status, body) = json_response( + &app, + invoke_request_bytes( + "list_people", + "t-invoke", + Body::empty(), + Some("application/json"), + ), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + + let (status, body) = json_response( + &app, + invoke_request_bytes( + "list_people", + "t-invoke", + Body::from("{}"), + Some("application/json"), + ), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + + let (status, body) = json_response( + &app, + invoke_request_bytes( + "list_people", + "t-invoke", + Body::from("{"), + Some("application/json"), + ), + ) + .await; + assert_eq!(status, StatusCode::BAD_REQUEST, "body: {body}"); + assert!( + body["error"] + .as_str() + .unwrap_or_default() + .contains("invalid stored-query invocation body"), + "malformed JSON should be rejected as bad request; body: {body}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_mutation_double_gates_on_change() { + let specs: &[(&str, &str, bool)] = &[( + "add_person", + "query add_person($name: String) { insert Person { name: $name } }", + false, + )]; + let (_temp, app) = app_with_stored_queries( + specs, + &[("act-invoke", "t-invoke"), ("act-full", "t-full")], + INVOKE_POLICY_YAML, + ) + .await; + + // Has invoke_query but NOT change β†’ the inner change gate denies (403). + let (status, body) = json_response( + &app, + invoke_request("add_person", "t-invoke", json!({ "params": { "name": "Eve" } })), + ) + .await; + assert_eq!( + status, + StatusCode::FORBIDDEN, + "invoke_query without change must 403; body: {body}" + ); + + // Has invoke_query + change β†’ applied. + let (status, body) = json_response( + &app, + invoke_request("add_person", "t-full", json!({ "params": { "name": "Eve" } })), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert_eq!(body["affected_nodes"], 1, "body: {body}"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_query_bad_param_is_400() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + // `name` is declared String; pass a number. + let (status, body) = json_response( + &app, + invoke_request("find_person", "t-invoke", json!({ "params": { "name": 123 } })), + ) + .await; + assert_eq!(status, StatusCode::BAD_REQUEST, "body: {body}"); + assert!( + body["error"].as_str().unwrap_or_default().contains("name"), + "400 should name the offending param; body: {body}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_unknown_query_and_denied_actor_return_identical_404() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invoke", "t-invoke"), ("act-noinvoke", "t-noinvoke")], + INVOKE_POLICY_YAML, + ) + .await; + + // Authorized actor, unknown query name β†’ 404. + let (unknown_status, unknown_body) = + json_response(&app, invoke_request("does_not_exist", "t-invoke", json!({}))).await; + // Denied actor (no invoke_query), real query name β†’ 404. + let (denied_status, denied_body) = json_response( + &app, + invoke_request("find_person", "t-noinvoke", json!({ "params": { "name": "Alice" } })), + ) + .await; + + assert_eq!(unknown_status, StatusCode::NOT_FOUND); + assert_eq!(denied_status, StatusCode::NOT_FOUND); + assert_eq!( + unknown_body, denied_body, + "deny must be byte-identical to a missing query (no catalog probing)" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_query_holder_without_read_sees_403_not_404() { + // The 404-hiding is for callers WITHOUT invoke_query. An actor that + // HOLDS invoke_query but lacks `read` clears the boundary gate, then the + // inner read gate denies β†’ 403 for an EXISTING read query, vs 404 for an + // unknown one. Existence is visible to grant-holders by design (the + // documented double-gate); this pins that actual contract. + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invokeonly", "t-invokeonly")], + INVOKE_POLICY_YAML, + ) + .await; + let (exists_status, _) = json_response( + &app, + invoke_request("find_person", "t-invokeonly", json!({ "params": { "name": "Alice" } })), + ) + .await; + let (absent_status, _) = + json_response(&app, invoke_request("does_not_exist", "t-invokeonly", json!({}))).await; + assert_eq!( + exists_status, + StatusCode::FORBIDDEN, + "an existing read query the holder can't read β†’ inner-gate 403" + ); + assert_eq!(absent_status, StatusCode::NOT_FOUND, "unknown query still 404s"); +} + +fn get_request(uri: &str, token: &str) -> Request { + Request::builder() + .uri(uri) + .method(Method::GET) + .header("authorization", format!("Bearer {token}")) + .body(Body::empty()) + .unwrap() +} + +#[tokio::test(flavor = "multi_thread")] +async fn list_queries_returns_only_exposed_with_typed_params() { + let (_temp, app) = app_with_stored_queries( + &[ + ("find_person", FIND_PERSON_GQ, true), + ( + "add_person", + "query add_person($name: String) { insert Person { name: $name } }", + true, + ), + ("hidden", "query hidden() { match { $p: Person } return { $p.name } }", false), + ], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + let (status, body) = json_response(&app, get_request("/queries", "t-invoke")).await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + + let entries = body["queries"].as_array().unwrap(); + let names: Vec<&str> = entries.iter().map(|q| q["name"].as_str().unwrap()).collect(); + assert!( + names.contains(&"find_person") && names.contains(&"add_person"), + "exposed queries listed: {names:?}" + ); + assert!(!names.contains(&"hidden"), "non-exposed query hidden from the catalog: {names:?}"); + + let fp = entries.iter().find(|q| q["name"] == "find_person").unwrap(); + assert_eq!(fp["mutation"], false); + assert_eq!(fp["tool_name"], "find_person"); + assert_eq!(fp["params"][0]["name"], "name"); + assert_eq!(fp["params"][0]["kind"], "string"); + let ap = entries.iter().find(|q| q["name"] == "add_person").unwrap(); + assert_eq!(ap["mutation"], true, "stored insert β†’ mutation"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn list_queries_is_read_gated_so_a_non_invoker_can_list() { + // The catalog is read-gated (not invoke_query-gated), so a reader who + // lacks invoke_query still enumerates the exposed queries β€” the + // documented probe-oracle gap until per-query Cedar filtering lands. + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, true)], + &[("act-noinvoke", "t-noinvoke")], + INVOKE_POLICY_YAML, + ) + .await; + let (status, body) = json_response(&app, get_request("/queries", "t-noinvoke")).await; + assert_eq!(status, StatusCode::OK, "read-gated catalog; body: {body}"); + let names: Vec<&str> = body["queries"] + .as_array() + .unwrap() + .iter() + .map(|q| q["name"].as_str().unwrap()) + .collect(); + assert!( + names.contains(&"find_person"), + "a reader lists the catalog despite lacking invoke_query: {names:?}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn list_queries_is_empty_when_no_registry() { + let (_temp, app) = app_for_loaded_graph_with_auth("demo-token").await; + let (status, body) = json_response(&app, get_request("/queries", "demo-token")).await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert!( + body["queries"].as_array().unwrap().is_empty(), + "no stored-query registry β†’ empty catalog" + ); +} + fn drifted_test_schema() -> String { fs::read_to_string(fixture("test.pg")) .unwrap() @@ -423,6 +887,83 @@ async fn schema_apply_route_updates_graph_for_authorized_admin() { ); } +#[tokio::test(flavor = "multi_thread")] +async fn schema_apply_route_rejects_stored_query_breakage_before_publish() { + let (temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, true)], + &[("act-ragnor", "admin-token")], + STORED_QUERY_SCHEMA_APPLY_POLICY_YAML, + ) + .await; + + let request = Request::builder() + .method(Method::POST) + .uri("/schema/apply") + .header("content-type", "application/json") + .header("authorization", "Bearer admin-token") + .body(Body::from( + serde_json::to_vec(&SchemaApplyRequest { + schema_source: renamed_age_schema(), + ..Default::default() + }) + .unwrap(), + )) + .unwrap(); + let (status, payload) = json_response(&app, request).await; + assert_eq!(status, StatusCode::BAD_REQUEST, "body: {payload}"); + let message = payload["error"].as_str().unwrap_or_default(); + assert!( + message.contains("find_person") && message.contains("schema check"), + "registry breakage should name the stored query; body: {payload}" + ); + + let reopened = Omnigraph::open(graph_path(temp.path()).to_str().unwrap()) + .await + .unwrap(); + let person = &reopened.catalog().node_types["Person"]; + assert!(person.properties.contains_key("age")); + assert!(!person.properties.contains_key("years")); + + let (invoke_status, invoke_body) = json_response( + &app, + invoke_request( + "find_person", + "admin-token", + json!({ "params": { "name": "Alice" } }), + ), + ) + .await; + assert_eq!(invoke_status, StatusCode::OK, "body: {invoke_body}"); + assert_eq!(invoke_body["row_count"], 1); +} + +#[tokio::test(flavor = "multi_thread")] +async fn schema_apply_route_noop_keeps_valid_stored_query_registry() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, true)], + &[("act-ragnor", "admin-token")], + STORED_QUERY_SCHEMA_APPLY_POLICY_YAML, + ) + .await; + + let request = Request::builder() + .method(Method::POST) + .uri("/schema/apply") + .header("content-type", "application/json") + .header("authorization", "Bearer admin-token") + .body(Body::from( + serde_json::to_vec(&SchemaApplyRequest { + schema_source: fs::read_to_string(fixture("test.pg")).unwrap(), + ..Default::default() + }) + .unwrap(), + )) + .unwrap(); + let (status, payload) = json_response(&app, request).await; + assert_eq!(status, StatusCode::OK, "body: {payload}"); + assert_eq!(payload["applied"], false); +} + #[tokio::test] async fn schema_apply_route_requires_schema_apply_policy_permission() { let (_temp, app) = app_for_graph_with_auth_tokens_and_policy( @@ -4690,6 +5231,7 @@ mod multi_graph_startup { uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, })); dirs.push(dir); } @@ -4985,12 +5527,14 @@ graphs: uri: graph_uri.clone(), engine: Arc::clone(&engine), policy: None, + queries: None, }); let beta = Arc::new(GraphHandle { key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), uri: format!("file://{graph_uri}/"), engine, policy: None, + queries: None, }); match GraphRegistry::from_handles(vec![alpha, beta]) { @@ -5016,6 +5560,7 @@ graphs: uri: format!("file://{graph_uri}/"), engine: Arc::new(engine), policy: None, + queries: None, }); let registry = GraphRegistry::from_handles(vec![handle]).unwrap(); @@ -5138,11 +5683,11 @@ graphs: let err = load_server_settings(Some(&config_path), None, None, None, true).unwrap_err(); let msg = err.to_string(); assert!( - msg.contains("top-level `policy.file` is single-graph/CLI-local policy only"), - "expected single-graph policy guidance, got: {msg}" + msg.contains("top-level") && msg.contains("policy.file") && msg.contains("not honored"), + "expected top-level-not-honored guidance, got: {msg}" ); assert!( - msg.contains("graphs..policy.file"), + msg.contains("graphs."), "expected per-graph migration guidance, got: {msg}" ); assert!( @@ -5151,6 +5696,88 @@ graphs: ); } + #[test] + fn mode_inference_multi_rejects_top_level_queries() { + // Symmetric to the policy guard: a top-level `queries:` block in + // multi-graph mode is not honored (each graph uses its own), so it + // is a loud error rather than a silent no-op. + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + "queries:\n q:\n file: ./q.gq\ngraphs:\n alpha:\n uri: /tmp/alpha.omni\n", + ) + .unwrap(); + let err = load_server_settings(Some(&config_path), None, None, None, true).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("queries") && msg.contains("not honored"), + "top-level queries must be rejected in multi-graph mode: {msg}" + ); + } + + #[test] + fn single_mode_named_graph_rejects_top_level_blocks() { + // Serving a graph by name (`--target`/`server.graph`) uses its + // per-graph block; a populated top-level block would be silently + // shadowed, so boot refuses and names the per-graph location. + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + "policy:\n file: ./top.yaml\ngraphs:\n prod:\n uri: /tmp/prod.omni\n", + ) + .unwrap(); + let err = + load_server_settings(Some(&config_path), None, Some("prod".to_string()), None, true) + .unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("prod") && msg.contains("policy.file") && msg.contains("graphs.prod"), + "named single-mode + top-level policy must refuse, naming the graph: {msg}" + ); + } + + #[test] + fn single_mode_named_graph_uses_per_graph_policy_and_queries() { + // The identity rule: `--target prod` attaches `graphs.prod`'s own + // policy + queries, not the top-level ones (which are absent here). + let temp = tempfile::tempdir().unwrap(); + fs::write( + temp.path().join("prod.gq"), + "query pq() { match { $u: User } return { $u.name } }", + ) + .unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + "graphs:\n prod:\n uri: /tmp/prod.omni\n policy:\n file: ./prod-policy.yaml\n \ + queries:\n pq:\n file: ./prod.gq\n", + ) + .unwrap(); + let settings = + load_server_settings(Some(&config_path), None, Some("prod".to_string()), None, true) + .unwrap(); + match settings.mode { + ServerConfigMode::Single { + graph_id, + policy_file, + queries, + .. + } => { + assert_eq!(graph_id, "prod", "named single-mode keeps graph identity"); + assert!( + policy_file + .as_ref() + .is_some_and(|p| p.ends_with("prod-policy.yaml")), + "per-graph policy attached: {policy_file:?}" + ); + assert!(queries.lookup("pq").is_some(), "per-graph query attached"); + } + other => panic!("expected Single mode, got {other:?}"), + } + } + #[test] fn mode_inference_normalizes_multi_graph_uris() { let temp = tempfile::tempdir().unwrap(); @@ -5383,6 +6010,7 @@ graphs: uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, }); let tokens = vec![("act-andrew".to_string(), "secret-token".to_string())]; let workload = omnigraph_server::workload::WorkloadController::from_env(); @@ -5450,6 +6078,7 @@ graphs: uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, })); } diff --git a/crates/omnigraph/Cargo.toml b/crates/omnigraph/Cargo.toml index 1fa3436..70f51d8 100644 --- a/crates/omnigraph/Cargo.toml +++ b/crates/omnigraph/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-engine" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "Runtime engine for the Omnigraph graph database." license = "MIT" @@ -16,8 +16,8 @@ default = [] failpoints = ["dep:fail", "fail/failpoints"] [dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.1" } lance = { workspace = true } lance-datafusion = { workspace = true } datafusion = { workspace = true } @@ -51,7 +51,7 @@ chrono = { workspace = true } arc-swap = { workspace = true } [dev-dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } tokio = { workspace = true } lance-namespace-impls = { workspace = true } serial_test = "3" diff --git a/crates/omnigraph/src/db/commit_graph.rs b/crates/omnigraph/src/db/commit_graph.rs index 565bd69..9531a64 100644 --- a/crates/omnigraph/src/db/commit_graph.rs +++ b/crates/omnigraph/src/db/commit_graph.rs @@ -169,6 +169,37 @@ impl CommitGraph { self.refresh().await } + /// Idempotently drop the commit-graph branch `name`, tolerating an + /// already-absent branch (see [`TableStore::force_delete_branch`] for the + /// same semantics). Used by the best-effort reclaim in `branch_delete` and + /// the `cleanup` orphan reconciler. `RefConflict` (referencing descendants) + /// is still surfaced. + pub async fn force_delete_branch(&mut self, name: &str) -> Result<()> { + let mut ds = Dataset::open(&graph_commits_uri(&self.root_uri)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + match ds.force_delete_branch(name).await { + Ok(()) => {} + Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => {} + Err(e) => return Err(OmniError::Lance(e.to_string())), + } + self.refresh().await + } + + /// List the named branches present on the commit-graph dataset. The + /// `cleanup` reconciler diffs this against the manifest branch set to find + /// orphaned commit-graph branches to reclaim. + pub async fn list_branches(&self) -> Result> { + let ds = Dataset::open(&graph_commits_uri(&self.root_uri)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let branches = ds + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(branches.into_keys().collect()) + } + pub async fn append_commit( &mut self, manifest_branch: Option<&str>, @@ -345,7 +376,7 @@ impl CommitGraph { } } -fn graph_commits_uri(root_uri: &str) -> String { +pub(crate) fn graph_commits_uri(root_uri: &str) -> String { format!("{}/{}", root_uri.trim_end_matches('/'), GRAPH_COMMITS_DIR) } diff --git a/crates/omnigraph/src/db/graph_coordinator.rs b/crates/omnigraph/src/db/graph_coordinator.rs index a721036..dfe2767 100644 --- a/crates/omnigraph/src/db/graph_coordinator.rs +++ b/crates/omnigraph/src/db/graph_coordinator.rs @@ -211,14 +211,47 @@ impl GraphCoordinator { let branch = normalize_branch_name(name)? .ok_or_else(|| OmniError::manifest("cannot create branch 'main'".to_string()))?; self.ensure_commit_graph_initialized().await?; + + // Manifest authority flip first. self.manifest.create_branch(&branch).await?; - failpoints::maybe_fail("branch_create.after_manifest_branch_create")?; - if let Some(commit_graph) = &mut self.commit_graph { - commit_graph.create_branch(&branch).await?; + + // Derived commit-graph branch. If anything after the authority flip + // fails, roll back the manifest branch so the branch never half-exists + // (a manifest branch with no commit-graph branch breaks the next write). + if let Err(err) = self.create_commit_graph_branch(&branch).await { + if let Err(rollback_err) = self.manifest.delete_branch(&branch).await { + tracing::warn!( + target: "omnigraph::branch_create", + branch = %branch, + error = %rollback_err, + "rollback of manifest branch failed after commit-graph create failure", + ); + } + return Err(err); } Ok(()) } + /// Create the derived commit-graph branch for `branch`, healing a zombie ref + /// left by an incomplete prior delete. The manifest branch was just created + /// fresh, so any existing commit-graph branch with this name is provably + /// orphaned and is force-dropped before recreating. + async fn create_commit_graph_branch(&mut self, branch: &str) -> Result<()> { + failpoints::maybe_fail("branch_create.after_manifest_branch_create")?; + let Some(commit_graph) = &mut self.commit_graph else { + return Ok(()); + }; + if commit_graph + .list_branches() + .await? + .iter() + .any(|existing| existing == branch) + { + commit_graph.force_delete_branch(branch).await?; + } + commit_graph.create_branch(branch).await + } + pub async fn branch_delete(&mut self, name: &str) -> Result<()> { let branch = normalize_branch_name(name)? .ok_or_else(|| OmniError::manifest("cannot delete branch 'main'".to_string()))?; @@ -229,20 +262,43 @@ impl GraphCoordinator { ))); } + // Manifest authority flip β€” the single atomic op that makes the branch + // cease to exist. Must succeed; everything after is derived state + // reclaimed best-effort. self.manifest.delete_branch(&branch).await?; + // Commit-graph branch is derived state. Reclaim best-effort with the + // idempotent force variant: a failure here (or a missing dataset) is + // reconciled by `cleanup` and must not fail the delete after the + // authority already flipped. + if let Err(err) = self.reclaim_commit_graph_branch(&branch).await { + tracing::warn!( + target: "omnigraph::branch_delete::cleanup", + branch = %branch, + error = %err, + "best-effort commit-graph branch reclaim failed; cleanup will reconcile", + ); + } + + Ok(()) + } + + /// Best-effort, idempotent reclaim of the commit-graph branch `branch`. + /// Tolerates an absent commit-graph dataset (a graph that never committed). + async fn reclaim_commit_graph_branch(&mut self, branch: &str) -> Result<()> { + failpoints::maybe_fail("branch_delete.before_commit_graph_reclaim")?; if let Some(commit_graph) = &mut self.commit_graph { - commit_graph.delete_branch(&branch).await?; + commit_graph.force_delete_branch(branch).await } else if self .storage .exists(&graph_commits_uri(self.root_uri())) .await? { let mut commit_graph = CommitGraph::open(self.root_uri()).await?; - commit_graph.delete_branch(&branch).await?; + commit_graph.force_delete_branch(branch).await + } else { + Ok(()) } - - Ok(()) } pub async fn snapshot_at_version(&self, version: u64) -> Result { diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs index 7fcf7de..3b2886f 100644 --- a/crates/omnigraph/src/db/manifest.rs +++ b/crates/omnigraph/src/db/manifest.rs @@ -48,6 +48,22 @@ const OBJECT_TYPE_TABLE_VERSION: &str = "table_version"; const OBJECT_TYPE_TABLE_TOMBSTONE: &str = "table_tombstone"; const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management"; +/// Apply pending internal-schema migrations against `__manifest` on the +/// open-for-write path, independent of a publish. +/// +/// `Omnigraph::open(ReadWrite)` calls this before the coordinator reads branch +/// state, so branch-observing code (`branch_list`, the schema-apply +/// blocking-branch checks) sees the post-migration graph. In particular the +/// v2β†’v3 step sweeps legacy `__run__*` staging branches off `__manifest` +/// (MR-770); running it here closes the window where those branches would +/// otherwise block schema apply before the first publish runs the migration. +/// +/// Idempotent: a no-op stamp read when the on-disk version already matches. +pub(crate) async fn migrate_on_open(root_uri: &str) -> Result<()> { + let mut dataset = open_manifest_dataset(root_uri, None).await?; + migrations::migrate_internal_schema(&mut dataset).await +} + /// Immutable point-in-time view of the database. /// /// Cheap to create (no storage I/O). All reads within a query go through one diff --git a/crates/omnigraph/src/db/manifest/migrations.rs b/crates/omnigraph/src/db/manifest/migrations.rs index bbb7995..e2801fe 100644 --- a/crates/omnigraph/src/db/manifest/migrations.rs +++ b/crates/omnigraph/src/db/manifest/migrations.rs @@ -46,7 +46,11 @@ use crate::error::{OmniError, Result}; /// - v2 β€” `__manifest.object_id` carries the unenforced-PK annotation, /// engaging Lance's bloom-filter conflict resolver at commit time. Added /// alongside `expected_table_versions` OCC on `ManifestBatchPublisher::publish`. -pub(super) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 2; +/// - v3 β€” one-time sweep of legacy `__run__` staging branches left on the +/// `__manifest` dataset by the pre-v0.4.0 Run state machine (removed in +/// MR-771). Once swept, the `is_internal_run_branch` defense-in-depth guard +/// is no longer needed (MR-770). +pub(super) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 3; const INTERNAL_SCHEMA_VERSION_KEY: &str = "omnigraph:internal_schema_version"; const OBJECT_ID_PK_KEY: &str = "lance-schema:unenforced-primary-key"; @@ -89,6 +93,10 @@ pub(super) async fn migrate_internal_schema(dataset: &mut Dataset) -> Result<()> migrate_v1_to_v2(dataset).await?; current = 2; } + 2 => { + migrate_v2_to_v3(dataset).await?; + current = 3; + } other => { return Err(OmniError::manifest_internal(format!( "no internal-schema migration registered for v{} β†’ v{}", @@ -122,6 +130,51 @@ async fn migrate_v1_to_v2(dataset: &mut Dataset) -> Result<()> { set_stamp(dataset, 2).await } +/// v2 β†’ v3: sweep legacy `__run__` staging branches off the `__manifest` +/// dataset, then bump the stamp. +/// +/// The pre-v0.4.0 Run state machine (removed in MR-771) created graph-level +/// staging branches named `__run__` on `__manifest`. MR-771 stopped +/// creating them but left any pre-existing ones in place; Lance's +/// `list_branches` still enumerates them, so they leak into `branch_list()` +/// and count as blocking branches at schema-apply time. This one-time sweep +/// removes them so the `is_internal_run_branch` guard can retire (MR-770). +/// +/// The `"__run__"` prefix is inlined here on purpose: this migration must keep +/// working after the `run_registry` module (the guard) is deleted, so it does +/// not depend on it. +/// +/// Idempotent under both sequential retry and concurrent runners: each run +/// re-enumerates `list_branches` fresh, and `force_delete_branch` tolerates a +/// branch that is already gone β€” so a crash before the stamp bump, or a second +/// process opening the same legacy graph at the same time, never errors out. +async fn migrate_v2_to_v3(dataset: &mut Dataset) -> Result<()> { + const LEGACY_RUN_BRANCH_PREFIX: &str = "__run__"; + let branches = dataset + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let run_branches: Vec = branches + .into_keys() + .filter(|name| { + name.trim_start_matches('/') + .starts_with(LEGACY_RUN_BRANCH_PREFIX) + }) + .collect(); + for name in run_branches { + // `force_delete_branch` deletes even when the `BranchContents` is + // already gone. Plain `delete_branch` errors "BranchContents not + // found", which would fail a second concurrent open (or a retry that + // raced another runner) after the first one swept the branch. Force is + // exactly Lance's documented path for cleaning up zombie branches. + dataset + .force_delete_branch(&name) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + set_stamp(dataset, 3).await +} + async fn set_stamp(dataset: &mut Dataset, version: u32) -> Result<()> { dataset .update_schema_metadata([(INTERNAL_SCHEMA_VERSION_KEY.to_string(), version.to_string())]) diff --git a/crates/omnigraph/src/db/manifest/tests.rs b/crates/omnigraph/src/db/manifest/tests.rs index effa0b5..885a2a8 100644 --- a/crates/omnigraph/src/db/manifest/tests.rs +++ b/crates/omnigraph/src/db/manifest/tests.rs @@ -1461,6 +1461,80 @@ async fn test_publish_migrates_pre_stamp_manifest_to_current_version() { assert!(reopened.snapshot().entry("node:Person").is_some()); } +#[tokio::test] +async fn test_v2_to_v3_sweeps_legacy_run_branches_on_write_open() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + + // Synthesize a pre-MR-770 graph: several stale `__run__` staging branches + // left on `__manifest` (a real legacy graph accumulates one per run), plus + // a real user branch that must survive the sweep. Multiple run branches + // exercise the migration's delete loop on a single reused dataset handle. + mc.create_branch("__run__01J9LEGACY").await.unwrap(); + mc.create_branch("__run__01J9SECOND").await.unwrap(); + mc.create_branch("__run__01J9THIRD").await.unwrap(); + mc.create_branch("feature").await.unwrap(); + let before = mc.list_branches().await.unwrap(); + assert_eq!( + before.iter().filter(|b| b.starts_with("__run__")).count(), + 3, + "precondition: three legacy run branches exist on __manifest; got {before:?}", + ); + + // Rewind the internal-schema stamp to v2 so the next write-open runs the + // v2 β†’ v3 sweep arm (init stamps at the current version, which is past it). + { + let mut ds = open_manifest_dataset(uri, None).await.unwrap(); + ds.update_schema_metadata([( + "omnigraph:internal_schema_version".to_string(), + Some("2".to_string()), + )]) + .await + .unwrap(); + let post = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!(super::migrations::read_stamp(&post), 2, "stamp rewound to v2"); + } + + // A no-op publish forces the open-for-write path, which runs the migration. + let mut expected = HashMap::new(); + expected.insert("node:Person".to_string(), 1); + GraphNamespacePublisher::new(uri, None) + .publish(&[], &expected) + .await + .unwrap(); + + // Stamp advanced to current; the legacy run branch is physically gone from + // `__manifest` (checked via the raw, unfiltered manifest list β€” not the + // guard-filtered `branch_list`), and the real branch + `main` survive. + let post = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&post), + super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION, + ); + let reopened = ManifestCoordinator::open(uri).await.unwrap(); + let after = reopened.list_branches().await.unwrap(); + assert!( + !after.iter().any(|b| b.starts_with("__run__")), + "legacy run branch must be swept; got {after:?}", + ); + assert!(after.iter().any(|b| b == "feature"), "user branch must survive"); + assert!(after.iter().any(|b| b == "main"), "main must survive"); + + // Idempotent: a second write-open finds the stamp at current and does not + // re-run the sweep or error. + GraphNamespacePublisher::new(uri, None) + .publish(&[], &expected) + .await + .unwrap(); + let final_ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&final_ds), + super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION, + ); +} + #[tokio::test] async fn test_publish_rejects_manifest_stamped_at_future_version() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/omnigraph/src/db/mod.rs b/crates/omnigraph/src/db/mod.rs index d0b292f..13e1c74 100644 --- a/crates/omnigraph/src/db/mod.rs +++ b/crates/omnigraph/src/db/mod.rs @@ -3,7 +3,6 @@ pub mod graph_coordinator; pub mod manifest; mod omnigraph; mod recovery_audit; -mod run_registry; mod schema_state; pub(crate) mod write_queue; @@ -13,9 +12,8 @@ pub use manifest::{Snapshot, SubTableEntry, SubTableUpdate}; pub(crate) use omnigraph::ensure_public_branch_ref; pub use omnigraph::{ CleanupPolicyOptions, InitOptions, MergeOutcome, Omnigraph, OpenMode, SchemaApplyOptions, - SchemaApplyResult, TableCleanupStats, TableOptimizeStats, + SchemaApplyResult, SkipReason, TableCleanupStats, TableOptimizeStats, }; -pub(crate) use run_registry::is_internal_run_branch; pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__"; @@ -69,5 +67,8 @@ pub(crate) fn is_schema_apply_lock_branch(name: &str) -> bool { } pub(crate) fn is_internal_system_branch(name: &str) -> bool { - is_internal_run_branch(name) || is_schema_apply_lock_branch(name) + // Legacy `__run__*` staging branches (Run state machine, removed MR-771) + // are swept off `__manifest` by the v2β†’v3 internal-schema migration, so the + // only internal branch the engine still creates is the schema-apply lock. + is_schema_apply_lock_branch(name) } diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 5c92ac3..ba2b70e 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -33,7 +33,7 @@ mod optimize; mod schema_apply; mod table_ops; -pub use optimize::{CleanupPolicyOptions, TableCleanupStats, TableOptimizeStats}; +pub use optimize::{CleanupPolicyOptions, SkipReason, TableCleanupStats, TableOptimizeStats}; pub use schema_apply::SchemaApplyOptions; use super::commit_graph::GraphCommit; @@ -67,6 +67,12 @@ pub struct SchemaApplyResult { pub steps: Vec, } +#[derive(Debug, Clone)] +pub struct SchemaApplyPreview { + pub plan: SchemaMigrationPlan, + pub catalog: Catalog, +} + /// Top-level handle to an Omnigraph database. /// /// An Omnigraph is a Lance-native graph database with git-style branching. @@ -340,6 +346,16 @@ impl Omnigraph { mode: OpenMode, ) -> Result { let root = normalize_root_uri(uri)?; + // Apply pending internal-schema migrations before the coordinator reads + // branch state, so `branch_list` and the schema-apply blocking-branch + // checks observe the post-migration graph β€” notably the v2β†’v3 sweep of + // legacy `__run__*` staging branches (MR-770). ReadWrite only: a + // read-only open must not trigger object-store writes, so a read-only + // open of an unmigrated legacy graph still lists `__run__*` until its + // first read-write open (an accepted, documented limitation). + if matches!(mode, OpenMode::ReadWrite) { + crate::db::manifest::migrate_on_open(&root).await?; + } // Open the coordinator first so the schema-staging recovery sweep can // compare its snapshot against any leftover staging files. let mut coordinator = GraphCoordinator::open(&root, Arc::clone(&storage)).await?; @@ -493,6 +509,14 @@ impl Omnigraph { schema_apply::plan_schema(self, desired_schema_source, options).await } + pub async fn preview_schema_apply_with_options( + &self, + desired_schema_source: &str, + options: SchemaApplyOptions, + ) -> Result { + schema_apply::preview_schema_apply(self, desired_schema_source, options).await + } + pub async fn apply_schema(&self, desired_schema_source: &str) -> Result { self.apply_schema_as(desired_schema_source, SchemaApplyOptions::default(), None) .await @@ -523,7 +547,28 @@ impl Omnigraph { options: SchemaApplyOptions, actor: Option<&str>, ) -> Result { - schema_apply::apply_schema(self, desired_schema_source, options, actor).await + self.apply_schema_as_with_catalog_check(desired_schema_source, options, actor, |_| Ok(())) + .await + } + + pub async fn apply_schema_as_with_catalog_check( + &self, + desired_schema_source: &str, + options: SchemaApplyOptions, + actor: Option<&str>, + validate_catalog: F, + ) -> Result + where + F: FnOnce(&Catalog) -> Result<()>, + { + schema_apply::apply_schema( + self, + desired_schema_source, + options, + actor, + validate_catalog, + ) + .await } pub(crate) async fn ensure_schema_apply_idle(&self, operation: &str) -> Result<()> { @@ -1058,11 +1103,14 @@ impl Omnigraph { Ok(()) } - async fn cleanup_deleted_branch_tables( - &self, - branch: &str, - owned_tables: &[(String, String)], - ) -> Result<()> { + /// Best-effort reclaim of the per-table Lance forks a just-deleted branch + /// owned. Runs AFTER the manifest authority flip, so the branch is already + /// gone and these forks are unreachable orphans. A failure here (transient + /// object-store error, the `branch_delete.before_table_cleanup` failpoint) + /// is logged and swallowed: the `cleanup` reconciler is the guaranteed + /// backstop that converges any leftover orphan. Uses `force_delete_branch` + /// so a partially-reclaimed retry is idempotent. + async fn cleanup_deleted_branch_tables(&self, branch: &str, owned_tables: &[(String, String)]) { let mut seen_paths = HashSet::new(); let mut cleanup_targets = owned_tables .iter() @@ -1073,15 +1121,21 @@ impl Omnigraph { for (table_key, table_path) in cleanup_targets { let dataset_uri = self.table_store.dataset_uri(&table_path); - if let Err(err) = self.table_store.delete_branch(&dataset_uri, branch).await { - return Err(OmniError::manifest_internal(format!( - "branch '{}' was deleted but cleanup failed for {}: {}", - branch, table_key, err - ))); + let outcome = match crate::failpoints::maybe_fail("branch_delete.before_table_cleanup") + { + Ok(()) => self.table_store.force_delete_branch(&dataset_uri, branch).await, + Err(injected) => Err(injected), + }; + if let Err(err) = outcome { + tracing::warn!( + target: "omnigraph::branch_delete::cleanup", + branch = %branch, + table = %table_key, + error = %err, + "best-effort fork reclaim failed; cleanup will reconcile the orphan", + ); } } - - Ok(()) } async fn delete_branch_storage_only(&self, branch: &str) -> Result<()> { @@ -1105,9 +1159,12 @@ impl Omnigraph { .map(|entry| (entry.table_key.clone(), entry.table_path.clone())) .collect::>(); + // Authority flip (+ best-effort commit-graph reclaim) β€” must succeed. self.coordinator.write().await.branch_delete(branch).await?; + // Best-effort per-table fork reclaim; cleanup reconciles any leftover. self.cleanup_deleted_branch_tables(branch, &owned_tables) - .await + .await; + Ok(()) } pub(crate) fn normalize_branch_name(branch: &str) -> Result> { @@ -1444,12 +1501,6 @@ pub(crate) fn normalize_branch_name(branch: &str) -> Result> { } pub(crate) fn ensure_public_branch_ref(branch: &str, operation: &str) -> Result<()> { - if super::is_internal_run_branch(branch) { - return Err(OmniError::manifest(format!( - "{} does not allow internal run ref '{}'", - operation, branch - ))); - } if is_internal_system_branch(branch) { return Err(OmniError::manifest(format!( "{} does not allow internal system ref '{}'", @@ -1853,7 +1904,6 @@ fn json_value_from_array(array: &dyn Array, row: usize) -> Result Company #[tokio::test] async fn test_apply_schema_succeeds_after_load() { // Historical: schema apply used to be blocked by leftover - // `__run__` branches. A defense-in-depth filter now skips - // internal system branches, and run branches were made - // ephemeral on every terminal state β€” so in practice no - // `__run__` branch survives publish. The filter still guards - // the invariant. + // `__run__` branches. The Run state machine was removed in + // MR-771, so a fresh graph never creates a `__run__` branch; + // legacy ones are swept by the v2β†’v3 manifest migration. This + // asserts the invariant a current graph upholds: publish leaves + // no `__run__` branch behind, so schema apply proceeds. let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); @@ -2210,8 +2260,8 @@ edge WorksAt: Person -> Company let all_branches = db.coordinator.read().await.all_branches().await.unwrap(); assert!( - !all_branches.iter().any(|b| is_internal_run_branch(b)), - "run branch should be deleted after publish, got: {:?}", + !all_branches.iter().any(|b| b.starts_with("__run__")), + "no __run__ branch should exist after publish, got: {:?}", all_branches ); @@ -2223,6 +2273,56 @@ edge WorksAt: Person -> Company assert!(result.applied, "schema apply should have applied"); } + /// Regression (MR-770): a pre-v0.4.0 graph that still carries a stale + /// `__run__*` branch on `__manifest` must not block schema apply. The + /// v2β†’v3 sweep runs in `Omnigraph::open(ReadWrite)` β€” before the + /// schema-apply blocking-branch check β€” so apply succeeds with no + /// intervening publish. + /// + /// Confirmed to fail before the open-time migration landed: the reopened + /// graph still listed `__run__legacy`, and `apply_schema` returned + /// "found non-main branches: __run__legacy". + #[tokio::test] + async fn legacy_run_branch_is_swept_on_open_and_does_not_block_schema_apply() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Synthesize a legacy graph: a stale `__run__` branch on `__manifest` + // plus the manifest stamp rewound to v2 (pre-sweep). + db.branch_create("__run__legacy").await.unwrap(); + drop(db); + { + let mut ds = lance::Dataset::open(&format!("{}/__manifest", uri)) + .await + .unwrap(); + ds.update_schema_metadata([( + "omnigraph:internal_schema_version".to_string(), + Some("2".to_string()), + )]) + .await + .unwrap(); + } + + // Reopen (ReadWrite): the open-time migration must sweep `__run__legacy` + // before any branch-observing code runs. + let db = Omnigraph::open(uri).await.unwrap(); + let branches = db.branch_list().await.unwrap(); + assert!( + !branches.iter().any(|b| b.starts_with("__run__")), + "open-time migration must sweep legacy __run__ branches; got {branches:?}", + ); + + // Schema apply must proceed with no intervening publish β€” the + // blocking-branch check no longer sees `__run__legacy`. + let desired = TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + let result = db.apply_schema(&desired).await.unwrap(); + assert!(result.applied, "schema apply should have applied"); + } + #[tokio::test] async fn test_apply_schema_adds_index_for_existing_property() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/omnigraph/src/db/omnigraph/optimize.rs b/crates/omnigraph/src/db/omnigraph/optimize.rs index e158dc7..fff3f54 100644 --- a/crates/omnigraph/src/db/omnigraph/optimize.rs +++ b/crates/omnigraph/src/db/omnigraph/optimize.rs @@ -40,6 +40,20 @@ fn maint_concurrency() -> usize { .unwrap_or(DEFAULT_MAINT_CONCURRENCY) } +/// Whether the installed Lance can compact a dataset that contains blob +/// columns. `false` today: Lance `compact_files` forces +/// `BlobHandling::AllBinary` on the read side, and the blob-v2 struct decoder +/// mis-counts columns ("there were more fields in the schema than provided +/// column indices"), failing even a pristine uniform-V2_2 multi-fragment blob +/// table. Reads are unaffected (queries use descriptor handling). +/// +/// While `false`, [`optimize_all_tables`] skips blob-bearing tables and reports +/// [`SkipReason::BlobColumnsUnsupportedByLance`] instead of aborting the whole +/// sweep. Flip to `true` once the upstream Lance fix ships β€” the +/// `lance_surface_guards.rs::compact_files_still_fails_on_blob_columns` guard +/// turns red on that bump and forces this flip. Tracked in `docs/dev/lance.md`. +const LANCE_SUPPORTS_BLOB_COMPACTION: bool = false; + /// Retention knobs for [`cleanup_all_tables`]. At least one must be set or /// nothing is cleaned. If both are set, Lance applies them as AND (a manifest /// is kept if it satisfies either β€” i.e. only manifests older than BOTH the @@ -52,8 +66,45 @@ pub struct CleanupPolicyOptions { pub older_than: Option, } -/// Per-table outcome of `optimize_all_tables`. +/// Why `optimize` did not compact a table. Typed so callers branch on the +/// reason rather than sniffing a string. One variant today, gated by +/// [`LANCE_SUPPORTS_BLOB_COMPACTION`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum SkipReason { + /// The table has one or more `Blob` columns. Lance `compact_files` forces + /// `BlobHandling::AllBinary`, which mis-decodes blob-v2 columns; see + /// [`LANCE_SUPPORTS_BLOB_COMPACTION`] and `docs/dev/lance.md`. + BlobColumnsUnsupportedByLance, +} + +impl SkipReason { + /// Stable machine-readable token for serialized output (e.g. CLI `--json`). + /// Once emitted this is part of the output contract β€” keep it stable. + pub fn as_str(&self) -> &'static str { + match self { + SkipReason::BlobColumnsUnsupportedByLance => "blob_columns_unsupported_by_lance", + } + } +} + +impl std::fmt::Display for SkipReason { + /// Human-readable reason for CLI and log output. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let msg = match self { + SkipReason::BlobColumnsUnsupportedByLance => { + "blob columns β€” Lance compaction unsupported" + } + }; + f.write_str(msg) + } +} + +/// Per-table outcome of `optimize_all_tables`. This is a returned result type, +/// not built by callers, so it is `#[non_exhaustive]`: future fields stay +/// non-breaking and downstream code reads fields rather than constructing it. #[derive(Debug, Clone)] +#[non_exhaustive] pub struct TableOptimizeStats { pub table_key: String, /// Number of source fragments that were rewritten by Lance. @@ -62,14 +113,44 @@ pub struct TableOptimizeStats { pub fragments_added: usize, /// Did this table get a new Lance manifest version from the compaction? pub committed: bool, + /// `Some(reason)` if this table was deliberately not compacted. When set, + /// `fragments_removed == 0`, `fragments_added == 0`, and `!committed`. + pub skipped: Option, } -/// Per-table outcome of `cleanup_all_tables`. +impl TableOptimizeStats { + /// Stat for a table that Lance actually compacted. + fn compacted(table_key: String, metrics: &CompactionMetrics, committed: bool) -> Self { + Self { + table_key, + fragments_removed: metrics.fragments_removed, + fragments_added: metrics.fragments_added, + committed, + skipped: None, + } + } + + /// Stat for a table that was deliberately skipped (compaction not attempted). + fn skipped(table_key: String, reason: SkipReason) -> Self { + Self { + table_key, + fragments_removed: 0, + fragments_added: 0, + committed: false, + skipped: Some(reason), + } + } +} + +/// Per-table outcome of `cleanup_all_tables`. `error` is `Some` when this +/// table's version GC failed; cleanup is fault-isolated per table, so a single +/// table's failure is recorded here rather than aborting the whole sweep. #[derive(Debug, Clone)] pub struct TableCleanupStats { pub table_key: String, pub bytes_removed: u64, pub old_versions_removed: u64, + pub error: Option, } /// Run Lance `compact_files` on every node + edge table on `main`. @@ -81,14 +162,21 @@ pub async fn optimize_all_tables(db: &Omnigraph) -> Result = all_table_keys(&db.catalog()) - .into_iter() - .filter_map(|table_key| { - let entry = snapshot.entry(&table_key)?; + // Compute per-table state (path + whether it has blob columns) up front, in + // a scope that drops the catalog handle before the async stream starts. + let table_tasks: Vec<(String, String, bool)> = { + let catalog = db.catalog(); + let mut tasks = Vec::new(); + for table_key in all_table_keys(&catalog) { + let Some(entry) = snapshot.entry(&table_key) else { + continue; + }; let full_path = format!("{}/{}", db.root_uri, entry.table_path); - Some((table_key, full_path)) - }) - .collect(); + let has_blob = !blob_properties_for_table_key(&catalog, &table_key)?.is_empty(); + tasks.push((table_key, full_path, has_blob)); + } + tasks + }; if table_tasks.is_empty() { return Ok(Vec::new()); @@ -98,7 +186,24 @@ pub async fn optimize_all_tables(db: &Omnigraph) -> Result> = futures::stream::iter(table_tasks.into_iter()) - .map(|(table_key, full_path)| async move { + .map(|(table_key, full_path, has_blob)| async move { + // Lance `compact_files` mis-decodes blob-v2 columns under the forced + // `BlobHandling::AllBinary` read (see LANCE_SUPPORTS_BLOB_COMPACTION). + // Skip blob-bearing tables and report it rather than aborting the + // whole sweep β€” the other tables still compact. + if has_blob && !LANCE_SUPPORTS_BLOB_COMPACTION { + tracing::warn!( + target: "omnigraph::optimize", + table = %table_key, + "skipping compaction: table has blob columns the current Lance \ + cannot rewrite (blob-v2 AllBinary decode bug); other tables \ + unaffected β€” rerun after the Lance fix", + ); + return Ok(TableOptimizeStats::skipped( + table_key, + SkipReason::BlobColumnsUnsupportedByLance, + )); + } let mut ds = table_store .open_dataset_head_for_write(&table_key, &full_path, None) .await?; @@ -108,12 +213,11 @@ pub async fn optimize_all_tables(db: &Omnigraph) -> Result> = futures::stream::iter(table_tasks.into_iter()) + // Fault-isolated per table: a single table's GC failure is recorded on its + // stats row (`error: Some`) and logged, never aborting the healthy tables. + // cleanup is the convergence backstop, so it must do as much as it can and + // converge on re-run rather than fail wholesale (invariant 13). + let results: Vec = futures::stream::iter(table_tasks.into_iter()) .map(|(table_key, full_path)| async move { - let ds = table_store - .open_dataset_head_for_write(&table_key, &full_path, None) - .await?; - let before_version = keep_versions - .map(|n| ds.version().version.saturating_sub(n as u64)) - .filter(|v| *v > 0); - let policy = CleanupPolicy { - before_timestamp, - before_version, - delete_unverified: false, - error_if_tagged_old_versions: false, - clean_referenced_branches: false, - delete_rate_limit: None, - }; - let removed: RemovalStats = lance::dataset::cleanup::cleanup_old_versions(&ds, policy) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - Ok(TableCleanupStats { - table_key, - bytes_removed: removed.bytes_removed, - old_versions_removed: removed.old_versions, - }) + let outcome: Result = async { + crate::failpoints::maybe_fail("cleanup.table_gc")?; + let ds = table_store + .open_dataset_head_for_write(&table_key, &full_path, None) + .await?; + let before_version = keep_versions + .map(|n| ds.version().version.saturating_sub(n as u64)) + .filter(|v| *v > 0); + let policy = CleanupPolicy { + before_timestamp, + before_version, + delete_unverified: false, + error_if_tagged_old_versions: false, + clean_referenced_branches: false, + delete_rate_limit: None, + }; + lance::dataset::cleanup::cleanup_old_versions(&ds, policy) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + .await; + match outcome { + Ok(removed) => TableCleanupStats { + table_key, + bytes_removed: removed.bytes_removed, + old_versions_removed: removed.old_versions, + error: None, + }, + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + table = %table_key, + error = %err, + "version GC failed for table; other tables unaffected", + ); + TableCleanupStats { + table_key, + bytes_removed: 0, + old_versions_removed: 0, + error: Some(err.to_string()), + } + } + } }) .buffer_unordered(concurrency) .collect() .await; - results.into_iter().collect() + Ok(results) +} + +/// Outcome of [`reconcile_orphaned_branches`]: the `(owner, branch)` pairs +/// reclaimed and the `(owner, error)` pairs that failed, where `owner` is a +/// table key (e.g. `node:Person`) or `"_graph_commits"`. Per-owner failures are +/// isolated and recorded here, not propagated β€” the next reconcile converges. +#[derive(Debug, Clone, Default)] +pub struct BranchReconcileStats { + pub reclaimed: Vec<(String, String)>, + pub failures: Vec<(String, String)>, +} + +/// Drop every per-table and commit-graph Lance branch that the manifest no +/// longer references. +/// +/// Orphaned forks arise when a `branch_delete` flips the manifest authority +/// (atomic) but a downstream best-effort reclaim does not complete. They are +/// unreachable through any snapshot β€” no manifest entry can name them β€” yet +/// they pin their `tree/{branch}/` storage and can block reusing the branch +/// name. This is the guaranteed convergence backstop: it is idempotent and +/// derived purely from the manifest authority, so it no-ops once everything is +/// reconciled, and it would harmlessly find nothing if a future Lance atomic +/// multi-dataset branch op prevented orphans from forming. +/// +/// The keep-set is the full (unfiltered) manifest branch list, so system +/// branches' forks are never reclaimed; `main`/default is not a named Lance +/// branch and so is never a candidate. Referencing children are dropped before +/// parents (Lance refuses to delete a referenced parent) by ordering longest +/// branch names first. +pub async fn reconcile_orphaned_branches(db: &Omnigraph) -> Result { + use std::collections::HashSet; + + let keep: HashSet = db + .coordinator + .read() + .await + .all_branches() + .await? + .into_iter() + .collect(); + + let resolved = db.resolved_branch_target(None).await?; + let snapshot = resolved.snapshot; + let table_targets: Vec<(String, String)> = all_table_keys(&db.catalog()) + .into_iter() + .filter_map(|table_key| { + let entry = snapshot.entry(&table_key)?; + let full_path = format!("{}/{}", db.root_uri, entry.table_path); + Some((table_key, full_path)) + }) + .collect(); + + let mut stats = BranchReconcileStats::default(); + + // Per-table fault isolation: one table's transient failure is recorded and + // logged, never aborting the rest of the sweep. + for (table_key, full_path) in table_targets { + let listed = match db.table_store.list_branches(&full_path).await { + Ok(listed) => listed, + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + table = %table_key, + error = %err, + "listing branches failed during reconcile; skipping table", + ); + stats.failures.push((table_key.clone(), err.to_string())); + continue; + } + }; + for branch in orphan_branches(listed, &keep) { + let outcome = match crate::failpoints::maybe_fail("cleanup.reconcile_fork") { + Ok(()) => db.table_store.force_delete_branch(&full_path, &branch).await, + Err(injected) => Err(injected), + }; + match outcome { + Ok(()) => stats.reclaimed.push((table_key.clone(), branch)), + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + table = %table_key, + branch = %branch, + error = %err, + "reclaiming orphaned fork failed; will retry next cleanup", + ); + stats.failures.push((table_key.clone(), err.to_string())); + } + } + } + } + + // Commit-graph orphans (best-effort: the dataset may not exist on a graph + // that has never committed; any failure is isolated and retried next time). + if let Err(err) = reconcile_commit_graph_orphans(db, &keep, &mut stats).await { + tracing::warn!( + target: "omnigraph::cleanup", + error = %err, + "commit-graph orphan reconcile failed; will retry next cleanup", + ); + stats.failures.push(("_graph_commits".to_string(), err.to_string())); + } + + Ok(stats) +} + +/// Commit-graph half of [`reconcile_orphaned_branches`], split out so its +/// errors can be isolated. Returns `Ok` when the commit-graph dataset is absent. +async fn reconcile_commit_graph_orphans( + db: &Omnigraph, + keep: &std::collections::HashSet, + stats: &mut BranchReconcileStats, +) -> Result<()> { + let commits_uri = crate::db::commit_graph::graph_commits_uri(db.root_uri()); + if !db.storage_adapter().exists(&commits_uri).await? { + return Ok(()); + } + let mut commit_graph = crate::db::commit_graph::CommitGraph::open(db.root_uri()).await?; + for branch in orphan_branches(commit_graph.list_branches().await?, keep) { + match commit_graph.force_delete_branch(&branch).await { + Ok(()) => stats.reclaimed.push(("_graph_commits".to_string(), branch)), + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + branch = %branch, + error = %err, + "reclaiming orphaned commit-graph branch failed; will retry next cleanup", + ); + stats.failures.push(("_graph_commits".to_string(), err.to_string())); + } + } + } + Ok(()) +} + +/// Filter `present` Lance branches down to those absent from the manifest +/// `keep` set, ordered children-before-parents (longest name first) so Lance's +/// referenced-parent `RefConflict` cannot block reclamation. +fn orphan_branches(present: Vec, keep: &std::collections::HashSet) -> Vec { + let mut orphans: Vec = present + .into_iter() + .filter(|branch| !keep.contains(branch)) + .collect(); + orphans.sort_by(|a, b| b.len().cmp(&a.len()).then_with(|| a.cmp(b))); + orphans } fn all_table_keys(catalog: &omnigraph_compiler::catalog::Catalog) -> Vec { diff --git a/crates/omnigraph/src/db/omnigraph/schema_apply.rs b/crates/omnigraph/src/db/omnigraph/schema_apply.rs index 0dcf0f9..7cb3193 100644 --- a/crates/omnigraph/src/db/omnigraph/schema_apply.rs +++ b/crates/omnigraph/src/db/omnigraph/schema_apply.rs @@ -48,57 +48,24 @@ pub(super) async fn plan_schema( Ok(plan) } -pub(super) async fn apply_schema( - db: &Omnigraph, - desired_schema_source: &str, - options: SchemaApplyOptions, - actor: Option<&str>, -) -> Result { - // Engine-layer policy gate (MR-722 chassis core). - // - // Fires BEFORE acquiring the schema-apply lock or doing any other - // work. When no PolicyChecker is installed this is a no-op and - // the apply path behaves exactly as it did before MR-722. When - // a PolicyChecker IS installed and the actor is None, this is a - // hard error β€” see Omnigraph::enforce's docstring for the - // forget-the-actor-footgun reasoning. - // - // Scope is TargetBranch("main") to match the HTTP-layer convention - // for SchemaApply: branch=None, target_branch=Some("main"). Cedar - // policies in the wild use `target_branch_scope: protected` to - // gate schema applies, so the engine-layer call has to set the - // target_branch shape that activates that predicate. Wrong scope - // here = silent policy mismatch with HTTP. See - // `omnigraph_policy::ResourceScope::to_branch_pair` for the mapping. - db.enforce( - omnigraph_policy::PolicyAction::SchemaApply, - &omnigraph_policy::ResourceScope::TargetBranch("main".to_string()), - actor, - )?; - - acquire_schema_apply_lock(db).await?; - let result = apply_schema_with_lock(db, desired_schema_source, options).await; - let release_result = release_schema_apply_lock(db).await; - match (result, release_result) { - (Ok(result), Ok(())) => Ok(result), - (Ok(_), Err(err)) => Err(err), - (Err(err), Ok(())) => Err(err), - (Err(err), Err(_)) => Err(err), - } +struct PlannedSchemaApply { + plan: SchemaMigrationPlan, + desired_ir: SchemaIR, + desired_catalog: Catalog, } -pub(super) async fn apply_schema_with_lock( +async fn plan_schema_for_apply( db: &Omnigraph, desired_schema_source: &str, options: SchemaApplyOptions, -) -> Result { +) -> Result { db.ensure_schema_state_valid().await?; let branches = db.coordinator.read().await.all_branches().await?; - // Skip `main` and internal system branches. The schema-apply lock branch - // is excluded because it is the cluster-wide schema-apply serializer. - // `__run__*` branches are no longer created; the filter remains as - // defense-in-depth for legacy graphs with leftover staging branches. - // A future production sweep will let this guard go. + // Skip `main` and internal system branches (the schema-apply lock branch, + // the cluster-wide schema-apply serializer). Legacy `__run__*` staging + // branches were swept off `__manifest` by the v2β†’v3 migration that runs in + // `Omnigraph::open(ReadWrite)` before this check (MR-770), so they no + // longer appear here. let blocking_branches = branches .into_iter() .filter(|branch| branch != "main" && !is_internal_system_branch(branch)) @@ -123,6 +90,87 @@ pub(super) async fn apply_schema_with_lock( .unwrap_or_else(|| "unsupported schema migration plan".to_string()); return Err(OmniError::manifest(message)); } + + let mut desired_catalog = build_catalog_from_ir(&desired_ir)?; + fixup_blob_schemas(&mut desired_catalog); + Ok(PlannedSchemaApply { + plan, + desired_ir, + desired_catalog, + }) +} + +pub(super) async fn preview_schema_apply( + db: &Omnigraph, + desired_schema_source: &str, + options: SchemaApplyOptions, +) -> Result { + let planned = plan_schema_for_apply(db, desired_schema_source, options).await?; + Ok(SchemaApplyPreview { + plan: planned.plan, + catalog: planned.desired_catalog, + }) +} + +pub(super) async fn apply_schema( + db: &Omnigraph, + desired_schema_source: &str, + options: SchemaApplyOptions, + actor: Option<&str>, + validate_catalog: F, +) -> Result +where + F: FnOnce(&Catalog) -> Result<()>, +{ + // Engine-layer policy gate (MR-722 chassis core). + // + // Fires BEFORE acquiring the schema-apply lock or doing any other + // work. When no PolicyChecker is installed this is a no-op and + // the apply path behaves exactly as it did before MR-722. When + // a PolicyChecker IS installed and the actor is None, this is a + // hard error β€” see Omnigraph::enforce's docstring for the + // forget-the-actor-footgun reasoning. + // + // Scope is TargetBranch("main") to match the HTTP-layer convention + // for SchemaApply: branch=None, target_branch=Some("main"). Cedar + // policies in the wild use `target_branch_scope: protected` to + // gate schema applies, so the engine-layer call has to set the + // target_branch shape that activates that predicate. Wrong scope + // here = silent policy mismatch with HTTP. See + // `omnigraph_policy::ResourceScope::to_branch_pair` for the mapping. + db.enforce( + omnigraph_policy::PolicyAction::SchemaApply, + &omnigraph_policy::ResourceScope::TargetBranch("main".to_string()), + actor, + )?; + + acquire_schema_apply_lock(db).await?; + let result = apply_schema_with_lock(db, desired_schema_source, options, validate_catalog).await; + let release_result = release_schema_apply_lock(db).await; + match (result, release_result) { + (Ok(result), Ok(())) => Ok(result), + (Ok(_), Err(err)) => Err(err), + (Err(err), Ok(())) => Err(err), + (Err(err), Err(_)) => Err(err), + } +} + +pub(super) async fn apply_schema_with_lock( + db: &Omnigraph, + desired_schema_source: &str, + options: SchemaApplyOptions, + validate_catalog: F, +) -> Result +where + F: FnOnce(&Catalog) -> Result<()>, +{ + let planned = plan_schema_for_apply(db, desired_schema_source, options).await?; + validate_catalog(&planned.desired_catalog)?; + let PlannedSchemaApply { + plan, + desired_ir, + desired_catalog, + } = planned; if plan.steps.is_empty() { return Ok(SchemaApplyResult { supported: true, @@ -132,9 +180,6 @@ pub(super) async fn apply_schema_with_lock( }); } - let mut desired_catalog = build_catalog_from_ir(&desired_ir)?; - fixup_blob_schemas(&mut desired_catalog); - let snapshot = db.snapshot().await; let base_manifest_version = snapshot.version(); let mut added_tables = BTreeSet::new(); diff --git a/crates/omnigraph/src/db/omnigraph/table_ops.rs b/crates/omnigraph/src/db/omnigraph/table_ops.rs index 0e89c45..3ed9c43 100644 --- a/crates/omnigraph/src/db/omnigraph/table_ops.rs +++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs @@ -483,6 +483,22 @@ pub(super) async fn open_owned_dataset_for_branch_write( Ok((ds, Some(active_branch.to_string()))) } source_branch => { + crate::failpoints::maybe_fail("fork.before_classify")?; + // Authority check before forking: re-read the live manifest. If this + // table is already forked on active_branch, a concurrent first-write + // won the race and our snapshot is stale β€” that is a retryable + // conflict, not an orphan. (A zombie fork is never in the manifest, + // so this only fires for a live concurrent fork.) + let live = db.snapshot_for_branch(Some(active_branch)).await?; + if let Some(entry) = live.entry(table_key) { + if entry.table_branch.as_deref() == Some(active_branch) { + return Err(OmniError::manifest_expected_version_mismatch( + table_key, + entry_version, + entry.table_version, + )); + } + } fork_dataset_from_entry_state( db, table_key, diff --git a/crates/omnigraph/src/db/run_registry.rs b/crates/omnigraph/src/db/run_registry.rs deleted file mode 100644 index ee3d336..0000000 --- a/crates/omnigraph/src/db/run_registry.rs +++ /dev/null @@ -1,16 +0,0 @@ -// The Run state machine has been removed. Mutations now write directly -// to target tables and use the publisher's `expected_table_versions` -// CAS for cross-table OCC; `__run__` staging branches and the -// `_graph_runs.lance` state machine no longer exist. -// -// What remains is the branch-name predicate, kept as a defense-in-depth -// guard against users naming a public branch `__run__*`. A future -// production sweep of legacy `_graph_runs.lance` rows and stale -// `__run__*` branches will let this predicate (and this file) go too. - -pub(crate) const INTERNAL_RUN_BRANCH_PREFIX: &str = "__run__"; - -pub(crate) fn is_internal_run_branch(name: &str) -> bool { - name.trim_start_matches('/') - .starts_with(INTERNAL_RUN_BRANCH_PREFIX) -} diff --git a/crates/omnigraph/src/exec/merge.rs b/crates/omnigraph/src/exec/merge.rs index 2e5f32e..eb6c4a3 100644 --- a/crates/omnigraph/src/exec/merge.rs +++ b/crates/omnigraph/src/exec/merge.rs @@ -1087,9 +1087,9 @@ impl Omnigraph { target: &str, actor_id: Option<&str>, ) -> Result { - if is_internal_run_branch(source) || is_internal_run_branch(target) { + if is_internal_system_branch(source) || is_internal_system_branch(target) { return Err(OmniError::manifest(format!( - "branch_merge does not allow internal run refs ('{}' -> '{}')", + "branch_merge does not allow internal system refs ('{}' -> '{}')", source, target ))); } diff --git a/crates/omnigraph/src/exec/mod.rs b/crates/omnigraph/src/exec/mod.rs index 33a7e41..ce72d42 100644 --- a/crates/omnigraph/src/exec/mod.rs +++ b/crates/omnigraph/src/exec/mod.rs @@ -35,7 +35,7 @@ use time::format_description::well_known::Rfc3339; use crate::db::commit_graph::CommitGraph; use crate::db::manifest::ManifestCoordinator; -use crate::db::{MergeOutcome, Omnigraph, is_internal_run_branch}; +use crate::db::{MergeOutcome, Omnigraph, is_internal_system_branch}; use crate::db::{ReadTarget, Snapshot}; use crate::embedding::EmbeddingClient; use crate::error::{MergeConflict, MergeConflictKind, OmniError, Result}; diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index a7aae28..23fe4ef 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -288,21 +288,24 @@ async fn load_jsonl_reader( let mut node_rows: HashMap> = HashMap::new(); let mut edge_rows: HashMap> = HashMap::new(); - for (line_num, line) in reader.lines().enumerate() { - let line = line?; - let line = line.trim(); - if line.is_empty() { - continue; - } - let value: JsonValue = serde_json::from_str(line).map_err(|e| { - OmniError::manifest(format!("invalid JSON on line {}: {}", line_num + 1, e)) + // Parse a stream of JSON values. Accepts both compact JSONL (one object + // per line) and pretty-printed JSON where a single object spans multiple + // lines β€” serde's streaming deserializer treats any whitespace (including + // newlines) between top-level values as a separator. + for (idx, parsed) in serde_json::Deserializer::from_reader(reader) + .into_iter::() + .enumerate() + { + let record_num = idx + 1; + let value: JsonValue = parsed.map_err(|e| { + OmniError::manifest(format!("invalid JSON at record {}: {}", record_num, e)) })?; if let Some(type_name) = value.get("type").and_then(|v| v.as_str()) { if !catalog.node_types.contains_key(type_name) { return Err(OmniError::manifest(format!( - "line {}: unknown node type '{}'", - line_num + 1, + "record {}: unknown node type '{}'", + record_num, type_name ))); } @@ -317,8 +320,8 @@ async fn load_jsonl_reader( } else if let Some(edge_name) = value.get("edge").and_then(|v| v.as_str()) { if catalog.lookup_edge_by_name(edge_name).is_none() { return Err(OmniError::manifest(format!( - "line {}: unknown edge type '{}'", - line_num + 1, + "record {}: unknown edge type '{}'", + record_num, edge_name ))); } @@ -326,14 +329,14 @@ async fn load_jsonl_reader( .get("from") .and_then(|v| v.as_str()) .ok_or_else(|| { - OmniError::manifest(format!("line {}: edge missing 'from'", line_num + 1)) + OmniError::manifest(format!("record {}: edge missing 'from'", record_num)) })? .to_string(); let to = value .get("to") .and_then(|v| v.as_str()) .ok_or_else(|| { - OmniError::manifest(format!("line {}: edge missing 'to'", line_num + 1)) + OmniError::manifest(format!("record {}: edge missing 'to'", record_num)) })? .to_string(); let data = value @@ -347,8 +350,8 @@ async fn load_jsonl_reader( .push((from, to, data)); } else { return Err(OmniError::manifest(format!( - "line {}: expected 'type' or 'edge' field", - line_num + 1 + "record {}: expected 'type' or 'edge' field", + record_num ))); } } diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index 7643cbc..4b52db6 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -177,6 +177,45 @@ impl TableStore { .map_err(|e| OmniError::Lance(e.to_string())) } + /// List the named Lance branches present on the dataset at `dataset_uri`. + /// The `cleanup` orphan reconciler diffs this against the manifest branch + /// set to find orphaned per-table forks. `main`/default is not a named + /// branch and never appears here. + pub async fn list_branches(&self, dataset_uri: &str) -> Result> { + let ds = Dataset::open(dataset_uri) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let branches = ds + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(branches.into_keys().collect()) + } + + /// Idempotently drop `branch` from the dataset at `dataset_uri`. + /// + /// Unlike [`delete_branch`](Self::delete_branch), this tolerates an + /// already-absent branch β€” both a missing contents ref (Lance's + /// `force_delete_branch` handles that) and a missing `tree/{branch}/` + /// directory (the local-store `NotFound` quirk pinned by + /// `lance_surface_guards::force_delete_branch_semantics`). Safe to call on a + /// possibly-orphaned or already-reclaimed fork. + /// + /// A branch that still has referencing descendants (`RefConflict`) is NOT + /// tolerated: that is a real ordering error and surfaces as `OmniError::Lance`. + /// Used by the eager best-effort reclaim in `cleanup_deleted_branch_tables` + /// and the `cleanup` orphan reconciler. + pub async fn force_delete_branch(&self, dataset_uri: &str, branch: &str) -> Result<()> { + let mut ds = Dataset::open(dataset_uri) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + match ds.force_delete_branch(branch).await { + Ok(()) => Ok(()), + Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => Ok(()), + Err(e) => Err(OmniError::Lance(e.to_string())), + } + } + pub async fn open_dataset_at_state( &self, table_path: &str, @@ -243,21 +282,24 @@ impl TableStore { .map_err(|e| OmniError::Lance(e.to_string()))?; self.ensure_expected_version(&source_ds, table_key, source_version)?; - match source_ds + if source_ds .create_branch(target_branch, source_version, None) .await + .is_err() { - Ok(_) => {} - Err(create_err) => match self - .open_dataset_head(dataset_uri, Some(target_branch)) - .await - { - Ok(ds) => { - self.ensure_expected_version(&ds, table_key, source_version)?; - return Ok(ds); - } - Err(_) => return Err(OmniError::Lance(create_err.to_string())), - }, + // The target branch ref already exists. The caller + // (`open_owned_dataset_for_branch_write`) re-reads the live manifest + // before forking and returns a retryable error when a concurrent + // writer legitimately holds the fork, so reaching here means the + // manifest does NOT reference this fork: it is an orphan from an + // incomplete prior `branch_delete`. Surface the actionable cleanup + // error rather than guessing from Lance branch versions. + return Err(OmniError::manifest_conflict(format!( + "branch '{}' has orphaned table state for '{}' from an incomplete \ + prior delete; run `omnigraph cleanup` to reclaim it before reusing \ + this branch name", + target_branch, table_key + ))); } let ds = self diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index 5ea71c5..149c63a 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -41,6 +41,452 @@ async fn branch_create_failpoint_triggers() { ); } +// Branch delete flips the manifest authority first, then reclaims the per-table +// forks best-effort. A failure during that reclaim (here, the +// `branch_delete.before_table_cleanup` failpoint, standing in for a transient +// object-store error) must NOT fail the call: the branch is already gone, and +// `cleanup` reconciles the stranded fork. The branch name is reusable after. +#[tokio::test] +async fn branch_delete_partial_failure_converges_via_cleanup() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut main = helpers::init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + drop(feature); + + let person_uri = node_table_uri(&uri, "Person"); + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("feature"), + "precondition: the owned table fork exists before delete" + ); + } + + // Inject a failure during per-table cleanup, AFTER the manifest authority + // flip. branch_delete must still succeed (best-effort reclaim). + { + let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return"); + main.branch_delete("feature").await.expect( + "branch_delete is best-effort after the manifest flip: a cleanup-step \ + failure must not fail the call", + ); + } + + // Authority flipped: the branch is gone. + assert_eq!(main.branch_list().await.unwrap(), vec!["main".to_string()]); + + // The eager reclaim failed, so the orphan is stranded until cleanup. + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("feature"), + "failed eager reclaim should leave the orphan for cleanup to reconcile" + ); + } + + // cleanup converges: the orphan is reclaimed. + main.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("feature"), + "cleanup should reconcile the orphaned fork away" + ); + } + + // The name is reusable after cleanup reclaims the orphan. + main.branch_create("feature").await.unwrap(); + let mut feature2 = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut feature2, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 41)]), + ) + .await + .unwrap(); +} + +// Reusing a branch name whose delete left an orphaned fork (before `cleanup` +// reconciles it) must fail with a clear, actionable error pointing at +// `cleanup`, not the opaque `ExpectedVersionMismatch` that leaks from the fork +// path. The recreate itself succeeds; the first write to the previously-forked +// table is where the stale orphan collides. +#[tokio::test] +async fn recreate_over_orphaned_fork_before_cleanup_is_actionable() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut main = helpers::init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + drop(feature); + + // Partial delete: leaves the Person fork orphaned (cleanup not yet run). + { + let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return"); + main.branch_delete("feature").await.unwrap(); + } + + // Recreate the name and write to the previously-forked table WITHOUT a + // cleanup in between. + main.branch_create("feature").await.unwrap(); + let mut feature2 = Omnigraph::open(&uri).await.unwrap(); + let err = helpers::mutate_branch( + &mut feature2, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 41)]), + ) + .await + .expect_err("write should collide with the stale orphaned fork"); + + let msg = err.to_string(); + assert!( + msg.contains("cleanup") + && (msg.contains("orphan") || msg.contains("incomplete prior delete")), + "expected an actionable orphaned-fork error pointing at cleanup, got: {msg}" + ); + assert!( + !msg.contains("expected manifest table version"), + "should not surface the opaque ExpectedVersionMismatch, got: {msg}" + ); +} + +// cleanup is the guaranteed convergence backstop, so one table's transient +// failure must not abort the whole sweep. Inject a one-shot version-GC failure +// for a single table and assert: cleanup still succeeds, the failure is +// surfaced per-table in the returned stats, and the independent reconcile pass +// still reclaimed an orphan. +#[tokio::test] +async fn cleanup_isolates_single_table_failure() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = helpers::init_and_load(&dir).await; + + // Forge an orphaned fork on the Person table (a reconcile target). + let person_uri = node_table_uri(&uri, "Person"); + { + let mut ds = lance::Dataset::open(&person_uri).await.unwrap(); + let base = ds.version().version; + ds.create_branch("ghost", base, None).await.unwrap(); + } + + // One table's version GC fails once; the sweep must isolate it. + let _fp = ScopedFailPoint::new("cleanup.table_gc", "1*return"); + let stats = db + .cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .expect("a single table's GC failure must not abort cleanup"); + + let errored = stats.iter().filter(|s| s.error.is_some()).count(); + assert_eq!( + errored, 1, + "exactly one table's GC failure should be surfaced in stats, got {errored}" + ); + assert!( + stats.len() >= 4, + "every node+edge table should still appear in the stats" + ); + + // The reconcile pass is independent of the GC failure, so the orphan is gone. + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("ghost"), + "reconcile should reclaim the orphan despite the GC failure" + ); + } +} + +// Companion to the version-GC isolation test, exercising the OTHER cleanup +// loop: a force-delete failure while reconciling one orphaned fork must be +// isolated (logged, not propagated) so the sweep continues, and a later +// cleanup converges. This is the loop the Devin finding was about. +#[tokio::test] +async fn cleanup_isolates_reconcile_failure() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = helpers::init_and_load(&dir).await; + + // Forge an orphaned fork the reconcile pass will try to reclaim. + let person_uri = node_table_uri(&uri, "Person"); + { + let mut ds = lance::Dataset::open(&person_uri).await.unwrap(); + let base = ds.version().version; + ds.create_branch("ghost", base, None).await.unwrap(); + } + + // Inject a one-shot failure into the reconcile force-delete. The sweep must + // not abort. + { + let _fp = ScopedFailPoint::new("cleanup.reconcile_fork", "1*return"); + db.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .expect("a reconcile force-delete failure must not abort cleanup"); + } + // The blocked orphan is still present (the failure was isolated, not retried). + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("ghost"), + "the orphan whose reclaim was injected-to-fail should remain" + ); + } + // A second cleanup with no injected failure converges. + db.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("ghost"), + "the second cleanup should reconcile the orphan" + ); + } +} + +// The cleanup reconciler must reclaim orphaned commit-graph branches, not just +// per-table forks. A delete whose best-effort commit-graph reclaim fails leaves +// a commit-graph orphan; the next cleanup must drop it. +#[tokio::test] +async fn cleanup_reclaims_orphaned_commit_graph_branch() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = helpers::init_and_load(&dir).await; + + db.branch_create("feature").await.unwrap(); + // Delete, failing the commit-graph reclaim β†’ commit-graph "feature" orphan + // (manifest branch gone, commit-graph branch left behind). + { + let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return"); + db.branch_delete("feature").await.unwrap(); + } + + let commits_uri = format!("{}/_graph_commits.lance", uri.trim_end_matches('/')); + { + let ds = lance::Dataset::open(&commits_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("feature"), + "precondition: the commit-graph branch should be orphaned after the failed reclaim" + ); + } + + db.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + + { + let ds = lance::Dataset::open(&commits_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("feature"), + "cleanup should reclaim the orphaned commit-graph branch" + ); + } +} + +// A branch_delete whose best-effort commit-graph reclaim fails leaves a +// commit-graph "zombie" branch. Recreating that name must heal the zombie and +// succeed (branch_create force-deletes a stale commit-graph ref since the +// manifest branch is created fresh), instead of dying on the leftover ref. +#[tokio::test] +async fn branch_create_recreates_over_commit_graph_zombie() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) + .await + .unwrap(); + + db.branch_create("feature").await.unwrap(); + { + // Fail the best-effort commit-graph reclaim β†’ commit-graph "feature" + // zombie survives the delete (manifest authority still flips). + let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return"); + db.branch_delete("feature").await.unwrap(); + } + assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); + + db.branch_create("feature") + .await + .expect("branch_create should heal the zombie commit-graph branch and succeed"); + assert!( + db.branch_list() + .await + .unwrap() + .contains(&"feature".to_string()) + ); +} + +// branch_create is authority-then-derived: if the derived commit-graph branch +// cannot be created, the manifest branch (the authority) must be rolled back so +// the branch does not half-exist. The existing failpoint fires right after the +// manifest create, standing in for any post-authority failure. +#[tokio::test] +async fn branch_create_rolls_back_manifest_on_commit_graph_failure() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) + .await + .unwrap(); + + let err = { + let _fp = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return"); + db.branch_create("feature").await.unwrap_err() + }; + assert!( + !db.branch_list() + .await + .unwrap() + .contains(&"feature".to_string()), + "branch_create must roll back the manifest branch when the derived \ + commit-graph branch fails, got error: {err}" + ); +} + +// A fork collision must be classified by the manifest authority, not by Lance +// branch versions. When a concurrent first-write legitimately wins the fork +// race, the loser sees a version mismatch β€” but that is a stale snapshot, not +// an orphan, so it must be a retryable "refresh and retry", never a misleading +// "run cleanup". +// +// Ordering is made deterministic (no sleeps) via a callback at the fork point: +// `compare_exchange` lets only the FIRST arrival (writer A) record readiness and +// block until released; later arrivals (writer B) fall through. The test waits +// on the readiness flag, lets B win and commit the fork, then releases A. +static FORK_A_AT_POINT: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); +static FORK_RELEASE_A: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); + +#[tokio::test(flavor = "multi_thread")] +async fn fork_collision_with_live_concurrent_fork_is_retryable() { + use std::sync::atomic::Ordering::SeqCst; + + let _scenario = FailScenario::setup(); + FORK_A_AT_POINT.store(false, SeqCst); + FORK_RELEASE_A.store(false, SeqCst); + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let main = helpers::init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + // First arrival (A) records readiness and blocks until released; the rest + // (B) fall through immediately. Bounded spin so a mistake can't hang forever. + fail::cfg_callback("fork.before_classify", || { + if FORK_A_AT_POINT + .compare_exchange(false, true, SeqCst, SeqCst) + .is_ok() + { + for _ in 0..2000 { + if FORK_RELEASE_A.load(SeqCst) { + break; + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + } + }) + .unwrap(); + + let uri_a = uri.clone(); + let writer_a = tokio::spawn(async move { + let mut a = Omnigraph::open(&uri_a).await.unwrap(); + helpers::mutate_branch( + &mut a, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + }); + + // Wait (bounded) until A is parked at the fork point. + for _ in 0..600 { + if FORK_A_AT_POINT.load(SeqCst) { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + } + assert!( + FORK_A_AT_POINT.load(SeqCst), + "writer A never reached the fork point" + ); + + // B wins the fork and commits it. + let mut b = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut b, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 41)]), + ) + .await + .unwrap(); + + // Release A; it resumes, re-reads the manifest, and sees the fork is live. + FORK_RELEASE_A.store(true, SeqCst); + let err = writer_a + .await + .unwrap() + .expect_err("A's stale-snapshot fork should be a retryable conflict"); + fail::remove("fork.before_classify"); + + let msg = err.to_string(); + assert!( + !msg.contains("cleanup"), + "a live concurrent fork must not be misclassified as an orphan, got: {msg}" + ); + assert!( + msg.contains("refresh and retry") || msg.contains("expected manifest table version"), + "expected a retryable stale-view error, got: {msg}" + ); +} + #[tokio::test(flavor = "multi_thread")] async fn graph_publish_failpoint_triggers_before_commit_append() { let _scenario = FailScenario::setup(); diff --git a/crates/omnigraph/tests/lance_surface_guards.rs b/crates/omnigraph/tests/lance_surface_guards.rs index b65a808..1d60c08 100644 --- a/crates/omnigraph/tests/lance_surface_guards.rs +++ b/crates/omnigraph/tests/lance_surface_guards.rs @@ -242,3 +242,136 @@ async fn _compile_delete_result_field_shape() -> lance::Result<()> { let _num_deleted: u64 = result.num_deleted_rows; Ok(()) } + +// --- Guard 9: force_delete_branch semantics -------------------------------- +// +// The branch-delete reconciler (`db/omnigraph/optimize.rs::reconcile_orphaned_branches`) +// and the eager best-effort reclaim in `cleanup_deleted_branch_tables` call +// `force_delete_branch` to drop orphaned branch refs. The single-authority +// design relies on three facts pinned here: +// 1. plain `delete_branch` errors on a missing ref (so the design uses the +// force variant instead); +// 2. `force_delete_branch` removes an existing (forked) branch β€” the orphan +// case, where a `tree/{branch}/` exists; +// 3. `force_delete_branch` on a *fully-absent* branch (no tree dir) still +// errors on the local store, because `remove_dir_all`'s NotFound is not +// caught for Lance's native error variant. `TableStore::force_delete_branch` +// wraps this to be fully idempotent. Pin the raw quirk so a future Lance +// fix (which would let us simplify the wrapper) is noticed. + +#[tokio::test] +async fn force_delete_branch_semantics() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("guard9.lance"); + let uri = uri.to_str().unwrap(); + let mut ds = fresh_dataset(uri).await; + + // (1) Plain delete of a never-created branch errors (RefNotFound). + assert!( + ds.delete_branch("nope").await.is_err(), + "Dataset::delete_branch on a missing ref should error; if this is now \ + Ok, the reconciler could drop the force variant." + ); + + // (2) force_delete_branch removes an existing (forked) branch. + let base = ds.version().version; + ds.create_branch("feature", base, None).await.unwrap(); + ds.force_delete_branch("feature").await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("feature"), + "force_delete_branch should remove an existing branch ref" + ); + + // (3) Quirk: force_delete on a fully-absent branch errors on the local + // store (worked around by TableStore::force_delete_branch). + assert!( + ds.force_delete_branch("never").await.is_err(), + "force_delete_branch on a fully-absent branch no longer errors β€” \ + TableStore::force_delete_branch's NotFound tolerance can be simplified." + ); +} + +// --- Guard 10: blob-column compaction is still broken in this Lance -------- +// +// `db/omnigraph/optimize.rs` skips tables with blob columns while +// `LANCE_SUPPORTS_BLOB_COMPACTION = false`: Lance `compact_files` forces +// `BlobHandling::AllBinary`, and the blob-v2 struct decoder mis-counts columns +// ("more fields in the schema than provided column indices"), failing even a +// pristine uniform-V2_2 multi-fragment blob table. Reads are unaffected (they +// use descriptor handling). +// +// WHEN THIS TEST TURNS RED (compact_files no longer errors), the Lance bug is +// fixed: flip `LANCE_SUPPORTS_BLOB_COMPACTION` to true in optimize.rs, drop the +// blob-skip branch + the `optimize_skips_blob_table_and_reports_skip` +// skip assertions in maintenance.rs, and re-pin docs/dev/lance.md. + +#[tokio::test] +async fn compact_files_still_fails_on_blob_columns() { + use arrow_array::{LargeBinaryArray, StructArray}; + + fn blob_batch(start: i32, n: i32) -> RecordBatch { + let ids: Vec = (start..start + n).map(|i| format!("n{i}")).collect(); + let data = + LargeBinaryArray::from_iter_values((start..start + n).map(|i| format!("blob{i}"))); + let blob_uri = StringArray::from(vec![None::<&str>; n as usize]); + let DataType::Struct(fields) = lance::blob::blob_field("content", true).data_type().clone() + else { + unreachable!("blob_field is always a Struct"); + }; + let content = StructArray::new( + fields, + vec![Arc::new(data) as _, Arc::new(blob_uri) as _], + None, + ); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + lance::blob::blob_field("content", true), + ])); + RecordBatch::try_new( + schema, + vec![Arc::new(StringArray::from(ids)) as _, Arc::new(content) as _], + ) + .unwrap() + } + + async fn write(uri: &str, batch: RecordBatch, mode: WriteMode) { + let schema = batch.schema(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + // Blob v2 requires file version >= 2.2; without the pin the *write* + // would fail with a different error, masking the guard's intent. + let params = WriteParams { + mode, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + Dataset::write(reader, uri, Some(params)).await.unwrap(); + } + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("guard10-blob.lance"); + let uri = uri.to_str().unwrap(); + + // Uniform V2_2, two fragments β†’ forces compaction to actually rewrite. + write(uri, blob_batch(0, 2), WriteMode::Create).await; + write(uri, blob_batch(100, 2), WriteMode::Append).await; + + let mut ds = Dataset::open(uri).await.unwrap(); + assert!( + ds.get_fragments().len() >= 2, + "guard needs a multi-fragment table to trigger a real compaction rewrite" + ); + + let result = compact_files(&mut ds, CompactionOptions::default(), None).await; + let err = result.expect_err( + "compact_files unexpectedly SUCCEEDED on a blob table β€” the Lance blob-v2 \ + compaction bug is fixed. Flip LANCE_SUPPORTS_BLOB_COMPACTION to true in \ + db/omnigraph/optimize.rs, remove the blob-skip branch, and re-pin docs/dev/lance.md.", + ); + assert!( + err.to_string() + .contains("more fields in the schema than provided column indices"), + "blob compaction failed with an unexpected error (Lance internals may have \ + shifted): {err}" + ); +} diff --git a/crates/omnigraph/tests/maintenance.rs b/crates/omnigraph/tests/maintenance.rs index 3c6ab30..3e61677 100644 --- a/crates/omnigraph/tests/maintenance.rs +++ b/crates/omnigraph/tests/maintenance.rs @@ -7,11 +7,24 @@ mod helpers; use std::time::Duration; -use omnigraph::db::{CleanupPolicyOptions, Omnigraph}; +use lance::Dataset; +use omnigraph::db::{CleanupPolicyOptions, Omnigraph, SkipReason}; use omnigraph::loader::{LoadMode, load_jsonl}; use helpers::{TEST_DATA, TEST_SCHEMA, count_rows, init_and_load}; +/// Filesystem URI of a node sub-table, mirroring the engine's layout +/// (FNV-1a of the type name under `nodes/`). Matches the helper in +/// `failpoints.rs`; used to inspect/forge Lance branches directly in tests. +fn node_table_uri(root: &str, type_name: &str) -> String { + let mut hash: u64 = 0xcbf2_9ce4_8422_2325; + for &b in type_name.as_bytes() { + hash ^= b as u64; + hash = hash.wrapping_mul(0x100_0000_01b3); + } + format!("{}/nodes/{hash:016x}", root.trim_end_matches('/')) +} + #[tokio::test] async fn optimize_on_empty_graph_returns_stats_per_table_with_no_changes() { let dir = tempfile::tempdir().unwrap(); @@ -59,6 +72,97 @@ async fn optimize_after_load_then_again_is_idempotent() { } } +// Regression: `optimize` must not crash on a graph that has a `Blob` table. +// +// Lance `compact_files` forces `BlobHandling::AllBinary`, which mis-decodes +// blob-v2 columns ("more fields in the schema than provided column indices"), +// failing even a pristine uniform-V2_2 multi-fragment blob table. `optimize` +// must skip blob-bearing tables (and report the skip) rather than aborting the +// whole sweep. +// +// Before the skip fix, `optimize()` returned that Lance error here and aborted +// the whole sweep; it now skips the blob table (`doc.skipped == Some(..)`) +// while the sibling non-blob `Tag` table still compacts. The skip is gated by +// `LANCE_SUPPORTS_BLOB_COMPACTION`; the surface guard +// `compact_files_still_fails_on_blob_columns` flags when the upstream Lance fix +// makes the skip (and this test's blob arm) removable. +#[tokio::test] +async fn optimize_skips_blob_table_and_reports_skip() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + // One Blob node type (`Doc`) + one plain node type (`Tag`): proves the blob + // table is skipped while a non-blob table in the same sweep still compacts. + let schema = "\ +node Doc {\n slug: String @key\n content: Blob\n}\n\ +node Tag {\n slug: String @key\n}\n"; + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + // Multi-fragment blob table: Overwrite creates fragment 1; each Merge of + // new keys appends another. A >=2-fragment blob table is exactly what + // crashes `compact_files` today (single fragment would no-op and not crash). + load_jsonl( + &mut db, + "{\"type\":\"Doc\",\"data\":{\"slug\":\"d1\",\"content\":\"base64:aGVsbG8x\"}}\n{\"type\":\"Doc\",\"data\":{\"slug\":\"d2\",\"content\":\"base64:aGVsbG8y\"}}", + LoadMode::Overwrite, + ) + .await + .unwrap(); + load_jsonl( + &mut db, + "{\"type\":\"Doc\",\"data\":{\"slug\":\"d3\",\"content\":\"base64:aGVsbG8z\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + load_jsonl( + &mut db, + "{\"type\":\"Doc\",\"data\":{\"slug\":\"d4\",\"content\":\"base64:aGVsbG80\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + // Plain table, also multi-fragment so it has something to compact. + load_jsonl( + &mut db, + "{\"type\":\"Tag\",\"data\":{\"slug\":\"t1\"}}\n{\"type\":\"Tag\",\"data\":{\"slug\":\"t2\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + load_jsonl( + &mut db, + "{\"type\":\"Tag\",\"data\":{\"slug\":\"t3\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + + let stats = db + .optimize() + .await + .expect("optimize must not crash on a graph with a Blob table"); + + let doc = stats + .iter() + .find(|s| s.table_key == "node:Doc") + .expect("Doc stat present"); + let tag = stats + .iter() + .find(|s| s.table_key == "node:Tag") + .expect("Tag stat present"); + // The blob table is skipped (and reported), not compacted. + assert_eq!( + doc.skipped, + Some(SkipReason::BlobColumnsUnsupportedByLance), + "blob table must be reported as skipped", + ); + assert!(!doc.committed, "skipped blob table is not compacted"); + assert_eq!(doc.fragments_removed, 0); + assert_eq!(doc.fragments_added, 0); + // The plain (non-blob) table is unaffected by the skip. + assert_eq!(tag.skipped, None, "non-blob table must not be skipped"); +} + #[tokio::test] async fn cleanup_without_any_policy_option_errors() { let dir = tempfile::tempdir().unwrap(); @@ -158,3 +262,59 @@ async fn cleanup_then_optimize_preserves_rows_and_table_remains_writable() { .unwrap(); assert_eq!(count_rows(&db, "node:Person").await, people_before); } + +#[tokio::test] +async fn cleanup_reconciles_orphaned_branch_forks() { + // An incomplete prior `branch_delete` can leave a per-table Lance branch + // that the manifest no longer references (a "zombie" fork). It is + // unreachable through any snapshot but pins its `tree/{branch}/` storage. + // `cleanup` must reconcile it away: drop every Lance branch absent from the + // manifest authority, without touching `main`. + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = init_and_load(&dir).await; + + let people_before = count_rows(&db, "node:Person").await; + assert!(people_before > 0, "fixture should seed Person rows"); + + // Forge an orphaned fork the manifest never knew about. + let person_uri = node_table_uri(&uri, "Person"); + { + let mut ds = Dataset::open(&person_uri).await.unwrap(); + let base = ds.version().version; + ds.create_branch("ghost", base, None).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("ghost"), + "precondition: orphaned fork staged" + ); + } + + db.cleanup(CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + + // Orphan reclaimed; main untouched. + { + let ds = Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("ghost"), + "cleanup should reconcile the orphaned 'ghost' fork away" + ); + } + assert_eq!( + count_rows(&db, "node:Person").await, + people_before, + "cleanup must not disturb main while reconciling orphans" + ); + + // Idempotent: a second cleanup with the orphan already gone is a no-op. + db.cleanup(CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); +} diff --git a/crates/omnigraph/tests/writes.rs b/crates/omnigraph/tests/writes.rs index 13cb10f..0a309c9 100644 --- a/crates/omnigraph/tests/writes.rs +++ b/crates/omnigraph/tests/writes.rs @@ -371,11 +371,10 @@ async fn cancelled_mutation_future_leaves_no_state() { // Cancel-safety property: no graph-level run/staging state remains. // - // Note: `branch_list()` already filters `__run__*` via - // `is_internal_system_branch`, so a runtime "no `__run__` branches" check - // would be vacuous. The structural property that no `__run__` branches - // can ever be created is enforced by deletion of `begin_run` etc. in - // (verified by the build itself β€” those symbols no longer exist). + // No `__run__` branches can ever be created: the Run state machine + // (`begin_run` etc.) was deleted in MR-771 β€” verified by the build itself, + // those symbols no longer exist. Any legacy `__run__*` branch on an + // upgraded graph is swept by the v2β†’v3 manifest migration. // // (1) The branch list is unchanged: cancellation/completion cannot // synthesize new public branches. @@ -442,34 +441,40 @@ async fn repeated_loads_do_not_accumulate_branches() { assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); } -/// User code must not be able to write to internal `__run__*` names. -/// The branch-name guard predicate is kept as defense-in-depth; it -/// will be removed once a future production sweep retires the legacy -/// branches. +/// After MR-770, `__run__*` is an ordinary branch name β€” the Run state machine +/// and its `is_internal_run_branch` guard are gone. The surviving internal-ref +/// guard still rejects the active `__schema_apply_lock__` branch on the public +/// create/merge APIs. #[tokio::test] -async fn public_branch_apis_reject_internal_run_refs() { +async fn public_branch_apis_reject_internal_system_refs() { let dir = tempfile::tempdir().unwrap(); let mut db = init_and_load(&dir).await; - let create_err = db.branch_create("__run__synthetic").await.unwrap_err(); + // `__run__*` is no longer reserved β€” creating it now succeeds. + db.branch_create("__run__formerly_reserved") + .await + .expect("__run__ prefix is a normal branch name post-MR-770"); + + // The schema-apply lock branch is still rejected on public branch APIs. + let create_err = db.branch_create("__schema_apply_lock__").await.unwrap_err(); let OmniError::Manifest(err) = create_err else { panic!("expected Manifest error"); }; assert!( - err.message.contains("internal run ref"), + err.message.contains("internal system ref"), "unexpected error: {}", err.message ); let merge_err = db - .branch_merge("__run__synthetic", "main") + .branch_merge("__schema_apply_lock__", "main") .await .unwrap_err(); let OmniError::Manifest(err) = merge_err else { panic!("expected Manifest error"); }; assert!( - err.message.contains("internal run refs"), + err.message.contains("internal system refs"), "unexpected error: {}", err.message ); diff --git a/docs/dev/branch-protection.md b/docs/dev/branch-protection.md index 9b2fa78..2b6cc37 100644 --- a/docs/dev/branch-protection.md +++ b/docs/dev/branch-protection.md @@ -8,7 +8,7 @@ This page explains what the policy says and how to change it. | Setting | Value | Why | |---|---|---| -| **Required status checks (strict)** | `Classify Changes`, `Check AGENTS.md Links`, `Test Workspace`, `Test omnigraph-server --features aws`, `CODEOWNERS / drift`, `CODEOWNERS / noedit` | Every PR must pass workspace tests, AGENTS.md link integrity, and the CODEOWNERS hygiene checks. `strict: true` requires the branch to be up-to-date with `main` before merge. | +| **Required status checks (strict)** | `Classify Changes`, `Check AGENTS.md Links`, `Test Workspace`, `Test omnigraph-server --features aws`, `CODEOWNERS matches source`, `CODEOWNERS not hand-edited` | Every PR must pass workspace tests, AGENTS.md link integrity, and the CODEOWNERS hygiene checks. The two CODEOWNERS contexts must equal the job `name:` values in `.github/workflows/codeowners.yml` **verbatim** β€” a context naming a job that never reports (the old `CODEOWNERS / drift` used the job *id*, and the job was path-filtered) leaves every PR permanently pending and forces admin overrides. `strict: true` requires the branch to be up-to-date with `main` before merge. | | **Required approving reviews** | `1` | At least one reviewer. With a 2-person team, going higher would block all merges when one person is unavailable. | | **Require code-owner reviews** | `true` | The reviewer must be a code owner per `.github/CODEOWNERS`. This is what makes the codeowners chassis enforced. | | **Dismiss stale reviews on new commits** | `true` | A push after approval invalidates the prior review. Prevents the "approve, then sneak in unreviewed changes" pattern. | @@ -16,7 +16,7 @@ This page explains what the policy says and how to change it. | **Disallow force pushes** | `true` | No history rewrites on `main`. | | **Disallow branch deletions** | `true` | `main` cannot be deleted. | | **Required conversation resolution** | `true` | All review comment threads must be resolved before merge. | -| **Enforce on admins** | `true` | Even repository admins go through the gates. The point is no bypasses. | +| **Enforce on admins** | `false` | Admins can override the gates (`enforce_admins: false` in the JSON). This is the intended escape hatch for the 2-person team; tightening to `true` is tracked under hardening below. | | **Required signed commits** | not yet | Not enabled. Would lock out maintainers until everyone enrolls GPG/SSH commit signing. Tracked as a follow-up. | ## How to apply diff --git a/docs/dev/codeowners.md b/docs/dev/codeowners.md index 9a7fb50..50c4dc7 100644 --- a/docs/dev/codeowners.md +++ b/docs/dev/codeowners.md @@ -4,24 +4,45 @@ This setup gives every role change a reviewable PR and a permanent in-repository audit trail (`git log .github/codeowners-roles.yml`). -## Current roles +## Who owns what -| Role | Members | Scope | +The tables below are **generated** from `.github/codeowners-roles.yml` by `.github/scripts/render-codeowners.py` (the same render that produces `.github/CODEOWNERS`). They are the always-current "who owns what at this commit" view β€” don't edit them by hand; edit the yml and re-render. + + + +**Path β†’ owners** (GitHub applies *last match wins*; the `*` catch-all is listed first and is overridden by the specific patterns below it): + +| Path | Owners | Role(s) | |---|---|---| -| `engineering` | `@ragnorc` | All code under `crates/**`, repository infrastructure, default for unmapped paths | -| `docs` | `@ragnorc` | `docs/**`, README.md, AGENTS.md, CLAUDE.md, SECURITY.md | +| `*` | @ragnorc @aaltshuler | engineering | +| `crates/**` | @ragnorc @aaltshuler | engineering | +| `docs/**` | @ragnorc | docs | +| `README.md` | @ragnorc | docs | +| `AGENTS.md` | @ragnorc | docs | +| `CLAUDE.md` | @ragnorc | docs | +| `SECURITY.md` | @ragnorc | docs | -GitHub treats multiple owners in a CODEOWNERS line as **"any one of them satisfies the review requirement"**. To require N distinct approvers on a specific path, layer a CI check on top (not currently configured). +**Roles**: + +| Role | Members | Description | +|---|---|---| +| `engineering` | @ragnorc @aaltshuler | All production code under crates/**. Engine, CLI, server, compiler. | +| `docs` | @ragnorc | Documentation under docs/**, plus repo-level docs (README.md, AGENTS.md, CLAUDE.md symlink, SECURITY.md). | + + + +GitHub treats multiple owners on a CODEOWNERS line as **"any one of them satisfies the review requirement"**. To require N distinct approvers on a specific path, layer a CI check on top (not currently configured). ## How to change role membership or path mappings 1. Edit `.github/codeowners-roles.yml`. -2. Run `python3 .github/scripts/render-codeowners.py` (requires PyYAML; `pip install pyyaml`). -3. Commit both files in the same PR. +2. Open a PR. **CI re-renders for you**: the `CODEOWNERS` workflow regenerates `.github/CODEOWNERS` and the ownership tables above and auto-commits them back to your PR branch on same-repository PRs β€” you don't have to run the script locally (though you can: `python3 .github/scripts/render-codeowners.py`, requires PyYAML). + +On a fork (where CI can't push back), the workflow instead fails with the diff so you can run the script and commit it yourself. CI fails the PR if: -- `CODEOWNERS` was edited without a corresponding yml change, or -- The yml was changed but the rendered `CODEOWNERS` doesn't match. +- a fork PR left a generated artifact out of sync, or +- `CODEOWNERS` was edited without a corresponding yml change (the `CODEOWNERS not hand-edited` check). ## How to add a new role diff --git a/docs/dev/index.md b/docs/dev/index.md index d9ba5e5..1e41342 100644 --- a/docs/dev/index.md +++ b/docs/dev/index.md @@ -51,6 +51,18 @@ constraints. User-facing behavior should still be documented through | Install and deployment packaging | [install.md](../user/install.md), [deployment.md](../user/deployment.md) | | Release history | [releases/](../releases/) | +## Contribution & Governance + +| Area | Read | +|---|---| +| How to contribute (external) | [CONTRIBUTING.md](../../CONTRIBUTING.md) | +| Governance model, roles, decision authority | [GOVERNANCE.md](../../GOVERNANCE.md) | +| Public contribution RFC track | [rfcs/](../rfcs/) | + +The `docs/rfcs/` track is the **public, externally-authorable** RFC process. The +maintainer/internal RFCs below (`rfc-00N-*.md`) are a separate, team-owned +track; don't conflate the two. + ## Active Implementation Plans Working documents for in-flight feature work. Removed when the work lands. @@ -59,6 +71,8 @@ Working documents for in-flight feature work. Removed when the work lands. |---|---| | Schema-lint chassis v1 (MR-694) β€” `--allow-data-loss`, soft/hard drops | [schema-lint-v1-plan.md](schema-lint-v1-plan.md) | | Inline + stored queries, request/response envelope, MCP (MR-656 / MR-976 / MR-969) | [rfc-001-queries-envelope-mcp.md](rfc-001-queries-envelope-mcp.md) | +| Config & CLI architecture β€” layered config, client targeting, file naming (MR-973 / MR-974 / MR-981) | [rfc-002-config-cli-architecture.md](rfc-002-config-cli-architecture.md) | +| MCP server surface β€” full tool parity, stored queries, modular auth (MR-969 / MR-956 / MR-974) | [rfc-003-mcp-server-surface.md](rfc-003-mcp-server-surface.md) | ## Boundary diff --git a/docs/dev/invariants.md b/docs/dev/invariants.md index 70477d4..5ee4f17 100644 --- a/docs/dev/invariants.md +++ b/docs/dev/invariants.md @@ -99,6 +99,7 @@ Use it this way: | Multi-table commit | Manifest CAS plus recovery sidecars; not a single Lance primitive | [writes.md](writes.md), [architecture.md](architecture.md) | | Constructive mutations | In-memory `MutationStaging`, one end-of-query table commit per touched table, then one manifest publish | [writes.md](writes.md), [execution.md](execution.md) | | Deletes | Inline-commit residual; delete-only queries allowed, mixed insert/update/delete rejected by D2 | [query-language.md](../user/query-language.md), [writes.md](writes.md) | +| Branch delete | Manifest is the single authority, flipped atomically first; per-table forks + commit-graph branch are derived state, reclaimed best-effort (`force_delete_branch`) with the `cleanup` reconciler as the guaranteed backstop. Reusing a name whose reclaim failed before `cleanup` surfaces an actionable error | [branches-commits.md](../user/branches-commits.md), [maintenance.md](../user/maintenance.md) | | Schema validation | Type checks, required fields, defaults, edge endpoint checks, and edge cardinality are enforced on write paths | [schema-language.md](../user/schema-language.md), [execution.md](execution.md) | | Unique constraints | Intra-batch and write-path checks exist; full cross-version uniqueness is still a gap | [schema-language.md](../user/schema-language.md) | | Storage trait | `TableStorage` exists as the sealed staged-write surface; full call-site migration and capability/stat surfaces are incomplete | [writes.md](writes.md), [architecture.md](architecture.md) | @@ -107,6 +108,13 @@ Use it this way: | Auth | Bearer token hashing and server-side actor resolution are implemented at the HTTP boundary | [server.md](../user/server.md), [policy.md](../user/policy.md) | | Tests | Tempdir-backed Lance tests are the current substrate; there is no `MemStorage` test backend | [testing.md](testing.md) | +The branch-delete reconciler is authority-derived: it reclaims orphaned forks +today and degrades to a no-op if Lance ships an atomic multi-dataset branch +operation, so the design composes with that future rather than blocking it. This +is the same shape as invariant 7 (indexes are derived state); prefer it over a +recovery-sidecar-style approach for any new multi-dataset metadata operation, +since the sidecar would be scaffolding to remove once the substrate closes the gap. + ## Known Gaps Do not hide these behind invariant wording. Either move them forward or keep @@ -122,6 +130,15 @@ them explicit. - **Deletes and vector indexes:** `delete_where` and vector index creation still advance Lance HEAD inline because the required public Lance APIs are missing. Keep D2 and recovery coverage in place until those residuals are removed. +- **Blob-column compaction:** Lance `compact_files` mis-decodes blob-v2 columns + under its forced `BlobHandling::AllBinary` read ("more fields in the schema + than provided column indices"), so `optimize` skips any table with a `Blob` + property β€” reporting `SkipReason::BlobColumnsUnsupportedByLance` (loud, not a + silent drop) behind the `LANCE_SUPPORTS_BLOB_COMPACTION` gate. Reads and writes + are unaffected; only space/fragment reclamation on blob tables is deferred. + Remove the skip when the upstream Lance fix lands β€” the + `lance_surface_guards.rs::compact_files_still_fails_on_blob_columns` guard + turns red on that bump to force it. - **Planner capability/stat surfaces:** cost-aware planning, complete capability advertisement, and explain-with-cost are roadmap. Do not describe them as implemented. diff --git a/docs/dev/lance.md b/docs/dev/lance.md index ef83f2c..9d2b990 100644 --- a/docs/dev/lance.md +++ b/docs/dev/lance.md @@ -175,7 +175,9 @@ Migration from Lance 4.0.0 β†’ 6.0.1 landed in this cycle (DataFusion 52 β†’ 53, - **Lance #6658 closed** (2026-05-14) but `DeleteBuilder::execute_uncommitted` did **not** ship in v6.0.1 β€” binary search across the release stream shows it first appears in `v7.0.0-beta.10` (the closing commits landed on main but didn't backport to the 6.x line). Tracked as MR-A: migrate `delete_where` to staged, retire the parse-time D2 mutation rule, extend recovery sidecar coverage. **Gated on the Lance v7.x bump**, not this PR. v7.0.0-rc.1 dropped 2026-05-21. - **Lance #6666 still open** (`build_index_metadata_from_segments` public): vector-index two-phase blocked; inline `create_vector_index` residual retained. - **Lance #6877 still open** (`MergeInsertBuilder` dup-rowid): PR #109's `SourceDedupeBehavior::FirstSeen` + `check_batch_unique_by_keys` precondition stay load-bearing. +- **`Dataset::force_delete_branch`** (`branches().delete(name, force=true)`, dataset.rs:524) tolerates a missing branch-*contents* ref (vs plain `delete_branch`'s `RefNotFound`), but on the local store still errors `NotFound` if the branch `tree/` directory is fully absent (`remove_dir_all`'s NotFound is not caught for Lance's native error variant, refs.rs:526-549). Both variants still refuse a branch with referencing descendants (`RefConflict`). `TableStore::force_delete_branch` wraps this to be fully idempotent (tolerates already-absent). The single-authority branch-delete redesign uses it for orphan reclamation (eager best-effort reclaim + cleanup reconciler). Pinned by `lance_surface_guards.rs::force_delete_branch_semantics`. Branch delete is "flip the ref atomically, then `remove_dir_all(tree/{branch})`"; branch-exclusive data lives under `tree/{branch}/` so a drop reclaims it immediately without touching `main`. +- **Lance blob-v2 `compact_files` bug** (no public issue found as of 2026-06): `compact_files` disables binary-copy for blob datasets and forces `BlobHandling::AllBinary` on the read side; the v2.1+ structural decoder then mis-counts column infos for the blob-v2 struct and fails with `Invalid user input: there were more fields in the schema than provided column indices / infos` (`lance-encoding/src/decoder.rs::ColumnInfoIter::expect_next`). This fails even a pristine uniform-V2_2 multi-fragment blob table; vector/list/scalar/ragged columns and mixed file versions all compact fine. Reads/queries use descriptor handling (`BlobHandling::default()`) and are unaffected. `optimize` skips blob-bearing tables behind `LANCE_SUPPORTS_BLOB_COMPACTION = false` (`db/omnigraph/optimize.rs`), reporting `SkipReason::BlobColumnsUnsupportedByLance`. Pinned by `lance_surface_guards.rs::compact_files_still_fails_on_blob_columns`, which turns red when the bug is fixed β†’ flip the gate, remove the skip branch + the `maintenance.rs::optimize_skips_blob_table_and_reports_skip` skip assertions. -Surface guards added: `crates/omnigraph/tests/lance_surface_guards.rs` (8 named guards; 3 runtime + 5 compile-only). Future Lance bumps re-run this file first as the smoke check. Two additional guards from the original plan deferred to follow-up (`manifest_cas_returns_row_level_contention_variant` needs full publisher-race harness; `table_version_metadata_byte_compatible_with_v4` needs `pub(crate)` reach extension). +Surface guards added: `crates/omnigraph/tests/lance_surface_guards.rs` (10 named guards; 5 runtime + 5 compile-only). Future Lance bumps re-run this file first as the smoke check. Two additional guards from the original plan deferred to follow-up (`manifest_cas_returns_row_level_contention_variant` needs full publisher-race harness; `table_version_metadata_byte_compatible_with_v4` needs `pub(crate)` reach extension). Bump this date stanza on the next alignment pass. diff --git a/docs/dev/rfc-002-config-cli-architecture.md b/docs/dev/rfc-002-config-cli-architecture.md new file mode 100644 index 0000000..0a8e573 --- /dev/null +++ b/docs/dev/rfc-002-config-cli-architecture.md @@ -0,0 +1,590 @@ +# RFC: Config & CLI Architecture β€” Layered Config, Client Targeting, File Naming + +**Status:** Proposed +**Date:** 2026-05-30 +**Tickets:** MR-668 (multi-graph server, shipped β€” the dependency this builds on), MR-969 (stored queries + MCP β€” supplies the in-repo agent tool surface), MR-973 (quickstart / onboarding), MR-974 (agent setup surface), MR-981 (agent-friendly CLI hardening) +**Target release:** v0.8.x (tentative; phased β€” see Rollout) + +## Summary + +OmniGraph today has a single config file, `omnigraph.yaml`, read both by the CLI (operating the embedded engine) and by `omnigraph-server` (hosting graphs). There is **no client-side configuration that targets a *running server*** β€” to talk to a deployed `omnigraph-server` you drop to `curl` or the `omnigraph-ts` client. This is the one real gap in an otherwise coherent design (storage-URI addressing, multi-graph routing, per-graph policy). + +This RFC defines the config and CLI architecture that closes that gap, derived from first principles β€” *working backwards from what OmniGraph uniquely enables* rather than copying kubeconfig / `helix.toml`. The result: + +1. A **global-first layered config** β€” user-global (`~/.omnigraph/`) is the **primary, self-sufficient default**; per-project (`./omnigraph.yaml`) is an *optional* override + deployment manifest. One uniform schema, both layers optional; the CLI works from any directory with **no project file** (the `kubectl`/`aws`/`gh` posture), unlike today's project-anchored behavior. +2. A single unifying noun β€” the **target** β€” that resolves a name to a concrete `(locus, graph, sub-state, credential)` tuple, where the locus is **embedded (storage URI) XOR remote (server endpoint)**. +3. A **multi-server Γ— multi-graph** client model (OmniGraph hosts N graphs per server and there are M servers β€” unlike Helix's one-cluster-one-graph). +4. **Credentials by reference, keyed by server name** (the AWS/gh/kube model) β€” OS keychain `omnigraph:` (preferred) β†’ a `[]` profile in `~/.omnigraph/credentials` β†’ `OMNIGRAPH_TOKEN[_]` env (CI). `servers.` is endpoint-only by default but may carry an explicit, secret-free `auth: { token: { env|file|command|keychain } }` source; no `credentials.yaml`; the shipped `bearer_token_env` + dotenv stay as a legacy compat path. Every committed/GitOps'd surface stays secret-free. +5. A **file-naming** decision: project and server config are **the same artifact, same name** (`omnigraph.yaml`); the only differently-named file is the user-global `config.yaml`, justified by **scope, not role**. + +The design optimizes jointly for **DX** (one command surface across embedded and remote; clone-and-go) and **AX** (agent experience: one flat resolved context, secrets structurally unreachable, branch-pinned reproducible reads, and a GitOps'd capability surface). + +## Reconciliation with shipped / planned CLI work + +Verified **against the code**, not ticket statuses (which are unreliable β€” e.g. MR-581 is marked done but is stale and unbuilt). Findings and the corrections they force: + +- **Noun is `graph`/`graphs`, NOT `target`/`targets`.** The config key is `graphs:` in `config.rs` and the flag is `--graph`. **This RFC uses `graphs:`/`--graph` throughout**; the unifying noun is a **`graphs:` entry** that is *embedded* (`storage:`, formerly `uri:`) XOR *remote* (`server:` + `graph_id:` defaulting to the entry key) β€” a typed locator (Β§1.1). Read any lingering `targets:`/`--target` below as `graphs:`/`--graph`. +- **`~/.omnigraph/` stands on its own merits** (Helix/aws/kube peer convention), **not** on precedent β€” there is **no `~/.omnigraph/` usage in the code** today. (MR-581 / MR-531 templates-into-`~/.omnigraph/` are *stale tickets, unbuilt*.) +- **Templates do not exist** in the code (no `template` command). The template mechanism is a *design question for this RFC / the init family*, not an existing foothold. +- **What actually exists in the CLI** (verified): `init, query(read), mutate(change), load, ingest, branch, schema, lint, snapshot, export, commit, policy, optimize, cleanup, graphs`. **Not built:** `serve, quickstart, template, prune, login`. `omnigraph init` exists (with `scaffold_config_if_missing`, `main.rs:1415`); the rest of the "init family" (`quickstart` MR-973, `serve` MR-970, `prune`/`init --force` MR-972/975, `mcp install`/skills MR-974, agent-mode MR-981) are **unbuilt tickets**, some stale. +- **Config still uses `aliases:`** (no `operations:` in code; MR-839 unbuilt). Β§6's reconciliation talks about `aliases:` as-is, noting `operations:` is a *proposed* rename. +- **`bearer_token_env` exists** (per-graph, `config.rs`); MR-971 flags a CLI-parity / server-side gap. The per-`servers.` extension lands on top of that. +- **A top-level `omnigraph lint` command exists** (verified). A stored-query *registry* validator must pick a verb that doesn't read as a competing lint/check. + +## Motivation + +Three problems, in priority order: + +- **No clientβ†’server targeting config.** The moment an operator stands up `omnigraph-server` β€” for bearer auth + Cedar at a network boundary + admission control + multi-graph routing β€” the CLI can't address it. `curl` is the fallback. There is no named, switchable, credential-carrying way to say "run this against `prod` on the team server." +- **Multi-server Γ— multi-graph has no first-class expression.** OmniGraph genuinely runs N graphs per server across M servers. The same graph is **multi-homed** β€” `s3://b/prod` may be `prod` on server A, `production` on server B, and opened directly by the CLI. Today's flat `graphs:` map (nameβ†’storage-URI) can't express "graph `production` on server `prod-eu`." +- **Solo-first and embedded-first are unserved by the remote story.** A solo developer with no projects should define everything in `~`. A developer iterating locally (embedded, no server) and then pointing at staging (remote) should change *one word*, not learn a second command surface. + +MR-668 shipped the server side (multiple graphs per server). MR-969 ships the in-repo agent tool surface (stored queries / MCP). This RFC supplies the **client and config layer** that lets humans and agents target that surface coherently β€” the foundation under MR-973 / MR-974 / MR-981. + +## Non-Goals + +- **A control plane / dashboard for config.** Operators edit files and (for servers) restart. No runtime config-mutation API. Matches the MR-668 / MR-969 operational model. +- **Hot reload.** Restart-only for server-side config, matching MR-668 and MR-969. +- **Embedding secrets in any config file.** Credentials are by-reference; the git-ignored `auth.env_file` dotenv (or, later, the OS keychain) holds tokens. Never a committable `*.yaml`. +- **Renaming the project manifest by role.** No `omnigraph.server.yaml` / `omnigraph.client.yaml`. Role lives in sections, not filenames (see Design Β§3). +- **Dropping embedded mode.** Embedded-first is load-bearing for the file-naming decision; this RFC assumes it stays. +- **Cross-graph / cross-server tool listing in MCP.** Clients loop over per-graph catalogs (a MR-969 non-goal, restated). + +## Background + +OmniGraph runs on Lance 6.x: typed nodes/edges in per-type Lance datasets, atomic multi-table commits via a `__manifest` table, branchable and time-travelable. The CLI (`omnigraph`) operates the **embedded engine** directly against a storage URI β€” no HTTP client in its runtime dependencies. `omnigraph-server` (Axum) is a *separate* HTTP front-end over the same engine, with bearer auth + per-graph Cedar (MR-668). The two read the same `omnigraph.yaml` but never connect to each other. + +OmniGraph **already has a credentials-by-reference mechanism**, which this RFC builds on rather than replacing: `TargetConfig.bearer_token_env` names the env var holding a graph's bearer token, and `auth.env_file` points at a git-ignored dotenv (`.env.omni`) that the CLI auto-loads into the process (`load_env_file_into_process`) with real-env-vars-win precedence; `resolve_remote_bearer_token` resolves a token via env var then dotenv named lookup. `.env.omni` is already in `.gitignore`. + +The six **irreducible enablers** that drive the design (referenced as E1–E6 below): + +| # | Enabler | Consequence | +|---|---|---| +| E1 | A graph is a **self-contained storage URI**; the substrate (object store + manifest CAS) is the source of truth β€” no server required to read/write. | A graph is addressable **directly (embedded)**, not only via a server. | +| E2 | A server hosts **many graphs**; **many servers** exist. | The remote address space is **`{server} Γ— {graph_id}`**. | +| E3 | The same graph is **multi-homed** under different per-locus names. | **Name β‰  identity.** Resolution is mandatory. | +| E4 | **Branch / commit / snapshot** are first-class addressable sub-state. | An address is *graph @ branch/snapshot*, not just graph. | +| E5 | Enforcement is **two-layered**: engine-layer Cedar (`_as` writers, works embedded) + HTTP-boundary bearer+Cedar (server only). | *How* you reach a graph determines *which* enforcement applies. | +| E6 | **Stored queries / MCP tools are a per-graph registry defined in the project config** (MR-969). | The **agent tool surface is version-controlled in the repo**. | + +Competitors collapse dimensions OmniGraph keeps live: **Helix** fuses E2+E3 (one cluster = one graph); **namidb** fuses E1+E3 into the URI (`s3://b?ns=prod`) and serves one namespace per process. OmniGraph has all of E1–E6 at once, so its config resolves a richer space β€” but the richness is *earned* by capability. + +## Design + +### 1. The address space and the `target` abstraction + +Every OmniGraph address is a tuple: + +``` +(locus, graph, sub-state, credential) + locus = embedded(URI) XOR remote(server-endpoint) # E1, E2 + graph = a URI (embedded) | a graph_id on a server (remote) # E3 + sub-state = branch | snapshot # E4 + credential = cloud-storage creds (embedded) | bearer token (remote) # E5 +``` + +The config's only job is **name β†’ this tuple**. Define one noun β€” a **target** β€” that resolves to either shape: + +```yaml +targets: + dev: # embedded β€” substrate-direct (E1) + storage: s3://team-bucket/dev.omni + branch: main # sub-state (E4) + staging: # remote β€” resolves a server by reference (E2/E3) + server: staging # β†’ looked up in `servers` + graph_id: prod # the graph's id on that server (defaults to the entry key) + branch: review +``` + +`--target staging` resolves: project `targets.staging` β†’ `{server: staging, graph_id: prod, branch: review}` β†’ `servers.staging` β†’ `{endpoint, token-by-ref}` β†’ final `(remote(https://…), prod, review, $TOKEN)`. Embedded targets skip the server hop and use cloud-storage credentials. + +**Two concepts, not kubeconfig's three.** kube splits cluster / user / context; that 3-way split is its most-cursed UX. A target *bundles* server+graph+branch+defaults under one name; the **only** thing split out is `servers`, because endpoints+credentials are shared across many targets and are secret-bearing (different ownership and rate-of-change; see Β§2). Result: **2 nouns β€” `servers` and `targets`.** Embedded `targets` (`storage:`) subsume today's `graphs:` entries. + +### 1.1 The resolved address is a typed *locator*, not a `uri` string + +The shipped config models a graph as a single `uri: String`, and code branches on `is_remote_uri(uri)`. That conflates two structurally different addresses: an **embedded** graph is a *complete, self-contained* address β€” one storage URI = one graph, opened directly via the embedded engine; a **remote** graph is a *server endpoint + a `graph_id`* β€” one server hosts N graphs. A bare server URL **is not a graph**; it lacks the `graph_id`. The cost of the string model, in the code today: + +- the CLI re-decides "server or file?" via `is_remote_uri` at ~16 call sites; +- `TargetConfig` (one `uri` field) **cannot express** multi-server Γ— multi-graph or a multi-homed graph (E2/E3) β€” "graph `production` on server `prod-eu`" has no representation; +- the CLI **bails on remote URIs** for most operations, precisely because the string can't carry the `graph_id`; +- the `omnigraph-ts` SDK had to model `baseUrl` **+** `graphId` *separately* (rewriting `/graphs/{graphId}/…`) β€” it invented the structure the string lacks. + +So the *resolved* address is a **typed locator**, not a string: + +```rust +enum GraphLocator { + Embedded { storage: StorageUri }, // file:// , s3:// β€” a complete graph + Remote { server: ServerId, graph_id: GraphId }, // which server + which graph (+ bearer creds) +} +``` + +A `graphs:` entry resolves into this **once**; downstream code dispatches on the variant (the breadboard's `GraphConn = Embedded(engine) | Remote(http)`) instead of re-sniffing a scheme at each call site. The `uri` string becomes an *input format* for the embedded variant, never the address itself. + +**YAML naming follows the locator β€” the *key* names the locus**, so neither the value's scheme nor a comment is load-bearing: + +| Locus | Key | Value | +|---|---|---| +| Embedded | **`storage:`** (shipped `uri:` is a deprecated alias) | a storage URI (`s3://…`, `file://…`) | +| Remote | **`server:`** | a name in `servers:` (its `endpoint` + creds resolve by name, Β§5) | +| Remote graph id | **`graph_id:`** | the id on that server β€” **defaults to the entry key**; set only when the local alias differs | + +An entry has `storage:` **xor** `server:` β€” the deserializer rejects *both* and *neither* (no silent ambiguity). This removes two prior confusions: `graphs:` (the map) vs `graph:` (the remote id), and `uri:`-might-be-a-server. + +```yaml +servers: + prod-eu: { endpoint: https://og-eu.internal:8080 } +graphs: + dev: { storage: s3://team-bucket/dev.omni } # embedded + production: { server: prod-eu } # remote β€” graph_id = "production" (the key) + staging: { server: prod-eu, graph_id: prod } # remote β€” alias β‰  server's id +``` + +### 1.2 Invalid configs are rejected by design + +The DX rule is: **a config field is either honored or rejected, never silently ignored**. The loader therefore has two phases: + +1. Parse YAML into a loose/raw shape that preserves origin (`base_dir`, layer, line/path when available). +2. Convert once into a typed, role-aware resolved config. Every command receives the resolved form, not the raw YAML structs. + +The typed graph shape is: + +```rust +enum GraphEntry { + Embedded(EmbeddedGraphEntry), + Remote(RemoteGraphEntry), +} + +struct EmbeddedGraphEntry { + storage: StorageUri, + branch: Option, + policy: Option, + queries: QueryRegistrySpec, +} + +struct RemoteGraphEntry { + server: ServerId, + graph_id: GraphId, + branch: Option, +} +``` + +That makes these rules structural rather than advisory: + +- A graph entry must specify **exactly one** locator: `storage:`/legacy `uri:` xor `server:`. +- `policy:` and `queries:` are valid only on `Embedded` graph entries, because they define the capability surface of a graph this process opens directly. A `Remote` graph entry points at a server; that server owns policy and stored-query definitions. +- `omnigraph-server` may serve only `Embedded` graph entries. A server manifest entry with `server:` is rejected: a server should not "host" a graph by proxying another server. +- A named graph uses its own graph entry. Top-level `policy:` / `queries:` are a legacy anonymous-bare-URI compatibility path only; if a named graph is selected while top-level blocks would be ignored, config validation errors with a migration hint. +- A client-defined remote graph discovers stored queries from the server (`GET /queries`) and invokes them (`POST /queries/{name}`); it does not define `queries:` locally for that remote graph. + +Examples that must fail fast: + +```yaml +graphs: + prod: + storage: s3://team-bucket/prod.omni + server: prod-us # invalid: storage xor server +``` + +```yaml +graphs: + prod: + server: prod-us + graph_id: production + policy: { file: ./policies/prod.yaml } # invalid: remote graph policy lives on the server + queries: + find_user: { file: ./queries/find_user.gq } # invalid: remote graph queries are discovered +``` + +`omnigraph config view --resolved --show-origin` is the user-facing debugger for this boundary: it shows the final `Embedded` or `Remote` graph and where every honored field came from. Fields that cannot be honored never make it into the resolved view; they fail validation first. + +### 2. Layered config β€” global-first, uniform schema, project-optional + +**Posture: global-first, project-optional.** OmniGraph's CLI is primarily a *client* (it operates against graphs and servers, embedded or remote), so it sits on the **global-first** side of the CLI-config axis β€” like `kubectl` / `aws` / `gh` / `docker`, and unlike *project-first* tools (`git` / `cargo` / `terraform`) whose primary config is per-repo. The **global user config is the primary, self-sufficient default**; the project file is an *optional* repo-scoped override (and, when present, the deployment manifest). `omnigraph query --target prod` must work from **any directory with no project file**, exactly as `kubectl get pods --context prod` works from anywhere. *(This is a deliberate flip from today, where the CLI reads `./omnigraph.yaml` and does not even walk parent dirs β€” i.e. today it is project-anchored.)* + +**Rule: the two layers share ONE raw schema, and each is fully self-sufficient** (the git-layering mechanism β€” same schema at both levels; you never need a repo to have a working config). Do **not** specialize the file format by layer. Instead, run the same role-aware validation everywhere (Β§1.2): the global and project layers may both define graph locators, defaults, servers, and aliases, but fields that are meaningless for a resolved graph variant are rejected rather than ignored. For example, `queries:` is valid for an embedded graph this config opens directly; it is invalid on a remote graph entry because remote stored queries are server-owned and discovered. + +This makes the **zero-project case the default, not an edge case**: a solo user (or an agent) defines everything needed for client work in `~/.omnigraph/config.yaml` β€” servers, embedded + remote graph locators, defaults, aliases, and optionally personal embedded-graph query registries β€” and **never creates a project file**. A team adds `./omnigraph.yaml` only when it wants repo-scoped overrides or a committed, GitOps'd deployment manifest. Global-first does **not** forbid project files; it stops *requiring* them (the kubectl model: `~/.kube/config` is sufficient and default; per-project kubeconfigs are opt-in via `KUBECONFIG`). + +| Layer | Required? | Typical use | Path | +|---|---|---|---| +| Global | no | **the default** β€” solo/agent's entire config; shared servers+creds for teams; even a personal server's graphs/queries | `~/.omnigraph/config.yaml` | +| Project | no | **opt-in** β€” repo-scoped overrides + the committed deployment manifest (graphs, queries, policy) | `./omnigraph.yaml` | + +**Precedence (low β†’ high):** built-in defaults < global < project < env vars < CLI flags. With no project file it collapses to **built-in < global < env < flags** β€” the common global-only path. + +**Merge semantics β€” "closest layer wins, at the smallest meaningful unit"** (the field consensus: git / kubeconfig / cargo / Helm / VS Code): +- **Settings objects** (`defaults`, `auth`, `server`) β†’ **deep-merge per field**: a project sets `defaults.graph` and *inherits* the global `defaults.output_format`. (VS Code / cargo behavior.) +- **Named-resource maps** (`servers`, `graphs` / compat `targets`, `queries`, `aliases`) β†’ **union by key; on a collision the higher layer's entry REPLACES the lower wholesale** β€” *no field-level deep-merge within an entry*. (kubeconfig: union contexts by name.) The footgun this avoids: global `servers.prod = {endpoint, policy}`, project `servers.prod = {endpoint: other}` β€” deep-merge would silently retain the old fields; replace makes the project's `prod` self-contained and predictable. +- **Lists/arrays** β†’ **replace, never append** (Helm convention; appending is order-sensitive and surprising). +- **Scalars** β†’ higher layer wins. +- **Relative paths carry their origin's base_dir.** A `queries:` entry's `.gq` path, or a `policy.file`, resolves against the directory of the layer it was *defined in* β€” global entries under `~/.omnigraph/`, project entries under the project dir. +- **Inspectable (non-negotiable):** `omnigraph config view --resolved --show-origin` prints each final value *and which layer set it* (the `git config --show-origin` / `kubectl config view` rule). A layered config without origin-tracing is a debugging trap. + +### 3. Roles, and the file-naming decision (same name for project = server) + +`omnigraph.yaml` carries two *roles* that diverge in prod and collapse on a laptop: + +- **Server role** (read by `omnigraph-server`): `graphs:` entries that are **embedded storage locators**, per-graph `policy.file`, **`queries:` β€” the stored-query/MCP registry lives here**, plus serving knobs. Remote graph locators are rejected in this role. +- **Client role** (read by the CLI/agent): `servers:`, embedded or remote `graphs:` locators, `defaults:`, `aliases:`. A remote graph locator points at server-owned capabilities; it cannot define local `policy:` or `queries:`. + +**Project config and server config are the same artifact, hence the same name.** The server *serves the project*: the file that says "these graphs exist, with these stored queries and this policy" is simultaneously the project manifest and the server's deploy config. Role is distinguished by which *sections* are populated, never by filename. Readers ignore sections that are not theirs (today's file already does this with `cli:` vs `server:`). + +**Why not kube's role-split.** Two coherent models exist: (A) one project file with role-sections (Helix `helix.toml` holds both `[local.dev]` and `[enterprise.production]`; compose; Cargo), and (B) deployment-manifest strictly separate from client config (kubectl β€” you never put a context in `deployment.yaml`). kube is the sharpest topological analog (multi-server Γ— multi-graph, one client targeting many), so B has a real claim. The tiebreaker is **E1: OmniGraph is embedded-first.** In embedded mode the manifest's `graphs:` *is* the local target list β€” manifest and local-client-view are the same object, so splitting them (B) fights the grain and forces two files for local work. kube splits because it has **no** embedded mode (client always remote+global). So: take the half kube is right about β€” *remote* client targeting (`servers:`, endpoints, creds) is a separate concern in a separate **user-global** file (`config.yaml`, like `~/.kube/config`); reject the half it is wrong about for us β€” do **not** split the *project* layer by role. **The second name (`config.yaml`) is justified by scope (user-global), not role.** *(If OmniGraph ever dropped embedded mode and went pure-remote, model B's strict split would become cleanest.)* + +### 4. File naming + +Principles from the field: **one global dir** `~/.omnigraph/` (like `~/.aws`/`~/.kube`/`~/.helix`), with config/cache/state as **subdirectories** (separation without XDG's three-root scatter); **secrets keyed by server name in the OS keychain or a separate git-ignored profile file** (AWS/gh model, not a new `credentials.yaml`); **project-root manifest keeps the app-named file** (`Cargo.toml`, `package.json`); **`.yaml`, not `.yml`**; keep OmniGraph's established names. The genuinely *new* decisions are the **global** dir's existence and keyed-by-name resolution with an explicit `auth.token` override (MR-971); the shipped `bearer_token_env` + `auth.env_file` mechanism remains as legacy compat. + +| Artifact | Path / name | Why | +|---|---|---| +| Project = server config (one artifact) | `./omnigraph.yaml` | **Keep.** Root manifest like `Cargo.toml` / `compose.yaml` / `helix.toml`. Same name for both roles because it is one file. In prod the server's deploy repo and an app repo each have their own `omnigraph.yaml` β€” same name, different repos. | +| Global user config | `~/.omnigraph/config.yaml` | **One dir** (`~/.omnigraph/`, like `~/.aws`/`~/.kube`/`~/.helix`). Named `config.yaml` *not* `omnigraph.yaml` β€” the name signals scope (and `~/.aws/config`, `~/.kube/config`, `~/.helix/config` all do this). Holds the full schema so a solo user needs nothing else. | +| Credentials | OS keychain (`omnigraph:`, preferred) β†’ `~/.omnigraph/credentials` profile file (`[]`, `0600`, git-ignored). **Keyed by server name**, inside the one dir. | **Key by name, AWS/gh model** β€” `~/.aws/credentials [profile]`, `~/.kube/config users:`, `~/.helix/credentials`. *Not* a `credentials.yaml`, and *not* a per-server hand-named env var; the secret lives under the server name (no indirection). Legacy `bearer_token_env` + `.env.omni` dotenv remain as a compat path. See Β§5. | +| Cache / state | `~/.omnigraph/cache/`, `~/.omnigraph/state/` | Subdirs of the one dir (like `~/.aws/sso/cache/`, `~/.kube/cache/`) β€” cache is `rm -rf`-safe and backup-excludable without scattering across XDG roots. | +| Cedar policy | `./policies/.yaml` + `.tests.yaml` | **Keep.** Referenced by `policy.file`. | +| Schema | `./*.pg` (e.g. `schema.pg`) | **Keep.** | +| Stored queries | `./queries/*.gq` | **Keep.** `.gq` sources referenced by the `queries:` registry. | + +**Global dir: `~/.omnigraph/` β€” one place, with subdirectories.** Everything OmniGraph keeps for a user lives under a single `~/.omnigraph/` directory, matching the peer group (`~/.aws`, `~/.kube`, `~/.docker`) and the direct competitor (`~/.helix`). This is what DB/cloud-CLI users expect and the lowest-cognitive-load shape. + +*Separation and "one place" are not in conflict* β€” the decisive realization. The peer tools get config/cache/state separation via **subdirectories inside the one dir**, not via XDG's three scattered roots: `~/.aws/sso/cache/`, `~/.kube/cache/`. So OmniGraph keeps `~/.omnigraph/config.yaml`, `~/.omnigraph/credentials`, `~/.omnigraph/cache/` (catalogs β€” `rm -rf`-safe, backup-excludable), `~/.omnigraph/state/` (session, logs) β€” getting cache hygiene **and** a single discoverable location, without the XDG scatter. An earlier draft argued XDG on a false dichotomy (it assumed single-dir β‡’ mixed); subdirs dissolve it. `~/.omnigraph/` is canonical and documented; `$XDG_CONFIG_HOME` may optionally be honored if a user has set it, but XDG is not part of the mental model. + +**Env / override precedence (the `KUBECONFIG` analog):** +- `OMNIGRAPH_CONFIG=/path` β€” explicit config file, highest precedence. +- `OMNIGRAPH_HOME=/path` β†’ the global dir (default `~/.omnigraph/`); `$XDG_CONFIG_HOME` optionally honored if a user has set it, but `~/.omnigraph/` is canonical. +- Cache and state are subdirs of the one dir: `~/.omnigraph/cache/` (cached remote catalogs), `~/.omnigraph/state/` (session, logs). +- Per-server token resolution: an explicit `auth: { token: {...} }` source (env/file/command/keychain) wins if set; otherwise **keyed by the server name** β€” `OMNIGRAPH_TOKEN_` (or `OMNIGRAPH_TOKEN` for the active server) β†’ OS keychain `omnigraph:` β†’ the `[]` profile in `~/.omnigraph/credentials`; legacy `bearer_token_env` still honored. See Β§5. + +### 5. Credentials, connection tiers, and bind portability (12-factor) + +**Credentials are by-reference everywhere, never inlined β€” and keyed by the *server name*, not by a hand-invented env-var name.** This is the one place the design departs from simply reusing the shipped `bearer_token_env` mechanism, because that mechanism is sub-optimal for a multi-server client: it forces the operator to invent and coordinate an env-var name per server (three steps to add a server: pick a var, name it in config, set it in the store). The peer group (AWS profiles, `gh` hosts, kubeconfig users, docker auths) instead keys the secret **by the server's name** β€” no indirection. OmniGraph should match that. + +**Resolution for server `` (no config field required):** +1. **`OMNIGRAPH_TOKEN_`** env var (name-derived, upper-snake), else **`OMNIGRAPH_TOKEN`** for the active server β€” the CI/headless override (12-factor). +2. **OS keychain** entry `omnigraph:` β€” the preferred interactive store (no plaintext on disk); written by `omnigraph login `. +3. **`~/.omnigraph/credentials`** β€” an AWS-style profile file keyed by server name (mode `0600`, git-ignored), the fallback when no keychain: + ```ini + [prod-us] + token = … + [prod-eu] + token = … + ``` +So a `servers.` with no token field resolves by name β€” adding a server is one step (`omnigraph login `), and "multiple servers, multiple tokens" falls out for free. + +**But implicit must not be the *only* path β€” explicit sourcing is a first-class option** (the DX/AX lesson). Pure-convention is invisible (you must *know* `OMNIGRAPH_TOKEN_`), can't integrate with a secrets-manager's fixed var name, and can't do dynamic/short-lived tokens. So a server may declare an explicit `auth:` block β€” a **method-agnostic wrapper** (today only `token:` for bearer; `mtls:`/`oidc:` are the future siblings, so the credential model never has to be re-keyed) holding a tagged token *source*. Secrets are *still* never inlined (every source is a reference): + +```yaml +servers: + prod-us: + endpoint: https://og-us… + auth: { token: { env: OG_PROD_US_TOKEN } } # explicit env var β€” self-documenting (= legacy bearer_token_env) + prod-eu: + endpoint: https://og-eu… + auth: { token: { command: [vault, read, -field=token, secret/og] } } # dynamic / short-lived + edge: + endpoint: https://og-edge… + auth: { token: { file: /run/secrets/og-token } } # k8s/docker mounted secret + staging: + endpoint: https://og-staging… # no auth: β†’ implicit chain (below) +``` + +| `auth.token:` source | when | DX/AX value | +|---|---|---| +| *(auth omitted)* | the common case | zero-config; `omnigraph login` populates keychain `omnigraph:` | +| `{ env: VAR }` | secrets-manager / CI injects a fixed var | **self-documenting** β€” config states the source; = the legacy `bearer_token_env` | +| `{ file: PATH }` | k8s/docker secret mounted as a file | no env plumbing | +| `{ command: [...] }` | Vault, cloud IAM, `gh auth token` | **dynamic tokens** β€” first-class exec, the capability pure-env/keychain can't give (kube `exec` / AWS `credential_process`) | +| `{ keychain: ENTRY }` | pin a non-default keychain entry | explicit override of the name-derived default | + +**Resolution per server:** if `auth.token:` is set, use that source (no fallthrough). Else the **implicit chain**: `OMNIGRAPH_TOKEN_` (or `OMNIGRAPH_TOKEN` for the active server) β†’ keychain `omnigraph:` β†’ `[]` in `~/.omnigraph/credentials` (`0600`, git-ignored). `omnigraph login ` writes/rotates only that server's secret; per-server precedence is independent; sharing is opt-in (same env var or source). The `command` source runs locally with the operator's own privileges and is defined only in operator-owned config (never server-supplied), so it adds no remote-execution surface. The `auth:` wrapper is method-agnostic so adding mTLS/OIDC later is a new sibling key, not a breaking re-key (Hyrum's Law: the field name is a contract once shipped). There is **no `credentials.yaml`** and **no inlined secret**. *Convention for the floor, explicit for control β€” and explicit is legible to agents and never inlines a secret.* + +**Back-compat.** The shipped per-graph `bearer_token_env` + `auth.env_file` dotenv (`resolve_remote_bearer_token`, real-env-wins) keeps working unchanged for existing single-server setups; `bearer_token_env` is just the legacy flat alias for `auth: { token: { env } }`. Resolution tries an explicit `auth.token:` (or legacy `bearer_token_env`) first, then the keyed-by-name chain β€” so nothing breaks, but the zero-config default is the no-boilerplate keyed-by-name path. (MR-971 β€” the `bearer_token_env` parity gap β€” is where this resolver work lands.) + +**Three connection tiers** (Supabase/Prisma teach the zero-config floor): +1. **Env vars** β€” `OMNIGRAPH_SERVER=https://…` + `OMNIGRAPH_TOKEN=…`: zero-config remote, no file (the `DATABASE_URL` floor). +2. **Global `config.yaml`** β€” named `servers:` + `graphs:` for multi-server setups (the AWS-profiles convenience). +3. **Project `omnigraph.yaml`** β€” project-pinned targets/graphs, committed. + +**Keep `omnigraph.yaml` a *portable* manifest (12-factor).** Deploy-specific runtime that varies per environment β€” the **bind host/port**, worker counts β€” should be supplied by **`--bind` / `OMNIGRAPH_BIND` (flags/env)**, *not* a committed `server.bind:` baked into the manifest. A manifest that hardcodes `0.0.0.0:8080` is not portable across deploys and leaks an environment detail into a version-controlled file. The same-named `omnigraph.yaml` stays portable across deploys precisely because the volatile, per-environment knobs live in env/flags (12-factor config), while the stable, portable definition (graphs, queries, policy) lives in the file. This is the one concrete lesson taken from kube's model-B without adopting its file split: portability via env/flags, not via a second file. + +### 6. Where stored queries live: defined locally, invoked remotely + +A stored query splits across two axes; do not conflate them: +- **Definition** (`.gq` source + `queries:` entry) lives next to the **embedded graph entry that owns it**. For a hosted remote graph, that is the **deployment manifest** read by `omnigraph-server`; for a personal embedded graph, it may be the user's own config. It never lives on a client-side `Remote` graph entry. +- **Discovery** ("what tools exist for me?") is fetched from the **server** (Cedar-filtered `GET /queries` / MCP catalog) at connect time. +- **Invocation** is **remote** (client β†’ server, HTTP/MCP) β€” or **embedded** (the CLI opens the graph directly and reads the same manifest). + +For remote use, the client carries *pointers to servers*, not query definitions; it **discovers and invokes**, never defines. This is the **capability-as-code guarantee for agents**: an agent can only invoke tools the server's *committed, reviewed* config exposes β€” it **cannot define a new tool at runtime**. Definition is structurally outside the agent's reach. + +`queries:` (graph-capability registry, Cedar-gated when served remotely, MCP-visible when exposed) and `aliases:` (client CLI shortcut) overlap β€” both can name `.gq`-backed operations. This RFC keeps them siblings (the MR-969 decision); the clean long-term is **one registry, two invocation surfaces** (embedded + remote), with `aliases:` subsumed. Out of scope here. + +#### Reconciling `aliases:` with the role model + +`aliases:` is the pre-MR-969, **client-role, embedded-only, ungated** ancestor of `queries:`. An alias bundles `command` (read/change), `query` (`.gq` path), `name` (symbol), `args` (positional param names), and `graph`/`branch`/`format` defaults; the CLI runs it embedded. The server never reads it. So: + +- **Role:** `aliases:` is **client-role** (CLI behavior) β†’ it may live in **both** the user-global `config.yaml` and the project manifest, layered. `queries:` is **graph-capability role** β†’ it lives only on an `Embedded` graph entry, and for remote server graphs that means the server deployment manifest. *Who opens the graph determines where query definitions can live.* +- **Difference:** `aliases:` = embedded invocation, no gating, explicit `command`, bundles client defaults + positional args. `queries:` = remote (+future embedded), Cedar + `mcp.expose`, **infers** read/mutate, bundles only MCP settings. +- **Convergence:** decompose an alias β€” *definition* (nameβ†’.gq+symbol) β†’ `queries:` (the superset: typed, validated, gated, multi-surface, no redundant `command`); *target/branch/format* β†’ client invocation context (`--target`/`--branch`/`--format` or `defaults:`), not baked per-query; *positional `args`* β†’ thin CLI sugar or dropped (agents/services use named JSON params). End-state: one `queries:` registry + the client config model subsumes `aliases:`. +- **Validation:** a file-backed alias (`query: ./foo.gq`) may target only an embedded graph. A remote graph shortcut must be explicit that it invokes a server-owned stored query, e.g. `invoke: find_user`, so the client cannot smuggle a new `.gq` definition into a remote capability surface. +- **v1:** keep `aliases:` unchanged. Footgun worth a load-time warn: an alias and a query with the same name in one manifest are different namespaces invoked differently (`--alias X` vs `POST /queries/X`). + +```yaml +aliases: + local_owner: + command: query + query: ./queries/owner.gq + name: owner + graph: dev # valid only if `dev` resolves Embedded + + remote_owner: + invoke: find_user + graph: prod # valid only if `prod` resolves Remote; source lives on the server + args: [name] +``` + +### 7. CLI surface + +- `omnigraph login ` β€” interactive auth; stores the token keyed by server name in the OS keychain (`omnigraph:`) or the `[]` profile of `~/.omnigraph/credentials` (0600). The `gh auth login` analog. +- `omnigraph use ` β€” set the active graph (writes the appropriate layer). The `kubectl config use-context` analog. +- `omnigraph config view [--resolved] [--show-origin] []` β€” print the merged config and, with `--resolved`, the final tuple **plus the origin layer of every field** (the `git config --show-origin` / `kubectl config view` analog). Resolution is never a mystery. +- All existing verbs (`query`, `mutate`, `load`, `schema`, `branch`, …) gain `--graph `; resolution decides embedded vs remote transparently. + +### 7.5 Init, login, and bootstrap β€” three tiers (folds in the Q2 design) + +Scaffolding splits into three tiers by *scope* and *fatness*, mirroring the field (supabase `init` vs `login`; HelixDB thin `init` vs fat `chef`). Most of this lives in sibling tickets; this RFC owns only the **user route**. + +| Tier | Command | Scope | What it does | Model | Status | +|---|---|---|---|---|---| +| **User route** | `omnigraph login []` | user (`~/.omnigraph/`) | auth + write `~/.omnigraph/config.yaml` / `credentials`; first-run global setup | gh / supabase `login` | **this RFC** (unbuilt) | +| **Thin project init** | `omnigraph init` | project, in-place | create graph + `scaffold_config_if_missing` (`omnigraph.yaml` + minimal `.pg`/`.gq`); refuse-if-exists or `--force` | `cargo init`, `prisma init` | exists; `--force` purge = MR-975 | +| **Fat bootstrap** | `omnigraph quickstart [--template ] [--auto]` | project, possibly new-dir | scaffold + seed data + `serve start` + agent prompt file | HelixDB `chef`, `create-next-app` | MR-973 (unbuilt) | + +**Design positions** (first-principles, since none of the fat tier is built): +- **Split `init` (project) from `login` (user)** β€” never one command writing to both `$HOME` and the project (the supabase line, not the dbt line). `init`=project scaffold; `login`=user credential + global config. +- **`init` is in-place + refuse-if-exists** (cargo/prisma/terraform default): don't clobber; adopt existing files; require `--force` to overwrite (and `--force` purges Lance state per MR-975). +- **Interactive for humans, `--auto`/agent-mode for automation** (npm `-y`, create-* `--CI`, MR-981 `--machine`). In `OMNIGRAPH_AGENT_MODE` any prompt β†’ fail with a repair hint. +- **Templates are a `--template ` flag on the fat tier** (create-vite model), with the *content* (schema + queries + seed) coming from a template source. Mechanism is a design question (bundled-in vs `og template pull` from a repo vs `npm create-*`-style delegation) β€” **not** an existing foothold (MR-581 stale). Lean: a small set of bundled templates first (generic `Personβ†’Knows`, plus promote `omnigraph-intel-bootstrap`), `--template ` later. +- **`init`/`quickstart` can scaffold the `graphs:` map with one or more entries**; "init with specific graphs" = the scaffolded `graphs:` block (embedded `storage:` locally; the agent/operator adds remote `server:` entries via `login` + editing). +- **Secrets-on-scaffold rule** (prisma/dbt/supabase all do this): anything that writes a token also keeps it out of VCS. `login` prefers the OS keychain (no file); the `~/.omnigraph/credentials` profile fallback is `0600` and git-ignored, and any project-local `.env`-shaped file gets a `.gitignore` entry. + +### 8. Concrete shape + +**Global** `~/.omnigraph/config.yaml` (per-user, secret-free): +```yaml +servers: # endpoint only β€” token is keyed by the server name + prod-us: { endpoint: https://og-us.internal:8080 } + prod-eu: { endpoint: https://og-eu.internal:8080 } + staging: { endpoint: https://og-staging.internal:8080 } +graphs: + personal: { storage: ~/graphs/personal.omni } +defaults: + graph: personal +aliases: + my_people: + command: query + query: ~/queries/people.gq + name: list_people + graph: personal +``` + +**Project client** `./omnigraph.yaml` (committed, secret-free, portable β€” no `server.bind`). Note the shipped noun is `graphs:` (MR-603); an entry is embedded (`storage:`) XOR remote (`server:` + `graph_id:`, Β§1.1): +```yaml +graphs: + dev: { storage: s3://team-bucket/dev.omni, branch: main } # embedded + staging: { server: staging, graph_id: prod, branch: review } # remote β†’ graph `prod` on server `staging` + prod-us: { server: prod-us, graph_id: production } + prod-eu: { server: prod-eu, graph_id: production } # multi-homed: same graph, another server +defaults: { graph: dev, output_format: table } +aliases: + owner: + command: query + query: ./queries/owner.gq + name: owner + args: [name] + graph: dev +``` +Select with `--graph ` (shipped flag, MR-603). + +**Server deployment** `./omnigraph.yaml` (committed in the deploy repo, read by `omnigraph-server`). Every served graph is an embedded storage locator; server-owned policy and stored-query definitions live here: +```yaml +graphs: + production: + storage: s3://team-bucket/prod.omni + policy: + file: ./policies/prod.yaml + queries: + find_user: + file: ./queries/find_user.gq + mcp: { expose: true, tool_name: lookup_user } + +server: + policy: + file: ./policies/server.yaml +``` + +**Credentials** are keyed by server name β€” `omnigraph login prod-us` writes the OS keychain entry `omnigraph:prod-us` (or a `[prod-us]` profile in `~/.omnigraph/credentials`, 0600, git-ignored); `OMNIGRAPH_TOKEN_PROD_US` overrides for CI. No token fields in any config file; no committable secrets. + +## DX + +1. **One command surface, two loci.** `query --graph dev` (embedded) and `--graph staging` (remote) are the same command; only resolution differs. Change one word, not a mental model. +2. **Clone-and-go.** Project config names servers+graphs; teammate runs `omnigraph login staging` once and every target resolves. The git + `gh auth login` model. +3. **Multi-server Γ— multi-graph is the default.** Remote graph entries reference `server` by name; `servers` is a global named map; graphs are per-server. `prod-us` and `prod-eu` both serving `production` is two graph entries β€” Helix cannot express this. +4. **Solo-first.** Everything in `~`, no project required. +5. **Laptop-to-fleet on one schema.** Local = one `omnigraph.yaml` (both roles); prod = role-split across repos. No second format to learn. + +## AX (agent experience) + +1. **One flat resolved context, never a config to navigate.** targetβ†’serverβ†’endpointβ†’token resolves *before* the agent sees anything. The agent reasons about tools, not topology (the LLM-safe-surface principle extended to config). +2. **Secrets are structurally outside the agent's reach.** The repo it operates in has no tokens; they are in the global layer / keychain, outside its view. An agent *cannot* exfiltrate a prod token from project config because it is not there. +3. **Branch/snapshot-pinned contexts** (E4) β€” hand an agent a `branch: review` / `--snapshot v42` target and its reads are reproducible and cannot see uncommitted main-line state. No kubeconfig analog. +4. **The agent's capabilities are a GitOps'd artifact** (E6) β€” which graphs exist, which stored-query tools it may call, and which Cedar rules gate them are all in the version-controlled server config. Powers change only via a reviewed PR, deployed by restart. Infrastructure-as-code for what the AI can do. +5. **Config + policy compose.** Config = "where am I pointed + which token"; Cedar = "what may I do there." Orthogonal; no enforcement logic leaks into config. + +## GitOps β€” three surfaces, secrets in none + +| Surface | Repo | Contents | Deploy | Secrets | +|---|---|---|---|---| +| Server deployment config | infra/deploy repo | `graphs:`, policy, **`queries:` + `.gq` files** | commit β†’ CI β†’ **server restart** (no hot reload) | none β€” by-reference | +| Project client config | app repo | `graphs:` β†’ embedded storage or remote server+graph | committed, read by CLI/agent | none | +| Global user config | **not GitOps'd** β€” machine-local `~` | `servers:` + creds-by-ref | `omnigraph login` writes it | refs only (like `~/.kube/config`) | + +## Comparison + +| Property | kubeconfig | Helix | git | compose | **OmniGraph (this RFC)** | +|---|---|---|---|---|---| +| Named remote endpoints + creds-by-ref | βœ… | βœ… | partial | partial | βœ… (global `servers`) | +| Global + project layering, uniform schema | βœ— | βœ— | βœ… | βœ— | βœ… | +| Embedded OR remote under one name | βœ— | βœ— | n/a | βœ— | βœ… (E1) | +| Multi-server Γ— multi-graph | βœ… | βœ— | n/a | n/a | βœ… (E2) | +| Branch/snapshot in the address | βœ— | βœ— | partial | βœ— | βœ… (E4) | +| Agent tool surface in the repo | βœ— | βœ— (separate bundle) | n/a | n/a | βœ… (E6) | +| Project manifest renamed by role | β€” | no | β€” | no | **no** | +| Concept count | 3 | 1 | 2 | 1 | **2 (servers/targets)** | + +## Migration / backwards compatibility + +- **Additive.** Today's `omnigraph.yaml` (`graphs:`, `cli:`, `server:`, `aliases:`, `policy:`) keeps working unchanged. `graphs:` entries are equivalent to embedded `targets:` with a `storage:` (shipped `uri:` is a deprecated alias); both resolve. +- **`targets:` is new** and optional. `servers:` is new and optional. Absent β†’ today's behavior. +- **Global `~/.omnigraph/config.yaml` is new.** Absent β†’ only project + env + flags, exactly as now. Its addition is the **global-first posture flip**: today the CLI is project-anchored (reads `./omnigraph.yaml`, no parent walk); the global config becomes the new primary discovery path so the CLI works with no project file. Existing project-only workflows are unchanged (project still overrides global); the flip is additive β€” it adds a fallback layer below the project file, it does not remove the project file. +- **`graphs:` β†’ `targets:` is an evolution, not a break.** Both can coexist; `targets:` is the superset (adds remote + branch pinning). A future cleanup may alias `graphs:` to embedded `targets:`. +- **`server.bind` stays supported** but documentation steers operators to `--bind` / `OMNIGRAPH_BIND` for portability; no removal. +- **Credentials: keyed-by-name is new; `bearer_token_env` is the compat path.** The primary design (keychain / `[]` profile / `OMNIGRAPH_TOKEN_`) is new resolver work (lands on MR-971). The shipped `bearer_token_env` + `auth.env_file` dotenv (`resolve_remote_bearer_token`) is **unchanged and still honored** β€” existing single-server dotenv setups keep working, and the resolver honors an explicit `auth: { token: {...} }` source (env/file/command/keychain) with `bearer_token_env` as its flat legacy alias. No `credentials.yaml`. +- **Validation tightens invalid mixes, not valid legacy use.** Top-level `policy:` / `queries:` remain only for anonymous bare-URI compatibility. Named graphs use per-entry fields. Remote graph entries with local `policy:` / `queries:` and server manifests with `server:` graph locators are rejected because there is no correct way to honor those fields. + +## Open questions + +- **`graphs:` vs `targets:` naming churn.** Do we rename `graphs:` β†’ `targets:` (with a deprecation alias) or keep `graphs:` for embedded and add `targets:` for remote? Leaning: keep both, document `targets:` as the superset. +- **Keychain integration scope.** Keychain is now the *primary* credential store (Β§5), so this is on the critical path, not optional: macOS Keychain first (matches operator practice) with the `0600` `[]` profile file as fallback; Linux Secret Service / `pass` later. Open: which keyring crate, and the exact `OMNIGRAPH_TOKEN_` name-derivation (upper-snake, non-alnum β†’ `_`). +- **Project-local `servers:`.** Allowed (e.g. a localhost dev server), merged with global. Confirm creds stay by-reference even for project-local servers (yes). +- **`aliases:` ⇄ `queries:` convergence.** Out of scope here; tracked separately. One registry with embedded + remote invocation surfaces is the target end state. +- **Single-file `KUBECONFIG`-style list.** Do we support `OMNIGRAPH_CONFIG` pointing at multiple files (colon-joined), or a single file only? Start single; revisit if demand appears. + +## Implementation β€” breadboard + slices (Shape A) + +Shaped via requirements + a fit check (Shape A β€” global-first layered config + unified `graphs:` entry + three-tier init β€” selected over a project-first minimal option and a Helix-clone). This section breadboards A and slices it. **Bold** = NEW. + +### Places + +| # | Place | What | +|---|---|---| +| P1 | Disk | `~/.omnigraph/{config.yaml, credentials, cache/, state/}` + project `omnigraph.yaml` + `.env.omni` | +| P2 | Config resolution | runs on every command: load layers β†’ merge β†’ resolve `--graph` | +| P3 | Command execution | embedded engine OR remote HTTP client | +| P4 | Remote `omnigraph-server` | existing HTTP surface (`/query`, `/mutate`, `/queries/{name}`) | +| P5 | Scaffold | `login` / `init` / `quickstart` | + +### Affordances + +| # | Place | Affordance | NEW? | Wires | +|---|---|---|---|---| +| U1 | P1 | `~/.omnigraph/config.yaml` (operator edits) | **N** | β†’ N1 | +| U2 | P1 | project `./omnigraph.yaml` | β€” | β†’ N1 | +| U3 | P1 | `~/.omnigraph/credentials` / `.env.omni` dotenv (secrets, git-ignored) | β€” | β†’ N4 | +| U4 | P3 | `omnigraph --graph ` (any command) | β€” | β†’ N14 | +| U5 | P5 | `omnigraph login []` | **N** | β†’ N11 | +| U6 | P5 | `omnigraph init` / `quickstart [--template]` | partly | β†’ N12 / N13 | +| U7 | P2 | `omnigraph config view --resolved --show-origin` | **N** | β†’ N10 | +| N1 | P2 | `load_layered_config()` β€” global (N3) + project (cwd), serde each | **N** | β†’ N2 | +| N2 | P2 | **merge engine** β€” deep-merge settings; replace named-resource entries; replace lists; **retain provenance** and raw field origins | **N⚠️** | β†’ N5, β†’ S_merged | +| N3 | P2 | global-dir resolver β€” `OMNIGRAPH_HOME` else `~/.omnigraph/` | **N** | β†’ N1 | +| N4 | P2 | `load_env_file_into_process` β€” dotenv, real-env-wins (existing) | β€” | β†’ N9 | +| N5 | P2 | `resolve_graph(name, merged)` β†’ typed `Embedded`/`Remote` locator; rejects invalid role/field combinations before execution | **N⚠️** | β†’ N6 | +| N6 | P3 | `GraphConn` β€” `Embedded(engine)` \| `Remote(http)` dispatch | **N⚠️** | β†’ N7, β†’ N8 | +| N7 | P3 | embedded path β€” `Omnigraph::open(uri)` (existing) | β€” | β†’ engine | +| N8 | P3 | **HTTP-client path** β€” POST `/query`/`/mutate`/`/queries/{name}` | **N⚠️** | β†’ P4, β†’ N9 | +| N9 | P2 | `resolve_bearer_token(server)` β€” explicit `auth.token` source if set, else **keyed by name**: `OMNIGRAPH_TOKEN_`/`OMNIGRAPH_TOKEN` β†’ keychain `omnigraph:` β†’ `[]` profile; legacy `bearer_token_env`/dotenv (MR-971) | **N⚠️** | β†’ N8 | +| N10 | P2 | `config view` handler β€” merged + per-field origin (needs N2 provenance) | **N** | β†’ U7 | +| N11 | P5 | `login` handler β€” interactive auth β†’ write `config.yaml` + `credentials` (0600) + `.gitignore` | **N⚠️** | β†’ S_global | +| N12 | P5 | `init` handler β€” `scaffold_config_if_missing` + create graph; refuse-if-exists/`--force` purge (MR-975) | partly | β†’ S_project | +| N13 | P5 | `quickstart` handler β€” scaffold + `--template` + seed + `serve start` + agent prompt (MR-973; needs serve MR-970) | **N⚠️** | β†’ S_project | +| N14 | P3 | agent-mode wrapper β€” `--machine`/`OMNIGRAPH_AGENT_MODE`: JSON, structured errors, never-prompt, typed exit codes (MR-981) | **N⚠️** | β†’ N1 | +| S_global | P1 | `~/.omnigraph/config.yaml` + `credentials` | **N** | read by N1/N9 | +| S_project | P1 | `./omnigraph.yaml` + `.env.omni` | β€” | read by N1/N4 | +| S_merged | P2 | in-memory resolved config (per command, with provenance) | **N** | read by N5/N10 | +| S_cache | P1 | `~/.omnigraph/cache/` (remote catalogs) | **N** | read by N8 | + +```mermaid +flowchart TB + subgraph P1["P1: Disk"] + U1["U1: ~/.omnigraph/config.yaml"] + U2["U2: ./omnigraph.yaml"] + U3["U3: credentials dotenv"] + end + subgraph P2["P2: Config resolution"] + N3["N3: global-dir (OMNIGRAPH_HOME)"] + N1["N1: load_layered_config"] + N2["N2: merge engine (+provenance)"] + N4["N4: dotenv loader"] + N5["N5: resolve_graph(--graph)"] + N9["N9: resolve_bearer_token"] + N10["N10: config view"] + end + subgraph P3["P3: Command execution"] + U4["U4: omnigraph --graph"] + N14["N14: agent-mode wrapper"] + N6["N6: GraphConn embedded|remote"] + N7["N7: embedded Omnigraph::open"] + N8["N8: HTTP-client POST"] + end + subgraph P5["P5: Scaffold"] + U5["U5: login"]; U6["U6: init/quickstart"] + N11["N11: login handler"]; N12["N12: init"]; N13["N13: quickstart"] + end + P4["P4: remote omnigraph-server"] + U1-->N1; U2-->N1; N3-->N1; N1-->N2-->N5-->N6 + U3-->N4-->N9-->N8 + U4-->N14-->N1 + N6-->N7; N6-->N8-->P4 + N2-->N10-->U7["U7: config view --resolved"] + U5-->N11; U6-->N12; U6-->N13 + classDef ui fill:#ffb6c1,stroke:#d87093,color:#000 + classDef n fill:#d3d3d3,stroke:#808080,color:#000 + class U1,U2,U3,U4,U5,U6,U7 ui + class N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14 n +``` + +### Slices (vertical, each demo-able) + +| # | Slice | Parts/affordances | Demo | +|---|---|---|---| +| **V1** | **Global layer + merge + `config view`** | A1–A4 Β· N1,N2,N3,N10 Β· U1,U7,S_global,S_merged | Put config in `~/.omnigraph/`, run `omnigraph config view --resolved --show-origin` from any dir β†’ merged result with per-field origin; existing embedded commands work global-first with no project file | +| **V2** | **Remote graphs + HTTP client + creds** | A5–A7 Β· N5,N6,N8,N9 Β· S_cache | Define a `server:` graph entry; `omnigraph query --graph prod` hits the remote server (`curl`-free); embedded `--graph dev` still local | +| **V3** | **`omnigraph login`** | A8 Β· N11,U5 | `omnigraph login prod` writes `~/.omnigraph/credentials` (0600) + `.gitignore`; V2 remote query now works with no manual env | +| **V4** | **Thin-init hardening + quickstart + templates** | A9 Β· N12,N13,U6 (needs serve MR-970) | `omnigraph quickstart --template person-knows` scaffolds + seeds + serves; `init --force` purges (MR-975) | +| **V5** | **Agent-mode** | A10 Β· N14,U4 (MR-981) | `OMNIGRAPH_AGENT_MODE=1 omnigraph query …` β†’ JSON + structured errors + typed exit codes; never-prompt | + +V1 is the foundation (global-first + merge + view). V2 closes the substantive clientβ†’server gap. V3 is credential ergonomics. V4/V5 ride sibling tickets (MR-970/973/981). MR-969 (stored queries) ships independently and is reached by N8's `/queries/{name}` once V2 lands. + +## Rollout + +The slices above are the rollout order: **V1 (global layer + merge) β†’ V2 (remote graphs + HTTP client) β†’ V3 (login) β†’ V4 (quickstart/templates, on MR-970) β†’ V5 (agent-mode, MR-981).** V1–V2 close the substantive gap (global-first config + `curl`-free server access); V3–V5 are ergonomics that ride sibling tickets. Evaluate after V2 against early-adopter and agent-onboarding (MR-973 / MR-974) signal. The spikes (X1 HTTP-client, X2 merge engine, X3 resolver+provenance, X4 login) resolve before their owning slice. + +## Prior art + +- kubeconfig (clusters / users / contexts; `KUBECONFIG`; `kubectl config view`) +- Helix CLI v2 (`helix.toml` local+enterprise instance blocks; `~/.helix/config`; `~/.helix/credentials`) +- AWS CLI (`~/.aws/config` + `~/.aws/credentials` split; named profiles; `credential_process`) +- git (`~/.gitconfig` + `.git/config`; `--show-origin`) +- Cargo (`Cargo.toml` manifest + `~/.cargo/config.toml`) +- Supabase / Prisma (one project manifest; connection via `DATABASE_URL` env) +- 12-factor app (config that varies by deploy lives in the environment) diff --git a/docs/dev/rfc-003-mcp-server-surface.md b/docs/dev/rfc-003-mcp-server-surface.md new file mode 100644 index 0000000..32fbce5 --- /dev/null +++ b/docs/dev/rfc-003-mcp-server-surface.md @@ -0,0 +1,270 @@ +# RFC: MCP Server Surface for `omnigraph-server` β€” Full Tool Parity, Stored Queries, Modular Auth + +**Status:** Proposed +**Date:** 2026-06-01 +**Tickets:** MR-969 (stored queries + MCP exposure β€” the surface this completes), MR-956 (federated auth / WorkOS OAuth β€” the auth substrate this consumes), MR-971 (per-server credential resolver), MR-974 (agent setup surface β€” the installer that wires this), MR-668 (multi-graph server β€” shipped, the routing this builds on) +**Builds on:** [omnigraph#128](https://github.com/ModernRelay/omnigraph/pull/128) (`ragnorc/stored-queries-mcp`) β€” the shipped stored-query registry, `GET /queries`, `POST /queries/{name}`, and the coarse `invoke_query` gate. +**Supersedes:** the MCP-transport portion of [rfc-001-queries-envelope-mcp.md](rfc-001-queries-envelope-mcp.md) (`/mcp/tools` + `/mcp/invoke`). See [Relationship to RFC-001](#relationship-to-rfc-001). +**Target release:** v0.8.x (phased β€” see Rollout) + +## Summary + +Add a first-class **MCP (Model Context Protocol) server surface to `omnigraph-server`**, exposed over **Streamable HTTP**, that projects the server's operations as MCP tools and resources for LLM clients (Claude Code/Desktop/web, Cursor, etc.). Two populations of tools share one projection path: + +1. **Built-in operational tools** β€” parity with the existing `@modernrelay/omnigraph-mcp` stdio package's **13 tools** (`health`, `snapshot`, `read`, `schema_get`, `branches_list`, `commits_list`, `commits_get`, `change`, `ingest`, `branches_create`, `branches_delete`, `branches_merge`, `schema_apply`) and its **2 resources** (`omnigraph://schema`, `omnigraph://branches`), plus a new server-scoped `graphs_list` tool and an `omnigraph://graphs` resource (multi-graph mode). +2. **Dynamic stored-query tools** β€” one MCP tool per `mcp.expose: true` entry in the `queries:` registry (MR-969 / #128), with parameters typed from the `.gq` declaration via the shipped `query_catalog_entry` / `param_descriptor` projection. + +Every tool is **authorized by the server's existing Cedar policy engine**. The MCP layer never implements its own authentication: it consumes an **already-resolved `ResolvedActor`** from the server's bearer middleware (`require_bearer_auth` today; the `TokenVerifier` seam when MR-956 lands), so the **same MCP endpoint serves on-prem (static or customer-OIDC tokens) and our cloud (WorkOS OAuth) by configuration only**. Cloud OAuth is an additive layer (RFC 9728 protected-resource metadata) that slots in with zero MCP changes. + +The end-state collapses two diverging tool implementations into one: the in-server MCP is the canonical, Cedar-gated, remotely-reachable surface; the stdio package becomes a thin stdio↔HTTP proxy (local on-ramp) over it. + +> **Key caveat, stated up front (see Β§5.9 below):** the headline "a token scoped via Cedar to a *specific set* of stored queries" requires **per-query `invoke_query` scope**, which is *designed* (rfc-001) but **not yet implemented** β€” the shipped action is coarse (any stored query on the graph, or none). Per-actor Cedar curation works today for *built-in vs ad-hoc vs admin* tools and for *stored-vs-ad-hoc*; sub-selecting individual stored queries per actor is gated on a prerequisite (PR 0b). Until then, stored-query curation is graph-level (registry membership + `mcp.expose`). + +## Relationship to RFC-001 + +[rfc-001-queries-envelope-mcp.md](rfc-001-queries-envelope-mcp.md) (MR-656 / MR-976 / MR-969) is the parent design for stored queries + the response envelope + MCP. This RFC is the **detailed MCP-transport design** that #128 left for a follow-up, and it **revises rfc-001 in three places where the shipped code or the MCP wire protocol diverged from rfc-001's sketch**: + +1. **Transport shape.** rfc-001 sketched `GET /mcp/tools` + `POST /mcp/invoke` (a bespoke REST pair). **That is not the MCP wire protocol β€” real MCP clients cannot connect to it.** This RFC implements actual MCP JSON-RPC over Streamable HTTP and reuses `query_catalog_entry` as a *projection source*, not a parallel surface. (rfc-001's own Open Question already leaned toward Streamable HTTP.) +2. **Exposure config.** rfc-001 specified inline `.gq` pragmas (`@mcp(expose=…)`, default `expose=false`). **#128 shipped a different mechanism:** YAML `queries..mcp.expose` in `omnigraph.yaml`, **default `true`** (declaring a query in the manifest *is* the opt-in). This RFC builds on the shipped YAML form; the `.gq`-pragma design in rfc-001 is superseded for exposure. +3. **Schema introspection.** rfc-001 lists "Schema introspection through MCP" as a **non-goal** ("agents see types through declared return shapes"). This RFC **revises that**: the operational-parity tools include `schema_get` and `omnigraph://schema` β€” *because the shipped stdio package already exposes both*. The non-goal is achieved by *policy*, not omission: `schema_get`/`omnigraph://schema` are Cedar-gated by `Read`, and the recommended locked-down agent policy denies `Read`, so a curated agent still never sees the schema. (rfc-001's intent is preserved; the mechanism moves from "don't build it" to "build it, gate it.") + +Everything else in rfc-001 (two-paths-one-engine, per-query `invoke_query` *as the intended scope*, the response envelope, multi-graph per-graph endpoints) this RFC consumes unchanged. + +> **Numbering note:** the `TokenVerifier`/WorkOS auth design is referred to in code (`crates/omnigraph-server/src/identity.rs`) as "RFC 0001," which is a *different* document from this repo's `docs/dev/rfc-001-queries-envelope-mcp.md`. To avoid the collision this RFC cites the auth substrate as **MR-956** throughout, never "RFC 0001." + +## Reconciliation with shipped code (verified against `ragnorc/stored-queries-mcp` HEAD) + +Verified against `crates/omnigraph-server/src/{lib.rs,api.rs}` and `crates/omnigraph-policy/src/lib.rs` at the current branch head (not the #128 PR body, and not `api.rs` alone): + +- βœ… `GET /queries` returns the `mcp.expose == true` subset as `QueriesCatalogOutput { queries: [QueryCatalogEntry] }`, each with typed `ParamDescriptor`s, `tool_name`, `description`, `instruction`, and a `mutation` flag. **MCP-ready projection, but exposed as bespoke REST/JSON β€” not the MCP wire protocol.** +- βœ… `POST /queries/{name}` route exists (`server_invoke_query`, `lib.rs`). +- βœ… `query_catalog_entry()` / `param_descriptor()` with an exhaustive `ScalarType β†’ ParamKind` map (a new scalar is a compile error). +- βœ… `InvokeQuery` Cedar action defined in `omnigraph-policy`. +- βœ… **`InvokeQuery` IS enforced** at `POST /queries/{name}`: `server_invoke_query` calls `authorize(PolicyAction::InvokeQuery)` and **masks a denial to a 404 identical to "unknown query"** so the catalog isn't probeable (the denial-masking the previous draft of this RFC reported as missing is shipped β€” it lives in `lib.rs`, not `api.rs`). The stored-mutation path is already double-gated: `InvokeQuery` outer, then `Change` inside `run_mutate`. +- βœ… **Reuse path exists:** `run_query` / `run_mutate` are already decoupled from their HTTP request bodies and take registry-supplied `(source, name, params, branch/snapshot)`. MCP `tools/call` for both stored and ad-hoc tools delegates to these β€” no new business logic. +- ❌ **Per-query (`invoke_query[name]`) scope is NOT implemented.** `PolicyRequest` carries only `{action, branch, target_branch}` β€” **no query-name dimension** β€” and the action is documented coarse ("permits *any* stored query on the graph"). rfc-001 *designed* per-name scope; it is unbuilt. This RFC's per-query Cedar filtering (Β§5.4) and recommended agent policy (Β§5.9) depend on it β†’ tracked as **PR 0b**. +- ❌ No MCP protocol surface (`initialize`/`tools/list`/`tools/call`, JSON-RPC, transport). +- ❌ No `TokenVerifier` trait yet β€” `require_bearer_auth` resolves a `ResolvedActor` inline (static-hash). The trait/`OidcJwtVerifier` are MR-956 (draft). The MCP layer's only requirement β€” *consume `ResolvedActor`* β€” is satisfiable today. + +Stack (verified `Cargo.toml`): Axum + utoipa (OpenAPI) + `omnigraph-policy` (Cedar) + `futures` + `tokio`. **No MCP crate present.** `edition = "2024"`. + +## Motivation + +- **One curated, safe, remotely-reachable tool surface.** MR-969's thesis: hand an LLM a token Cedar-scoped to a set of tools and it sees exactly those typed tools β€” cannot construct ad-hoc queries it isn't permitted, cannot read the schema it isn't permitted, cannot reach other graphs. Today the only MCP is the stdio package: local-only, full surface, ungated. +- **Parity, so the in-server MCP can be the single implementation.** Operators/agents already depend on the operational tools. Supporting them server-side behind one Cedar gate lets the stdio package degrade to a proxy and removes two diverging tool sets. +- **On-prem and cloud from one endpoint.** A managed cloud (WorkOS OAuth) and an on-prem/air-gapped deploy (static or customer-OIDC tokens) must serve the same MCP without forks or MCP-specific auth. +- **Foundation for the agent on-ramp (MR-974).** `omnigraph mcp install --agent ` needs a decided transport + a stable endpoint. + +## Goals + +- Project built-in tools + stored queries as MCP tools through **one** registry abstraction. +- `tools/list` and the callable set are **identical for argument-independent authorization**, both driven by Cedar (see Β§5.4 for the branch-scoped caveat). +- The MCP layer is **auth-method-agnostic**: it consumes `ResolvedActor`, never a raw token, never branches on how auth happened. +- The same endpoint works on-prem (static/OIDC) and cloud (WorkOS OAuth), switched by config; cloud OAuth is additive (RFC 9728). +- No new business logic: MCP tools delegate to the same `run_query`/`run_mutate`/branch/schema functions the HTTP routes call. +- Behaviour-neutral when unused: no MCP traffic = no change. + +## Non-Goals + +- **Building/hosting an OAuth authorization server.** The server is a Resource Server; WorkOS AuthKit+Connect is the AS (MR-956). The MCP endpoint validates tokens, never issues them, never holds client secrets. +- **OAuth/WorkOS implementation itself** β€” MR-956's work. This RFC leaves a clean RFC-9728 hook and consumes `ResolvedActor`. +- **MCP prompts, elicitation, `tools/list_changed`, resource subscriptions, server-initiated messages.** None needed β†’ enables a stateless POST-only transport (Β§5.6). +- **stdio transport inside the server.** stdio stays in the TS package (now a proxy). +- **Cross-graph tool listing.** Per-graph catalogs only (MR-969 + RFC-002 non-goal). +- **Hot reload of the query registry.** Restart-only (MR-969). + +## Background + +`omnigraph-server` (Axum) already implements every operation this RFC exposes as an authenticated HTTP route; each authorizes via a `PolicyAction` against the Cedar policy for a server-resolved actor and calls into the engine. The existing stdio MCP package is a *client* of these routes (it owns no business logic). MR-956 will introduce a `TokenVerifier` trait (`StaticHashTokenVerifier` today inline, `OidcJwtVerifier` for OIDC/WorkOS) producing the `ResolvedActor { actor_id, tenant_id: Option, scopes: Vec, source }` that already exists in `identity.rs` and is consumed by Cedar β€” token *validation* is offline (cached JWKS), so on-prem/air-gapped has no request-path dependency on the cloud. + +## Design + +### 5.1 One tool model: a `McpTool` trait, two populators + +Both built-in and stored-query tools implement one trait so `tools/list` / `tools/call` never special-case: + +```rust +trait McpTool: Send + Sync { + fn name(&self) -> &str; // MCP tool id (stable) + fn title(&self) -> Option<&str>; + fn description(&self) -> &str; + fn input_schema(&self) -> serde_json::Value; // JSON Schema (draft 2020-12) + fn annotations(&self) -> ToolAnnotations; // readOnlyHint / destructiveHint / idempotentHint + /// The Cedar request(s) this call requires, given parsed args. Used BOTH at + /// list-time (dry-run filter, default args) and call-time (enforce, real args). + fn authorization(&self, args: &ToolArgs) -> Vec; + async fn call(&self, ctx: &GraphCtx, args: ToolArgs) -> Result; +} +``` + +- **Built-ins**: ~14 static impls, each delegating to the *same* function its HTTP route calls (`run_query`, `run_mutate`, branch ops, `apply_schema_as`, …). `input_schema` authored once (or derived from each route's existing `utoipa`/`ToSchema` DTO). +- **Stored queries**: generated `McpTool` instances, one per `mcp.expose` entry; `input_schema` from `param_descriptor` (Β§5.3); `authorization` β†’ `InvokeQuery` (coarse today; `InvokeQuery{name}` after PR 0b) then the inner `Read`/`Change`. + +`ToolRegistry` for a graph = the static built-ins + the dynamic stored-query tools resolved from that graph's `GraphHandle` registry. + +### 5.2 Tool catalog (parity) and Cedar mapping + +Each built-in **reuses the exact `PolicyAction` its HTTP route already enforces** β€” verified against the handlers in `lib.rs`, not invented: + +| MCP tool | Scope | Read/Mutate | Cedar action (verified from route) | +|---|---|---|---| +| `health` | server | read | none (liveness/version) | +| `graphs_list` *(new)* | server | read | `GraphList` | +| `snapshot` | graph | read | `Read` | +| `schema_get` | graph | read | `Read` | +| `branches_list` | graph | read | `Read` | +| `commits_list`, `commits_get` | graph | read | `Read` | +| `read` (ad-hoc `.gq`) / `query` *(alias)* | graph | read | `Read` | +| `change` (ad-hoc `.gq`) / `mutate` *(alias)* | graph | mutate | `Change` | +| `ingest` (NDJSON) | graph | mutate | `Change` (+ `BranchCreate` when forking a new branch) | +| `branches_create` | graph | mutate | `BranchCreate` | +| `branches_delete` | graph | mutate | `BranchDelete` | +| `branches_merge` | graph | mutate | `BranchMerge` | +| `schema_apply` (`allow_data_loss`) | graph | mutate | `SchemaApply` | +| **stored query** (`find_user`, …) | graph | inferred | `InvokeQuery` (coarse; `InvokeQuery{name}` after PR 0b) + inner `Read`/`Change` | + +There is **no `Ingest` and no separate `snapshot`/`Export` action** β€” `ingest` enforces `Change`, `snapshot` enforces `Read`. (`Export` exists but maps to the `/export` route, which this RFC does not expose as a tool.) + +**Tool id parity vs. canonicalization.** The shipped stdio package uses tool ids **`read`/`change`** (and calls the deprecated `/read`,`/change` routes). The server HTTP surface canonicalized to `/query`,`/mutate` with `/read`,`/change` deprecated (MR-656). To keep existing package clients working *and* align with the server, the MCP exposes **`query`/`mutate` as canonical with `read`/`change` retained as deprecated-but-live aliases** (both dispatch to the same handler). Open Q7 asks whether to drop the aliases later. + +Resources (Β§5.5): `omnigraph://schema`, `omnigraph://branches` (parity), plus `omnigraph://graphs` *(new)* β€” each gated by the same action as its list/get route (`Read`, `Read`, `GraphList`). + +### 5.3 `ParamDescriptor β†’ JSON Schema` (stored-query tools) + +| `ParamKind` | JSON Schema | Notes | +|---|---|---| +| String | `{"type":"string"}` | | +| Bool | `{"type":"boolean"}` | | +| Int (i32/u32) | `{"type":"integer"}` | | +| BigInt (i64/u64) | `{"type":"string","pattern":"^-?\\d+$"}` | JSON numbers lose precision >2⁡³ β†’ string (matches the shipped `api.rs` rationale). (Open Q1) | +| Float (f32/f64) | `{"type":"number"}` | | +| Date | `{"type":"string","format":"date"}` | | +| DateTime | `{"type":"string","format":"date-time"}` | | +| Blob | `{"type":"string","contentEncoding":"base64"}` | | +| Vector | `{"type":"array","items":{"type":"number"},"minItems":dim,"maxItems":dim}` | uses `vector_dim` | +| List | `{"type":"array","items":}` | scalar items only (grammar guarantees) | + +`nullable == false` β†’ param is in `required`. Annotations: `mutation` β†’ `{readOnlyHint:false, destructiveHint:true}`; else `{readOnlyHint:true}`. `description` β†’ tool description; `instruction` β†’ appended to description (or `_meta`). (The shipped `check()` already warns when an `mcp.expose` query declares a `Vector` param an LLM can't supply.) + +For built-in tools the schema is hand-authored from the route DTO; e.g. `query` β†’ `{source: string, branch?: string, params?: object}`; `schema_apply` β†’ `{schema: string, allow_data_loss?: boolean}`; `ingest` β†’ `{ndjson: string, mode?: "merge"|"append"|"overwrite", branch?: string}`. + +### 5.4 `tools/list` (Cedar-filtered) and `tools/call` (dispatch + masking) + +- **`tools/list`**: build the `ToolRegistry`; for each tool evaluate `authorization(default_args)` against the actor's Cedar policy; **emit only tools that authorize**. Authz decisions memoized per request. Stored-query tools additionally require `mcp.expose: true`. + - **Exactness caveat (R7 is conditional):** the listed set equals the callable set **only for tools whose authorization is argument-independent** (`health`, `graphs_list`, `snapshot`, `schema_get`, `branches_list`, `commits_*`, ad-hoc `query`/`mutate`, and stored queries under the *coarse* action). For **branch-scoped tools** (`branches_create`/`merge` with `target_branch_scope`, and any branch-scoped `Read`/`Change` rule), list-time uses `default_args` (e.g. branch `main`) and cannot know the real target, so the listed set is a *best-effort approximation* of callability β€” a call may still be denied (or, rarely, a hidden tool would have been allowed). `tools/call` is always the authoritative gate. The contract is: **list never shows a tool the actor can't ever call; for branch-scoped tools it may show one the actor can call only on some branches.** +- **`tools/call`**: resolve `name` β†’ `McpTool` (masked-404 if unknown *or* `mcp.expose:false`); parse+validate args against `input_schema`; enforce `authorization(args)` (mutations stay double-gated: `InvokeQuery` then `Change`); on success `call`. **Denial masking** lives in one place (the dispatcher): an authz denial is returned identically to "unknown tool" (Β§5.10), reusing the same deny≑missing principle already shipped at `POST /queries/{name}`. + +### 5.5 Resources + +Advertise `resources` capability (`subscribe:false, listChanged:false`). `resources/list` β†’ the URIs the actor may read; `resources/read` β†’ schema `.pg` text / branches JSON / (multi-graph) graphs JSON, each gated by the corresponding action (`Read`, `Read`, `GraphList`). A locked-down agent denied `Read` simply never sees `omnigraph://schema` or `omnigraph://branches` β€” this is how rfc-001's "agents don't introspect schema" intent is met *by policy* (Β§Relationship-to-RFC-001). + +### 5.6 Transport: Streamable HTTP, stateless, POST-only + +- **Streamable HTTP** (MCP's current standard; we're already an HTTP server). One endpoint per scope (Β§5.7). +- Because the server emits **no** server-initiated messages, implement the **minimal conformant** shape: client `POST`s JSON-RPC, server replies `application/json`. **No SSE channel, no `Mcp-Session-Id`, stateless** β€” each request authenticated independently via the bearer middleware. Honour the `MCP-Protocol-Version` header. SSE/sessions can be added later if subscriptions land. +- **JSON-RPC methods:** `initialize` (advertise `{tools:{listChanged:false}, resources:{listChanged:false, subscribe:false}}` + serverInfo/version), `notifications/initialized` (no-op ack), `ping`, `tools/list`, `tools/call`, `resources/list`, `resources/read`. `prompts/list` returns empty if probed. +- **Library decision (Open Q2):** spike `rmcp` (official Rust MCP SDK) for conformance + Streamable-HTTP/Axum on edition 2024; **fall back to a hand-rolled ~150 LOC JSON-RPC-over-POST** (only the methods above) on friction. Given the tiny surface, hand-roll is an acceptable default. + +### 5.7 Endpoint routing (server- vs graph-scoped) + +- **Single-graph mode:** `POST /mcp` β€” graph tools + server tools (`health`, `graphs_list`). +- **Multi-graph mode (MR-668):** `POST /graphs/{graph_id}/mcp` β€” graph-scoped tools for that graph; plus a server-level `POST /mcp` exposing only server-scoped tools (`health`, `graphs_list`). A per-graph endpoint never lists another graph's tools (isolation, tested). Mirrors the shipped `/graphs/{graph_id}/…` cluster routing. (Open Q5: confirm naming + whether server tools also appear on the per-graph endpoint.) + +### 5.8 Modular / decoupled auth (the cross-cutting requirement) + +**Invariant (load-bearing, satisfiable today):** the MCP handler receives an **already-resolved `ResolvedActor`** and **branches on nothing** about how the token was verified. No token parsing, no method check, no OAuth inside the MCP module. Today that actor comes from `require_bearer_auth`; when MR-956 lands it comes from a `TokenVerifier` β€” the MCP code is identical either way. + +``` +request β†’ [auth middleware: ResolvedActor] β†’ [MCP route] β†’ Cedar β†’ McpTool +``` + +**Server side β€” auth is config, not code:** + +| Deployment | Verifier | MCP change | +|---|---|---| +| On-prem, static bearer | `require_bearer_auth` / `StaticHashTokenVerifier` | none | +| On-prem, customer IdP | `OidcJwtVerifier` β†’ customer issuer (MR-956) | none | +| Our cloud | `OidcJwtVerifier` β†’ WorkOS, `tenant_id = Some(org_id)` (MR-956) | none | + +Token validation is offline (cached JWKS) β€” on-prem/air-gapped keeps working with no request-path cloud dependency. The MCP endpoint never terminates OAuth and never holds a client secret (Resource Server only). + +**Cloud client negotiation β€” additive, no MCP changes:** when MR-956 lands, the server publishes RFC 9728 `/.well-known/oauth-protected-resource` and returns `WWW-Authenticate: Bearer ..., resource_metadata="..."` on 401. A compliant MCP client (Claude) then auto-negotiates: static bearer to an on-prem endpoint; on a cloud 401 it discovers the WorkOS AS and runs OAuth/PKCE itself β€” **same endpoint URL, zero client-side branching.** This RFC only requires that MCP routes flow through the standard 401 path so that hook can be added later without touching MCP. + +**Multi-user identity pass-through (cloud):** the *caller's* token (a WorkOS JWT, audience-bound per-tenant) must reach the server so Cedar enforces per-user/per-tenant policy β€” never a shared service token. The MCP endpoint validates it offline and maps `org_id β†’ tenant_id`. This is why the **remote path is the in-server HTTP MCP that Claude connects to directly** (its token flows through), not a stdio bridge impersonating a user. + +**Client-side credential acquisition (CLI/SDK/proxy) β€” pluggable `CredentialSource`** (RFC-002 Β§5, MR-971), keyed by server name, so OAuth is a future *sibling key*, not a re-key: + +```yaml +servers: + onprem: { endpoint: https://og.internal:8080, auth: { token: { env: OG_TOKEN } } } + edge: { endpoint: https://og-edge, auth: { token: { command: [vault, read, -field=token, secret/og] } } } + cloud: { endpoint: https://api.omnigraph.cloud, auth: { oauth: { issuer: workos } } } # future sibling +``` + +Implicit chain when `auth:` omitted: `OMNIGRAPH_TOKEN_` β†’ keychain `omnigraph:` β†’ `[]` in `~/.omnigraph/credentials`; legacy `bearer_token_env` honoured. Secrets never inlined. + +### 5.9 Safety model β€” Cedar is the gate, default-deny is the floor + +With ad-hoc `query`/`mutate`/`schema_apply` present as tools, the **only** thing protecting an untrusted agent is the Cedar policy. Therefore: + +- **Default-deny when tokens are configured** (MR-723, shipped) is the floor β€” an actor with no grants sees an empty tool list. +- **What works today (coarse action):** a policy can hide all ad-hoc tools and admin tools per-actor (`deny Read, Change, SchemaApply, Branch*`) while allowing stored queries (`allow InvokeQuery`). That already reproduces "can't run ad-hoc, can't read schema, can only call stored queries" β€” the agent sees *every* exposed stored query plus nothing else. +- **What needs PR 0b (per-query scope):** selecting *which* stored queries an actor may call (`allow InvokeQuery [find_user, list_orders]`, deny the rest). The shipped `invoke_query` is coarse (all stored queries or none). Until PR 0b adds a query-name dimension to `PolicyRequest` + the Cedar schema (rfc-001's intended design), per-actor sub-selection of stored queries is **not expressible**; curation is graph-level (which `.gq` files are registered + `mcp.expose`). +- `schema_apply`, `branches_delete`, ad-hoc `mutate` require an explicit admin-tier grant; never in a default agent policy. +- (Open Q3) Optional `mcp.allow_adhoc` server switch defaulting **off** for the ad-hoc `query`/`mutate` tools β€” defence-in-depth independent of Cedar, and independent of PR 0b. + +### 5.10 Result shaping and error mapping + +- **Success:** `tools/call` returns `content: [{type:"text", text:}]` where `` is the route's existing output envelope (read rows / mutation summary, i.e. `ReadOutput` / `ChangeOutput`). (Open Q4: also emit `structuredContent` + `outputSchema` β€” defer; text-JSON for v1.) +- **Tool execution error** (bad params after schema validation, engine error): result with `isError:true` + a text content block. +- **Authorization denial / unknown tool / `mcp.expose:false`:** a single JSON-RPC error (`-32602`, message `"unknown tool"`) β€” identical for all three so policy isn't probeable (same principle as the shipped `POST /queries/{name}` 404 masking). +- **Auth failure** (bad/absent bearer): HTTP 401 from the middleware *before* MCP β€” carries `WWW-Authenticate` (the RFC 9728 hook), never masked as a tool error. (This is exactly the path the shipped `authorize`/`authorize_request` split preserves: operational failures keep their status; only *denials* are masked.) + +## Relationship to the `@modernrelay/omnigraph-mcp` stdio package + +Verified surface of the package (`omnigraph-ts`, pkg version `0.3.0`, `@modelcontextprotocol/sdk@^1.29.0`, **stdio only**): **13 tools** (`health`, `snapshot`, `read`, `schema_get`, `branches_list`, `commits_list`, `commits_get`, `change`, `ingest`, `branches_create`, `branches_delete`, `branches_merge`, `schema_apply`) and **2 resources** (`omnigraph://schema`, `omnigraph://branches`). It is a thin client over the SDK β†’ HTTP routes and **forwards the caller's bearer verbatim** (no inspection). + +Once parity lands, **collapse to one implementation**: the in-server MCP is canonical (Cedar-gated, remote-capable, the path that becomes a Claude-web connector via MR-956). The stdio package degrades to a **thin stdio↔HTTP proxy** forwarding JSON-RPC (and the incoming `Authorization`) to `/mcp` β€” staying the local on-ramp for Claude Code/Desktop while sharing one tool set, one Cedar gate. Transition: keep the current independent stdio package on its `0.3.x`/`0.6.x` line; ship proxy mode in a later TS minor once the server endpoint is GA. (Note: the package is currently several minors behind the server β€” its vendored `spec/openapi.json` predates the stored-query routes β€” so it needs the standard re-sync regardless of MCP work.) + +## Testing + +- **Protocol conformance:** `initialize` handshake + advertised capabilities; `tools/list` shape; `tools/call` happy path; JSON-RPC error envelopes (`-32601` unknown method, `-32602` invalid params / unknown tool); `resources/list` + `resources/read`. +- **Cedar filtering (coarse, today):** an actor with `allow InvokeQuery` + `deny Read/Change` sees *all* exposed stored queries but **not** `query`/`mutate`/`schema_get`; `tools/call query` returns masked "unknown tool"; an admin sees the full catalog. +- **Cedar filtering (per-query, gated on PR 0b):** actor scoped to `InvokeQuery [find_user]` sees *only* `find_user`; `tools/call list_orders` masks. **This test ships with PR 0b**, not PR 1 β€” it cannot pass against the coarse action. +- **Parity per built-in:** each tool round-trips against the same expectations as its HTTP route (reuse route tests); `read`/`change` aliases dispatch identically to `query`/`mutate`. +- **Double-gating:** a stored mutation requires both `InvokeQuery` and `Change`; `schema_apply` requires `SchemaApply`. +- **`mcp.expose:false`:** absent from `GET /queries` and MCP `tools/list`; still service-callable by name through `POST /queries/{name}` when the actor has `invoke_query`, but not MCP-callable. +- **Schema generation:** table-driven over every `ParamKind` incl. nullable / list / vector(dim). +- **Branch-scoped list approximation:** assert the documented R7 caveat β€” a branch-scoped policy lists `branches_create`, and `tools/call` is the authoritative gate (a denied target still 403s/masks). +- **Multi-graph isolation:** `/graphs/a/mcp` never lists graph `b`'s tools; server `/mcp` exposes only server tools. +- **Auth decoupling:** the MCP suite is green under the current `require_bearer_auth` and under a mock OIDC `ResolvedActor` source β€” proving verifier-agnosticism. A 401 carries `WWW-Authenticate`. +- **OpenAPI:** the JSON-RPC endpoint is not REST β€” document only the envelope in utoipa (or exclude); keep `openapi.json` drift test green (`OMNIGRAPH_UPDATE_OPENAPI=1` to regenerate on intentional change). +- **Cross-repo smoke (optional):** point `@modelcontextprotocol/sdk` (TS) at the HTTP endpoint in an `omnigraph-ts` integration test. + +## Rollout β€” phased by risk + +- **PR 0a β€” extract the reusable invoke path (small).** The coarse `invoke_query` gate + 404 denial-masking are **already shipped** in `server_invoke_query`. Extract the read/mutate dispatch into `invoke_stored_query(handle, name, params, branch/snapshot, actor)` so MCP `tools/call` and the HTTP route share one path. No behaviour change. *(Replaces the previous draft's "PR 0 β€” wire the gate", which was already done.)* +- **PR 0b β€” per-query `invoke_query` scope (the safety prerequisite).** Add a query-name dimension to `PolicyRequest` + the Cedar schema (rfc-001's intended design), wire it at `POST /queries/{name}` and in the stored-query `McpTool::authorization`. Independently useful (the `allow InvokeQuery [find_user]` policy). **Gates the per-query Cedar-filtering test and Β§5.9's recommended agent policy.** +- **PR 1 β€” MCP transport + read-only parity + stored-query reads.** Endpoint(s), `initialize`/`tools/list`/`tools/call`/`resources/*`, the `McpTool` registry, Cedar-filtered listing, the read-only built-ins (`health`, `graphs_list`, `snapshot`, `read`/`query`, `schema_get`, `branches_list`, `commits_*`) + resources + stored-query *reads*. All auth-agnostic. +- **PR 2 β€” mutating parity + stored-query mutations.** `change`/`mutate`, `ingest`, `branches_create/delete/merge`, `schema_apply`, stored-query mutations + the `mcp.allow_adhoc` switch. +- **PR 3 β€” docs + agent on-ramp hook.** `docs/user/server.md` MCP section (incl. the recommended agent policy + the coarse-vs-per-query caveat), `openapi.json` sync, the `omnigraph mcp install` config target (MR-974), and the downstream `omnigraph-ts` re-sync/proxy follow-up. +- **Later (separate, MR-956):** RFC 9728 protected-resource metadata + WorkOS β€” slots in with zero MCP changes. +- **Later (TS minor):** stdio package β†’ proxy mode. + +## Migration / backwards compatibility + +- **Additive.** No `queries:` and no MCP traffic β†’ today's behaviour unchanged. New endpoints are new routes. +- **Cedar default-deny** (when tokens configured) means MCP exposes nothing until an actor is granted β€” safe by default. +- The stdio package keeps working unchanged; proxy mode is opt-in later. +- `openapi.json` only gains the documented MCP envelope; existing REST routes untouched. + +## Open Questions + +1. **BigInt/u64 as JSON string** (recommended, precision-safe) vs number. +2. **`rmcp` vs hand-rolled** JSON-RPC (spike `rmcp` on edition 2024; default to hand-roll on friction). +3. **Default-off `mcp.allow_adhoc`** for ad-hoc `query`/`mutate` (recommended) vs always-on + Cedar-only. +4. **`structuredContent` + `outputSchema`** now vs text-JSON v1 (recommend v1 text-JSON). +5. **Endpoint paths:** `/mcp` + `/graphs/{id}/mcp` β€” confirm naming and whether server-scoped tools also appear on the per-graph endpoint. +6. **Stateless POST-only** confirmed (no near-term server-initiated messages) β€” revisit only if subscriptions land. +7. **Legacy alias tools** (`read`/`change`): keep for client compat (the shipped package uses them), or drop and rely on `query`/`mutate`? +8. **PR 0b shape:** per-query scope as a Cedar *resource* (`StoredQuery::"find_user"`) vs a `query_name` *context attribute* + policy condition β€” affects how `allow InvokeQuery [list]` is authored. diff --git a/docs/dev/writes.md b/docs/dev/writes.md index 974f7a6..8b692b4 100644 --- a/docs/dev/writes.md +++ b/docs/dev/writes.md @@ -14,8 +14,11 @@ publisher's row-level CAS on `__manifest` is the single fence. - No `RunRecord`, no `_graph_runs.lance`, no `_graph_run_actors.lance`. - No `omnigraph run *` CLI subcommands and no `/runs/*` HTTP endpoints. -- No `__run__` staging branches. (Legacy on-disk artifacts from - pre-MR-771 repos are inert; MR-770 sweeps them in production.) +- No `__run__` staging branches; `__run__*` is no longer a reserved + name. The branch-name guard was removed in MR-770, and any stale + `__run__*` branch on an upgraded graph is swept off `__manifest` by the + v2β†’v3 internal-schema migration on first read-write open. (The inert + `_graph_runs.lance` bytes remain until a `delete_prefix` primitive lands.) - Cancelled mutation futures leave **no graph-level state** β€” only orphaned Lance fragments, which the existing `omnigraph cleanup` pipe reclaims. @@ -245,9 +248,14 @@ list`. ## Migration code -`db/manifest/migrations.rs` does not change. Active deletion of -`_graph_runs.lance` belongs in MR-770 (the production sweep) β€” this PR -stops *creating* run state but does not destroy legacy bytes on disk. +`db/manifest/migrations.rs` carries the v2β†’v3 internal-schema step (MR-770): +a one-time sweep that deletes legacy `__run__*` staging branches off +`__manifest`. It runs in `Omnigraph::open(ReadWrite)` (via +`manifest::migrate_on_open`, before the coordinator reads branch state) and +again on the publisher's write path; both are idempotent once the stamp is at +v3. Deleting the inert `_graph_runs.lance` / `_graph_run_actors.lance` dataset +*bytes* is still deferred β€” it needs a `StorageAdapter::delete_prefix` +primitive β€” but those bytes are invisible to graph-level state. ## Mid-query partial failure: closed by MR-794 diff --git a/docs/releases/v0.6.1.md b/docs/releases/v0.6.1.md new file mode 100644 index 0000000..0acc34b --- /dev/null +++ b/docs/releases/v0.6.1.md @@ -0,0 +1,28 @@ +# Omnigraph v0.6.1 + +v0.6.1 focuses on operational polish after v0.6.0: stored-query registries, safer branch cleanup, more complete release artifacts, and a Lance blob-compaction workaround. + +## Highlights + +- **Stored-query registries.** `omnigraph.yaml` can declare curated `queries:` blocks per graph. Servers load and type-check them at startup, `omnigraph queries validate` checks them offline, `omnigraph queries list` shows exposed queries and typed params, `GET /queries` exposes a typed catalog, and `POST /queries/{name}` invokes a stored query without accepting ad hoc `.gq` source from the client. +- **Stored-query policy gate.** New Cedar action `invoke_query` gates the stored-query invocation surface. Stored mutations are double-gated: `invoke_query` to reach the stored query and `change` for the actual write. +- **Safer branch deletion.** `branch_delete` now treats the manifest as the authority, flips branch visibility atomically, and reclaims per-table/commit-graph forks as derived state. If best-effort reclaim is interrupted, `cleanup` reconciles orphaned forks; reusing a branch name before cleanup reports an actionable error. +- **Legacy `__run__` cleanup (MR-770).** Removed the last functional remnant of the Run state machine (retired in v0.4.0): the `__run__` branch-name guard. A new v2β†’v3 `__manifest` internal-schema migration sweeps any stale `__run__*` staging branches on the first read-write open, so `__run__*` is no longer a reserved branch name. This closes the "unpromoted `__run__` branches block reads" condition behind the zombie-run cascade incident; the inert `_graph_runs.lance` row cleanup is tracked separately (it needs a `delete_prefix` primitive). +- **Blob-safe optimize.** `omnigraph optimize` skips tables with `Blob` properties instead of failing the whole sweep on Lance's blob-v2 compaction decode bug. Skips are visible in human output, `--json` as `skipped`, `TableOptimizeStats.skipped`, and logs; non-blob tables still compact normally. +- **Deployment improvements.** The container entrypoint now composes `OMNIGRAPH_TARGET_URI` with `OMNIGRAPH_CONFIG`, so operators can keep the graph URI in env while loading policy/query config from a mounted file. The local RustFS bootstrap pins RustFS beta.3 and allows the current insecure local-dev default credentials. +- **Windows release support.** Tagged and edge releases now publish Windows x86_64 archives containing `omnigraph.exe` and `omnigraph-server.exe`, with a PowerShell installer and Windows install docs. +- **Release tooling.** Homebrew formula generation was tightened to produce audit-clean formulas. + +## Compatibility Notes + +- A graph selected by name (`--target` or `server.graph`) now uses `graphs..policy` and `graphs..queries`. Top-level `policy` / `queries` blocks are only for anonymous bare-URI single-graph mode; using them with a named graph now fails loudly with migration guidance. +- `mcp.expose` defaults to `true` for stored-query registry entries. Set `mcp: { expose: false }` for service-only queries that should not appear in the catalog. +- `invoke_query` is graph-scoped, not branch-scoped. Branch/snapshot access remains enforced by the inner `read` / `change` gate. +- **Legacy `__run__` migration.** Graphs created before v0.4.0 are migrated automatically on the first **read-write** open by a v0.6.1 binary (one-time `__manifest` stamp v2β†’v3 sweep of stale `__run__*` branches). No action required. Two caveats: (1) a graph opened **read-only** still lists any stale `__run__*` branch until its first read-write open, since the migration is write-path-only like all manifest migrations β€” long-lived read-only deployments should be opened read-write once after upgrading; (2) the inert `_graph_runs.lance` / `_graph_run_actors.lance` dataset bytes are left in place until a future `delete_prefix` primitive (they are invisible to graph-level state). +- Blob tables are not compacted until the upstream Lance fix lands, so fragment count and deleted-row space on blob tables are not reclaimed by `optimize`. Reads, writes, and query results are unaffected; no on-disk migration is required. +- `TableOptimizeStats` is now `#[non_exhaustive]` and gains a `skipped: Option` field (so does the new `SkipReason` enum). This is a source-level change only for downstream code that built this returned result struct by literal β€” rare, since it is produced by `optimize` and consumed by reading its fields; field access is unaffected, and `#[non_exhaustive]` keeps future additions non-breaking. + +## Docs And Cleanup + +- Public docs were updated for stored queries, policy, server routes, deployment, Windows installation, branch deletion, maintenance, and the `runs` docs rename to `writes`. +- README copy and release documentation were refreshed; older release notes had small typo/wording fixes. diff --git a/docs/rfcs/0000-template.md b/docs/rfcs/0000-template.md new file mode 100644 index 0000000..48f4bda --- /dev/null +++ b/docs/rfcs/0000-template.md @@ -0,0 +1,54 @@ +# RFC NNNN: + +| | | +|---|---| +| **Status** | Proposed | +| **Author(s)** | <your name / handle> | +| **Discussion** | <link to the originating Discussion, if any> | +| **Implementation** | <issue/PR links, filled in as work lands> | + +> Status is maintained by maintainers: `Proposed` while the PR is open, +> `Accepted` on merge, `Declined` on close, `Superseded by NNNN` later. + +## Summary + +One paragraph: what this changes, in plain terms. + +## Motivation + +What problem does this solve, and why is it worth the ongoing cost? Tie it to a +concrete need (a Discussion, a recurring issue, a user request). Per the +project's first principle, argue the *long-run liability*, not just the +short-term convenience. + +## Guide-level explanation + +Explain the change as you'd teach it to a user or contributor: new commands, +syntax, API shapes, behavior. Examples first. + +## Reference-level design + +The precise design: data structures, IR/AST/planner changes, storage/format +impact, migration path, error behavior. Enough that a reviewer can find the +holes. + +## Invariants & deny-list check + +Which Hard Invariants in [../dev/invariants.md](../dev/invariants.md) does this +touch? Does it brush against any deny-list item β€” and if so, why is this the +justified exception? State explicitly that no invariant is weakened, or which +Known Gap moves. + +## Drawbacks & alternatives + +What does this cost, what did you reject, and why. "Do nothing" is a valid +alternative to weigh. + +## Reversibility + +Is this reversible? On-disk/wire/format and substrate choices are near-permanent +and demand more evidence; a CLI flag or doc is cheap to undo. Say which this is. + +## Unresolved questions + +What's deliberately left open for review to settle. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md new file mode 100644 index 0000000..99cdd76 --- /dev/null +++ b/docs/rfcs/README.md @@ -0,0 +1,66 @@ +# RFCs + +Substantial changes to OmniGraph β€” new user-facing surface, format or protocol +changes, anything irreversible or cross-cutting β€” go through a lightweight RFC +so the design is agreed *as reviewable code* before implementation starts. This +is the public RFC track, open to **anyone, including external contributors**. + +This complements the always-on review bar in +[../dev/invariants.md](../dev/invariants.md): the invariants say *what every +change must respect*; an RFC says *why this particular change is worth making and +how*. + +> **Two tracks, don't conflate them.** This `docs/rfcs/` directory is the +> **public contribution** track (anyone authors; maintainers accept). The +> maintainer-internal RFCs under `docs/dev/rfc-00N-*.md` are a separate, +> team-owned track for in-flight internal work. If you're an outside +> contributor, you're in the right place here. + +## When you need one + +- **RFC required:** new query/schema/CLI/HTTP surface; on-disk or wire-format + changes; a new substrate dependency; anything the deny-list in + [../dev/invariants.md](../dev/invariants.md) flags; anything irreversible + ("reversibility shapes evidence demand"). +- **RFC not required:** bug fixes for an `accepted` issue, and the trivial + fast-lane (typos, docs, deps) β€” see [../../CONTRIBUTING.md](../../CONTRIBUTING.md). + +If you're unsure, start a [Discussion](../../../discussions); a maintainer will +tell you whether it needs an RFC. + +## Lifecycle + +``` +Discussion (incubate, get rough consensus) + β”‚ graduate + β–Ό +RFC pull request β†’ adds docs/rfcs/NNNN-title.md (Status: Proposed) + β”‚ +maintainer review ──▢ changes requested / declined (PR closed, with rationale) + β”‚ + β–Ό +merged == Accepted (the merged file is the durable decision record) + β”‚ + β–Ό +Implementation PR(s) reference the accepted RFC +``` + +- **Author:** anyone. **Acceptance:** a maintainer decision, performed by + merging the RFC PR. Declining is closing it with rationale. +- The merged RFC *is* the accepted record β€” there is no separate sign-off step. +- Later reversals don't edit history: supersede with a new RFC that links back + and flip the old one's `Status` to `Superseded`. + +## Numbering & naming + +- File: `docs/rfcs/NNNN-kebab-title.md`, where `NNNN` is the next free + zero-padded integer (`0001`, `0002`, …). `0000-template.md` is reserved. +- Pick the number when you open the PR; if it collides with another in-flight + RFC, the second to merge bumps theirs. + +## Status values + +`Proposed` (open PR) Β· `Accepted` (merged) Β· `Declined` (closed) Β· +`Superseded by NNNN` Β· `Implemented` (set once the work lands, optional). + +Copy [0000-template.md](0000-template.md) to start. diff --git a/docs/user/audit.md b/docs/user/audit.md index e8abe5b..ab028ac 100644 --- a/docs/user/audit.md +++ b/docs/user/audit.md @@ -4,4 +4,4 @@ - `_as` variants of every write API let callers override the actor: `mutate_as`, `ingest_as`, `branch_merge_as`, `apply_schema_as`, etc. - Actor IDs are persisted on `GraphCommit.actor_id` with split storage in `_graph_commit_actors.lance` (the commit graph is split into `_graph_commits.lance` for the linkage and `_graph_commit_actors.lance` for the actor map). - HTTP server uses the bearer-token actor automatically; CLI uses the local user / explicit env (no implicit actor). -- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0 and reclaimed by MR-770's production sweep. +- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0. The v2β†’v3 manifest migration sweeps any stale `__run__*` branches on first write-open (MR-770); the inert dataset bytes remain until a `delete_prefix` primitive lands. diff --git a/docs/user/branches-commits.md b/docs/user/branches-commits.md index de6c653..0565186 100644 --- a/docs/user/branches-commits.md +++ b/docs/user/branches-commits.md @@ -8,10 +8,10 @@ Lance supports branching at the dataset level: a branch is a named lineage of ve OmniGraph builds *graph branches* on top by branching every sub-table coherently: -- `branch_create(name)` / `branch_create_from(target, name)` β€” disallowed name `main`; fails if branch exists; ensures the schema-apply lock is idle. -- `branch_list()` β€” returns public branches, **filters internal** `__run__…` and `__schema_apply_lock__` prefixes. -- `branch_delete(name)` β€” refuses if there are descendants or active runs on the branch; cleans up owned per-branch fragments. -- **Lazy forking**: a branch only forks a sub-table when that sub-table is first mutated on it. Pure-read branches share fragments with their source. +- `branch_create(name)` / `branch_create_from(target, name)` β€” disallowed name `main`; fails if branch exists; ensures the schema-apply lock is idle. Atomic and authority-first like `branch_delete`: it flips the `__manifest` branch (authority), then creates the derived commit-graph branch, force-dropping any orphaned commit-graph ref left by an incomplete prior delete (the manifest branch is fresh, so a same-named commit-graph branch is provably a zombie). If commit-graph creation fails, the manifest branch is rolled back so the name never half-exists. +- `branch_list()` β€” returns public branches, **filters the internal** `__schema_apply_lock__` branch. +- `branch_delete(name)` β€” refuses if there are descendants on the branch, or if it is the current branch. The manifest is the single authority for branch existence: deletion flips the `__manifest` branch ref first (one atomic op), after which the branch is gone from every snapshot. The owned per-table forks and the commit-graph branch are derived state, reclaimed best-effort with `force_delete_branch` after the flip. A failure during that reclaim (transient object-store error) does not fail the call or block the authority flip; the leftover forks are unreachable orphans that the [`cleanup`](maintenance.md) reconciler converges. One consequence: if a delete's best-effort reclaim fails, reusing that branch name before the next `cleanup` surfaces a clear error pointing at `cleanup` (the stale fork would otherwise collide on first write). +- **Lazy forking**: a branch only forks a sub-table when that sub-table is first mutated on it. Pure-read branches share fragments with their source. A fork collision is classified by the manifest authority, not by Lance branch versions: if the live manifest already records the fork on the active branch, a concurrent first-write won and the caller gets a retryable "refresh and retry"; if the manifest does not, a physical branch there is an orphan and the caller is pointed at `cleanup`. - `sync_branch(branch)` β€” re-binds the in-memory handle to the latest head of the branch. ## L2 β€” Commit graph (`db/commit_graph.rs`) @@ -51,10 +51,10 @@ Notes: ## L2 β€” Internal system branches -Filtered from `branch_list()` but visible to internals: +Internal or legacy branch refs: -- `__schema_apply_lock__` β€” serializes schema migrations. -- `__run__<run-id>` β€” legacy from the pre-v0.4.0 Run state machine (removed in MR-771). The branch-name guard predicate `is_internal_run_branch` is kept as defense-in-depth so users cannot create a branch matching the legacy prefix; the filter will be removed once production legacy branches are swept (MR-770). +- `__schema_apply_lock__` β€” serializes schema migrations; filtered from `branch_list()` but visible to internals. +- `__run__<run-id>` β€” legacy from the pre-v0.4.0 Run state machine (removed in MR-771). These are swept off `__manifest` on the first read-write open by the v2β†’v3 internal-schema migration (MR-770), and `__run__*` is no longer a reserved name. Known limitation: a pre-v0.4.0 graph opened **read-only** still surfaces any stale `__run__*` branch in `branch_list()` until its first read-write open (the migration is write-path-only, like all manifest migrations). ## L2 β€” Recovery audit trail diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 0326e64..8263919 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -20,10 +20,11 @@ A reference for the `omnigraph` binary's command surface and `omnigraph.yaml` sc | `run list \| show \| publish \| abort` | transactional run ops | | `schema plan \| apply \| show (alias: get)` | migrations | | `lint` (alias: `check`) | offline / graph-backed query validation. Replaces `query lint` / `query check`, which are kept as deprecated argv-level shims that print a one-line warning and rewrite to `omnigraph lint` | -| `optimize` | non-destructive Lance compaction | +| `queries validate \| list` | operate on the server-side stored-query registry (the `queries:` block). `validate` type-checks every stored query against the live schema offline (opens the selected graph; exits non-zero on any breakage), catching schema drift without restarting the server; `list` prints the selected registry's query names, MCP exposure, and typed params. For per-graph registries, pass `--target <graph>` or set `cli.graph`; with no graph selection, `list` shows only top-level `queries:`. Distinct from `lint`, which validates a single `.gq` file | +| `optimize` | non-destructive Lance compaction (skips tables with `Blob` columns; `--json` reports a `skipped` field) | | `cleanup --keep N --older-than 7d --confirm` | destructive version GC | | `embed` | offline JSONL embedding pipeline | -| `policy validate \| test \| explain` | Cedar tooling | +| `policy validate \| test \| explain` | Cedar tooling. Selects `cli.graph`, else `server.graph`, else top-level `policy.file` | | `version` / `-v` | print `omnigraph 0.3.x` | ## `omnigraph.yaml` schema @@ -34,6 +35,13 @@ graphs: <name>: uri: <local|s3://|http(s)://> bearer_token_env: <ENV_NAME> + queries: # per-graph stored-query registry (server-role; multi-graph mode) + <query-name>: # key MUST equal the `query <name>` symbol inside the .gq + file: <path-to-.gq> # relative to this config's directory + mcp: + expose: true # default true: listed in the MCP catalog (GET /queries); set false to hide (still HTTP-callable) + tool_name: <name> # optional MCP tool-name override (defaults to <query-name>; + # must be unique across exposed queries) server: graph: <name> bind: <ip:port> @@ -59,6 +67,8 @@ aliases: graph: <name> branch: <name> format: <output-format> +queries: # top-level registry β€” applies only to a bare-URI (anonymous) graph; a graph served by name uses its `graphs.<id>.queries`. Mirrors top-level `policy`. + <query-name>: { file: <path-to-.gq> } # mcp.expose defaults to true policy: file: ./policy.yaml ``` diff --git a/docs/user/constants.md b/docs/user/constants.md index 527aaea..210155e 100644 --- a/docs/user/constants.md +++ b/docs/user/constants.md @@ -4,13 +4,14 @@ |---|---|---| | `MANIFEST_DIR` | `__manifest` | `db/manifest/layout.rs` | | Commit graph dir | `_graph_commits.lance` | `db/commit_graph.rs` | -| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; reclaimed by MR-770 | -| Run branch prefix (legacy, removed MR-771) | `__run__` | filtered by `is_internal_run_branch` defense-in-depth | +| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; bytes remain until a `delete_prefix` primitive lands | +| Run branch prefix (legacy, removed MR-771/MR-770) | `__run__` | swept off `__manifest` by the v2β†’v3 migration; no longer a reserved name | | Schema apply lock | `__schema_apply_lock__` | `db/mod.rs` | | Manifest publisher retry budget | `PUBLISHER_RETRY_BUDGET = 5` | `db/manifest/publisher.rs` | -| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 2` | `db/manifest/migrations.rs` | +| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 3` | `db/manifest/migrations.rs` | | Merge stage batch | `MERGE_STAGE_BATCH_ROWS = 8192` | `exec/merge.rs` | | Maintenance concurrency | `OMNIGRAPH_MAINTENANCE_CONCURRENCY=8` | `db/omnigraph/optimize.rs` | +| Lance blob compaction support | `LANCE_SUPPORTS_BLOB_COMPACTION = false` | `db/omnigraph/optimize.rs` | | Graph index cache size | `8` (LRU) | `runtime_cache.rs` | | Default body limit | `1 MB` | `omnigraph-server/lib.rs` | | Ingest body limit | `32 MB` | `omnigraph-server/lib.rs` | diff --git a/docs/user/maintenance.md b/docs/user/maintenance.md index 08ae8da..3628fa0 100644 --- a/docs/user/maintenance.md +++ b/docs/user/maintenance.md @@ -7,16 +7,23 @@ - Lance `compact_files()` on every node + edge table on `main`. - Rewrites small fragments into fewer large ones; old fragments remain reachable via older manifests. - Bounded by `OMNIGRAPH_MAINTENANCE_CONCURRENCY` (default 8). -- Returns `[TableOptimizeStats { table_key, fragments_removed, fragments_added, committed }]`. +- Returns `[TableOptimizeStats { table_key, fragments_removed, fragments_added, committed, skipped }]`. +- **Blob tables are skipped.** A table that declares any `Blob` property is not compacted: it is reported with `skipped: Some(BlobColumnsUnsupportedByLance)` (and logged via `tracing::warn`) instead of compacted, and the rest of the sweep proceeds normally. The current Lance `compact_files` mis-decodes blob-v2 columns under its forced `BlobHandling::AllBinary` read; **reads and writes are unaffected** β€” only compaction is. This is gated by `LANCE_SUPPORTS_BLOB_COMPACTION` (`db/omnigraph/optimize.rs`) and removed when the upstream Lance fix lands (see [docs/dev/lance.md](../dev/lance.md)). Consequence: fragment count and deleted-row space on blob tables are not reclaimed until then; query results are never affected. ## `cleanup_all_tables(db, options)` β€” destructive - Lance `cleanup_old_versions()` per table. - Removes manifests (and their unique fragments) older than the retention policy. - `CleanupPolicyOptions { keep_versions: Option<u32>, older_than: Option<Duration> }` β€” at least one is required. -- Returns `[TableCleanupStats { table_key, bytes_removed, old_versions_removed }]`. +- Returns `[TableCleanupStats { table_key, bytes_removed, old_versions_removed, error }]`. +- **Fault-isolated per table.** A single table's transient failure (version GC or + orphan reclaim) is recorded on that table's stats row (`error: Some(..)`, logged + via `tracing`) and never aborts the healthy tables β€” cleanup is the convergence + backstop, so it does as much as it can and converges on re-run. The CLI reports + any failed tables; rerun `cleanup` to retry them. - CLI guards with `--confirm`; without it, prints a preview line. - **Recovery floor:** `--keep < 3` may garbage-collect Lance versions that the open-time recovery sweep needs as a rollback target (the sweep restores to the branch's manifest-pinned table version, which is HEAD-1 in the typical Phase B β†’ Phase C drift case). Default `--keep 10` is safe. +- **Orphaned-branch reconciliation:** before the version GC, cleanup runs `reconcile_orphaned_branches`, which `force_delete_branch`es any per-table or commit-graph Lance branch absent from the manifest branch list. These orphans arise when a `branch_delete` flips the manifest authority but a downstream best-effort reclaim does not complete (see [branches-commits.md](branches-commits.md)). The reconciler is authority-derived and idempotent (it no-ops once nothing is orphaned), runs regardless of the `keep_versions` / `older_than` values (those gate version GC only), and never reclaims `main` or system-branch forks. Reclaimed forks are logged via `tracing::info`. ## Tombstones diff --git a/docs/user/policy.md b/docs/user/policy.md index 749d3be..ec0d214 100644 --- a/docs/user/policy.md +++ b/docs/user/policy.md @@ -14,10 +14,11 @@ Per-graph actions (bind to `Omnigraph::Graph::"<graph_id>"`): 6. `branch_delete` 7. `branch_merge` 8. `admin` β€” reserved for policy-management surfaces (hot reload, audit log, approvals). No call site today; see MR-724 for the reservation rationale. +9. `invoke_query` β€” gates invoking a server-side stored query (the `queries:` registry). Graph-scoped (like `admin`) β€” per-branch access is enforced by the inner `read` / `change` gate, so a rule that sets `branch_scope` on `invoke_query` is rejected. Coarse in this release: an `invoke_query` allow rule permits any stored query on the graph; a future, additive refinement adds an optional per-query-name scope without changing rules written against the coarse action. Enforced at `POST /queries/{name}` (see [server](server.md)). A stored *mutation* is double-gated: `invoke_query` to reach the tool, plus `change` for the write itself (the engine `_as` writers still enforce per the query body). Server-scoped action (v0.6.0+; binds to `Omnigraph::Server::"root"`): -9. `graph_list` β€” `GET /graphs` registry enumeration (multi-graph mode) +10. `graph_list` β€” `GET /graphs` registry enumeration (multi-graph mode) Server-scoped actions cannot use `branch_scope` or `target_branch_scope` β€” they operate on the registry, not on a graph's branches. A rule cannot mix server-scoped and per-graph actions; split into separate rules. (Runtime `graph_create` / `graph_delete` are reserved but not shipped in v0.6.0; operators add/remove graphs by editing `omnigraph.yaml` and restarting.) @@ -46,10 +47,15 @@ graphs: # no per-graph policy β†’ no engine-layer Cedar enforcement on beta ``` -Top-level `policy.file` is single-graph / CLI-local policy only. Multi-graph -server startup rejects it because applying one graph policy to every configured -graph is ambiguous. Move per-graph rules to `graphs.<graph_id>.policy.file` and -move `graph_list` rules to `server.policy.file`. +**Config follows graph identity, not server mode.** A graph served by **name** +(`--target <name>` or `server.graph`) uses its own `graphs.<name>.policy.file`, +exactly as in multi-graph mode. Top-level `policy.file` applies only to an +**anonymous** graph β€” one served by a bare `<URI>` with no `graphs:` entry. +Serving a **named** graph (single- or multi-graph mode) while top-level +`policy.file` (or `queries:`) is populated **refuses boot**, naming the block, +since the top-level value would otherwise be silently shadowed by the per-graph +block. Move per-graph rules to `graphs.<graph_id>.policy.file` and `graph_list` +rules to `server.policy.file`. Each graph's HTTP request flows through its own per-graph policy. The management endpoint (`GET /graphs`) flows through the server-level policy. When `server.policy.file` is unset, `GET /graphs` is denied in every runtime state, including `--unauthenticated`; with bearer tokens configured, it returns 403 after admission control because `graph_list` is not a `read`-equivalent action. The operator must explicitly authorize via `server-policy.yaml` to expose `/graphs`. @@ -92,6 +98,10 @@ bearer token. ## CLI +Policy tooling resolves its graph like server single-mode policy: `cli.graph` +wins, otherwise `server.graph` is used, otherwise the top-level `policy.file` +is validated/tested/explained as the anonymous policy. + - `omnigraph policy validate` β€” parse + count actors, exit 1 on parse error. - `omnigraph policy test` β€” run cases in `policy.tests.yaml`, exit 1 on any expectation mismatch. - `omnigraph policy explain --actor … --action … [--branch …] [--target-branch …]` β€” show decision and matched rule. diff --git a/docs/user/server.md b/docs/user/server.md index 6f55e16..67b5afe 100644 --- a/docs/user/server.md +++ b/docs/user/server.md @@ -6,7 +6,9 @@ Axum 0.8 + tokio + utoipa-generated OpenAPI. **Two modes** (v0.6.0+): single-gra ### Single-graph mode (legacy) -`omnigraph-server <URI>` or `omnigraph-server --target <name> --config omnigraph.yaml`. Routes are flat β€” `/snapshot`, `/read`, `/branches`, etc. Behavior unchanged from v0.6.0. +`omnigraph-server <URI>` or `omnigraph-server --target <name> --config omnigraph.yaml`. Routes are flat β€” `/snapshot`, `/read`, `/branches`, etc. + +**Config follows graph identity.** A bare `<URI>` is an *anonymous* graph and uses the **top-level** `policy.file` / `queries:`. A graph chosen by **name** (`--target` / `server.graph`) uses its own `graphs.<name>.{policy.file, queries}` β€” the same block multi-graph mode uses. ⚠️ *Changed from v0.6.0, which always used top-level config in single mode: a named-graph config that puts `policy`/`queries` at top-level now **refuses boot** and points you at `graphs.<name>.…` (move the block there). Bare-`<URI>` single mode is unchanged.* ### Multi-graph mode (v0.6.0+) @@ -20,6 +22,10 @@ Mode inference (four-rule matrix): 4. `--config` + non-empty `graphs:` + no single-mode selector β†’ **multi** 5. otherwise β†’ error with migration hint +### Stored-query validation at startup + +If a graph declares a `queries:` registry (see [cli-reference](cli-reference.md)), the server **loads and type-checks every stored query against that graph's live schema at startup** and **refuses to boot** if any query references a type or property the schema lacks β€” the same fail-loud posture as a malformed policy file, so schema drift surfaces at the deploy boundary rather than at invocation. Two MCP-exposed queries claiming the same tool name is likewise a boot error. Non-blocking advisories (e.g. an MCP-exposed query with a vector parameter an agent cannot supply) are logged. Validate offline before deploying with `omnigraph queries validate`. Discover the exposed queries as a typed tool catalog with `GET /queries`, and invoke one over HTTP with `POST /queries/{name}` (both below). + ## Endpoint inventory Per-graph endpoints β€” same body shape across modes; URLs differ: @@ -34,6 +40,8 @@ Per-graph endpoints β€” same body shape across modes; URLs differ: | POST | `/export` | `/graphs/{id}/export` | bearer + `export` | NDJSON stream | `server_export` | | POST | `/mutate` | `/graphs/{id}/mutate` | bearer + `change` | mutation (canonical; `query`/`name`; accepts legacy `query_source`/`query_name` as serde aliases) | `server_mutate` | | POST | `/change` | `/graphs/{id}/change` | bearer + `change` | **deprecated** alias of `/mutate` (carries `Deprecation: true` + `Link: </mutate>; rel="successor-version"`) | `server_change` | +| GET | `/queries` | `/graphs/{id}/queries` | bearer + `read` | list the `mcp.expose` stored queries as a typed tool catalog | `server_list_queries` | +| POST | `/queries/{name}` | `/graphs/{id}/queries/{name}` | bearer + `invoke_query` (+ `change` for a stored mutation) | invoke a named query from the `queries:` registry; deny == 404 | `server_invoke_query` | | GET | `/schema` | `/graphs/{id}/schema` | bearer + `read` | get current `.pg` source | `server_schema_get` | | POST | `/schema/apply` | `/graphs/{id}/schema/apply` | bearer + `schema_apply` (target=`main`) | migrate | `server_schema_apply` | | POST | `/ingest` | `/graphs/{id}/ingest` | bearer + `branch_create` (if new) + `change` | bulk load | `server_ingest` (32 MB body limit) | @@ -50,6 +58,23 @@ Server-level management endpoints (v0.6.0+): |---|---|---|---|---| | GET | `/graphs` | bearer + `graph_list` on `Server::"root"` | list registered graphs | `server_graphs_list` (405 in single mode) | +### Stored-query catalog (`GET /queries`) + +List the graph's **`mcp.expose`** stored queries as a typed tool catalog β€” enough for a client (e.g. an MCP server) to register each as a tool without fetching `.gq` source. Each entry: `{ name, tool_name, description, instruction, mutation, params }`, where each param is `{ name, kind, item_kind?, vector_dim?, nullable }`. `kind` is one of `string | bool | int | bigint | float | date | datetime | blob | vector | list` (decomposed so a consumer maps it with a closed `switch`, never re-parsing GQ type spelling). `bigint` (I64/U64), `date`, `datetime`, and `blob` are carried as JSON **strings** β€” a 64-bit integer loses precision as a JSON number, dates are ISO strings, and a blob is a URI string. + +- **Read-gated** (works in default-deny mode). The catalog is **graph-wide** (branch-independent; `read` is authorized against `main`). +- **`mcp.expose` defaults to `true`** β€” declaring a query in `queries:` lists it; set `mcp: { expose: false }` to keep it HTTP/service-callable but hidden from the catalog. +- **Not Cedar-filtered per query (yet).** A caller with `read` but not `invoke_query` can *list* a query they can't *invoke* (which would 404). Closing that gap is future per-query authorization; for now the catalog is a discovery surface and `invoke_query` remains the invocation gate. + +### Stored-query invocation (`POST /queries/{name}`) + +Invoke a curated, server-side stored query by **name** β€” the source comes from the graph's `queries:` registry, so the client never sends `.gq`. The request body itself is optional; omit it for no-param queries, or send `{ "params": { … }, "branch": "main", "snapshot": null }`, where every field is optional and `params` keys match the query's declared parameters. The response is the **read envelope** (`ReadOutput`) for a stored read or the **mutation envelope** (`ChangeOutput`) for a stored mutation β€” serialized untagged, so the wire shape is identical to `/query` / `/mutate`. + +- **Gate:** `invoke_query` (per-graph, graph-scoped) at the boundary. A stored *mutation* is **double-gated** β€” it also passes the engine's `change` gate, so an actor with `invoke_query` but not `change` gets `403`. +- **Deny == unknown, for callers without `invoke_query`:** for a caller lacking the grant, an `invoke_query` denial and an unknown query name return the **same `404`** (identical body), so the catalog can't be probed. A caller that *holds* `invoke_query` may still get the inner gate's `403` for an existing query it can't `read`/`change` (the double-gate, above) β€” so existence is visible to grant-holders by design. +- **Requires an explicit policy grant when auth is on.** In default-deny mode (bearer tokens but no `policy.file`), only `read` is permitted, so *every* `/queries/{name}` call returns `404` until an `invoke_query` rule is configured. +- A stored mutation cannot target a `snapshot` (`400`); a parameter type error is a structured `400` naming the parameter. + ## Adding and removing graphs (multi mode) Runtime add/remove via API is **not** exposed in v0.6.0 β€” neither diff --git a/docs/user/storage.md b/docs/user/storage.md index c22d4d6..d1c52b5 100644 --- a/docs/user/storage.md +++ b/docs/user/storage.md @@ -22,7 +22,7 @@ OmniGraph is **not** a single Lance dataset; it is a *graph* of datasets coordin - `edges/{fnv1a64-hex(edge_type_name)}` β€” one Lance dataset per edge type - `__manifest/` β€” the catalog of all sub-tables and their published versions - `_graph_commits.lance` / `_graph_commit_actors.lance` β€” the commit graph and its actor map - - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771 and these files are cleaned up via MR-770's production sweep) + - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771. The v2β†’v3 manifest migration sweeps stale `__run__*` branches on first write-open; the inert dataset bytes themselves remain until a `delete_prefix` storage primitive lands) - **Manifest row schema** (`object_id, object_type, location, metadata, base_objects, table_key, table_version, table_branch, row_count`): - `object_type` ∈ `table | table_version | table_tombstone` - `table_key` ∈ `node:<TypeName> | edge:<EdgeName>` @@ -47,6 +47,7 @@ Adding a new on-disk shape change is one constant bump (`INTERNAL_MANIFEST_SCHEM |---|---| | v1 (implicit, pre-stamp) | `__manifest.object_id` had no PK annotation; publisher had no row-level CAS protection. | | v2 | `__manifest.object_id` carries `lance-schema:unenforced-primary-key=true`; row-level CAS engaged. Stamped as `omnigraph:internal_schema_version=2`. | +| v3 | One-time sweep of legacy `__run__*` staging branches (pre-v0.4.0 Run state machine, removed MR-771) off `__manifest`. Runs at `Omnigraph::open(ReadWrite)` and on publish. Stamped as `omnigraph:internal_schema_version=3`. | ## On-disk layout @@ -91,7 +92,7 @@ flowchart TB - **Graph root** is one directory (or S3 prefix). Everything below is part of one OmniGraph graph. - **`__manifest/`** is a Lance dataset whose rows describe which sub-table version is published at which graph-branch. Reading a snapshot starts here. - **`nodes/`** and **`edges/`** are sibling directories holding one Lance dataset per declared type. Names are `fnv1a64-hex` of the type name to keep paths fixed-length and case-safe. -- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; MR-770 sweeps these in production.) +- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; the v2β†’v3 migration sweeps their stale `__run__*` branches, and the dataset bytes are reclaimed once `delete_prefix` lands.) - **`_graph_commit_recoveries.lance`** β€” one row per recovery sweep action. Joined to `_graph_commits.lance` by `graph_commit_id`; the linked commit row carries `actor_id=omnigraph:recovery`. Operators correlate recoveries with the original mutations they rolled forward / back via this join. See `crates/omnigraph/src/db/recovery_audit.rs`. - **`__recovery/{ulid}.json`** β€” transient sidecar files written by the four migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`) before Phase B begins, deleted after Phase C succeeds. A sidecar persisting after process exit means the writer crashed in the Phase B β†’ Phase C window; the next `Omnigraph::open` recovery sweep processes it. Steady-state directory is empty. See `crates/omnigraph/src/db/manifest/recovery.rs`. - **`_refs/branches/{name}.json`** is graph-level branch metadata β€” pointers from a branch name to the manifest version it heads. diff --git a/openapi.json b/openapi.json index d1fa337..aced64d 100644 --- a/openapi.json +++ b/openapi.json @@ -7,7 +7,7 @@ "name": "MIT", "identifier": "MIT" }, - "version": "0.6.0" + "version": "0.6.1" }, "paths": { "/branches": { @@ -829,6 +829,177 @@ ] } }, + "/queries": { + "get": { + "tags": [ + "queries" + ], + "summary": "List the graph's exposed stored queries as a typed tool catalog.", + "description": "Returns the `mcp.expose == true` subset of the `queries:` registry, each\nwith its MCP tool name, read/mutate flag, description/instruction, and\ntyped parameters β€” enough for a client to register them as tools without\nfetching `.gq` source. Read-gated; the catalog is graph-wide (branch\nindependent β€” `read` is authorized against `main`). **Not** Cedar-filtered\nper query yet, so it can list a query whose `invoke_query` the caller\nlacks (a known gap until per-query authorization lands).", + "operationId": "list_queries", + "responses": { + "200": { + "description": "Stored-query catalog (the mcp.expose subset, with typed params)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueriesCatalogOutput" + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "403": { + "description": "Forbidden", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + } + }, + "security": [ + { + "bearer_token": [] + } + ] + } + }, + "/queries/{name}": { + "post": { + "tags": [ + "queries" + ], + "summary": "Invoke a curated, server-side stored query by name.", + "description": "The query source comes from the graph's `queries:` registry, not the\nrequest body β€” callers send only runtime inputs (`params`, `branch`,\n`snapshot`). Gated by the `invoke_query` Cedar action at the boundary;\na stored *mutation* additionally passes the engine's `change` gate\n(double-gated). An actor **without** `invoke_query` cannot tell a denied\nquery from a missing one β€” both return the same 404, so the catalog\ncan't be probed without the grant. Once `invoke_query` is held, the\ninner `read`/`change` gate may surface a 403 for an existing query the\nactor can't run (the intended double-gate signal).", + "operationId": "invoke_query", + "parameters": [ + { + "name": "name", + "in": "path", + "description": "Stored query name (the registry key)", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "type": "null" + }, + { + "$ref": "#/components/schemas/InvokeStoredQueryRequest" + } + ] + } + } + } + }, + "responses": { + "200": { + "description": "Read envelope (ReadOutput) or mutation envelope (ChangeOutput), serialized untagged", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InvokeStoredQueryResponse" + } + } + } + }, + "400": { + "description": "Bad request (param type error; snapshot on a stored mutation)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "403": { + "description": "Forbidden (the inner `change` gate for a stored mutation)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "404": { + "description": "Unknown stored query, or `invoke_query` denied β€” indistinguishable to a caller without the grant", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "409": { + "description": "Merge conflict", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "429": { + "description": "Per-actor admission cap exceeded; honor `Retry-After` header", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "500": { + "description": "Policy evaluation error (a denial is reported as 404, not 500)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + } + }, + "security": [ + { + "bearer_token": [] + } + ] + } + }, "/query": { "post": { "tags": [ @@ -1628,6 +1799,40 @@ } } }, + "InvokeStoredQueryRequest": { + "type": "object", + "description": "Body for `POST /queries/{name}` β€” invokes the server-side stored query\nnamed in the path. The query source and name come from the registry,\nnever the body; only the runtime inputs are supplied here.", + "properties": { + "branch": { + "type": [ + "string", + "null" + ], + "description": "Branch to run against. Defaults to `main`; for a stored mutation the\nwrite targets this branch." + }, + "params": { + "description": "JSON object whose keys match the stored query's declared parameters." + }, + "snapshot": { + "type": [ + "string", + "null" + ], + "description": "Snapshot id to read from (read queries only β€” rejected for a stored\nmutation). Mutually exclusive with `branch`." + } + } + }, + "InvokeStoredQueryResponse": { + "oneOf": [ + { + "$ref": "#/components/schemas/ReadOutput" + }, + { + "$ref": "#/components/schemas/ChangeOutput" + } + ], + "description": "Response for `POST /queries/{name}`: the read envelope for a stored\nread, or the mutation envelope for a stored mutation. Serialized\n**untagged**, so the wire shape is exactly [`ReadOutput`] or\n[`ChangeOutput`] β€” classification follows the stored query, not a\nwrapper field." + }, "LoadMode": { "type": "string", "description": "Shadow enum for documenting [`LoadMode`] in the OpenAPI schema.", @@ -1698,6 +1903,120 @@ } } }, + "ParamDescriptor": { + "type": "object", + "description": "One declared parameter of a stored query, projected for the catalog.", + "required": [ + "name", + "kind", + "nullable" + ], + "properties": { + "item_kind": { + "oneOf": [ + { + "type": "null" + }, + { + "$ref": "#/components/schemas/ParamKind", + "description": "Element kind when `kind == list` (always a scalar β€” the grammar\nforbids lists of vectors or nested lists)." + } + ] + }, + "kind": { + "$ref": "#/components/schemas/ParamKind" + }, + "name": { + "type": "string" + }, + "nullable": { + "type": "boolean", + "description": "`false` β†’ the caller must supply it; `true` β†’ optional." + }, + "vector_dim": { + "type": [ + "integer", + "null" + ], + "format": "int32", + "description": "Dimension when `kind == vector`.", + "minimum": 0 + } + } + }, + "ParamKind": { + "type": "string", + "description": "The kind of a stored-query parameter, decomposed so a client (e.g. an\nMCP server) can build a typed input schema with a closed `match` and\nnever re-parse omnigraph's type spelling. `bigint`/`date`/`datetime`/\n`blob` are carried as JSON strings on the wire: a 64-bit integer past\n2^53 loses precision as a JSON number, and Date/DateTime are ISO\nstrings, Blob a blob-URI string.", + "enum": [ + "string", + "bool", + "int", + "bigint", + "float", + "date", + "datetime", + "blob", + "vector", + "list" + ] + }, + "QueriesCatalogOutput": { + "type": "object", + "description": "Response for `GET /queries`: the `mcp.expose` subset of a graph's\nstored-query registry, each with typed parameters.", + "required": [ + "queries" + ], + "properties": { + "queries": { + "type": "array", + "items": { + "$ref": "#/components/schemas/QueryCatalogEntry" + } + } + } + }, + "QueryCatalogEntry": { + "type": "object", + "description": "One entry in the stored-query catalog (`GET /queries`).", + "required": [ + "name", + "tool_name", + "mutation", + "params" + ], + "properties": { + "description": { + "type": [ + "string", + "null" + ] + }, + "instruction": { + "type": [ + "string", + "null" + ] + }, + "mutation": { + "type": "boolean", + "description": "`true` for a stored mutation β†’ an MCP read-only hint of `false`." + }, + "name": { + "type": "string", + "description": "Registry key / invoke path segment (`POST /queries/{name}`)." + }, + "params": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ParamDescriptor" + } + }, + "tool_name": { + "type": "string", + "description": "MCP tool id (the `tool_name` override, else `name`)." + } + } + }, "QueryRequest": { "type": "object", "description": "Inline read-query request for `POST /query`.\n\nFriendlier-named alternative to [`ReadRequest`] for ad-hoc reads and\nAI-agent integration. Mutations are rejected with 400 β€” use `POST\n/mutate` (or its deprecated alias `POST /change`) for write queries.\nField names are deliberately short (`query`, `name`) to match the GQ\nkeyword and the CLI `-e` flag.", diff --git a/scripts/check-agents-md.sh b/scripts/check-agents-md.sh index abc6469..02a177a 100755 --- a/scripts/check-agents-md.sh +++ b/scripts/check-agents-md.sh @@ -34,10 +34,15 @@ PY canonical=() while IFS= read -r line; do canonical+=("$line") -done < <(find docs -type f -name '*.md' ! -path 'docs/releases/*' ! -path 'docs/internal/*' | sort) +done < <(find docs -type f -name '*.md' ! -path 'docs/releases/*' ! -path 'docs/internal/*' ! -path 'docs/rfcs/*' | sort) if [[ -d docs/releases ]]; then canonical+=("docs/releases/") fi +# RFCs are a growing collection (like releases): represent the directory, not +# every per-RFC file. The dir must be linked from an audience index. +if [[ -d docs/rfcs ]]; then + canonical+=("docs/rfcs/") +fi linked=() for index_file in "${index_files[@]}"; do diff --git a/scripts/local-rustfs-bootstrap.sh b/scripts/local-rustfs-bootstrap.sh index 29427de..c4fdcbe 100755 --- a/scripts/local-rustfs-bootstrap.sh +++ b/scripts/local-rustfs-bootstrap.sh @@ -6,7 +6,14 @@ SOURCE_REF="${SOURCE_REF:-main}" RELEASE_CHANNEL="${RELEASE_CHANNEL:-edge}" WORKDIR="${WORKDIR:-$PWD/.omnigraph-rustfs-demo}" RUSTFS_CONTAINER_NAME="${RUSTFS_CONTAINER_NAME:-omnigraph-rustfs-demo}" -RUSTFS_IMAGE="${RUSTFS_IMAGE:-rustfs/rustfs:latest}" +# Pinned to 1.0.0-beta.3 (2026-05-14) β€” the last known-good tag, matching CI +# (.github/workflows/ci.yml). `rustfs/rustfs:latest` (1.0.0-beta.4, 2026-05-21) +# added a credentials-policy check that refuses to start when the access/secret +# keys are values it considers "default" (rustfsadmin/rustfsadmin here). This +# script still works on beta.4+ because it passes +# RUSTFS_ALLOW_INSECURE_DEFAULT_CREDENTIALS=true below β€” so overriding +# RUSTFS_IMAGE to a newer tag is safe. +RUSTFS_IMAGE="${RUSTFS_IMAGE:-rustfs/rustfs:1.0.0-beta.3}" RUSTFS_DATA_DIR="${RUSTFS_DATA_DIR:-$WORKDIR/rustfs-data}" BUCKET="${BUCKET:-omnigraph-local}" PREFIX="${PREFIX:-repos/context}" @@ -265,6 +272,7 @@ start_rustfs() { -v "$RUSTFS_DATA_DIR:/data" \ -e RUSTFS_ACCESS_KEY="$AWS_ACCESS_KEY_ID" \ -e RUSTFS_SECRET_KEY="$AWS_SECRET_ACCESS_KEY" \ + -e RUSTFS_ALLOW_INSECURE_DEFAULT_CREDENTIALS=true \ "$RUSTFS_IMAGE" \ /data >/dev/null } diff --git a/scripts/update-homebrew-formula.sh b/scripts/update-homebrew-formula.sh index 90a5dea..f2f0df9 100755 --- a/scripts/update-homebrew-formula.sh +++ b/scripts/update-homebrew-formula.sh @@ -64,20 +64,8 @@ cat >"$FORMULA_PATH" <<EOF class Omnigraph < Formula desc "Typed property graph database with Git-style workflows" homepage "https://github.com/${REPO_SLUG}" - license "MIT" version "${VERSION}" - - on_macos do - depends_on arch: :arm64 - url "${MACOS_ARM_URL}" - sha256 "${MACOS_ARM_SHA}" - end - - on_linux do - url "${LINUX_X86_URL}" - sha256 "${LINUX_X86_SHA}" - end - + license "MIT" head "https://github.com/${REPO_SLUG}.git", branch: "main" livecheck do @@ -85,6 +73,21 @@ class Omnigraph < Formula regex(/^v?(\\d+(?:\\.\\d+)+)$/i) end + on_macos do + depends_on arch: :arm64 + on_arm do + url "${MACOS_ARM_URL}" + sha256 "${MACOS_ARM_SHA}" + end + end + + on_linux do + on_intel do + url "${LINUX_X86_URL}" + sha256 "${LINUX_X86_SHA}" + end + end + def install bin.install "omnigraph", "omnigraph-server" end