diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index d4ecfa5..e937724 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,9 +8,9 @@ # CI fails if this file drifts from its source, and rejects PRs that # edit this file directly without also editing the yml. -* @ragnorc +* @ragnorc @aaltshuler -crates/** @ragnorc +crates/** @ragnorc @aaltshuler docs/** @ragnorc README.md @ragnorc AGENTS.md @ragnorc diff --git a/.github/DISCUSSION_TEMPLATE/rfc.yml b/.github/DISCUSSION_TEMPLATE/rfc.yml new file mode 100644 index 0000000..2a63525 --- /dev/null +++ b/.github/DISCUSSION_TEMPLATE/rfc.yml @@ -0,0 +1,34 @@ +labels: ["rfc"] +body: + - type: markdown + attributes: + value: | + Use this to **incubate an RFC** β€” socialize a design and reach rough + consensus before writing the formal document. When it's ready, graduate + it into a pull request that adds `docs/rfcs/NNNN-title.md` + (see [docs/rfcs/README.md](../blob/main/docs/rfcs/README.md)); a + maintainer merging that PR is acceptance. + + For a plain feature request or open-ended idea, use the **Ideas** + category instead. For bugs, open an [Issue](../../issues/new/choose). + - type: textarea + id: problem + attributes: + label: Problem / motivation + description: What needs solving, and why is it worth the long-run cost? + validations: + required: true + - type: textarea + id: sketch + attributes: + label: Proposed direction (sketch) + description: A rough shape of the design. Detail comes later in the RFC document. + validations: + required: true + - type: textarea + id: invariants + attributes: + label: Invariants touched + description: Which items in docs/dev/invariants.md does this affect or risk? Any deny-list brush? + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml new file mode 100644 index 0000000..8e19465 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -0,0 +1,55 @@ +name: Bug report +description: Report a reproducible problem or wrong behavior in OmniGraph. +title: "bug: " +labels: ["bug", "needs-triage"] +body: + - type: markdown + attributes: + value: | + Issues are for **reporting problems** β€” concrete, reproducible bugs. + For ideas, feature requests, or questions, please use + [Discussions](../../discussions) instead. + For a security vulnerability, follow [SECURITY.md](../../blob/main/SECURITY.md) β€” do **not** file it here. + + A maintainer will triage this; once labelled **`accepted`** it's open for a pull request + (see [GOVERNANCE.md](../../blob/main/GOVERNANCE.md)). + - type: textarea + id: what-happened + attributes: + label: What happened + description: What went wrong, and what you expected instead. + validations: + required: true + - type: textarea + id: repro + attributes: + label: Steps to reproduce + description: Minimal steps, commands, schema/query, or a failing snippet. + placeholder: | + 1. omnigraph init ... + 2. omnigraph ... + 3. observed: ... / expected: ... + validations: + required: true + - type: input + id: version + attributes: + label: Version + description: Output of `omnigraph --version` (or the engine/crate version) and how you installed it. + validations: + required: true + - type: input + id: environment + attributes: + label: Environment + description: OS, architecture, and storage backend (local FS / S3 / RustFS / MinIO). + validations: + required: false + - type: textarea + id: logs + attributes: + label: Logs / output + description: Relevant error text or logs. Will be rendered as code. + render: shell + validations: + required: false diff --git a/.github/ISSUE_TEMPLATE/config.yml b/.github/ISSUE_TEMPLATE/config.yml new file mode 100644 index 0000000..50720b8 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/config.yml @@ -0,0 +1,13 @@ +# Issues are for problem reports only. Disable blank issues so everything is +# routed: bugs through the form, everything else to Discussions / SECURITY.md. +blank_issues_enabled: false +contact_links: + - name: πŸ’‘ Idea, feature request, or RFC + url: https://github.com/ModernRelay/omnigraph/discussions + about: Propose features and designs in Discussions. RFCs graduate from there into a docs/rfcs/ pull request. + - name: ❓ Question or help + url: https://github.com/ModernRelay/omnigraph/discussions + about: Ask in Discussions β€” questions are not tracked as Issues. + - name: πŸ”’ Security vulnerability + url: https://github.com/ModernRelay/omnigraph/blob/main/SECURITY.md + about: Report security issues privately per SECURITY.md β€” never as a public Issue. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000..2a548c7 --- /dev/null +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,29 @@ + + +## What & why + + + +## Backing issue / RFC + + + +- [ ] Fixes an **accepted** issue: Closes # +- [ ] Implements / is an **accepted** RFC: +- [ ] **Trivial fast-lane** (typo / docs / dependency bump / comment / one-line CI) β€” no issue/RFC required + +## Checklist + +- [ ] Change is focused (one logical change) +- [ ] Tests added/updated for behavior changes (or N/A) +- [ ] Public docs updated if user-facing surface changed (or N/A) +- [ ] Reviewed against [docs/dev/invariants.md](../blob/main/docs/dev/invariants.md) β€” no Hard Invariant weakened, no deny-list item hit (or justified) + +## Notes for reviewers + + diff --git a/.github/branch-protection.json b/.github/branch-protection.json index 61b7d33..c039e32 100644 --- a/.github/branch-protection.json +++ b/.github/branch-protection.json @@ -1,5 +1,5 @@ { - "_comment": "Branch protection policy for main. Applied via scripts/apply-branch-protection.sh. See docs/branch-protection.md for rationale.", + "_comment": "Branch protection policy for main. Applied via scripts/apply-branch-protection.sh. See docs/branch-protection.md for rationale. NOTE: bypass_pull_request_allowances.users must mirror the engineering owners in .github/codeowners-roles.yml β€” code owners merge their own PRs without a second review; non-owners still need a code-owner approval. (render-codeowners.py does NOT generate this list; keep it in sync by hand.)", "required_status_checks": { "strict": true, "contexts": [ @@ -7,8 +7,8 @@ "Check AGENTS.md Links", "Test Workspace", "Test omnigraph-server --features aws", - "CODEOWNERS / drift", - "CODEOWNERS / noedit" + "CODEOWNERS matches source", + "CODEOWNERS not hand-edited" ] }, "enforce_admins": false, @@ -17,7 +17,12 @@ "dismiss_stale_reviews": true, "require_code_owner_reviews": true, "required_approving_review_count": 1, - "require_last_push_approval": false + "require_last_push_approval": false, + "bypass_pull_request_allowances": { + "users": ["ragnorc", "aaltshuler"], + "teams": [], + "apps": [] + } }, "restrictions": null, "required_linear_history": true, diff --git a/.github/codeowners-roles.yml b/.github/codeowners-roles.yml index c5e36a9..ce4014d 100644 --- a/.github/codeowners-roles.yml +++ b/.github/codeowners-roles.yml @@ -22,6 +22,7 @@ roles: compiler. members: - ragnorc + - aaltshuler docs: description: > diff --git a/.github/scripts/render-codeowners.py b/.github/scripts/render-codeowners.py index f243d0c..5e96545 100755 --- a/.github/scripts/render-codeowners.py +++ b/.github/scripts/render-codeowners.py @@ -1,10 +1,14 @@ #!/usr/bin/env python3 -"""Render .github/CODEOWNERS from .github/codeowners-roles.yml. +"""Render .github/CODEOWNERS and the ownership tables in +docs/dev/codeowners.md from .github/codeowners-roles.yml. -The yml is the source of truth β€” editing CODEOWNERS directly is -rejected by CI (see .github/workflows/codeowners.yml). This script -expands the role-based yml into the flat pathβ†’owners format GitHub -expects. +The yml is the source of truth. This script expands the role-based yml +into (1) the flat pathβ†’owners format GitHub expects in +`.github/CODEOWNERS`, and (2) the "who owns what" markdown tables spliced +between the generated-region markers in `docs/dev/codeowners.md`. Both are +derived artifacts; CI re-renders them on every PR (see +.github/workflows/codeowners.yml) and auto-commits the result on same-repo +PRs, so the source of truth and the human-readable view never drift. Usage: python3 .github/scripts/render-codeowners.py @@ -16,6 +20,7 @@ Exits non-zero on: one owner; otherwise CODEOWNERS would assign nobody and GitHub would silently fall back to "no required reviewer", which defeats the purpose). + - Missing generated-region markers in docs/dev/codeowners.md. """ from __future__ import annotations @@ -34,6 +39,13 @@ except ImportError: REPO_ROOT = Path(__file__).resolve().parents[2] SOURCE = REPO_ROOT / ".github" / "codeowners-roles.yml" OUTPUT = REPO_ROOT / ".github" / "CODEOWNERS" +DOCS = REPO_ROOT / "docs" / "dev" / "codeowners.md" + +# The "who owns what" tables in docs/dev/codeowners.md are spliced between +# these markers so the human-readable view never drifts from the source of +# truth. Edit codeowners-roles.yml and re-render β€” never the table by hand. +DOCS_BEGIN = "" +DOCS_END = "" BANNER = """\ # AUTOGENERATED from .github/codeowners-roles.yml. Do not edit by hand. @@ -75,6 +87,62 @@ def owners_for(role_names: list[str], roles: dict) -> list[str]: return seen +def _oneline(text: str) -> str: + """Collapse a folded/multi-line YAML description into one cell of text.""" + return " ".join((text or "").split()) + + +def ownership_tables(spec: dict, roles: dict) -> str: + """Render the human-readable "who owns what" markdown β€” a pathβ†’owners + table (the operative view at PR time, in last-match-wins order with the + catch-all first) plus a roleβ†’members table. Spliced into the docs between + the markers so it is always current with the source of truth.""" + out: list[str] = [] + + out.append("**Path β†’ owners** (GitHub applies *last match wins*; the `*` " + "catch-all is listed first and is overridden by the specific " + "patterns below it):") + out.append("") + out.append("| Path | Owners | Role(s) |") + out.append("|---|---|---|") + if "default" in spec: + owners = " ".join(owners_for(spec["default"], roles)) + out.append(f"| `*` | {owners} | {', '.join(spec['default'])} |") + for pattern, role_names in (spec.get("paths") or {}).items(): + owners = " ".join(owners_for(role_names, roles)) + out.append(f"| `{pattern}` | {owners} | {', '.join(role_names)} |") + out.append("") + + out.append("**Roles**:") + out.append("") + out.append("| Role | Members | Description |") + out.append("|---|---|---|") + for name, role in roles.items(): + members = " ".join(f"@{m}" for m in (role.get("members") or [])) + out.append(f"| `{name}` | {members} | {_oneline(role.get('description', ''))} |") + out.append("") + + return "\n".join(out) + + +def splice_docs(table_md: str) -> None: + """Replace the region between DOCS_BEGIN/DOCS_END in the docs file with the + freshly generated tables, leaving surrounding prose untouched.""" + if not DOCS.exists(): + sys.exit(f"error: docs file not found: {DOCS}") + text = DOCS.read_text() + if DOCS_BEGIN not in text or DOCS_END not in text: + sys.exit( + f"error: ownership markers not found in {DOCS.relative_to(REPO_ROOT)}. " + f"Add the lines:\n {DOCS_BEGIN}\n {DOCS_END}\n" + f"around the generated table region." + ) + head, rest = text.split(DOCS_BEGIN, 1) + _, tail = rest.split(DOCS_END, 1) + new = f"{head}{DOCS_BEGIN}\n\n{table_md}\n{DOCS_END}{tail}" + DOCS.write_text(new) + + def main() -> int: if not SOURCE.exists(): sys.exit(f"error: source file not found: {SOURCE}") @@ -127,6 +195,9 @@ def main() -> int: OUTPUT.write_text(rendered) print(f"wrote {OUTPUT.relative_to(REPO_ROOT)}") + + splice_docs(ownership_tables(spec, roles)) + print(f"updated {DOCS.relative_to(REPO_ROOT)}") return 0 diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3dc2e80..bbe5893 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -111,6 +111,18 @@ jobs: - name: Verify AGENTS.md ↔ docs/ cross-links run: bash scripts/check-agents-md.sh + entrypoint_test: + name: Container Entrypoint + runs-on: ubuntu-latest + permissions: + contents: read + steps: + - name: Checkout source + uses: actions/checkout@v5.0.1 + + - name: Verify omnigraph-server entrypoint arg composition + run: sh docker/entrypoint_test.sh + test: name: Test Workspace needs: classify_changes diff --git a/.github/workflows/codeowners.yml b/.github/workflows/codeowners.yml index 19d5835..75b3515 100644 --- a/.github/workflows/codeowners.yml +++ b/.github/workflows/codeowners.yml @@ -1,19 +1,24 @@ name: CODEOWNERS +# Runs on EVERY pull request (no paths filter). The two jobs below are +# required status checks on `main`; a path-filtered required check never +# reports for PRs outside the filter and leaves them permanently "pending" +# (the trap that forced admin-override merges). Always-run + cheap +# short-circuit is what keeps them honest. on: pull_request: - paths: - - '.github/codeowners-roles.yml' - - '.github/CODEOWNERS' - - '.github/scripts/render-codeowners.py' - - '.github/workflows/codeowners.yml' workflow_dispatch: -# Read-only; we never push from this workflow. +# `drift` auto-commits the regenerated artifacts back to same-repo PR +# branches, so it needs write access. permissions: - contents: read + contents: write jobs: + # NOTE: the job `name:` values below ("CODEOWNERS matches source" / + # "CODEOWNERS not hand-edited") ARE the status-check contexts that + # .github/branch-protection.json must list verbatim. Renaming a job here + # is a branch-protection change β€” update the JSON and re-apply. drift: name: CODEOWNERS matches source runs-on: ubuntu-latest @@ -28,19 +33,56 @@ jobs: - name: Install PyYAML run: pip install pyyaml - - name: Re-render CODEOWNERS + - name: Re-render CODEOWNERS + ownership docs run: python3 .github/scripts/render-codeowners.py - - name: Reject drift + # Same-repo PR: push the regenerated artifacts back so contributors + # never have to run the script locally. Mirrors the openapi.json + # auto-commit in ci.yml (separate shallow clone of the head branch so + # the pushed commit carries only the regenerated files). + - name: Commit regenerated artifacts to PR branch + if: | + github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name == github.repository + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | - if ! git diff --quiet .github/CODEOWNERS; then - echo "::error::.github/CODEOWNERS is out of sync with .github/codeowners-roles.yml." - echo "::error::Run \`python3 .github/scripts/render-codeowners.py\` locally and commit the result." + if git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then + echo "CODEOWNERS and ownership docs already in sync." + exit 0 + fi + tmp=$(mktemp -d) + git clone --depth 1 --branch "${{ github.head_ref }}" \ + "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git" \ + "$tmp" + cp .github/CODEOWNERS "$tmp/.github/CODEOWNERS" + cp docs/dev/codeowners.md "$tmp/docs/dev/codeowners.md" + cd "$tmp" + if git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then + echo "Head branch already matches; nothing to push." + exit 0 + fi + git config user.name "github-actions[bot]" + git config user.email "41898282+github-actions[bot]@users.noreply.github.com" + git add .github/CODEOWNERS docs/dev/codeowners.md + git commit -m "chore: regenerate CODEOWNERS + ownership docs" + git push + + # Fork PR / workflow_dispatch: cannot push back, so enforce drift + # strictly. The contributor runs the script and commits the result. + - name: Verify in sync (forks / manual runs) + if: | + !(github.event_name == 'pull_request' && + github.event.pull_request.head.repo.full_name == github.repository) + run: | + if ! git diff --quiet -- .github/CODEOWNERS docs/dev/codeowners.md; then + echo "::error::Generated CODEOWNERS / ownership docs are out of sync with .github/codeowners-roles.yml." + echo "::error::Run \`python3 .github/scripts/render-codeowners.py\` and commit the result." echo "--- diff ---" - git --no-pager diff .github/CODEOWNERS + git --no-pager diff -- .github/CODEOWNERS docs/dev/codeowners.md exit 1 fi - echo "CODEOWNERS is in sync with its source." + echo "Generated artifacts are in sync with their source." noedit: name: CODEOWNERS not hand-edited @@ -52,6 +94,8 @@ jobs: fetch-depth: 0 - name: Reject hand-edits to generated file + # Only meaningful for PRs (needs a base to diff against). + if: github.event_name == 'pull_request' run: | base="origin/${{ github.base_ref }}" git fetch origin "${{ github.base_ref }}" --quiet diff --git a/.github/workflows/release-edge.yml b/.github/workflows/release-edge.yml index 6147646..3996e65 100644 --- a/.github/workflows/release-edge.yml +++ b/.github/workflows/release-edge.yml @@ -43,6 +43,8 @@ jobs: asset_name: omnigraph-linux-x86_64 - runner: macos-14 asset_name: omnigraph-macos-arm64 + - runner: windows-latest + asset_name: omnigraph-windows-x86_64 env: CARGO_TERM_COLOR: always steps: @@ -59,6 +61,10 @@ jobs: if: runner.os == 'macOS' run: brew install protobuf + - name: Install Windows dependencies + if: runner.os == 'Windows' + run: choco install protoc -y + - name: Install Rust stable uses: dtolnay/rust-toolchain@stable with: @@ -73,7 +79,8 @@ jobs: - name: Build release binaries run: cargo build --release --locked -p omnigraph-cli -p omnigraph-server - - name: Package release archive + - name: Package Unix release archive + if: runner.os != 'Windows' run: | mkdir -p release install -m 0755 target/release/omnigraph release/omnigraph @@ -81,6 +88,22 @@ jobs: tar -C release -czf "${{ matrix.asset_name }}.tar.gz" omnigraph omnigraph-server shasum -a 256 "${{ matrix.asset_name }}.tar.gz" > "${{ matrix.asset_name }}.sha256" + - name: Package Windows release archive + if: runner.os == 'Windows' + run: | + New-Item -ItemType Directory -Force -Path release | Out-Null + Copy-Item target/release/omnigraph.exe release/omnigraph.exe + Copy-Item target/release/omnigraph-server.exe release/omnigraph-server.exe + Compress-Archive -Path release/omnigraph.exe, release/omnigraph-server.exe -DestinationPath "${{ matrix.asset_name }}.zip" -Force + $hash = (Get-FileHash "${{ matrix.asset_name }}.zip" -Algorithm SHA256).Hash.ToLowerInvariant() + "$hash ${{ matrix.asset_name }}.zip" | Out-File -FilePath "${{ matrix.asset_name }}.sha256" -Encoding ascii + New-Item -ItemType Directory -Force -Path verify | Out-Null + Expand-Archive -Path "${{ matrix.asset_name }}.zip" -DestinationPath verify -Force + $items = Get-ChildItem -Path verify -File + if ($items.Count -ne 2 -or !(Test-Path verify/omnigraph.exe) -or !(Test-Path verify/omnigraph-server.exe)) { + throw "Windows release archive is missing expected binaries" + } + - name: Publish edge release assets uses: softprops/action-gh-release@v2.5.0 with: @@ -91,5 +114,22 @@ jobs: body: | Rolling prerelease from `${{ github.sha }}`. files: | - ${{ matrix.asset_name }}.tar.gz - ${{ matrix.asset_name }}.sha256 + ${{ matrix.asset_name }}.* + + smoke_windows_installer: + name: Smoke Windows installer + needs: build_release + runs-on: windows-latest + permissions: + contents: read + steps: + - name: Checkout source + uses: actions/checkout@v5.0.1 + + - name: Install from edge release + run: ./scripts/install.ps1 -ReleaseChannel edge -InstallDir "$env:RUNNER_TEMP/omnigraph-bin" + + - name: Smoke installed binaries + run: | + & "$env:RUNNER_TEMP/omnigraph-bin/omnigraph.exe" version + & "$env:RUNNER_TEMP/omnigraph-bin/omnigraph-server.exe" --help diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index e7fc75f..a265c40 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -20,6 +20,8 @@ jobs: asset_name: omnigraph-linux-x86_64 - runner: macos-14 asset_name: omnigraph-macos-arm64 + - runner: windows-latest + asset_name: omnigraph-windows-x86_64 env: CARGO_TERM_COLOR: always steps: @@ -36,6 +38,10 @@ jobs: if: runner.os == 'macOS' run: brew install protobuf + - name: Install Windows dependencies + if: runner.os == 'Windows' + run: choco install protoc -y + - name: Install Rust stable uses: dtolnay/rust-toolchain@stable with: @@ -50,7 +56,8 @@ jobs: - name: Build release binaries run: cargo build --release --locked -p omnigraph-cli -p omnigraph-server - - name: Package release archive + - name: Package Unix release archive + if: runner.os != 'Windows' run: | mkdir -p release install -m 0755 target/release/omnigraph release/omnigraph @@ -58,12 +65,27 @@ jobs: tar -C release -czf "${{ matrix.asset_name }}.tar.gz" omnigraph omnigraph-server shasum -a 256 "${{ matrix.asset_name }}.tar.gz" > "${{ matrix.asset_name }}.sha256" + - name: Package Windows release archive + if: runner.os == 'Windows' + run: | + New-Item -ItemType Directory -Force -Path release | Out-Null + Copy-Item target/release/omnigraph.exe release/omnigraph.exe + Copy-Item target/release/omnigraph-server.exe release/omnigraph-server.exe + Compress-Archive -Path release/omnigraph.exe, release/omnigraph-server.exe -DestinationPath "${{ matrix.asset_name }}.zip" -Force + $hash = (Get-FileHash "${{ matrix.asset_name }}.zip" -Algorithm SHA256).Hash.ToLowerInvariant() + "$hash ${{ matrix.asset_name }}.zip" | Out-File -FilePath "${{ matrix.asset_name }}.sha256" -Encoding ascii + New-Item -ItemType Directory -Force -Path verify | Out-Null + Expand-Archive -Path "${{ matrix.asset_name }}.zip" -DestinationPath verify -Force + $items = Get-ChildItem -Path verify -File + if ($items.Count -ne 2 -or !(Test-Path verify/omnigraph.exe) -or !(Test-Path verify/omnigraph-server.exe)) { + throw "Windows release archive is missing expected binaries" + } + - name: Publish GitHub release assets uses: softprops/action-gh-release@v2.5.0 with: files: | - ${{ matrix.asset_name }}.tar.gz - ${{ matrix.asset_name }}.sha256 + ${{ matrix.asset_name }}.* update_homebrew_tap: name: Update Homebrew tap @@ -99,6 +121,31 @@ jobs: run: | ./scripts/update-homebrew-formula.sh "${GITHUB_REF_NAME}" homebrew-tap/Formula/omnigraph.rb + # Diagnostic only: brew is not on PATH on the ubuntu runner by default, so + # set it up explicitly. Both this setup and the audit below are best-effort + # canaries, not gates β€” continue-on-error on each keeps a failed/flaky brew + # (the action is pinned to a moving @master ref) from skipping the actual + # tap publish below. The formula is correct by construction + # (update-homebrew-formula.sh), so brew tooling must never block the push. + - name: Set up Homebrew + if: env.HOMEBREW_TAP_SKIP != '1' + continue-on-error: true + uses: Homebrew/actions/setup-homebrew@master + + - name: Audit generated formula + if: env.HOMEBREW_TAP_SKIP != '1' + continue-on-error: true + run: | + # Audit the checked-out tap by name (brew audit rejects bare paths + # and needs tap context). Symlink the checkout into Homebrew's Taps + # tree so `modernrelay/tap/omnigraph` resolves to it. Offline audit + # (no --online) keeps it deterministic; it still catches the + # ComponentsOrder/structure class of problems. + tap_dir="$(brew --repository)/Library/Taps/modernrelay/homebrew-tap" + mkdir -p "$(dirname "$tap_dir")" + ln -sfn "$PWD/homebrew-tap" "$tap_dir" + brew audit --strict modernrelay/tap/omnigraph + - name: Commit and push formula update if: env.HOMEBREW_TAP_SKIP != '1' working-directory: homebrew-tap @@ -113,3 +160,22 @@ jobs: git add Formula/omnigraph.rb git commit -m "Update Omnigraph formula to ${GITHUB_REF_NAME}" git push origin HEAD:main + + smoke_windows_installer: + name: Smoke Windows installer + needs: build_release + if: startsWith(github.ref, 'refs/tags/v') + runs-on: windows-latest + permissions: + contents: read + steps: + - name: Checkout source + uses: actions/checkout@v5.0.1 + + - name: Install from tagged release + run: ./scripts/install.ps1 -Version "$env:GITHUB_REF_NAME" -InstallDir "$env:RUNNER_TEMP/omnigraph-bin" + + - name: Smoke installed binaries + run: | + & "$env:RUNNER_TEMP/omnigraph-bin/omnigraph.exe" version + & "$env:RUNNER_TEMP/omnigraph-bin/omnigraph-server.exe" --help diff --git a/AGENTS.md b/AGENTS.md index 27d1b7b..3f5b711 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -16,7 +16,7 @@ Tools that support `@`-imports (Claude Code) auto-include all three files via th `CLAUDE.md` is a symlink to this file β€” there is exactly one source of truth. Edit `AGENTS.md`. -**Version surveyed:** 0.6.0 +**Version surveyed:** 0.6.1 **Workspace crates:** `omnigraph-compiler`, `omnigraph` (engine), `omnigraph-policy`, `omnigraph-cli`, `omnigraph-server` **Storage substrate:** Lance 6.x (columnar, versioned, branchable) **License:** MIT @@ -81,7 +81,7 @@ Full diagram and concurrency model: [docs/dev/architecture.md](docs/dev/architec | Embeddings (compiler + engine clients, env vars, `@embed`) | [docs/user/embeddings.md](docs/user/embeddings.md) | | Branches, commit graph, snapshots, system branches | [docs/user/branches-commits.md](docs/user/branches-commits.md) | | Transactions and atomicity (per-query atomic; branches as multi-query transactions) | [docs/user/transactions.md](docs/user/transactions.md) | -| Direct-publish writes (the former Run state machine, now demoted to publisher CAS) | [docs/dev/runs.md](docs/dev/runs.md) | +| Direct-publish write path (staging, D2, recovery sidecars; the former Run state machine) | [docs/dev/writes.md](docs/dev/writes.md) | | Three-way merge and conflict kinds | [docs/dev/merge.md](docs/dev/merge.md) | | Diff / change feed (`diff_between`, `diff_commits`) | [docs/user/changes.md](docs/user/changes.md) | | Query execution, mutation execution, bulk loader, `load` vs `ingest` | [docs/dev/execution.md](docs/dev/execution.md) | @@ -164,6 +164,32 @@ If a proposal fits one of these, the burden is on the proposer to justify why th --- +## Build, test, lint + +Rust stable workspace (edition 2024). `protoc` is a build dependency (`brew install protobuf` / `apt-get install protobuf-compiler libprotobuf-dev`). **Crate dir β‰  package name** for the engine: the directory is `crates/omnigraph` but its Cargo package is `omnigraph-engine` (use that in `-p`). The CLI binary built from `omnigraph-cli` is named `omnigraph`. + +```bash +cargo build --workspace --locked # build everything +cargo test --workspace --locked # the canonical CI gate (matches CI exactly) +cargo run -p omnigraph-cli -- # run the `omnigraph` CLI from source +cargo run -p omnigraph-server -- --bind 0.0.0.0:8080 # run the server from source + +# Run one crate / one test file / one test fn +cargo test -p omnigraph-engine --test traversal # one integration-test file (see docs/dev/testing.md) +cargo test -p omnigraph-engine --test writes concurrent # one test fn by name substring +cargo test -p omnigraph-engine some_inline_test -- --nocapture # show stdout + +# Feature-gated suites (each is its own job in CI, not part of the default run) +cargo test -p omnigraph-engine --features failpoints --test failpoints # fault injection +cargo build -p omnigraph-server --features aws # AWS Secrets Manager bearer-token source +``` + +S3-backed tests (`s3_storage`, and the S3 paths in server/CLI system tests) **skip** unless `OMNIGRAPH_S3_TEST_BUCKET` + `AWS_*` (incl. `AWS_ENDPOINT_URL_S3` for non-AWS) are set; CI runs them against containerized RustFS. `scripts/local-rustfs-bootstrap.sh` stands up a local S3 environment. + +CI does **not** run `clippy` or `rustfmt` as gates β€” but `cargo test --workspace --locked` is the exact gate, so run it before pushing. Two non-test CI checks: `scripts/check-agents-md.sh` (doc cross-link integrity β€” run it after moving/renaming docs) and OpenAPI drift (`crates/omnigraph-server/tests/openapi.rs` regenerates `openapi.json`; set `OMNIGRAPH_UPDATE_OPENAPI=1` to update the checked-in copy when a server/API change is intentional). + +--- + ## Quick-reference flows ```bash @@ -210,8 +236,8 @@ omnigraph policy explain --actor act-alice --action change --branch main | Columnar storage on object store | βœ… Arrow/Lance | URI normalization, S3 env-var plumbing | | Per-dataset versioning + time travel | βœ… | `snapshot_at_version`, `entity_at`, snapshot-pinned reads across many tables | | Per-dataset branches | βœ… | **Graph-level** branches (atomic across all sub-tables), lazy fork, system branch filtering | -| Atomic single-dataset commits | βœ… | **Multi-table publish via three layers**, NOT a single Lance primitive: (1) per-table Lance `commit_staged` for the data write, (2) `__manifest` row-level CAS via `ManifestBatchPublisher` for cross-table ordering, (3) the open-time recovery sweep for the residual gap between (1) and (2). All three layers ship; the four migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`) write a `__recovery/{ulid}.json` sidecar before Phase B and delete it after Phase C. The next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the sweep in `db/manifest/recovery.rs`: classify, decide all-or-nothing per sidecar, roll forward via single `ManifestBatchPublisher::publish` or roll back via `Dataset::restore`, and record an audit row in `_graph_commit_recoveries.lance` (queryable via `omnigraph commit list --filter actor=omnigraph:recovery`). Continuous in-process recovery (no restart needed between Phase B failure and recovery) is the goal of a future background reconciler. Engine writes route through a sealed `TableStorage` trait exposing `stage_*` + `commit_staged` as the canonical staged-write surface; documented inline-commit residuals (`delete_where`, `create_vector_index`, plus legacy `append_batch` / `merge_insert_batches` / `overwrite_batch` / `create_*_index`) remain on the trait until upstream Lance ships a public two-phase API ([#6658](https://github.com/lance-format/lance/issues/6658), [#6666](https://github.com/lance-format/lance/issues/6666)) and the migration of every call site completes. | -| Compaction (`compact_files`) | βœ… | `omnigraph optimize` orchestrates over all node/edge tables, bounded concurrency | +| Atomic single-dataset commits | βœ… | **Multi-table publish via three layers**, NOT a single Lance primitive: (1) per-table Lance `commit_staged` for the data write, (2) `__manifest` row-level CAS via `ManifestBatchPublisher` for cross-table ordering, (3) the open-time recovery sweep for the residual gap between (1) and (2). All three layers ship; the five migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`, `optimize_all_tables`) write a `__recovery/{ulid}.json` sidecar before Phase B and delete it after Phase C. The next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the sweep in `db/manifest/recovery.rs`: classify, decide all-or-nothing per sidecar, roll forward via single `ManifestBatchPublisher::publish` or roll back via `Dataset::restore` followed by a manifest publish of the restored version (so both directions converge to `manifest == HEAD` β€” no residual drift), and record an audit row in `_graph_commit_recoveries.lance` (queryable via `omnigraph commit list --filter actor=omnigraph:recovery`). Continuous in-process recovery (no restart needed between Phase B failure and recovery) is the goal of a future background reconciler. Engine writes route through a sealed `TableStorage` trait exposing `stage_*` + `commit_staged` as the canonical staged-write surface; documented inline-commit residuals (`delete_where`, `create_vector_index`, plus legacy `append_batch` / `merge_insert_batches` / `overwrite_batch` / `create_*_index`) remain on the trait until upstream Lance ships a public two-phase API ([#6658](https://github.com/lance-format/lance/issues/6658), [#6666](https://github.com/lance-format/lance/issues/6666)) and the migration of every call site completes. | +| Compaction (`compact_files`) | βœ… | `omnigraph optimize` orchestrates over all node/edge tables, bounded concurrency; **publishes each compacted table's new version to `__manifest`** (so the manifest tracks the Lance HEAD β€” required for reads to observe compaction and for schema apply / strict writes to pass their HEAD-vs-manifest precondition), under the per-`(table, main)` write queue with `SidecarKind::Optimize` recovery coverage; **refuses on an unrecovered graph** (errors if a `__recovery` sidecar is pending β€” recovery may roll back a partial write, so optimize requires `manifest == HEAD` going in); **skips blob-bearing tables** (reported via `TableOptimizeStats.skipped`, not silent), gated on `LANCE_SUPPORTS_BLOB_COMPACTION` until the upstream blob-v2 compaction-decode bug is fixed (see [docs/dev/invariants.md](docs/dev/invariants.md) Known Gaps) | | Cleanup (`cleanup_old_versions`) | βœ… | `omnigraph cleanup` with `--keep` / `--older-than` policy | | BTREE / inverted (FTS) / vector indexes | βœ… | `ensure_indices` builds them on every relevant column; idempotent; lazy across branches | | `merge_insert` upsert | βœ… | `LoadMode::Merge`, mutation `update`/`insert`/`delete` lowering | diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 8d9c687..2d77ef0 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,10 +1,29 @@ # Contributing -Small bug fixes and documentation improvements are welcome directly through pull -requests. +Thanks for your interest in OmniGraph. This page is the practical how-to; the +rules and decision authority behind it live in [GOVERNANCE.md](GOVERNANCE.md). -For larger changes, please open an issue or design discussion first so the -proposed direction is clear before implementation starts. +## Start in the right place + +| I want to… | Go to | Notes | +|---|---|---| +| **Report a bug** or wrong behavior | **[Open an Issue](../../issues/new/choose)** | Concrete and reproducible. A maintainer triages it; once labelled **`accepted`** it's open for a PR. | +| **Suggest a feature / share an idea / ask** | **[Start a Discussion](../../discussions)** | Ideas and questions live here, not in Issues. | +| **Propose a design / RFC** | **An RFC pull request** | Anyone can author one β€” see [docs/rfcs/README.md](docs/rfcs/README.md). A maintainer merging it is acceptance. | +| **Fix something / implement a change** | **A pull request** | Must link an `accepted` issue or an accepted RFC β€” unless it's trivial (below). | +| **Report a security vulnerability** | **[SECURITY.md](SECURITY.md)** | Do **not** open a public Issue. | + +### When can I just open a PR? +The **trivial fast-lane** β€” open directly, no prior issue/RFC needed: typo and +wording fixes, doc corrections, dependency bumps, comment fixes, obvious +one-line CI tweaks. Anything more substantial needs a backing `accepted` issue +or accepted RFC first, so the *why* is agreed before the *how* is reviewed. A PR +that turns out to be non-trivial will be redirected β€” that's about process, not +the merit of the change. + +> **Maintainers (ModernRelay team)** follow a separate internal process and are +> not bound by the intake rules above. Everyone is bound by review, CODEOWNERS, +> branch protection, and CI. ## Development @@ -49,6 +68,11 @@ CI runs both. ## Pull Requests -- keep changes focused -- include tests for behavior changes when practical -- update public docs when the user-facing surface changes +- **Link the backing issue or RFC** (`Closes #123`, or reference the RFC) β€” or + mark the PR as trivial per the fast-lane. +- Keep changes focused; one logical change per PR. +- Include tests for behavior changes when practical. +- Update public docs when the user-facing surface changes. + +New to the codebase? Read [AGENTS.md](AGENTS.md) β€” the architecture map and the +always-on invariants every change is reviewed against. diff --git a/Cargo.lock b/Cargo.lock index a3d6d62..3223b9c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4543,7 +4543,7 @@ dependencies = [ [[package]] name = "omnigraph-cli" -version = "0.6.0" +version = "0.6.1" dependencies = [ "assert_cmd", "clap", @@ -4565,7 +4565,7 @@ dependencies = [ [[package]] name = "omnigraph-compiler" -version = "0.6.0" +version = "0.6.1" dependencies = [ "ahash", "arrow-array", @@ -4586,7 +4586,7 @@ dependencies = [ [[package]] name = "omnigraph-engine" -version = "0.6.0" +version = "0.6.1" dependencies = [ "arc-swap", "arrow-array", @@ -4627,7 +4627,7 @@ dependencies = [ [[package]] name = "omnigraph-policy" -version = "0.6.0" +version = "0.6.1" dependencies = [ "cedar-policy", "clap", @@ -4640,7 +4640,7 @@ dependencies = [ [[package]] name = "omnigraph-server" -version = "0.6.0" +version = "0.6.1" dependencies = [ "arc-swap", "async-trait", diff --git a/GOVERNANCE.md b/GOVERNANCE.md new file mode 100644 index 0000000..5878f1f --- /dev/null +++ b/GOVERNANCE.md @@ -0,0 +1,106 @@ +# Governance + +This document describes how **external contributions** to OmniGraph are +proposed, accepted, and merged. It exists so an outside contributor can answer, +without asking: *where does my report/idea/change go, who decides, and what has +to happen before code lands?* + +> **Scope.** This governs the public contribution surface β€” Issues, +> Discussions, RFCs, and pull requests from people outside the ModernRelay +> team. **Maintainers operate under a separate internal process** and are not +> bound by the intake gates below. Everyone, maintainer or not, is still bound +> by the universal gates: branch protection on `main` and CODEOWNERS review +> (see [docs/dev/branch-protection.md](docs/dev/branch-protection.md) and +> [docs/dev/codeowners.md](docs/dev/codeowners.md)). + +## Roles + +| Role | Who | Authority | +|---|---|---| +| **Maintainer** | The code owners in [`.github/CODEOWNERS`](.github/CODEOWNERS) (generated from [`.github/codeowners-roles.yml`](.github/codeowners-roles.yml)) | Validate issues, accept/reject RFCs, review and merge PRs, set direction. Final decision authority. | +| **Contributor** | Anyone else | Report problems (Issues), propose ideas (Discussions), author RFCs, and open pull requests. | + +Decision authority rests with the maintainers. CODEOWNERS is the single source +of truth for who that is; this document does not duplicate the list. + +## The three channels + +Each channel has one job. Using the right one is the first thing we ask of a +contribution. + +| Channel | Purpose | Not for | +|---|---|---| +| **[Issues](../../issues)** | **Report a problem** β€” a bug, a regression, a documented behavior that's wrong. Something concrete and reproducible. | Feature requests, ideas, questions, or design proposals (β†’ Discussions). | +| **[Discussions](../../discussions)** | **Propose and explore** β€” new ideas, feature requests, questions, and the incubation of RFCs. | Bug reports (β†’ Issues). | +| **Pull requests** | **Land a sanctioned change** β€” a fix for a *validated* issue, an *accepted* RFC, or a trivial change (see fast-lane). | Substantive change with no backing issue/RFC β€” it will be redirected. | + +## How a change becomes mergeable + +``` + β”Œβ”€β”€β”€β”€β”€β”€β”€β”€β”€β”€β”€ bug ───────────┐ β”Œβ”€β”€β”€β”€β”€β”€β”€β”€ idea / feature ────────┐ + β–Ό β”‚ β–Ό β”‚ + Issue (problem report) β”‚ Discussion (idea / RFC incubation) β”‚ + β”‚ β”‚ β”‚ β”‚ + maintainer triage β”‚ rough consensus β”‚ + β”‚ β”‚ β”‚ graduate β”‚ + β–Ό β”‚ β–Ό β”‚ + label: accepted ──────────┐ β”‚ RFC PR (docs/rfcs/NNNN-*.md) β”‚ + β”‚ β”‚ β”‚ β”‚ β”‚ + β”‚ β”‚ β”‚ maintainer review β”‚ + β–Ό β–Ό β”‚ β–Ό β”‚ + Pull request ◀──────────┴──────────│── merged == accepted β”‚ + (links the issue or the accepted RFC) β—€β”€β”€β”€β”€β”€β”€β”€β”˜ (implementation PRs reference it) β”‚ + β”‚ + review + CODEOWNERS + branch protection + β–Ό + merged +``` + +### Issues β†’ validated +A new issue starts unlabeled. A maintainer triages it and, if it's a real, +in-scope problem, applies the **`accepted`** label. **Only `accepted` issues are +open for a contributor PR.** This prevents the "I fixed an issue you hadn't +agreed was a problem" rejection. Want to fix something? Get the issue accepted +first, or pick one already labelled `accepted` / `help wanted`. + +### Discussions β†’ RFCs β†’ accepted +Ideas and feature requests start in **Discussions**. Anyone β€” including external +contributors β€” may then **author an RFC** by opening a pull request that adds +`docs/rfcs/NNNN-title.md` (see [docs/rfcs/README.md](docs/rfcs/README.md)). The +RFC is reviewed as code; **a maintainer merging it is the act of acceptance** +(it becomes the durable decision record). Implementation PRs then reference the +accepted RFC. + +Authoring an RFC is open to everyone; **accepting one is a maintainer +decision.** Maintainers may also decline an RFC, with rationale, by closing it. + +### Pull requests β†’ sanctioned +A contributor PR must do one of: +1. link a maintainer-**`accepted`** issue it fixes, or +2. be (or reference) an **accepted RFC**, or +3. qualify for the **trivial fast-lane**. + +**Trivial fast-lane** β€” these may be opened directly, no prior issue/RFC: +typo and wording fixes, documentation corrections, dependency bumps, comment +fixes, and obviously-correct one-line CI tweaks. When in doubt, open an Issue or +Discussion first; a PR that turns out to be non-trivial will be asked to. + +A substantive PR with no backing issue/RFC will be closed with a pointer to the +right channel β€” not as a judgment of the idea, but to keep design discussion +where it's reviewable. + +## What maintainers do *not* gate +Maintainers' own changes do not pass through the intake gates above β€” the team +runs a separate internal process. The universal gates (review, CODEOWNERS, +branch protection, CI) apply to everyone. Enforcement of the intake rules is, to +start, **by convention and review** (PR template + labels); an automated check +keyed to author association may be added later if volume warrants. + +## Code of conduct & security +- Conduct: [CODE_OF_CONDUCT.md](CODE_OF_CONDUCT.md). +- Security issues are **not** public Issues β€” see [SECURITY.md](SECURITY.md). + +## Changing this document +Governance changes the same way code does: a pull request, reviewed by +maintainers. This file describes the external surface; the internal maintainer +process is intentionally out of scope here. diff --git a/README.md b/README.md index ae3234b..0f6ebea 100644 --- a/README.md +++ b/README.md @@ -5,33 +5,35 @@ [![Crates.io](https://img.shields.io/crates/v/omnigraph-cli.svg)](https://crates.io/crates/omnigraph-cli) [![CI](https://github.com/ModernRelay/omnigraph/actions/workflows/ci.yml/badge.svg)](https://github.com/ModernRelay/omnigraph/actions/workflows/ci.yml) -**Object-storage native knowledge graph with git-style workflows. Designed for agents and humans to collaborate on shared structured knowledge.** +**Lakehouse native graph engine built for context assembly** -Turns fragmented context into a live graph, lets humans and agents coordinate through that graph, and uses branches so agent-generated changes can be reviewed and merged safely. +Omnigraph acts as operational state & coordination layer for agents -Built on Rust, Arrow, DataFusion and Lance. - -Join the [Omnigraph Slack community](https://join.slack.com/t/omnigraphworkspace/shared_invite/zt-3wfpglyxj-lHvJGhuySPfqLtN35uJZNw) - -## Use Cases - -- Company brain / [Second brain](https://github.com/ModernRelay/omnigraph-cookbooks/tree/main/second-brain) -- Context graph -- Knowledge base for multi-agent research -- Incident response graph -- Compliance & audit graph - - -## Capabilities - -- Typed schema, typed queries, and typed mutations +- Git-style versioning & branching +- Multimodal retrieval (graph+vector/fts+filters) optimized for context assembly +- Object storage native (S3, RustFS) - Native blob-as-data support (docs, images, videos, etc) -- Schema-as-code, query validation and linting -- Git-style graph workflows: branches, commits, merges, and transactional runs -- Local, on-prem & cloud S3-native storage with snapshot-pinned reads -- Graph traversal + text, fuzzy, BM25, vector, and RRF search in one runtime -- Policy-as-code for server-side access control -- Single CLI for multiple deployments +- VPC, On-prem, hybrid deployment +- [`Lance`](https://github.com/lance-format/lance) format as open storage layer + +| AS CODE | What it means | +|---|---| +| **Schema AS CODE** | Typed `.pg` schemas, planned, applied, enforced | +| **Context AS CODE** | Linted queries & agentic nudges, versioned and reusable | +| **Security AS CODE** | Cedar policies enforced server-side on every mutation | +| **Dashboards AS CODE** | Declarative views & controls over the graph *(coming)* | + +## Core Use Cases + +| Use case | What it's for +|---|---| +| **Company brain** | Org knowledge unified into one queryable graph | +| **Context graph** | Decision traces and codified tribal knowledge | +| **Agentic memory** | Durable, versioned memory for long-running agents | +| **Dev graph** | Issues & dependency model for coding agents | +| **R&D data layer** | Experiments & trials data written into branches | +| **ML workflows** | Versioned, branchable graphs for training & eval | +| **Karpathy's LLM wiki** | A living, agent-updatable knowledge base | ## Quick Install @@ -86,12 +88,29 @@ omnigraph branch create --from main feature-x ./graph.omni omnigraph branch merge feature-x --into main ./graph.omni ``` -See [docs/user/cli.md](docs/user/cli.md) for schema apply, snapshots, ingest, runs, and policy commands. +See [docs/user/cli.md](docs/user/cli.md) for schema apply, snapshots, ingest, commits, and policy commands. + +## Clients + +For programmatic access to a running `omnigraph-server`: + +- **TypeScript SDK** β€” [`@modernrelay/omnigraph`](https://www.npmjs.com/package/@modernrelay/omnigraph) ([source](https://github.com/ModernRelay/omnigraph-ts/tree/main/packages/sdk)). Instance-per-client, typed errors, camelCase types, async-iterator streaming export. + + ```bash + npm install @modernrelay/omnigraph + ``` + +- **Model Context Protocol server** β€” [`@modernrelay/omnigraph-mcp`](https://www.npmjs.com/package/@modernrelay/omnigraph-mcp) ([source](https://github.com/ModernRelay/omnigraph-ts/tree/main/packages/mcp)). Bridges Omnigraph to LLM hosts (Claude Desktop, Claude Code, …) over stdio. Exposes tools and resources for schema, branches, queries, mutations, ingest, and bundles curated best-practices guidance from the cookbook. + + ```bash + npm install -g @modernrelay/omnigraph-mcp + ``` + +Both packages are versioned in lockstep with `omnigraph-server` on major.minor: `@modernrelay/omnigraph@X.Y.*` targets `omnigraph-server@X.Y.*`. See [`ModernRelay/omnigraph-ts`](https://github.com/ModernRelay/omnigraph-ts) for the monorepo. ## Docs - [Install guide](docs/user/install.md) -- [CLI guide](docs/user/cli.md) - [Deployment guide](docs/user/deployment.md) ## Build And Test @@ -113,8 +132,8 @@ Notes: - `crates/omnigraph-compiler`: shared schema/query parser, typechecker, catalog, and IR lowering - `crates/omnigraph`: storage/runtime, branching, merge, change detection, and query execution -- `crates/omnigraph-cli`: CLI for init/load/ingest/read/change/branch/snapshot/export/policy operations -- `crates/omnigraph-server`: Axum HTTP server for remote reads, changes, ingest, export, branches, commits, and runs +- `crates/omnigraph-cli`: CLI for graph lifecycle (init/load/ingest), query/mutate, branch/commit/merge, schema/lint, snapshot/export, policy, and maintenance (optimize/cleanup) +- `crates/omnigraph-server`: Axum HTTP server for remote reads, changes, ingest, export, branches, and commits ## Contributing diff --git a/crates/omnigraph-cli/Cargo.toml b/crates/omnigraph-cli/Cargo.toml index 0d35ed8..641068e 100644 --- a/crates/omnigraph-cli/Cargo.toml +++ b/crates/omnigraph-cli/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-cli" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "CLI for the Omnigraph graph database." license = "MIT" @@ -13,10 +13,10 @@ name = "omnigraph" path = "src/main.rs" [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.0" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.0" } -omnigraph-server = { path = "../omnigraph-server", version = "0.6.0" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.1" } +omnigraph-server = { path = "../omnigraph-server", version = "0.6.1" } clap = { workspace = true } color-eyre = { workspace = true } serde = { workspace = true } diff --git a/crates/omnigraph-cli/src/main.rs b/crates/omnigraph-cli/src/main.rs index b7e3041..29b55c4 100644 --- a/crates/omnigraph-cli/src/main.rs +++ b/crates/omnigraph-cli/src/main.rs @@ -9,6 +9,7 @@ use clap::{Arg, ArgAction, Args, CommandFactory, FromArgMatches, Parser, Subcomm use color_eyre::eyre::{Result, bail}; use omnigraph::db::{Omnigraph, ReadTarget, SnapshotId}; use omnigraph::loader::LoadMode; +use omnigraph::storage::normalize_root_uri; use omnigraph_compiler::query::parser::parse_query; use omnigraph_compiler::schema::parser::parse_schema; use omnigraph_compiler::{ @@ -24,9 +25,10 @@ use omnigraph_server::api::{ SnapshotTableOutput, commit_output, ingest_output, read_output, schema_apply_output, snapshot_payload, }; +use omnigraph_server::queries::{QueryRegistry, check, format_check_breakages}; use omnigraph_server::{ AliasCommand, OmnigraphConfig, PolicyAction, PolicyDecision, PolicyEngine, PolicyRequest, - PolicyTestConfig, ReadOutputFormat, load_config, + PolicyTestConfig, ReadOutputFormat, graph_resource_id_for_selection, load_config, }; use reqwest::Method; use reqwest::header::AUTHORIZATION; @@ -153,6 +155,11 @@ enum Command { #[arg(long)] json: bool, }, + /// Operate on the server-side stored-query registry (`queries:`). + Queries { + #[command(subcommand)] + command: QueriesCommand, + }, /// Show graph snapshot Snapshot { /// Graph URI @@ -502,6 +509,35 @@ enum PolicyCommand { }, } +#[derive(Debug, Subcommand)] +enum QueriesCommand { + /// Type-check the stored-query registry against the live schema. + /// + /// Distinct from `omnigraph lint` (which lints one `.gq` file): + /// this validates the whole `queries:` registry β€” opening the graph + /// to read its schema and confirming every stored query still + /// type-checks. Exits non-zero on any breakage. + Validate { + /// Graph URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + json: bool, + }, + /// List the registered stored queries (name, MCP exposure, params). + List { + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + json: bool, + }, +} + #[derive(Debug, Args, Clone)] struct ParamsArgs { #[arg(long, conflicts_with = "params_file")] @@ -743,25 +779,66 @@ fn load_cli_config(config_path: Option<&PathBuf>) -> Result { Ok(config) } -fn resolve_policy_engine(config: &OmnigraphConfig) -> Result { - let policy_file = config - .resolve_policy_file() - .ok_or_else(|| color_eyre::eyre::eyre!("policy.file must be set in omnigraph.yaml"))?; - PolicyEngine::load_graph(&policy_file, &policy_graph_id(config)) +#[derive(Debug, Clone)] +struct ResolvedCliGraph { + uri: String, + selected: Option, + graph_id: String, + policy_file: Option, + is_remote: bool, } -/// Open a local-URI graph and, when `policy.file` is configured in -/// `omnigraph.yaml`, install the resolved `PolicyEngine` on the engine -/// handle so every direct-engine write goes through -/// `Omnigraph::enforce(...)` (MR-722). Without a configured policy this -/// is identical to a bare `Omnigraph::open`. -/// -/// Returns owned `Omnigraph`; chained on top of `Omnigraph::open(...)`'s -/// existing future to keep call sites narrow. -async fn open_local_db_with_policy(uri: &str, config: &OmnigraphConfig) -> Result { - let db = Omnigraph::open(uri).await?; - if config.resolve_policy_file().is_some() { - let engine = Arc::new(resolve_policy_engine(config)?); +impl ResolvedCliGraph { + fn selected(&self) -> Option<&str> { + self.selected.as_deref() + } +} + +struct ResolvedPolicyContext { + policy_file: PathBuf, + graph_id: String, +} + +fn resolve_policy_context(config: &OmnigraphConfig) -> Result { + let selected = config.resolve_policy_tooling_graph_selection()?; + let policy_file = config + .resolve_policy_file_for(selected) + .ok_or_else(|| { + color_eyre::eyre::eyre!( + "policy.file or graphs..policy.file must be set in omnigraph.yaml" + ) + })?; + let graph_id = match selected { + Some(name) => graph_resource_id_for_selection(Some(name), ""), + None => graph_resource_id_for_selection(None, "default"), + }; + Ok(ResolvedPolicyContext { + policy_file, + graph_id, + }) +} + +fn resolve_policy_engine(context: &ResolvedPolicyContext) -> Result { + PolicyEngine::load_graph(&context.policy_file, &context.graph_id) +} + +fn resolve_policy_engine_for_graph(graph: &ResolvedCliGraph) -> Result { + let policy_file = graph.policy_file.as_ref().ok_or_else(|| { + color_eyre::eyre::eyre!( + "policy.file or graphs..policy.file must be set in omnigraph.yaml" + ) + })?; + PolicyEngine::load_graph(policy_file, &graph.graph_id) +} + +/// Open a local graph and install the policy resolved for the same graph +/// identity that produced the URI. A named graph uses +/// `graphs..policy.file`; an explicit positional URI is anonymous and +/// uses the legacy top-level `policy.file`. +async fn open_local_db_with_policy(graph: &ResolvedCliGraph) -> Result { + let db = Omnigraph::open(&graph.uri).await?; + if graph.policy_file.is_some() { + let engine = Arc::new(resolve_policy_engine_for_graph(graph)?); Ok(db.with_policy(engine as Arc)) } else { Ok(db) @@ -778,22 +855,16 @@ fn resolve_cli_actor<'a>(cli_as: Option<&'a str>, config: &'a OmnigraphConfig) - cli_as.or(config.cli.actor.as_deref()) } -fn resolve_policy_tests_path(config: &OmnigraphConfig) -> Result { - config.resolve_policy_tests_file().ok_or_else(|| { - color_eyre::eyre::eyre!( - "policy.tests.yaml requires policy.file to be set in omnigraph.yaml" - ) - }) +fn resolve_policy_tests_path(context: &ResolvedPolicyContext) -> PathBuf { + context.policy_file.with_file_name("policy.tests.yaml") } -fn policy_graph_id(config: &OmnigraphConfig) -> String { - if let Some(name) = &config.project.name { - return name.clone(); +fn normalize_policy_graph_uri(uri: &str) -> Result { + if is_remote_uri(uri) { + Ok(uri.trim_end_matches('/').to_string()) + } else { + Ok(normalize_root_uri(uri)?) } - config - .resolve_target_uri(None, None, config.server_graph_name()) - .or_else(|_| config.resolve_target_uri(None, None, config.cli_graph_name())) - .unwrap_or_else(|_| "default".to_string()) } fn resolve_remote_bearer_token( @@ -877,6 +948,47 @@ fn resolve_uri( config.resolve_target_uri(cli_uri, cli_target, config.cli_graph_name()) } +fn resolve_cli_graph( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, +) -> Result { + let selected = if cli_uri.is_some() { + None + } else { + cli_target + .map(str::to_string) + .or_else(|| config.cli_graph_name().map(str::to_string)) + }; + config.resolve_graph_selection(selected.as_deref())?; + let uri = resolve_uri(config, cli_uri, cli_target)?; + let normalized_uri = normalize_policy_graph_uri(&uri)?; + let graph_id = graph_resource_id_for_selection(selected.as_deref(), &normalized_uri); + Ok(ResolvedCliGraph { + graph_id, + is_remote: is_remote_uri(&uri), + policy_file: config.resolve_policy_file_for(selected.as_deref()), + selected, + uri, + }) +} + +fn resolve_local_graph( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, + operation: &str, +) -> Result { + let graph = resolve_cli_graph(config, cli_uri, cli_target)?; + if graph.is_remote { + bail!( + "{} is only supported against local graph URIs in this milestone", + operation + ); + } + Ok(graph) +} + /// Parse a Go-style compact duration: `7d`, `24h`, `30m`, `90s`, or a plain /// integer as seconds. Used by the `cleanup --older-than` flag. fn parse_duration_arg(s: &str) -> Result { @@ -915,14 +1027,7 @@ fn resolve_local_uri( cli_target: Option<&str>, operation: &str, ) -> Result { - let uri = resolve_uri(config, cli_uri, cli_target)?; - if is_remote_uri(&uri) { - bail!( - "{} is only supported against local graph URIs in this milestone", - operation - ); - } - Ok(uri) + Ok(resolve_local_graph(config, cli_uri, cli_target, operation)?.uri) } fn resolve_branch( @@ -1609,6 +1714,248 @@ async fn execute_query_lint( )) } +#[derive(serde::Serialize)] +struct QueriesIssue { + query: String, + message: String, +} + +#[derive(serde::Serialize)] +struct QueriesValidateOutput { + ok: bool, + breakages: Vec, + warnings: Vec, +} + +#[derive(serde::Serialize)] +struct QueriesParam { + name: String, + #[serde(rename = "type")] + type_name: String, + nullable: bool, +} + +#[derive(serde::Serialize)] +struct QueriesListItem { + name: String, + mcp_expose: bool, + tool_name: Option, + mutation: bool, + params: Vec, +} + +#[derive(serde::Serialize)] +struct QueriesListOutput { + queries: Vec, +} + +/// Resolve the selected graph to `(local URI, registry selection)` from one +/// precedence, so a command's schema and its stored-query registry can never +/// come from different graphs. A **positional URI is anonymous** (top-level +/// registry, ignoring the configured default graph); otherwise `--target` +/// or the configured `cli.graph` names the graph (its per-graph block). +/// Mirrors the server's single-mode identity rule. +fn resolve_selected_graph( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, + operation: &str, +) -> Result<(String, Option)> { + let graph = resolve_local_graph(config, cli_uri, cli_target, operation)?; + Ok((graph.uri, graph.selected)) +} + +/// Load the stored-query registry for an already-resolved graph selection +/// (`None` = anonymous β†’ top-level; `Some(name)` = that graph's block). +fn load_registry_or_report( + config: &OmnigraphConfig, + selected: Option<&str>, +) -> Result { + QueryRegistry::load(config, config.query_entries_for(selected)).map_err(|errors| { + color_eyre::eyre::eyre!( + "stored-query registry failed to load:\n {}", + errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n ") + ) + }) +} + +fn graph_query_registry_names(config: &OmnigraphConfig) -> Vec<&str> { + config + .graphs + .iter() + .filter_map(|(name, graph)| (!graph.queries.is_empty()).then_some(name.as_str())) + .collect() +} + +fn resolve_registry_selection_for_list( + config: &OmnigraphConfig, + target: Option<&str>, +) -> Result> { + let selected = target + .map(str::to_string) + .or_else(|| config.cli_graph_name().map(str::to_string)); + if let Some(name) = selected.as_deref() { + config.resolve_graph_selection(Some(name))?; + return Ok(selected); + } + + if !config.query_entries().is_empty() { + return Ok(None); + } + + let graph_names = graph_query_registry_names(config); + if graph_names.is_empty() { + return Ok(None); + } + + bail!( + "stored-query registries are configured for graph{} {} but no graph was selected. Pass `--target {}` or set `cli.graph`.", + if graph_names.len() == 1 { "" } else { "s" }, + graph_names.join(", "), + graph_names[0], + ) +} + +fn validate_registry_for_catalog( + registry: &QueryRegistry, + catalog: &omnigraph_compiler::catalog::Catalog, + label: &str, +) -> omnigraph::error::Result<()> { + let report = check(registry, catalog); + if report.has_breakages() { + return Err(omnigraph::error::OmniError::manifest( + format_check_breakages(label, &report), + )); + } + Ok(()) +} + +async fn execute_queries_validate( + uri: Option, + target: Option, + config_path: Option<&PathBuf>, + json: bool, +) -> Result<()> { + let config = load_cli_config(config_path)?; + // One selection drives both the schema URI and the registry, so a + // positional URI and a `--target` can't validate different graphs. + let (uri, selected) = + resolve_selected_graph(&config, uri, target.as_deref(), "queries validate")?; + let registry = load_registry_or_report(&config, selected.as_deref())?; + let db = Omnigraph::open(&uri).await?; + let report = check(®istry, &db.catalog()); + + let output = QueriesValidateOutput { + ok: !report.has_breakages(), + breakages: report + .breakages + .iter() + .map(|b| QueriesIssue { + query: b.query.clone(), + message: b.message.clone(), + }) + .collect(), + warnings: report + .warnings + .iter() + .map(|w| QueriesIssue { + query: w.query.clone(), + message: w.message.clone(), + }) + .collect(), + }; + + if json { + print_json(&output)?; + } else { + if output.breakages.is_empty() { + println!( + "OK {} stored quer{} type-check against the schema", + registry.len(), + if registry.len() == 1 { "y" } else { "ies" } + ); + } + for issue in &output.breakages { + println!("ERROR query '{}': {}", issue.query, issue.message); + } + for issue in &output.warnings { + println!("WARN query '{}': {}", issue.query, issue.message); + } + } + + if report.has_breakages() { + io::stdout().flush()?; + std::process::exit(1); + } + Ok(()) +} + +fn execute_queries_list( + target: Option, + config_path: Option<&PathBuf>, + json: bool, +) -> Result<()> { + let config = load_cli_config(config_path)?; + let selected = resolve_registry_selection_for_list(&config, target.as_deref())?; + let registry = load_registry_or_report(&config, selected.as_deref())?; + + let output = QueriesListOutput { + queries: registry + .iter() + .map(|q| QueriesListItem { + name: q.name.clone(), + mcp_expose: q.expose, + tool_name: q.tool_name.clone(), + mutation: q.is_mutation(), + params: q + .decl + .params + .iter() + .map(|p| QueriesParam { + name: p.name.clone(), + type_name: p.type_name.clone(), + nullable: p.nullable, + }) + .collect(), + }) + .collect(), + }; + + if json { + print_json(&output)?; + } else if output.queries.is_empty() { + println!("(no stored queries registered)"); + } else { + for q in &output.queries { + let kind = if q.mutation { "mutation" } else { "read" }; + let params = q + .params + .iter() + .map(|p| { + format!( + "${}: {}{}", + p.name, + p.type_name, + if p.nullable { "?" } else { "" } + ) + }) + .collect::>() + .join(", "); + let mcp = if q.mcp_expose { + format!(" [mcp: {}]", q.tool_name.as_deref().unwrap_or(&q.name)) + } else { + String::new() + }; + println!("{kind} {}({params}){mcp}", q.name); + } + } + Ok(()) +} + async fn execute_read( uri: &str, query_source: &str, @@ -1655,7 +2002,7 @@ async fn execute_read_remote( } async fn execute_change( - uri: &str, + graph: &ResolvedCliGraph, query_source: &str, query_name: Option<&str>, branch: &str, @@ -1665,7 +2012,7 @@ async fn execute_change( ) -> Result { let (selected_name, query_params) = select_named_query(query_source, query_name)?; let params = query_params_from_json(&query_params, params_json)?; - let db = open_local_db_with_policy(uri, config).await?; + let db = open_local_db_with_policy(graph).await?; let actor = resolve_cli_actor(cli_as_actor, config); let result = db .mutate_as(branch, query_source, &selected_name, ¶ms, actor) @@ -1893,9 +2240,10 @@ async fn main() -> Result<()> { json, } => { let config = load_cli_config(config.as_ref())?; - let uri = resolve_local_uri(&config, uri, target.as_deref(), "load")?; + let graph = resolve_local_graph(&config, uri, target.as_deref(), "load")?; + let uri = graph.uri.clone(); let branch = resolve_branch(&config, branch, None, "main"); - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); let result = db .load_file_as(&branch, &data.to_string_lossy(), mode.into(), actor) @@ -1936,10 +2284,11 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let branch = resolve_branch(&config, branch, None, "main"); let from = resolve_branch(&config, from, None, "main"); - let payload = if is_remote_uri(&uri) { + let payload = if graph.is_remote { let data = fs::read_to_string(&data)?; remote_json::( &http_client, @@ -1955,7 +2304,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); let result = db .ingest_file_as( @@ -1986,9 +2335,10 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let from = resolve_branch(&config, from, None, "main"); - let payload = if is_remote_uri(&uri) { + let payload = if graph.is_remote { remote_json::( &http_client, Method::POST, @@ -2001,7 +2351,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); db.branch_create_from_as(ReadTarget::branch(&from), &name, actor) .await?; @@ -2027,8 +2377,9 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let payload = if is_remote_uri(&uri) { + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); + let payload = if graph.is_remote { remote_json::( &http_client, Method::GET, @@ -2061,8 +2412,9 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; - let payload = if is_remote_uri(&uri) { + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); + let payload = if graph.is_remote { remote_json::( &http_client, Method::DELETE, @@ -2072,7 +2424,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); db.branch_delete_as(&name, actor).await?; BranchDeleteOutput { @@ -2098,9 +2450,10 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let into = resolve_branch(&config, into, None, "main"); - let payload = if is_remote_uri(&uri) { + let payload = if graph.is_remote { remote_json::( &http_client, Method::POST, @@ -2113,7 +2466,7 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); let outcome = db.branch_merge_as(&source, &into, actor).await?; BranchMergeOutput { @@ -2248,9 +2601,10 @@ async fn main() -> Result<()> { let config = load_cli_config(config.as_ref())?; let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; - let uri = resolve_uri(&config, uri, target.as_deref())?; + let graph = resolve_cli_graph(&config, uri, target.as_deref())?; + let uri = graph.uri.clone(); let schema_source = fs::read_to_string(&schema)?; - let output = if is_remote_uri(&uri) { + let output = if graph.is_remote { // MR-694 PR B: SchemaApplyRequest gained an // allow_data_loss field so Hard-mode drops are no // longer CLI-only. The previous bail is gone; the @@ -2268,13 +2622,22 @@ async fn main() -> Result<()> { ) .await? } else { - let db = open_local_db_with_policy(&uri, &config).await?; + let db = open_local_db_with_policy(&graph).await?; let actor = resolve_cli_actor(cli.as_actor.as_deref(), &config); + let registry = load_registry_or_report(&config, graph.selected())?; + let registry = (!registry.is_empty()).then_some(registry); + let label = graph.selected().unwrap_or(&uri).to_string(); let result = db - .apply_schema_as( + .apply_schema_as_with_catalog_check( &schema_source, omnigraph::db::SchemaApplyOptions { allow_data_loss }, actor, + |catalog| { + if let Some(registry) = registry.as_ref() { + validate_registry_for_catalog(registry, catalog, &label)?; + } + Ok(()) + }, ) .await?; schema_apply_output(&uri, result) @@ -2331,6 +2694,23 @@ async fn main() -> Result<()> { .await?; finish_query_lint(&output, json)?; } + Command::Queries { command } => match command { + QueriesCommand::Validate { + uri, + target, + config, + json, + } => { + execute_queries_validate(uri, target, config.as_ref(), json).await?; + } + QueriesCommand::List { + target, + config, + json, + } => { + execute_queries_list(target, config.as_ref(), json)?; + } + }, Command::Snapshot { uri, target, @@ -2436,7 +2816,8 @@ async fn main() -> Result<()> { .as_deref() .or_else(|| alias_config.and_then(|alias| alias.graph.as_deref())); let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target_name)?; - let uri = resolve_uri(&config, uri, target_name)?; + let graph = resolve_cli_graph(&config, uri, target_name)?; + let uri = graph.uri.clone(); let query_source = resolve_query_source( &config, query.as_ref(), @@ -2458,7 +2839,7 @@ async fn main() -> Result<()> { alias_config.and_then(|alias| alias.branch.clone()), )?; let query_name = name.or_else(|| alias_config.and_then(|alias| alias.name.clone())); - let output = if is_remote_uri(&uri) { + let output = if graph.is_remote { execute_read_remote( &http_client, &uri, @@ -2521,7 +2902,8 @@ async fn main() -> Result<()> { .as_deref() .or_else(|| alias_config.and_then(|alias| alias.graph.as_deref())); let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target_name)?; - let uri = resolve_uri(&config, uri, target_name)?; + let graph = resolve_cli_graph(&config, uri, target_name)?; + let uri = graph.uri.clone(); let query_source = resolve_query_source( &config, query.as_ref(), @@ -2543,7 +2925,7 @@ async fn main() -> Result<()> { "main", ); let query_name = name.or_else(|| alias_config.and_then(|alias| alias.name.clone())); - let output = if is_remote_uri(&uri) { + let output = if graph.is_remote { execute_change_remote( &http_client, &uri, @@ -2556,7 +2938,7 @@ async fn main() -> Result<()> { .await? } else { execute_change( - &uri, + &graph, &query_source, query_name.as_deref(), &branch, @@ -2575,20 +2957,19 @@ async fn main() -> Result<()> { Command::Policy { command } => match command { PolicyCommand::Validate { config } => { let config = load_cli_config(config.as_ref())?; - let engine = resolve_policy_engine(&config)?; - let policy_file = config - .resolve_policy_file() - .expect("policy file should exist after resolve_policy_engine"); + let context = resolve_policy_context(&config)?; + let engine = resolve_policy_engine(&context)?; println!( "policy valid: {} [{} actors]", - policy_file.display(), + context.policy_file.display(), engine.known_actor_count() ); } PolicyCommand::Test { config } => { let config = load_cli_config(config.as_ref())?; - let engine = resolve_policy_engine(&config)?; - let tests_path = resolve_policy_tests_path(&config)?; + let context = resolve_policy_context(&config)?; + let engine = resolve_policy_engine(&context)?; + let tests_path = resolve_policy_tests_path(&context); let tests = PolicyTestConfig::load(&tests_path)?; engine.run_tests(&tests)?; println!("policy tests passed: {} cases", tests.cases.len()); @@ -2601,7 +2982,8 @@ async fn main() -> Result<()> { target_branch, } => { let config = load_cli_config(config.as_ref())?; - let engine = resolve_policy_engine(&config)?; + let context = resolve_policy_context(&config)?; + let engine = resolve_policy_engine(&context)?; let request = PolicyRequest { action, branch, @@ -2629,18 +3011,19 @@ async fn main() -> Result<()> { "fragments_removed": s.fragments_removed, "fragments_added": s.fragments_added, "committed": s.committed, + "skipped": s.skipped.map(|r| r.as_str()), })).collect::>(), }); print_json(&value)?; } else { println!("optimize {} β€” {} tables", uri, stats.len()); for s in &stats { - if s.committed { + if let Some(reason) = s.skipped { + println!(" {:<40} skipped ({reason})", s.table_key); + } else if s.committed { println!( " {:<40} frags {} β†’ {} βœ“", - s.table_key, - s.fragments_removed + s.fragments_added - s.fragments_added, - s.fragments_added + s.table_key, s.fragments_removed, s.fragments_added ); } else { println!(" {:<40} no-op", s.table_key); @@ -2699,20 +3082,33 @@ async fn main() -> Result<()> { "table_key": s.table_key, "bytes_removed": s.bytes_removed, "old_versions_removed": s.old_versions_removed, + "error": s.error, })).collect::>(), }); print_json(&value)?; } else { let total_bytes: u64 = stats.iter().map(|s| s.bytes_removed).sum(); let total_versions: u64 = stats.iter().map(|s| s.old_versions_removed).sum(); + let failed: Vec<&str> = stats + .iter() + .filter(|s| s.error.is_some()) + .map(|s| s.table_key.as_str()) + .collect(); println!( "cleanup {} ({}) β€” removed {} versions ({} bytes) across {} tables", uri, policy_desc, total_versions, total_bytes, - stats.len() + stats.len() - failed.len() ); + if !failed.is_empty() { + println!( + " {} table(s) failed and will be retried on the next cleanup: {}", + failed.len(), + failed.join(", ") + ); + } } } Command::Graphs { command } => match command { @@ -2761,7 +3157,8 @@ mod tests { use super::{ DEFAULT_BEARER_TOKEN_ENV, apply_bearer_token, bearer_token_from_env_file, legacy_change_request_body, load_cli_config, load_env_file_into_process, - normalize_bearer_token, parse_env_assignment, resolve_remote_bearer_token, + normalize_bearer_token, parse_env_assignment, resolve_policy_context, + resolve_cli_graph, resolve_remote_bearer_token, }; use omnigraph_server::load_config; use reqwest::header::AUTHORIZATION; @@ -3021,4 +3418,150 @@ graphs: } } } + + #[test] + fn graph_identity_resolve_policy_context_named_cli_graph_uses_graph_key_not_project_name_or_uri() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/local-policy-graph.omni + policy: + file: ./policy.yaml +cli: + graph: local +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let context = resolve_policy_context(&config).unwrap(); + assert_eq!(context.graph_id, "local"); + } + + #[test] + fn graph_identity_resolve_policy_context_server_graph_uses_graph_key_when_cli_graph_absent() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/local-policy-graph.omni + policy: + file: ./server-policy.yaml +server: + graph: local +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let context = resolve_policy_context(&config).unwrap(); + assert_eq!(context.graph_id, "local"); + assert!(context.policy_file.ends_with("server-policy.yaml")); + } + + #[test] + fn graph_identity_resolve_policy_context_anonymous_uses_top_level_default_identity() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/local-policy-graph.omni +policy: + file: ./top-policy.yaml +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let context = resolve_policy_context(&config).unwrap(); + assert_eq!(context.graph_id, "default"); + assert!(context.policy_file.ends_with("top-policy.yaml")); + } + + #[test] + fn graph_identity_resolve_cli_graph_named_target_uses_graph_key_not_project_name_or_uri() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + prod: + uri: s3://bucket/prod-graph/ + policy: + file: ./prod-policy.yaml +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let graph = resolve_cli_graph(&config, None, Some("prod")).unwrap(); + assert_eq!(graph.selected(), Some("prod")); + assert_eq!(graph.graph_id, "prod"); + assert_eq!(graph.uri, "s3://bucket/prod-graph/"); + } + + #[test] + fn graph_identity_resolve_cli_graph_positional_uri_uses_anonymous_normalized_uri() { + let temp = tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + r#" +project: + name: misleading-project +graphs: + local: + uri: /tmp/configured-graph.omni + policy: + file: ./policy.yaml +cli: + graph: local +"#, + ) + .unwrap(); + + let config = load_config(Some(&config_path)).unwrap(); + let local_graph_path = temp.path().join("explicit-graph.omni"); + let local_graph = resolve_cli_graph( + &config, + Some(format!("file://{}", local_graph_path.display())), + None, + ) + .unwrap(); + assert_eq!(local_graph.selected(), None); + assert_eq!( + local_graph.graph_id, + local_graph_path.to_string_lossy().as_ref() + ); + assert_eq!(local_graph.policy_file, None); + + let s3_graph = resolve_cli_graph( + &config, + Some("s3://bucket/anonymous-graph/".to_string()), + None, + ) + .unwrap(); + assert_eq!(s3_graph.selected(), None); + assert_eq!(s3_graph.graph_id, "s3://bucket/anonymous-graph"); + assert_eq!(s3_graph.policy_file, None); + } } diff --git a/crates/omnigraph-cli/tests/cli.rs b/crates/omnigraph-cli/tests/cli.rs index 6e5de37..9682d9a 100644 --- a/crates/omnigraph-cli/tests/cli.rs +++ b/crates/omnigraph-cli/tests/cli.rs @@ -2376,3 +2376,295 @@ fn graphs_list_against_local_uri_errors_with_remote_only_message() { "expected 'remote multi-graph server URL' rejection in stderr; got:\n{stderr}" ); } + +fn queries_test_config(graph_uri: &str, entry: &str, gq_file: &str) -> String { + format!( + "graphs:\n local:\n uri: '{}'\n queries:\n {entry}:\n file: ./{gq_file}\n\ + cli:\n graph: local\npolicy: {{}}\n", + graph_uri.replace('\'', "''") + ) +} + +#[test] +fn queries_validate_exits_zero_on_clean_registry() { + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &queries_test_config(&graph.path().to_string_lossy(), "find_person", "find_person.gq"), + ); + let output = output_success(cli().arg("queries").arg("validate").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!(stdout.contains("OK"), "stdout:\n{stdout}"); +} + +#[test] +fn queries_validate_exits_nonzero_on_type_broken_query() { + let graph = SystemGraph::loaded(); + // `Widget` is not in the fixture schema. + graph.write_query("ghost.gq", "query ghost() { match { $w: Widget } return { $w.name } }"); + let config = graph.write_config( + "omnigraph.yaml", + &queries_test_config(&graph.path().to_string_lossy(), "ghost", "ghost.gq"), + ); + let output = output_failure(cli().arg("queries").arg("validate").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!( + stdout.contains("ghost"), + "validation should name the broken query; stdout:\n{stdout}" + ); +} + +#[test] +fn queries_list_prints_registered_query() { + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + // Exposed with an explicit tool name so the list shows the MCP suffix. + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " find_person:\n", + " file: ./find_person.gq\n", + " mcp: {{ expose: true, tool_name: lookup_person }}\n", + "cli:\n", + " graph: local\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + let output = output_success(cli().arg("queries").arg("list").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!(stdout.contains("find_person"), "stdout:\n{stdout}"); + assert!( + stdout.contains("$name: String"), + "list should show typed params; stdout:\n{stdout}" + ); + assert!( + stdout.contains("[mcp: lookup_person]"), + "list should show the MCP tool name for exposed queries; stdout:\n{stdout}" + ); +} + +#[test] +fn queries_list_requires_graph_selection_for_per_graph_only_registries() { + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " find_person:\n", + " file: ./find_person.gq\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + + let output = output_failure(cli().arg("queries").arg("list").arg("--config").arg(&config)); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("local") && stderr.contains("--target local"), + "error must name the graph and give a concrete selection hint; stderr:\n{stderr}" + ); +} + +#[test] +fn queries_list_without_graph_selection_lists_top_level_registry() { + let graph = SystemGraph::loaded(); + graph.write_query( + "top_find.gq", + "query top_find($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + concat!( + "queries:\n", + " top_find:\n", + " file: ./top_find.gq\n", + "policy: {}\n", + ), + ); + + let output = output_success(cli().arg("queries").arg("list").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + assert!(stdout.contains("top_find"), "stdout:\n{stdout}"); +} + +#[test] +fn queries_list_unknown_target_errors() { + // `queries list` opens no graph URI, so unknown-graph validation can't ride + // along on URI resolution the way it does for every other command. An + // unknown `--target` must still error (naming the graph) instead of + // silently falling back to the top-level registry and showing the wrong + // (or empty) catalog. + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &queries_test_config(&graph.path().to_string_lossy(), "find_person", "find_person.gq"), + ); + let output = output_failure( + cli() + .arg("queries") + .arg("list") + .arg("--target") + .arg("nonexistent") + .arg("--config") + .arg(&config), + ); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("nonexistent"), + "error must name the unknown graph; stderr:\n{stderr}" + ); +} + +#[test] +fn queries_commands_reject_named_graph_with_populated_top_level_block() { + // A named graph (here via `cli.graph`) uses its own `graphs.` block, + // so a populated top-level `queries:` block would be silently ignored β€” a + // config the server REFUSES to boot. `queries validate`/`list` must reject + // it too (matching boot) instead of validating/listing the per-graph block + // and giving a false green. + let graph = SystemGraph::loaded(); + graph.write_query( + "find_person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " find_person:\n", + " file: ./find_person.gq\n", + "cli:\n", + " graph: local\n", + "queries:\n", // populated top-level block: the coherence violation + " legacy:\n", + " file: ./legacy.gq\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + // Both resolve `local` from cli.graph (no positional URI), so both must + // error and name the graph + the ignored block β€” like server boot does. + for sub in ["validate", "list"] { + let output = output_failure(cli().arg("queries").arg(sub).arg("--config").arg(&config)); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("local") && stderr.contains("queries"), + "`queries {sub}` must reject a named graph with a populated top-level block; stderr:\n{stderr}" + ); + } +} + +#[test] +fn queries_validate_exits_nonzero_on_duplicate_tool_name() { + // Two exposed queries claiming one MCP tool name is a load-time + // collision β€” `queries validate` must fail (offline, before the engine + // opens) and name both queries plus the contested tool. + let graph = SystemGraph::loaded(); + graph.write_query("a.gq", "query a() { match { $p: Person } return { $p.name } }"); + graph.write_query("b.gq", "query b() { match { $p: Person } return { $p.name } }"); + let config = graph.write_config( + "omnigraph.yaml", + &format!( + concat!( + "graphs:\n", + " local:\n", + " uri: '{}'\n", + " queries:\n", + " a:\n", + " file: ./a.gq\n", + " mcp: {{ expose: true, tool_name: dup }}\n", + " b:\n", + " file: ./b.gq\n", + " mcp: {{ expose: true, tool_name: dup }}\n", + "cli:\n", + " graph: local\n", + "policy: {{}}\n", + ), + graph.path().to_string_lossy().replace('\'', "''") + ), + ); + let output = output_failure(cli().arg("queries").arg("validate").arg("--config").arg(&config)); + let stderr = String::from_utf8_lossy(&output.stderr); + assert!( + stderr.contains("dup") && stderr.contains("'a'") && stderr.contains("'b'"), + "duplicate tool name should be reported naming both queries; stderr:\n{stderr}" + ); +} + +#[test] +fn queries_validate_positional_uri_ignores_default_graph() { + // A positional URI is anonymous β†’ the schema AND the registry both come + // from top-level, even when `cli.graph` names a graph whose per-graph + // queries would fail. Pins that the URI and registry can't diverge. + let graph = SystemGraph::loaded(); + graph.write_query( + "clean.gq", + "query clean($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + // `Widget` is not in the fixture schema β€” the default graph's per-graph + // query would break validate if it were (wrongly) selected. + graph.write_query("broken.gq", "query broken() { match { $w: Widget } return { $w.name } }"); + let config = graph.write_config( + "omnigraph.yaml", + concat!( + "cli:\n graph: prod\n", + "graphs:\n", + " prod:\n", + " uri: /nonexistent-prod.omni\n", + " queries:\n", + " broken:\n", + " file: ./broken.gq\n", + "queries:\n", + " clean:\n", + " file: ./clean.gq\n", + "policy: {}\n", + ), + ); + // Positional URI = the real loaded graph; selection is anonymous, so the + // CLEAN top-level registry validates (not prod's broken one). + let output = output_success( + cli() + .arg("queries") + .arg("validate") + .arg(graph.path()) + .arg("--config") + .arg(&config), + ); + let stdout = stdout_string(&output); + assert!( + stdout.contains("OK"), + "positional URI must validate the top-level registry, not the cli.graph default; stdout:\n{stdout}" + ); +} diff --git a/crates/omnigraph-cli/tests/system_local.rs b/crates/omnigraph-cli/tests/system_local.rs index 074b203..4fc3e9a 100644 --- a/crates/omnigraph-cli/tests/system_local.rs +++ b/crates/omnigraph-cli/tests/system_local.rs @@ -74,14 +74,36 @@ project: graphs: local: uri: {} + policy: + file: ./policy.yaml cli: graph: local branch: main query: roots: - . -policy: - file: ./policy.yaml +", + yaml_string(&graph.path().to_string_lossy()) + ) +} + +fn local_policy_server_graph_config(graph: &SystemGraph) -> String { + format!( + "\ +project: + name: policy-e2e-local +graphs: + local: + uri: {} + policy: + file: ./policy.yaml +server: + graph: local +cli: + branch: main +query: + roots: + - . ", yaml_string(&graph.path().to_string_lossy()) ) @@ -991,7 +1013,7 @@ query vector_search($q: String) { // The publisher CAS conflict shape is verified end-to-end at the engine // level in -// `crates/omnigraph/tests/runs.rs::concurrent_writers_one_succeeds_one_gets_expected_version_mismatch` +// `crates/omnigraph/tests/writes.rs::concurrent_writers_one_succeeds_one_gets_expected_version_mismatch` // and at the HTTP boundary in // `crates/omnigraph-server/tests/server.rs::change_conflict_returns_manifest_conflict_409`. // A CLI-level race would be timing-dependent; with direct-publish the @@ -1000,49 +1022,55 @@ query vector_search($q: String) { #[test] fn local_cli_policy_tooling_is_end_to_end() { // Sanity check for the read-only policy CLI surfaces. These don't - // mutate the graph β€” they just parse and evaluate the policy file β€” - // so they don't depend on PR #4's engine-side enforcement. + // mutate the graph; they parse and evaluate the effective policy for + // named graph selections, including per-graph policy files. let graph = SystemGraph::loaded(); let config = graph.write_config("omnigraph-policy.yaml", &local_policy_config(&graph)); + let server_graph_config = graph.write_config( + "omnigraph-policy-server.yaml", + &local_policy_server_graph_config(&graph), + ); graph.write_config("policy.yaml", POLICY_E2E_YAML); graph.write_config("policy.tests.yaml", POLICY_E2E_TESTS_YAML); - let validate = output_success( - cli() - .arg("policy") - .arg("validate") - .arg("--config") - .arg(&config), - ); - assert!(stdout_string(&validate).contains("policy valid:")); + for config in [&config, &server_graph_config] { + let validate = output_success( + cli() + .arg("policy") + .arg("validate") + .arg("--config") + .arg(config), + ); + assert!(stdout_string(&validate).contains("policy valid:")); - let tests = output_success(cli().arg("policy").arg("test").arg("--config").arg(&config)); - assert!(stdout_string(&tests).contains("policy tests passed: 2 cases")); + let tests = output_success(cli().arg("policy").arg("test").arg("--config").arg(config)); + assert!(stdout_string(&tests).contains("policy tests passed: 2 cases")); - let explain = output_success( - cli() - .arg("policy") - .arg("explain") - .arg("--config") - .arg(&config) - .arg("--actor") - .arg("act-bruno") - .arg("--action") - .arg("change") - .arg("--branch") - .arg("main"), - ); - let explain_stdout = stdout_string(&explain); - assert!(explain_stdout.contains("decision: deny")); - assert!(explain_stdout.contains("branch: main")); + let explain = output_success( + cli() + .arg("policy") + .arg("explain") + .arg("--config") + .arg(config) + .arg("--actor") + .arg("act-bruno") + .arg("--action") + .arg("change") + .arg("--branch") + .arg("main"), + ); + let explain_stdout = stdout_string(&explain); + assert!(explain_stdout.contains("decision: deny")); + assert!(explain_stdout.contains("branch: main")); + } } #[test] fn local_cli_change_enforces_engine_layer_policy() { - // Asserts MR-722 PR #4: when `policy.file` is configured in - // `omnigraph.yaml`, the CLI loads PolicyEngine into Omnigraph and - // every direct-engine write hits `enforce(action, scope, actor)` β€” - // identical to what the HTTP server gets, regardless of transport. + // Asserts MR-722 PR #4: when the selected graph has a configured + // policy file, the CLI loads PolicyEngine into Omnigraph and every + // direct-engine write hits `enforce(action, scope, actor)` β€” identical + // to what the HTTP server gets, regardless of transport. // // Three cases, each discriminating: // @@ -1135,6 +1163,32 @@ fn local_cli_change_enforces_engine_layer_policy() { assert_eq!(verify["rows"][0]["p.name"], "RagnorOnMain"); } +#[test] +fn local_cli_positional_uri_does_not_inherit_default_graph_policy() { + let graph = SystemGraph::loaded(); + let config = graph.write_config("omnigraph-policy.yaml", &local_policy_config(&graph)); + graph.write_config("policy.yaml", POLICY_E2E_YAML); + let mutation_file = insert_person_query(&graph, "system-local-policy-positional.gq"); + + let allowed = parse_stdout_json(&output_success( + cli() + .arg("--as") + .arg("act-bruno") + .arg("change") + .arg("--config") + .arg(&config) + .arg("--uri") + .arg(graph.path()) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"PositionalUriBruno","age":4}"#) + .arg("--json"), + )); + assert_eq!(allowed["affected_nodes"], 1); + assert_eq!(allowed["actor_id"], "act-bruno"); +} + // ─── MR-722 PR A: CLIΓ—writer matrix ─────────────────────────────────────── // // The change writer is covered above by `local_cli_change_enforces_engine_layer_policy`. @@ -1293,6 +1347,62 @@ fn local_cli_schema_apply_enforces_engine_layer_policy() { assert_eq!(allowed["applied"], true); } +#[test] +fn local_cli_schema_apply_rejects_stored_query_breakage_before_publish() { + let graph = SystemGraph::loaded(); + graph.write_query( + "stored-find-person.gq", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + ); + let config = graph.write_config( + "omnigraph-stored-query-schema.yaml", + &format!( + "\ +graphs: + local: + uri: {} + queries: + find_person: + file: ./stored-find-person.gq +cli: + graph: local + branch: main +query: + roots: + - . +policy: {{}} +", + yaml_string(&graph.path().to_string_lossy()) + ), + ); + let renamed_schema = std::fs::read_to_string(fixture("test.pg")) + .unwrap() + .replace("age: I32?", "years: I32? @rename_from(\"age\")"); + let schema_path = graph.write_file("stored-query-breaks.pg", &renamed_schema); + + let rejected = output_failure( + cli() + .arg("schema") + .arg("apply") + .arg("--config") + .arg(&config) + .arg("--schema") + .arg(&schema_path) + .arg("--json"), + ); + let stderr = String::from_utf8_lossy(&rejected.stderr); + assert!( + stderr.contains("find_person") && stderr.contains("schema check"), + "schema apply should reject the stored-query breakage before publish; stderr: {stderr}" + ); + + let schema = stdout_string(&output_success( + cli().arg("schema").arg("show").arg("--config").arg(&config), + )); + assert!(schema.contains("age: I32?")); + assert!(!schema.contains("years: I32?")); +} + #[test] fn local_cli_branch_create_enforces_engine_layer_policy() { let graph = SystemGraph::loaded(); @@ -1448,6 +1558,8 @@ project: graphs: local: uri: {} + policy: + file: ./policy.yaml cli: graph: local branch: main @@ -1455,8 +1567,6 @@ cli: query: roots: - . -policy: - file: ./policy.yaml ", yaml_string(&graph.path().to_string_lossy()), actor, diff --git a/crates/omnigraph-cli/tests/system_remote.rs b/crates/omnigraph-cli/tests/system_remote.rs index c86e32e..45bf502 100644 --- a/crates/omnigraph-cli/tests/system_remote.rs +++ b/crates/omnigraph-cli/tests/system_remote.rs @@ -60,10 +60,10 @@ project: graphs: local: uri: {} + policy: + file: ./policy.yaml server: graph: local -policy: - file: ./policy.yaml ", yaml_string(&graph.path().to_string_lossy()) ) diff --git a/crates/omnigraph-compiler/Cargo.toml b/crates/omnigraph-compiler/Cargo.toml index 229b862..545db83 100644 --- a/crates/omnigraph-compiler/Cargo.toml +++ b/crates/omnigraph-compiler/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-compiler" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "Schema/query compiler for Omnigraph. Zero Lance dependency." license = "MIT" diff --git a/crates/omnigraph-policy/Cargo.toml b/crates/omnigraph-policy/Cargo.toml index dacda35..3d14fc5 100644 --- a/crates/omnigraph-policy/Cargo.toml +++ b/crates/omnigraph-policy/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-policy" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "Policy / authorization layer for Omnigraph β€” Cedar-backed PolicyEngine, PolicyChecker trait, ResourceScope enum." license = "MIT" diff --git a/crates/omnigraph-policy/src/lib.rs b/crates/omnigraph-policy/src/lib.rs index 6459fcd..cb59796 100644 --- a/crates/omnigraph-policy/src/lib.rs +++ b/crates/omnigraph-policy/src/lib.rs @@ -56,6 +56,21 @@ pub enum PolicyAction { /// from v0.6.0; operators add and remove graphs by editing /// `omnigraph.yaml` and restarting. GraphList, + /// Gates invoking a server-side stored query by name. Per-graph and + /// **graph-scoped** (no branch dimension, like `Admin`): the per-branch + /// access of the query body is enforced by the inner `Read`/`Change` + /// gate, so branch-scoping this outer gate would be redundant (and was + /// wrong for snapshot reads). A rule that sets `branch_scope` on + /// `invoke_query` is rejected by `validate()`. In this release it is + /// **coarse**: an `invoke_query` allow rule permits *any* stored query + /// on the graph (no per-query dimension yet); a future, additive + /// refinement adds an optional query-name scope. + /// + /// This gate sits at the HTTP boundary. The engine `_as` writers still + /// enforce `Read`/`Change` per the query body, so a stored *mutation* + /// is double-gated: `invoke_query` to reach the tool, plus `change` for + /// the write itself. + InvokeQuery, } impl PolicyAction { @@ -70,6 +85,7 @@ impl PolicyAction { Self::BranchMerge => "branch_merge", Self::Admin => "admin", Self::GraphList => "graph_list", + Self::InvokeQuery => "invoke_query", } } @@ -99,7 +115,8 @@ impl PolicyAction { | Self::BranchCreate | Self::BranchDelete | Self::BranchMerge - | Self::Admin => PolicyResourceKind::Graph, + | Self::Admin + | Self::InvokeQuery => PolicyResourceKind::Graph, } } } @@ -155,6 +172,7 @@ impl FromStr for PolicyAction { "branch_merge" => Ok(Self::BranchMerge), "admin" => Ok(Self::Admin), "graph_list" => Ok(Self::GraphList), + "invoke_query" => Ok(Self::InvokeQuery), other => bail!("unknown policy action '{other}'"), } } @@ -806,6 +824,7 @@ namespace Omnigraph { action "branch_delete" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "branch_merge" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "admin" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; + action "invoke_query" appliesTo { principal: Actor, resource: Graph, context: RequestContext }; action "graph_list" appliesTo { principal: Actor, resource: Server, context: RequestContext }; } @@ -1264,6 +1283,80 @@ rules: assert!(!deny.allowed); } + #[test] + fn invoke_query_authorizes_per_graph() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-alice] + others: [act-bruno] +rules: + - id: team-invoke-queries + allow: + actors: { group: team } + actions: [invoke_query] +"#, + ) + .unwrap(); + let engine = PolicyCompiler::compile(&policy, "graph").unwrap(); + + let allow = engine + .authorize( + "act-alice", + &PolicyRequest { + action: PolicyAction::InvokeQuery, + branch: None, + target_branch: None, + }, + ) + .unwrap(); + assert!(allow.allowed); + assert_eq!( + allow.matched_rule_id.as_deref(), + Some("team-invoke-queries") + ); + + // Actor outside the group β†’ deny. + let deny = engine + .authorize( + "act-bruno", + &PolicyRequest { + action: PolicyAction::InvokeQuery, + branch: None, + target_branch: None, + }, + ) + .unwrap(); + assert!(!deny.allowed); + } + + #[test] + fn invoke_query_rejects_branch_scope() { + // invoke_query is graph-scoped (like admin) β€” per-branch access is + // enforced by the inner read/change gate β€” so a rule that puts a + // `branch_scope` qualifier on it is rejected at validate(). + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-alice] +rules: + - id: team-invoke-any-branch + allow: + actors: { group: team } + actions: [invoke_query] + branch_scope: any +"#, + ) + .unwrap(); + let err = policy.validate().unwrap_err().to_string(); + assert!( + err.contains("branch_scope") && err.contains("invoke_query"), + "branch_scope on invoke_query must be rejected: {err}" + ); + } + #[test] fn server_scoped_rule_cannot_use_branch_scope() { let policy: PolicyConfig = serde_yaml::from_str( diff --git a/crates/omnigraph-server/Cargo.toml b/crates/omnigraph-server/Cargo.toml index e9a0e46..5994aa1 100644 --- a/crates/omnigraph-server/Cargo.toml +++ b/crates/omnigraph-server/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-server" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "HTTP server for the Omnigraph graph database." license = "MIT" @@ -19,9 +19,9 @@ default = [] aws = ["dep:aws-config", "dep:aws-sdk-secretsmanager"] [dependencies] -omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.0" } -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.0" } +omnigraph = { package = "omnigraph-engine", path = "../omnigraph", version = "0.6.1" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.1" } axum = { workspace = true } clap = { workspace = true } color-eyre = { workspace = true } diff --git a/crates/omnigraph-server/src/api.rs b/crates/omnigraph-server/src/api.rs index 2c818ae..4a6024f 100644 --- a/crates/omnigraph-server/src/api.rs +++ b/crates/omnigraph-server/src/api.rs @@ -1,8 +1,11 @@ use omnigraph::db::{GraphCommit, MergeOutcome, ReadTarget, SchemaApplyResult, Snapshot}; use omnigraph::error::{MergeConflict, MergeConflictKind}; use omnigraph::loader::{IngestResult, LoadMode}; +use crate::queries::StoredQuery; use omnigraph_compiler::SchemaMigrationStep; +use omnigraph_compiler::query::ast::Param; use omnigraph_compiler::result::QueryResult; +use omnigraph_compiler::types::{PropType, ScalarType}; use serde::{Deserialize, Serialize}; use serde_json::Value; use utoipa::{IntoParams, ToSchema}; @@ -300,6 +303,162 @@ pub struct ChangeRequest { pub branch: Option, } +/// Body for `POST /queries/{name}` β€” invokes the server-side stored query +/// named in the path. The query source and name come from the registry, +/// never the body; only the runtime inputs are supplied here. +#[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] +pub struct InvokeStoredQueryRequest { + /// JSON object whose keys match the stored query's declared parameters. + #[serde(default)] + pub params: Option, + /// Branch to run against. Defaults to `main`; for a stored mutation the + /// write targets this branch. + #[serde(default)] + pub branch: Option, + /// Snapshot id to read from (read queries only β€” rejected for a stored + /// mutation). Mutually exclusive with `branch`. + #[serde(default)] + pub snapshot: Option, +} + +/// Response for `POST /queries/{name}`: the read envelope for a stored +/// read, or the mutation envelope for a stored mutation. Serialized +/// **untagged**, so the wire shape is exactly [`ReadOutput`] or +/// [`ChangeOutput`] β€” classification follows the stored query, not a +/// wrapper field. +#[derive(Debug, Serialize, ToSchema)] +#[serde(untagged)] +pub enum InvokeStoredQueryResponse { + Read(ReadOutput), + Change(ChangeOutput), +} + +/// The kind of a stored-query parameter, decomposed so a client (e.g. an +/// MCP server) can build a typed input schema with a closed `match` and +/// never re-parse omnigraph's type spelling. `bigint`/`date`/`datetime`/ +/// `blob` are carried as JSON strings on the wire: a 64-bit integer past +/// 2^53 loses precision as a JSON number, and Date/DateTime are ISO +/// strings, Blob a blob-URI string. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, ToSchema)] +#[serde(rename_all = "snake_case")] +pub enum ParamKind { + String, + Bool, + Int, + #[serde(rename = "bigint")] + BigInt, + Float, + Date, + #[serde(rename = "datetime")] + DateTime, + Blob, + Vector, + List, +} + +/// One declared parameter of a stored query, projected for the catalog. +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct ParamDescriptor { + pub name: String, + pub kind: ParamKind, + /// Element kind when `kind == list` (always a scalar β€” the grammar + /// forbids lists of vectors or nested lists). + #[serde(skip_serializing_if = "Option::is_none")] + pub item_kind: Option, + /// Dimension when `kind == vector`. + #[serde(skip_serializing_if = "Option::is_none")] + pub vector_dim: Option, + /// `false` β†’ the caller must supply it; `true` β†’ optional. + pub nullable: bool, +} + +/// One entry in the stored-query catalog (`GET /queries`). +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct QueryCatalogEntry { + /// Registry key / invoke path segment (`POST /queries/{name}`). + pub name: String, + /// MCP tool id (the `tool_name` override, else `name`). + pub tool_name: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub description: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub instruction: Option, + /// `true` for a stored mutation β†’ an MCP read-only hint of `false`. + pub mutation: bool, + pub params: Vec, +} + +/// Response for `GET /queries`: the `mcp.expose` subset of a graph's +/// stored-query registry, each with typed parameters. +#[derive(Debug, Clone, Serialize, Deserialize, ToSchema)] +pub struct QueriesCatalogOutput { + pub queries: Vec, +} + +/// Total map from a resolved scalar to its catalog kind. Exhaustive on +/// purpose: a new `ScalarType` is a compile error here until catalogued. +fn scalar_kind(scalar: ScalarType) -> ParamKind { + match scalar { + ScalarType::String => ParamKind::String, + ScalarType::Bool => ParamKind::Bool, + ScalarType::I32 | ScalarType::U32 => ParamKind::Int, + ScalarType::I64 | ScalarType::U64 => ParamKind::BigInt, + ScalarType::F32 | ScalarType::F64 => ParamKind::Float, + ScalarType::Date => ParamKind::Date, + ScalarType::DateTime => ParamKind::DateTime, + ScalarType::Blob => ParamKind::Blob, + ScalarType::Vector(_) => ParamKind::Vector, + } +} + +fn param_descriptor(param: &Param) -> ParamDescriptor { + match PropType::from_param_type_name(¶m.type_name, param.nullable) { + Some(pt) if pt.list => ParamDescriptor { + name: param.name.clone(), + kind: ParamKind::List, + item_kind: Some(scalar_kind(pt.scalar)), + vector_dim: None, + nullable: param.nullable, + }, + Some(pt) => { + let (kind, vector_dim) = match pt.scalar { + ScalarType::Vector(dim) => (ParamKind::Vector, Some(dim)), + other => (scalar_kind(other), None), + }; + ParamDescriptor { + name: param.name.clone(), + kind, + item_kind: None, + vector_dim, + nullable: param.nullable, + } + } + // Unreachable for a parsed query (every declared param type is + // grammatical); fall back to an opaque string so the field is still + // usable rather than dropped. + None => ParamDescriptor { + name: param.name.clone(), + kind: ParamKind::String, + item_kind: None, + vector_dim: None, + nullable: param.nullable, + }, + } +} + +/// Project a loaded stored query into its catalog entry (typed params, +/// MCP tool name, read/mutate flag, description/instruction). +pub fn query_catalog_entry(query: &StoredQuery) -> QueryCatalogEntry { + QueryCatalogEntry { + name: query.name.clone(), + tool_name: query.effective_tool_name().to_string(), + description: query.decl.description.clone(), + instruction: query.decl.instruction.clone(), + mutation: query.is_mutation(), + params: query.decl.params.iter().map(param_descriptor).collect(), + } +} + #[derive(Debug, Clone, Default, Serialize, Deserialize, ToSchema)] pub struct SchemaApplyRequest { /// Project schema in `.pg` source form. The diff against the current diff --git a/crates/omnigraph-server/src/config.rs b/crates/omnigraph-server/src/config.rs index 87737d0..b308b72 100644 --- a/crates/omnigraph-server/src/config.rs +++ b/crates/omnigraph-server/src/config.rs @@ -9,6 +9,13 @@ use serde::{Deserialize, Serialize}; pub const DEFAULT_CONFIG_FILE: &str = "omnigraph.yaml"; +pub fn graph_resource_id_for_selection( + selected_graph: Option<&str>, + normalized_uri: &str, +) -> String { + selected_graph.unwrap_or(normalized_uri).to_string() +} + #[derive(Debug, Clone, Default, Serialize, Deserialize)] pub struct ProjectConfig { pub name: Option, @@ -24,6 +31,14 @@ pub struct TargetConfig { /// graph's HTTP-layer Cedar enforcement. #[serde(default)] pub policy: PolicySettings, + /// Per-graph stored-query registry: an inline `name -> entry` + /// map. Mirrors the per-graph `policy` shape β€” each + /// `graphs..queries` declares that graph's stored queries. Absent + /// (or empty) = no stored queries for the graph. v1 is inline-only; + /// an external `queries.yaml` manifest indirection is a deferred + /// convenience. + #[serde(default)] + pub queries: BTreeMap, } #[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Serialize, Deserialize, ValueEnum)] @@ -90,6 +105,50 @@ pub struct PolicySettings { pub file: Option, } +/// One stored-query registry entry. The map **key** is the query's +/// identity β€” it must equal the `query ` symbol declared inside +/// the referenced `.gq` file (asserted when the registry loads). +/// Renaming the key (or the symbol) is a breaking change to callers, by +/// design. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct QueryEntry { + /// Path to the `.gq` file (relative to the config's `base_dir`). The + /// file may declare several queries; the registry selects the one + /// whose symbol matches the map key. + pub file: String, + #[serde(default)] + pub mcp: McpSettings, +} + +/// MCP exposure for a stored query. A *deployment* concern (the same +/// `.gq` may be exposed in one graph and hidden in another), so it lives +/// in YAML rather than in the `.gq` source. **Default `expose: true`** β€” +/// declaring a query in the manifest *is* the opt-in, so it appears in the +/// MCP tool catalog (`GET /queries`) by default; set `expose: false` to +/// keep a query HTTP/service-callable but hidden from the agent tool list. +/// `expose` governs catalog membership only β€” it is **not** an +/// authorization gate (invocation is gated by `invoke_query`), so a hidden +/// query is still invocable by name with the right permission. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct McpSettings { + #[serde(default = "mcp_expose_default")] + pub expose: bool, + pub tool_name: Option, +} + +fn mcp_expose_default() -> bool { + true +} + +impl Default for McpSettings { + fn default() -> Self { + Self { + expose: mcp_expose_default(), + tool_name: None, + } + } +} + #[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] #[serde(rename_all = "snake_case")] pub enum AliasCommand { @@ -137,6 +196,12 @@ pub struct OmnigraphConfig { pub aliases: BTreeMap, #[serde(default)] pub policy: PolicySettings, + /// Top-level stored-query registry, used in single-graph + /// mode β€” mirrors how the top-level `policy` applies to the single + /// graph. In multi-graph mode this is unused; each graph's + /// `graphs..queries` applies instead. + #[serde(default)] + pub queries: BTreeMap, #[serde(skip)] base_dir: PathBuf, } @@ -152,6 +217,7 @@ impl Default for OmnigraphConfig { query: QueryDefaults::default(), aliases: BTreeMap::new(), policy: PolicySettings::default(), + queries: BTreeMap::new(), base_dir: PathBuf::new(), } } @@ -244,6 +310,124 @@ impl OmnigraphConfig { .map(|path| self.resolve_config_path(path)) } + /// The top-level stored-query registry entries (single-graph mode). + pub fn query_entries(&self) -> &BTreeMap { + &self.queries + } + + /// The per-graph stored-query registry entries for a named target + /// (multi-graph mode). Returns `None` if the target is unknown. + pub fn target_query_entries( + &self, + target_name: &str, + ) -> Option<&BTreeMap> { + self.graphs.get(target_name).map(|target| &target.queries) + } + + /// The stored-query registry entries that apply for a graph + /// selection β€” the single definition of "which `queries:` block + /// governs graph X", shared by server boot and the CLI so the two + /// can't drift. A named graph present in `graphs:` uses its + /// per-graph block; everything else (no selection, or a name that is + /// not a known graph, e.g. a bare URI) falls back to the top-level + /// block (single-graph mode). + pub fn query_entries_for(&self, graph: Option<&str>) -> &BTreeMap { + match graph { + Some(name) if self.graphs.contains_key(name) => &self.graphs[name].queries, + _ => &self.queries, + } + } + + /// The single CLI gate that turns a raw graph selection into a *validated* + /// one β€” the fallible counterpart to the infallible + /// [`OmnigraphConfig::query_entries_for`]. Both `queries` subcommands route + /// their selection through here so neither can skip a check the other (or + /// server boot) applies: + /// * a known name passes through, but only after the same coherence check + /// server boot enforces + /// ([`OmnigraphConfig::ensure_top_level_blocks_honored`]) β€” a named graph + /// with a populated top-level block is rejected; + /// * an unknown name errors with the **same** message + /// [`OmnigraphConfig::resolve_target_uri`] produces, so a command that + /// opens no URI rejects an unknown `--target` exactly like the + /// URI-resolving commands do; + /// * an anonymous selection (`None`, e.g. a bare URI) stays anonymous, + /// resolving to the top-level registry downstream (top-level honored). + pub fn resolve_graph_selection<'a>(&self, graph: Option<&'a str>) -> Result> { + match graph { + Some(name) if self.graphs.contains_key(name) => { + self.ensure_top_level_blocks_honored(Some(name))?; + Ok(Some(name)) + } + Some(name) => bail!("graph '{}' not found in {}", name, DEFAULT_CONFIG_FILE), + None => Ok(None), + } + } + + pub fn resolve_policy_tooling_graph_selection(&self) -> Result> { + self.resolve_graph_selection(self.cli_graph_name().or_else(|| self.server_graph_name())) + } + + /// The policy file that applies for a graph selection β€” the policy + /// sibling of [`OmnigraphConfig::query_entries_for`], so policy and + /// queries resolve by the same identity rule. A named graph in + /// `graphs:` uses its per-graph `policy.file` with **no** top-level + /// fallback (a named graph with no per-graph policy has no policy β€” + /// that keeps the boot-time coherence check meaningful); anything else + /// (no selection, or a bare URI) uses the top-level `policy.file`. + pub fn resolve_policy_file_for(&self, graph: Option<&str>) -> Option { + match graph { + Some(name) if self.graphs.contains_key(name) => self.resolve_target_policy_file(name), + _ => self.resolve_policy_file(), + } + } + + /// Names of any top-level config blocks (`policy.file`, `queries:`) + /// that are populated. Used by the boot-time coherence check: when a + /// **named** graph is served (single-mode by name, or multi-mode), + /// the top-level blocks are not honored, so a populated one is a + /// configuration error rather than a silent no-op. + pub fn populated_top_level_blocks(&self) -> Vec<&'static str> { + let mut blocks = Vec::new(); + if self.policy.file.is_some() { + blocks.push("policy.file"); + } + if !self.queries.is_empty() { + blocks.push("queries"); + } + blocks + } + + /// A named graph uses its own `graphs.` block, so a populated + /// top-level block would be silently ignored β€” a config error. The single + /// definition of that rule, shared by server boot and the CLI selection + /// gate ([`OmnigraphConfig::resolve_graph_selection`]) so the two can't + /// drift. An anonymous selection (`None`, e.g. a bare URI) legitimately + /// honors the top-level blocks, so it is never rejected here. + pub fn ensure_top_level_blocks_honored(&self, selected: Option<&str>) -> Result<()> { + if let Some(name) = selected { + let unhonored = self.populated_top_level_blocks(); + if !unhonored.is_empty() { + bail!( + "named graph '{name}' uses its own `graphs.{name}.…` block, but top-level {} \ + {} set and would be ignored. Move it to `graphs.{name}` (e.g. \ + `graphs.{name}.policy.file`, `graphs.{name}.queries`).", + unhonored.join(" and "), + if unhonored.len() == 1 { "is" } else { "are" }, + ); + } + } + Ok(()) + } + + /// Resolve a stored-query `.gq` file path (from a registry entry), + /// relative to the config's `base_dir`. Mirrors policy-file + /// resolution; the registry loader calls this to turn each entry's + /// `file:` value into an absolute path. + pub fn resolve_query_file(&self, value: &str) -> PathBuf { + self.resolve_config_path(value) + } + /// Resolve the server-level policy file path (used by management /// endpoints). Returns `None` if `server.policy.file` is not set. pub fn resolve_server_policy_file(&self) -> Option { @@ -387,7 +571,9 @@ mod tests { use tempfile::tempdir; - use super::{ReadOutputFormat, TableCellLayout, load_config_in}; + use super::{ + ReadOutputFormat, TableCellLayout, graph_resource_id_for_selection, load_config_in, + }; #[test] fn load_config_reads_yaml_defaults_from_current_dir() { @@ -451,6 +637,114 @@ policy: {} assert!(config.graphs.is_empty()); } + #[test] + fn graph_resource_id_for_selection_uses_name_or_anonymous_uri() { + assert_eq!( + graph_resource_id_for_selection(Some("local"), "/tmp/graph.omni"), + "local" + ); + assert_eq!( + graph_resource_id_for_selection(None, "/tmp/graph.omni"), + "/tmp/graph.omni" + ); + } + + #[test] + fn resolve_graph_selection_validates_membership_and_coherence() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./demo.omni\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + + // A known graph passes through unchanged. + assert_eq!(config.resolve_graph_selection(Some("local")).unwrap(), Some("local")); + // An anonymous selection stays anonymous (β†’ top-level registry downstream). + assert_eq!(config.resolve_graph_selection(None).unwrap(), None); + // An unknown name errors, naming the graph (matching resolve_target_uri). + let err = config.resolve_graph_selection(Some("ghost")).unwrap_err().to_string(); + assert!( + err.contains("ghost") && err.contains("not found"), + "unknown graph must error naming it: {err}" + ); + + // Coherence: a named graph plus a populated top-level block is the + // config server boot refuses, so the gate rejects it too (shared rule + // via ensure_top_level_blocks_honored). An anonymous selection still + // passes β€” top-level is honored when no graph is named. + let temp2 = tempdir().unwrap(); + fs::write( + temp2.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./demo.omni\npolicy:\n file: ./top.yaml\n", + ) + .unwrap(); + let incoherent = load_config_in(temp2.path(), None).unwrap(); + let err = incoherent + .resolve_graph_selection(Some("local")) + .unwrap_err() + .to_string(); + assert!( + err.contains("local") && err.contains("policy.file"), + "named graph + populated top-level block must be rejected, naming both: {err}" + ); + assert_eq!( + incoherent.resolve_graph_selection(None).unwrap(), + None, + "anonymous selection still honors top-level" + ); + } + + #[test] + fn policy_tooling_graph_selection_prefers_cli_then_server_and_validates() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./local.omni\n prod:\n uri: ./prod.omni\n\ + server:\n graph: local\ncli:\n graph: prod\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!( + config.resolve_policy_tooling_graph_selection().unwrap(), + Some("prod") + ); + + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./local.omni\nserver:\n graph: local\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!( + config.resolve_policy_tooling_graph_selection().unwrap(), + Some("local") + ); + + let temp = tempdir().unwrap(); + fs::write(temp.path().join("omnigraph.yaml"), "policy: {}\n").unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!(config.resolve_policy_tooling_graph_selection().unwrap(), None); + + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./local.omni\nserver:\n graph: ghost\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + let err = config + .resolve_policy_tooling_graph_selection() + .unwrap_err() + .to_string(); + assert!( + err.contains("ghost") && err.contains("not found"), + "unknown server.graph must use graph-selection validation: {err}" + ); + } + #[test] fn resolve_query_path_searches_config_roots() { let temp = tempdir().unwrap(); @@ -489,6 +783,118 @@ policy: {} assert_eq!(resolved, config_dir.join("local.gq")); } + #[test] + fn queries_block_round_trips_inline_and_per_graph() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + r#" +graphs: + prod: + uri: s3://bucket/prod + queries: + find_user: + file: ./queries/find_user.gq + mcp: + expose: true + tool_name: lookup_user + internal_audit: + file: ./queries/audit.gq +queries: + single_mode_q: + file: ./q.gq +"#, + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + + // Per-graph registry (multi-graph mode). + let prod = config.target_query_entries("prod").unwrap(); + assert_eq!(prod.len(), 2); + let find_user = &prod["find_user"]; + assert_eq!(find_user.file, "./queries/find_user.gq"); + assert!(find_user.mcp.expose); + assert_eq!(find_user.mcp.tool_name.as_deref(), Some("lookup_user")); + // Default exposure is true (the manifest entry is the opt-in); tool_name absent. + let audit = &prod["internal_audit"]; + assert!(audit.mcp.expose); + assert!(audit.mcp.tool_name.is_none()); + + // Top-level registry (single-graph mode). + assert_eq!(config.query_entries().len(), 1); + + // The shared selector resolves the same blocks the server boot + // and the CLI use: a known graph β†’ its per-graph block; no + // selection or an unknown name β†’ the top-level block (the latter + // pins the behavior of the CLI's now-deleted fallback arm). + assert_eq!(config.query_entries_for(Some("prod")).len(), 2); + assert_eq!(config.query_entries_for(None).len(), 1); + assert_eq!(config.query_entries_for(Some("nonexistent")).len(), 1); + + // Path resolution joins against base_dir, like policy files. + assert_eq!( + config.resolve_query_file(&find_user.file), + temp.path().join("./queries/find_user.gq") + ); + } + + #[test] + fn resolve_policy_file_for_follows_identity() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "policy:\n file: ./top.yaml\ngraphs:\n prod:\n uri: s3://b/prod\n \ + policy:\n file: ./prod.yaml\n bare:\n uri: s3://b/bare\n", + ) + .unwrap(); + let config = load_config_in(temp.path(), None).unwrap(); + + // Named graph with its own policy β†’ per-graph (not top-level). + assert!( + config + .resolve_policy_file_for(Some("prod")) + .unwrap() + .ends_with("prod.yaml") + ); + // Named graph with NO per-graph policy β†’ None (no top-level fallback; + // load-bearing for the boot coherence check). + assert!(config.resolve_policy_file_for(Some("bare")).is_none()); + // Anonymous (bare URI) or an unknown name β†’ top-level. + assert!( + config + .resolve_policy_file_for(None) + .unwrap() + .ends_with("top.yaml") + ); + assert!( + config + .resolve_policy_file_for(Some("nope")) + .unwrap() + .ends_with("top.yaml") + ); + } + + #[test] + fn queries_block_absent_yields_empty_registry() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "graphs:\n local:\n uri: ./demo.omni\n", + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + // Additive: no `queries:` anywhere β†’ empty registries everywhere. + assert!(config.query_entries().is_empty()); + assert!( + config + .target_query_entries("local") + .unwrap() + .is_empty() + ); + } + #[test] fn policy_block_accepts_non_empty_mapping() { let temp = tempdir().unwrap(); diff --git a/crates/omnigraph-server/src/lib.rs b/crates/omnigraph-server/src/lib.rs index ad41f9d..60ebef3 100644 --- a/crates/omnigraph-server/src/lib.rs +++ b/crates/omnigraph-server/src/lib.rs @@ -4,6 +4,7 @@ pub mod config; pub mod graph_id; pub mod identity; pub mod policy; +pub mod queries; pub mod registry; pub mod workload; @@ -11,6 +12,8 @@ pub use graph_id::GraphId; pub use identity::{AuthSource, GraphKey, ResolvedActor, Scope, TenantId}; pub use registry::{GraphHandle, GraphRegistry, InsertError, RegistryLookup, RegistrySnapshot}; +use crate::queries::{QueryRegistry, check, format_check_breakages}; + use std::collections::{HashMap, HashSet}; use std::fs; use std::io; @@ -22,7 +25,8 @@ use api::{ BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, GraphInfo, GraphListResponse, - HealthOutput, IngestOutput, IngestRequest, QueryRequest, ReadOutput, ReadRequest, + HealthOutput, IngestOutput, IngestRequest, InvokeStoredQueryRequest, + InvokeStoredQueryResponse, QueriesCatalogOutput, QueryRequest, ReadOutput, ReadRequest, SchemaApplyOutput, SchemaApplyRequest, SchemaOutput, SnapshotQuery, ingest_output, schema_apply_output, snapshot_payload, }; @@ -40,12 +44,13 @@ use color_eyre::eyre::{Result, WrapErr, bail}; pub use config::{ AliasCommand, AliasConfig, CliDefaults, DEFAULT_CONFIG_FILE, OmnigraphConfig, PolicySettings, ProjectConfig, QueryDefaults, ReadOutputFormat, ServerDefaults, TableCellLayout, TargetConfig, - load_config, + graph_resource_id_for_selection, load_config, }; use futures::stream; use omnigraph::db::{Omnigraph, ReadTarget}; use omnigraph::error::{ManifestConflictDetails, ManifestErrorKind, OmniError}; use omnigraph::storage::normalize_root_uri; +use omnigraph_compiler::catalog::Catalog; use omnigraph_compiler::json_params_to_param_map; use omnigraph_compiler::query::parser::parse_query; use omnigraph_compiler::{JsonParamMode, ParamMap}; @@ -93,6 +98,8 @@ fn hash_bearer_token(token: &str) -> BearerTokenHash { server_export, #[allow(deprecated)] server_change, server_mutate, + server_list_queries, + server_invoke_query, server_schema_apply, server_schema_get, server_ingest, @@ -157,8 +164,16 @@ pub enum ServerConfigMode { /// set to a named target. Single { uri: String, + /// Cedar graph resource id for the single graph. A named selection + /// uses the graph name; an anonymous URI uses the normalized URI to + /// preserve legacy single-graph policy identity. + graph_id: String, /// Top-level `policy.file` (single-graph Cedar policy). policy_file: Option, + /// Top-level stored-query registry, loaded and identity-checked + /// at settings-build time; type-checked against the schema when + /// the engine opens. + queries: QueryRegistry, }, /// Multi-graph invocation β€” `--config omnigraph.yaml` with a /// non-empty `graphs:` map and no single-mode selector. @@ -185,6 +200,10 @@ pub struct GraphStartupConfig { pub graph_id: String, pub uri: String, pub policy_file: Option, + /// Per-graph stored-query registry, loaded and identity-checked at + /// settings-build time; type-checked against the schema when this + /// graph's engine opens. + pub queries: QueryRegistry, } /// Runtime routing for the server. Single mode = legacy @@ -285,7 +304,31 @@ impl AppState { ) -> Self { let bearer_tokens = hash_bearer_tokens(bearer_tokens); let per_graph_policy = policy_engine.map(Arc::new); - Self::build_single_mode(uri, db, bearer_tokens, per_graph_policy, Arc::new(workload)) + Self::build_single_mode(uri, db, bearer_tokens, per_graph_policy, Arc::new(workload), None) + } + + /// Like `new_single`, but attaches a pre-validated stored-query + /// registry. Private β€” the production single-mode boot path + /// (`open_single_with_queries`) is the only caller; every public + /// `new_*` constructor builds with no stored queries. + fn new_single_with_queries( + uri: String, + db: Omnigraph, + bearer_tokens: Vec<(String, String)>, + policy_engine: Option, + workload: workload::WorkloadController, + queries: Option>, + ) -> Self { + let bearer_tokens = hash_bearer_tokens(bearer_tokens); + let per_graph_policy = policy_engine.map(Arc::new); + Self::build_single_mode( + uri, + db, + bearer_tokens, + per_graph_policy, + Arc::new(workload), + queries, + ) } pub fn new(uri: String, db: Omnigraph) -> Self { @@ -377,6 +420,39 @@ impl AppState { uri: impl Into, bearer_tokens: Vec<(String, String)>, policy_file: Option<&PathBuf>, + ) -> Result { + Self::open_single_with_queries( + uri, + bearer_tokens, + policy_file, + QueryRegistry::default(), + ) + .await + } + + /// Single-mode boot with a stored-query registry: open the engine, + /// **type-check the registry against the live schema and refuse to + /// start on a breakage** (same posture as bad policy YAML), log + /// non-blocking warnings, then attach the registry to the handle. + /// With an empty registry the check is a no-op and no registry is + /// attached β€” that is the path `open_with_bearer_tokens_and_policy` + /// (no stored queries) takes. + pub async fn open_single_with_queries( + uri: impl Into, + bearer_tokens: Vec<(String, String)>, + policy_file: Option<&PathBuf>, + queries: QueryRegistry, + ) -> Result { + Self::open_single_with_queries_for_graph_id(uri, bearer_tokens, policy_file, queries, None) + .await + } + + async fn open_single_with_queries_for_graph_id( + uri: impl Into, + bearer_tokens: Vec<(String, String)>, + policy_file: Option<&PathBuf>, + queries: QueryRegistry, + graph_id: Option, ) -> Result { // The "policy requires tokens" invariant is enforced once by // `classify_server_runtime_state` in `serve()`, before either @@ -384,16 +460,24 @@ impl AppState { // time we get here, the (policy, no-tokens) combination has // already been rejected β€” no second bail needed. let uri = normalize_root_uri(&uri.into()).wrap_err("normalize graph URI")?; + let graph_id = graph_id.unwrap_or_else(|| uri.clone()); let db = Omnigraph::open(&uri).await?; + + // Validate the registry against the live schema and resolve it to + // an attachable handle (refuse boot on breakage). + let registry = validate_and_attach(queries, &db.catalog(), &graph_id)?; + let policy_engine = match policy_file { - Some(path) => Some(PolicyEngine::load_graph(path, &uri)?), + Some(path) => Some(PolicyEngine::load_graph(path, &graph_id)?), None => None, }; - Ok(Self::new_with_bearer_tokens_and_policy( + Ok(Self::new_single_with_queries( uri, db, bearer_tokens, policy_engine, + workload::WorkloadController::from_env(), + registry, )) } @@ -408,6 +492,7 @@ impl AppState { bearer_tokens: Arc<[(BearerTokenHash, Arc)]>, policy_engine: Option>, workload: Arc, + queries: Option>, ) -> Self { // Engine-layer policy gate (MR-722). With a per-graph policy // installed, every `_as` writer on `Omnigraph` calls into the @@ -436,6 +521,7 @@ impl AppState { uri, engine: Arc::new(db), policy: policy_engine, + queries, }); Self { routing: GraphRouting::Single { handle }, @@ -750,6 +836,58 @@ pub fn init_tracing() { let _ = tracing_subscriber::fmt().with_env_filter(filter).try_init(); } +/// Log each non-blocking advisory from a registry check report. +fn log_registry_warnings(label: &str, report: &queries::CheckReport) { + for warning in &report.warnings { + warn!(graph = label, query = %warning.query, "stored query: {}", warning.message); + } +} + +fn validate_registry_against_catalog( + registry: &QueryRegistry, + catalog: &Catalog, + label: &str, +) -> omnigraph::error::Result<()> { + let report = check(registry, catalog); + if report.has_breakages() { + return Err(OmniError::manifest(format_check_breakages(label, &report))); + } + log_registry_warnings(label, &report); + Ok(()) +} + +/// Validate a loaded stored-query registry against the live schema and +/// resolve it to an attachable handle. Refuses boot on any breakage +/// (same posture as bad policy YAML), logs the non-blocking warnings, +/// and collapses an empty registry to `None` (nothing attached). This is +/// the single gate every open path funnels through, so no opener can +/// attach a registry that has not been schema-checked. `label` names the +/// graph in messages. +fn validate_and_attach( + queries: QueryRegistry, + catalog: &Catalog, + label: &str, +) -> Result>> { + validate_registry_against_catalog(&queries, catalog, label) + .map_err(|err| color_eyre::eyre::eyre!(err.to_string()))?; + Ok(if queries.is_empty() { + None + } else { + Some(Arc::new(queries)) + }) +} + +/// Format every load error (parse / identity failure) into a multi-line +/// boot-abort message. +fn format_registry_load_errors(label: &str, errors: &[queries::LoadError]) -> String { + let joined = errors + .iter() + .map(|e| e.to_string()) + .collect::>() + .join("\n "); + format!("graph '{label}': stored-query registry failed to load:\n {joined}") +} + pub fn load_server_settings( config_path: Option<&PathBuf>, cli_uri: Option, @@ -799,15 +937,43 @@ pub fn load_server_settings( let uri = normalize_root_uri(&raw_uri).wrap_err_with(|| { format!("normalize single-graph URI '{raw_uri}' from server settings") })?; - let policy_file = config.resolve_policy_file(); - ServerConfigMode::Single { uri, policy_file } + // Config follows graph IDENTITY, not mode: a bare URI is anonymous + // (top-level config); a graph chosen by name uses its per-graph + // `graphs..{policy,queries}`. `resolve_target_uri` already + // errored on an unknown name, so a `Some(name)` here is a known graph. + let selected: Option<&str> = if has_cli_uri { + None + } else { + cli_target.as_deref().or_else(|| config.server_graph_name()) + }; + // A named selection must not leave a populated top-level block + // silently unused β€” refuse boot and point at the per-graph block. The + // same rule the CLI selection gate enforces, shared via one helper so + // the boot check and `omnigraph queries validate`/`list` can't drift. + config.ensure_top_level_blocks_honored(selected)?; + // Load + identity-check now (no engine needed); the schema + // type-check happens when the engine opens. + let policy_file = config.resolve_policy_file_for(selected); + let queries = QueryRegistry::load(&config, config.query_entries_for(selected)) + .map_err(|errs| color_eyre::eyre::eyre!(format_registry_load_errors(&uri, &errs)))?; + let graph_id = graph_resource_id_for_selection(selected, &uri); + ServerConfigMode::Single { + uri, + graph_id, + policy_file, + queries, + } } else if has_explicit_config && has_graphs_map { - if config.resolve_policy_file().is_some() { + // Multi mode: every graph uses its per-graph block; top-level + // policy/queries are never honored, so a populated one is an error. + let unhonored = config.populated_top_level_blocks(); + if !unhonored.is_empty() { bail!( - "top-level `policy.file` is single-graph/CLI-local policy only; \ - in multi-graph mode move per-graph rules to \ - `graphs..policy.file` and move `graph_list` rules to \ - `server.policy.file`." + "multi-graph mode: top-level {} {} not honored β€” each graph uses its own \ + `graphs..…` block. Move per-graph rules there (and any \ + `graph_list` policy to `server.policy.file`).", + unhonored.join(" and "), + if unhonored.len() == 1 { "is" } else { "are" }, ); } // Rule 4 β†’ Multi mode. Build a startup config per graph. @@ -823,10 +989,17 @@ pub fn load_server_settings( let uri = normalize_root_uri(&raw_uri).wrap_err_with(|| { format!("normalize URI '{raw_uri}' for graph '{name}' in omnigraph.yaml") })?; + // Per-graph `queries:`, selected through the shared + // `query_entries_for` so server and CLI resolve identically. + // Load + identity-check now; the schema type-check happens + // when this graph's engine opens. + let queries = QueryRegistry::load(&config, config.query_entries_for(Some(name.as_str()))) + .map_err(|errs| color_eyre::eyre::eyre!(format_registry_load_errors(name, &errs)))?; graphs.push(GraphStartupConfig { graph_id: name.clone(), uri, policy_file: config.resolve_target_policy_file(name), + queries, }); } let config_path = config_path @@ -949,6 +1122,8 @@ pub fn build_app(state: AppState) -> Router { server_change })) .route("/mutate", post(server_mutate)) + .route("/queries", get(server_list_queries)) + .route("/queries/{name}", post(server_invoke_query)) .route("/schema", get(server_schema_get)) .route("/schema/apply", post(server_schema_apply)) .route( @@ -1046,10 +1221,28 @@ pub async fn serve(config: ServerConfig) -> Result<()> { let bind = config.bind.clone(); let state = match config.mode { - ServerConfigMode::Single { uri, policy_file } => { + ServerConfigMode::Single { + uri, + graph_id, + policy_file, + queries, + } => { let uri_for_log = uri.clone(); - info!(uri = %uri_for_log, bind = %bind, mode = "single", "serving omnigraph"); - AppState::open_with_bearer_tokens_and_policy(uri, tokens, policy_file.as_ref()).await? + info!( + uri = %uri_for_log, + graph_id = %graph_id, + bind = %bind, + mode = "single", + "serving omnigraph" + ); + AppState::open_single_with_queries_for_graph_id( + uri, + tokens, + policy_file.as_ref(), + queries, + Some(graph_id), + ) + .await? } ServerConfigMode::Multi { graphs, @@ -1131,6 +1324,12 @@ async fn open_single_graph(cfg: GraphStartupConfig) -> Result> .await .map_err(|err| color_eyre::eyre::eyre!("open graph '{}' at {}: {err}", graph_id, uri))?; + // Validate this graph's stored queries against the live schema and + // resolve them to an attachable handle (refuse boot on breakage). + // Done before the policy match rebinds `db`; the catalog handle is an + // owned `Arc`, so no borrow of `db` survives into the match. + let queries = validate_and_attach(cfg.queries, &db.catalog(), graph_id.as_str())?; + let (policy_arc, db) = match &cfg.policy_file { Some(path) => { let policy = PolicyEngine::load_graph(path, graph_id.as_str())?; @@ -1146,6 +1345,7 @@ async fn open_single_graph(cfg: GraphStartupConfig) -> Result> uri, engine: Arc::new(db), policy: policy_arc, + queries, })) } @@ -1479,7 +1679,21 @@ fn log_policy_decision(actor_id: &str, request: &PolicyRequest, decision: &Polic ); } -/// HTTP-layer Cedar policy gate. Two sources of the policy engine: +/// The allow/deny **decision** an authorization check produces, kept +/// separate from the operational failures (`Err`) that can occur while +/// computing it. [`authorize_request`] collapses `Denied` to a 403; a caller +/// that needs to remap a denial without also remapping operational failures +/// (the stored-query invoke handler hides a denial as a 404) matches on this +/// directly, so a real 401 (missing bearer) or 500 (policy-evaluation error) +/// keeps its true status instead of being masked as the denial's response. +enum Authz { + Allowed, + Denied(String), +} + +/// HTTP-layer Cedar policy gate, returning the allow/deny [`Authz`] decision +/// and reserving `Err` for operational failures (401 missing bearer, 500 +/// policy-evaluation error). Two sources of the policy engine: /// * Per-graph handler β€” passes `handle.policy.as_deref()` so the /// graph's Cedar rules govern read/change/branch_*/schema_apply. /// * Management handler β€” passes `state.server_policy.as_deref()` so @@ -1493,11 +1707,11 @@ fn log_policy_decision(actor_id: &str, request: &PolicyRequest, decision: &Polic /// dropped from the type), so handlers cannot smuggle it through the /// request. See `actor_id_resolves_from_bearer_token_ignoring_client_supplied_headers` /// at `tests/server.rs`. -fn authorize_request( +fn authorize( actor: Option<&ResolvedActor>, policy: Option<&PolicyEngine>, request: PolicyRequest, -) -> std::result::Result<(), ApiError> { +) -> std::result::Result { let Some(engine) = policy else { // No PolicyEngine installed. Three runtime states can reach this: // @@ -1524,21 +1738,23 @@ fn authorize_request( // operator's only path to enabling it is configuring an // explicit `server.policy.file` in omnigraph.yaml. if request.action.resource_kind() == PolicyResourceKind::Server { - return Err(ApiError::forbidden( + return Ok(Authz::Denied( "server-scoped actions require an explicit `server.policy.file` \ configured in omnigraph.yaml β€” the management surface is closed \ by default in every runtime state, including --unauthenticated, \ - so that server topology is never exposed without operator opt-in.", + so that server topology is never exposed without operator opt-in." + .to_string(), )); } if actor.is_some() && request.action != PolicyAction::Read { - return Err(ApiError::forbidden( + return Ok(Authz::Denied( "server runs in default-deny mode (bearer tokens configured but no \ policy file). Only `read` actions are permitted; configure \ - `policy.file` in omnigraph.yaml to enable other actions.", + `policy.file` in omnigraph.yaml to enable other actions." + .to_string(), )); } - return Ok(()); + return Ok(Authz::Allowed); }; let Some(actor) = actor else { return Err(ApiError::unauthorized("missing bearer token")); @@ -1560,9 +1776,26 @@ fn authorize_request( .map_err(|err| ApiError::internal(format!("policy: {err}")))?; log_policy_decision(actor_id, &request, &decision); if decision.allowed { - Ok(()) + Ok(Authz::Allowed) } else { - Err(ApiError::forbidden(decision.message)) + Ok(Authz::Denied(decision.message)) + } +} + +/// Thin wrapper over [`authorize`] for the handlers that treat any denial as a +/// 403: a denial becomes `ApiError::forbidden`, and operational failures +/// (401 missing bearer, 500 policy-evaluation error) propagate unchanged. The +/// stored-query invoke handler does **not** use this β€” it consumes the +/// [`Authz`] decision directly to hide a denial as a 404 while letting an +/// operational failure keep its true status. +fn authorize_request( + actor: Option<&ResolvedActor>, + policy: Option<&PolicyEngine>, + request: PolicyRequest, +) -> std::result::Result<(), ApiError> { + match authorize(actor, policy, request)? { + Authz::Allowed => Ok(()), + Authz::Denied(message) => Err(ApiError::forbidden(message)), } } @@ -2001,6 +2234,194 @@ async fn server_mutate( )) } +/// Path parameter for `POST /queries/{name}`. +#[derive(Deserialize)] +struct QueryNamePath { + name: String, +} + +fn parse_optional_invoke_body( + body: Bytes, +) -> std::result::Result { + if body.is_empty() { + return Ok(InvokeStoredQueryRequest::default()); + } + serde_json::from_slice::>(&body) + .map(|request| request.unwrap_or_default()) + .map_err(|err| { + ApiError::bad_request(format!("invalid stored-query invocation body: {err}")) + }) +} + +#[utoipa::path( + post, + path = "/queries/{name}", + tag = "queries", + operation_id = "invoke_query", + params(("name" = String, Path, description = "Stored query name (the registry key)")), + request_body = Option, + responses( + (status = 200, description = "Read envelope (ReadOutput) or mutation envelope (ChangeOutput), serialized untagged", body = InvokeStoredQueryResponse), + (status = 400, description = "Bad request (param type error; snapshot on a stored mutation)", body = ErrorOutput), + (status = 401, description = "Unauthorized", body = ErrorOutput), + (status = 403, description = "Forbidden (the inner `change` gate for a stored mutation)", body = ErrorOutput), + (status = 404, description = "Unknown stored query, or `invoke_query` denied β€” indistinguishable to a caller without the grant", body = ErrorOutput), + (status = 409, description = "Merge conflict", body = ErrorOutput), + (status = 429, description = "Per-actor admission cap exceeded; honor `Retry-After` header", body = ErrorOutput), + (status = 500, description = "Policy evaluation error (a denial is reported as 404, not 500)", body = ErrorOutput), + ), + security(("bearer_token" = [])), +)] +/// Invoke a curated, server-side stored query by name. +/// +/// The query source comes from the graph's `queries:` registry, not the +/// request body β€” callers send only runtime inputs (`params`, `branch`, +/// `snapshot`). Gated by the `invoke_query` Cedar action at the boundary; +/// a stored *mutation* additionally passes the engine's `change` gate +/// (double-gated). An actor **without** `invoke_query` cannot tell a denied +/// query from a missing one β€” both return the same 404, so the catalog +/// can't be probed without the grant. Once `invoke_query` is held, the +/// inner `read`/`change` gate may surface a 403 for an existing query the +/// actor can't run (the intended double-gate signal). +async fn server_invoke_query( + State(state): State, + Extension(handle): Extension>, + actor: Option>, + Path(QueryNamePath { name }): Path, + body: Bytes, +) -> std::result::Result, ApiError> { + let req = parse_optional_invoke_body(body)?; + // A caller without `invoke_query` can't tell a denial from a missing + // query: both 404 with this exact message, so the catalog can't be + // probed without the grant. (A caller that holds invoke_query may still + // see the inner gate's 403 for an existing query it can't run β€” intended.) + const NOT_FOUND: &str = "stored query not found"; + let actor_ref = actor.as_ref().map(|Extension(actor)| actor); + + // Boundary gate (authentication already ran in `require_bearer_auth`). + // A denial is hidden as 404 (deny == missing, so the catalog can't be + // probed without the grant), but operational failures (401 missing bearer, + // 500 policy-evaluation error) propagate with their true status via `?` + // rather than being masked as a missing query. + match authorize( + actor_ref, + handle.policy.as_deref(), + PolicyRequest { + action: PolicyAction::InvokeQuery, + // Graph-scoped: no branch dimension. The per-branch/snapshot + // access is enforced by the inner read/change gate in the + // runner, so the outer gate must not resolve a branch (doing so + // was wrong for snapshot reads). + branch: None, + target_branch: None, + }, + )? { + Authz::Allowed => {} + Authz::Denied(_) => return Err(ApiError::not_found(NOT_FOUND)), + } + + // Resolve against the per-graph registry (same 404 on a miss). + let stored = handle + .queries + .as_ref() + .and_then(|registry| registry.lookup(&name)) + .ok_or_else(|| ApiError::not_found(NOT_FOUND))?; + + // Detach what we need before `handle` moves into the runner β€” the + // registry borrow lives inside `handle`. + let source = Arc::clone(&stored.source); + let query_name = stored.name.clone(); + let is_mutation = stored.is_mutation(); + + info!( + graph = %handle.uri, + actor = ?actor_ref.map(|a| a.actor_id.as_ref()), + query = %query_name, + kind = if is_mutation { "mutate" } else { "read" }, + "stored query invoked" + ); + + if is_mutation { + if req.snapshot.is_some() { + return Err(ApiError::bad_request( + "stored mutation cannot target a snapshot", + )); + } + let branch = req.branch.unwrap_or_else(|| "main".to_string()); + let output = run_mutate( + state, + handle, + actor_ref, + &source, + Some(&query_name), + req.params.as_ref(), + branch, + ) + .await?; + Ok(Json(InvokeStoredQueryResponse::Change(output))) + } else { + let (selected, target, result) = run_query( + handle, + actor_ref, + &source, + Some(&query_name), + req.params.as_ref(), + req.branch, + req.snapshot, + true, + ) + .await?; + Ok(Json(InvokeStoredQueryResponse::Read(api::read_output( + selected, &target, result, + )))) + } +} + +#[utoipa::path( + get, + path = "/queries", + tag = "queries", + operation_id = "list_queries", + responses( + (status = 200, description = "Stored-query catalog (the mcp.expose subset, with typed params)", body = QueriesCatalogOutput), + (status = 401, description = "Unauthorized", body = ErrorOutput), + (status = 403, description = "Forbidden", body = ErrorOutput), + ), + security(("bearer_token" = [])), +)] +/// List the graph's exposed stored queries as a typed tool catalog. +/// +/// Returns the `mcp.expose == true` subset of the `queries:` registry, each +/// with its MCP tool name, read/mutate flag, description/instruction, and +/// typed parameters β€” enough for a client to register them as tools without +/// fetching `.gq` source. Read-gated; the catalog is graph-wide (branch +/// independent β€” `read` is authorized against `main`). **Not** Cedar-filtered +/// per query yet, so it can list a query whose `invoke_query` the caller +/// lacks (a known gap until per-query authorization lands). +async fn server_list_queries( + Extension(handle): Extension>, + actor: Option>, +) -> std::result::Result, ApiError> { + authorize_request( + actor.as_ref().map(|Extension(actor)| actor), + handle.policy.as_deref(), + PolicyRequest { + action: PolicyAction::Read, + branch: Some("main".to_string()), + target_branch: None, + }, + )?; + let queries = match handle.queries.as_ref() { + Some(registry) => registry + .iter() + .filter(|q| q.expose) + .map(api::query_catalog_entry) + .collect(), + None => Vec::new(), + }; + Ok(Json(QueriesCatalogOutput { queries })) +} + #[utoipa::path( get, path = "/schema", @@ -2088,18 +2509,26 @@ async fn server_schema_apply( .map_err(ApiError::from_workload_reject)?; let result = { let db = &handle.engine; + let registry = handle.queries.as_deref(); + let label = handle.key.graph_id.as_str().to_string(); // Engine-layer policy enforcement (MR-722): pass the resolved // actor through so apply_schema_as can call enforce() with the // authoritative identity. With a policy installed in AppState, // engine-side enforcement re-checks the same decision the // HTTP-layer authorize_request just made above. PR #3 collapses // the redundancy. - db.apply_schema_as( + db.apply_schema_as_with_catalog_check( &request.schema_source, omnigraph::db::SchemaApplyOptions { allow_data_loss: request.allow_data_loss, }, actor_id, + |catalog| { + if let Some(registry) = registry { + validate_registry_against_catalog(registry, catalog, &label)?; + } + Ok(()) + }, ) .await .map_err(ApiError::from_omni)? @@ -2658,12 +3087,133 @@ mod tests { use std::fs; use tempfile::tempdir; + /// `authorize` returns the allow/deny **decision** (`Authz`) and reserves + /// `Err` for operational failures, so the invoke handler can hide a denial + /// as 404 without also masking a 401/500. Pins each outcome. + #[test] + fn authorize_splits_decision_from_operational_error() { + use super::{Authz, PolicyAction, PolicyCompiler, PolicyConfig, PolicyRequest, ResolvedActor, authorize}; + use std::sync::Arc; + + fn req(action: PolicyAction) -> PolicyRequest { + PolicyRequest { action, branch: None, target_branch: None } + } + let actor = ResolvedActor::cluster_static(Arc::from("act-alice")); + + // --- No policy engine installed (open / default-deny modes) --- + // A server-scoped action is denied in every no-policy state. + assert!(matches!( + authorize(Some(&actor), None, req(PolicyAction::GraphList)).unwrap(), + Authz::Denied(_) + )); + // Authenticated actor + a non-read per-graph action β†’ default-deny. + assert!(matches!( + authorize(Some(&actor), None, req(PolicyAction::Change)).unwrap(), + Authz::Denied(_) + )); + // `read` is the one per-graph action permitted without a policy. + assert!(matches!( + authorize(Some(&actor), None, req(PolicyAction::Read)).unwrap(), + Authz::Allowed + )); + // Open mode (no actor, no policy) β†’ allowed. + assert!(matches!( + authorize(None, None, req(PolicyAction::Read)).unwrap(), + Authz::Allowed + )); + + // --- Policy engine installed --- + let policy: PolicyConfig = serde_yaml::from_str( + "version: 1\n\ + groups:\n team: [act-alice]\n\ + rules:\n - id: team-read\n allow:\n actors: { group: team }\n actions: [read]\n branch_scope: any\n", + ) + .unwrap(); + let engine = PolicyCompiler::compile(&policy, "graph").unwrap(); + + // A matched allow rule β†’ Allowed. + assert!(matches!( + authorize( + Some(&actor), + Some(&engine), + PolicyRequest { action: PolicyAction::Read, branch: Some("main".to_string()), target_branch: None }, + ) + .unwrap(), + Authz::Allowed + )); + // Known actor, no matching allow rule β†’ Denied, carrying the decision message. + match authorize( + Some(&actor), + Some(&engine), + PolicyRequest { action: PolicyAction::Change, branch: Some("main".to_string()), target_branch: None }, + ) + .unwrap() + { + Authz::Denied(message) => assert!(!message.is_empty(), "a deny carries its decision message"), + Authz::Allowed => panic!("change must be denied: only read is allowed"), + } + // Policy installed but no actor β†’ operational failure (`Err`), NOT a + // decision. This is the split that keeps a 401/500 from being masked + // as the denial's response in the invoke handler. + assert!( + authorize(None, Some(&engine), req(PolicyAction::Read)).is_err(), + "a missing actor with a policy installed is an operational error, not a deny" + ); + } + #[test] fn hash_bearer_token_produces_32_byte_output() { let hash = hash_bearer_token("any-token"); assert_eq!(hash.len(), 32); } + /// The single gate both open paths funnel through: it refuses a + /// schema breakage (naming the graph label + query), attaches a clean + /// registry, and collapses an empty one to `None`. Pure over its args + /// (no engine), so it covers the multi-graph path's logic too β€” the + /// only per-path difference is the `label`, asserted here. + #[test] + fn validate_and_attach_gates_on_schema_and_collapses_empty() { + use crate::queries::{QueryRegistry, RegistrySpec}; + use omnigraph_compiler::catalog::build_catalog; + use omnigraph_compiler::schema::parser::parse_schema; + + let schema = parse_schema("node User {\nname: String\n}\n").unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let spec = |name: &str, source: &str| RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose: false, + tool_name: None, + }; + + // Empty registry β†’ nothing attached, no error. + let empty = + super::validate_and_attach(QueryRegistry::default(), &catalog, "g").unwrap(); + assert!(empty.is_none()); + + // A query that type-checks β†’ attached. + let ok = QueryRegistry::from_specs(vec![spec( + "find_user", + "query find_user() { match { $u: User } return { $u.name } }", + )]) + .unwrap(); + assert!(super::validate_and_attach(ok, &catalog, "g").unwrap().is_some()); + + // A query referencing a type the schema lacks β†’ boot refusal that + // names both the graph label and the offending query. + let broken = QueryRegistry::from_specs(vec![spec( + "ghost", + "query ghost() { match { $w: Widget } return { $w.name } }", + )]) + .unwrap(); + let err = super::validate_and_attach(broken, &catalog, "graph-x").unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("graph-x"), "labels the graph: {msg}"); + assert!(msg.contains("ghost"), "names the query: {msg}"); + assert!(msg.contains("schema check"), "mentions the schema check: {msg}"); + } + #[test] fn hash_bearer_token_is_deterministic() { assert_eq!( @@ -2707,7 +3257,10 @@ server: let settings = load_server_settings(Some(&config), None, None, None, false).unwrap(); match &settings.mode { - ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/demo.omni"), + ServerConfigMode::Single { uri, graph_id, .. } => { + assert_eq!(uri, "/tmp/demo.omni"); + assert_eq!(graph_id, "local"); + } ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), } assert_eq!(settings.bind, "0.0.0.0:9090"); @@ -2739,7 +3292,10 @@ server: ) .unwrap(); match &settings.mode { - ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "/tmp/override.omni"), + ServerConfigMode::Single { uri, graph_id, .. } => { + assert_eq!(uri, "/tmp/override.omni"); + assert_eq!(graph_id, "/tmp/override.omni"); + } ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), } assert_eq!(settings.bind, "0.0.0.0:9999"); @@ -2768,7 +3324,10 @@ server: load_server_settings(Some(&config), None, Some("dev".to_string()), None, false) .unwrap(); match &settings.mode { - ServerConfigMode::Single { uri, .. } => assert_eq!(uri, "http://127.0.0.1:8080"), + ServerConfigMode::Single { uri, graph_id, .. } => { + assert_eq!(uri, "http://127.0.0.1:8080"); + assert_eq!(graph_id, "dev"); + } ServerConfigMode::Multi { .. } => panic!("expected Single mode, got Multi"), } } @@ -2848,6 +3407,7 @@ server: .to_string_lossy() .into_owned(), policy_file: None, + queries: crate::queries::QueryRegistry::default(), }], config_path: temp.path().join("omnigraph.yaml"), server_policy_file: Some(policy_path), @@ -2895,7 +3455,9 @@ server: .join("graph.omni") .to_string_lossy() .into_owned(), + graph_id: "default".to_string(), policy_file: None, + queries: crate::queries::QueryRegistry::default(), }, bind: "127.0.0.1:0".to_string(), allow_unauthenticated: false, diff --git a/crates/omnigraph-server/src/queries.rs b/crates/omnigraph-server/src/queries.rs new file mode 100644 index 0000000..bf131c8 --- /dev/null +++ b/crates/omnigraph-server/src/queries.rs @@ -0,0 +1,688 @@ +//! Stored-query registry. +//! +//! A server-side registry of named, parameter-typed `.gq` queries that +//! operators declare in `omnigraph.yaml` (per-graph, or top-level in +//! single mode) and the server loads at startup. Each entry is parsed +//! and its identity asserted here (`load`); type-checking against the +//! live schema happens separately (a `check` pass) so the loader stays +//! callable without an open engine (the CLI's offline `queries check`). +//! +//! Identity is the query **name**: the manifest key must equal the +//! `query ` symbol declared in the referenced `.gq` file. The two +//! are asserted equal at load β€” one name, two places that must agree. +//! Renaming either is a breaking change to callers, by design. + +use std::collections::BTreeMap; +use std::fs; +use std::sync::Arc; + +use omnigraph_compiler::catalog::Catalog; +use omnigraph_compiler::query::ast::QueryDecl; +use omnigraph_compiler::query::parser::parse_query; +use omnigraph_compiler::query::typecheck::typecheck_query_decl; +use omnigraph_compiler::types::{PropType, ScalarType}; + +use crate::config::{OmnigraphConfig, QueryEntry}; + +/// One loaded stored query. `source` is the full `.gq` file text β€” the +/// invocation handler hands it to `run_query` / `run_mutate` verbatim, +/// which reuse the same parse/IR/exec path as the inline routes (no +/// parallel implementation). +#[derive(Debug, Clone)] +pub struct StoredQuery { + /// Identity: manifest key == `query ` symbol. + pub name: String, + /// Full `.gq` source text the query was selected from. + pub source: Arc, + /// Parsed declaration (params, mutations, description, …). + pub decl: QueryDecl, + /// Whether this query is listed in the MCP tool catalog (`GET /queries`). + /// Default `true` (the manifest entry is the opt-in); `expose: false` + /// keeps it HTTP/service-callable but hidden from the agent tool list. + /// Catalog membership only β€” not an authorization gate. + pub expose: bool, + /// Optional MCP tool-name override; defaults to `name`. + pub tool_name: Option, +} + +impl StoredQuery { + /// `true` if the selected declaration contains insert/update/delete + /// statements β€” drives read-vs-mutate routing at invocation time. + pub fn is_mutation(&self) -> bool { + !self.decl.mutations.is_empty() + } + + /// The MCP tool name this query is catalogued under: the explicit + /// `tool_name` override, else the query `name`. The catalog key β€” + /// enforced unique across exposed queries at load. Server-side + /// consumers (the uniqueness check, the future catalog projection) read + /// this; the CLI `queries list` resolves the same rule on its own DTO. + pub fn effective_tool_name(&self) -> &str { + self.tool_name.as_deref().unwrap_or(&self.name) + } +} + +/// A loaded, identity-checked stored-query registry for one graph. +#[derive(Debug, Clone, Default)] +pub struct QueryRegistry { + by_name: BTreeMap, +} + +/// In-memory registry entry before file I/O. Used by [`QueryRegistry::load`] +/// (after reading each `.gq` from disk) and directly by tests. +#[derive(Debug, Clone)] +pub struct RegistrySpec { + pub name: String, + pub source: String, + pub expose: bool, + pub tool_name: Option, +} + +/// A single registry load failure. Collected (not fail-fast) so a bad +/// `omnigraph.yaml` surfaces every broken entry at once, matching the +/// bad-policy-YAML posture. +#[derive(Debug, Clone)] +pub struct LoadError { + /// The offending query name, when the failure is entry-scoped. + pub query: Option, + pub message: String, +} + +impl std::fmt::Display for LoadError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.query { + Some(name) => write!(f, "stored query '{name}': {}", self.message), + None => write!(f, "stored query registry: {}", self.message), + } + } +} + +impl QueryRegistry { + /// Build a registry from in-memory specs: parse each source, select + /// the declaration whose symbol equals the manifest key, and assert + /// they agree. Collects every failure. No schema type-checking here + /// β€” that is [`check`]. + pub fn from_specs(specs: Vec) -> Result> { + let mut by_name = BTreeMap::new(); + let mut errors = Vec::new(); + + for spec in specs { + match parse_query(&spec.source) { + Ok(file) => { + match file.queries.into_iter().find(|q| q.name == spec.name) { + Some(decl) => { + by_name.insert( + spec.name.clone(), + StoredQuery { + name: spec.name, + source: Arc::from(spec.source), + decl, + expose: spec.expose, + tool_name: spec.tool_name, + }, + ); + } + None => errors.push(LoadError { + query: Some(spec.name.clone()), + message: format!( + "no `query {}` declaration found in its `.gq` file \ + (the registry key must match the query symbol)", + spec.name + ), + }), + } + } + Err(err) => errors.push(LoadError { + query: Some(spec.name), + message: format!("parse error: {err}"), + }), + } + } + + // Exposed queries are catalogued under their effective tool name; + // two claiming one name is an MCP-namespace collision. Refuse it at + // load (collected, not fail-fast), naming the loser and the winner. + // Iterating the `BTreeMap` makes the winner deterministic (the + // lexicographically-first query name; config is a map, so YAML + // declaration order isn't preserved anyway) and the error order + // stable. Scoped to a block so these borrows of `by_name` end + // before it is moved into `Self`. + { + let mut claimed: BTreeMap<&str, &str> = BTreeMap::new(); + for query in by_name.values().filter(|q| q.expose) { + let tool = query.effective_tool_name(); + if let Some(winner) = claimed.insert(tool, &query.name) { + errors.push(LoadError { + query: Some(query.name.clone()), + message: format!( + "MCP tool name '{tool}' already claimed by exposed query '{winner}'" + ), + }); + } + } + } + + if errors.is_empty() { + Ok(Self { by_name }) + } else { + Err(errors) + } + } + + /// Read each registry entry's `.gq` file from disk and build the + /// registry. `entries` is either the top-level `queries` map (single + /// mode) or a graph's `queries` map (multi mode); `config` resolves + /// each entry's relative `file:` path against `base_dir`. + pub fn load( + config: &OmnigraphConfig, + entries: &BTreeMap, + ) -> Result> { + let mut specs = Vec::with_capacity(entries.len()); + let mut errors = Vec::new(); + for (name, entry) in entries { + let path = config.resolve_query_file(&entry.file); + match fs::read_to_string(&path) { + Ok(source) => specs.push(RegistrySpec { + name: name.clone(), + source, + expose: entry.mcp.expose, + tool_name: entry.mcp.tool_name.clone(), + }), + Err(err) => errors.push(LoadError { + query: Some(name.clone()), + message: format!("cannot read '{}': {err}", path.display()), + }), + } + } + + // Parse/identity/uniqueness-check the readable specs even when some + // files failed to read, so every broken entry (I/O, parse, identity, + // tool-name collision) surfaces in one pass rather than one per + // restart. I/O errors come first (in `entries` key order), then the + // spec errors. A non-empty `errors` always fails the load. + match Self::from_specs(specs) { + Ok(registry) if errors.is_empty() => Ok(registry), + Ok(_) => Err(errors), + Err(spec_errors) => { + errors.extend(spec_errors); + Err(errors) + } + } + } + + pub fn lookup(&self, name: &str) -> Option<&StoredQuery> { + self.by_name.get(name) + } + + pub fn iter(&self) -> impl Iterator { + self.by_name.values() + } + + pub fn is_empty(&self) -> bool { + self.by_name.is_empty() + } + + pub fn len(&self) -> usize { + self.by_name.len() + } +} + +/// A stored query that fails to type-check against the live schema β€” +/// e.g. it references a node/edge type or property that was renamed or +/// removed by a migration. Breakages **block server boot** (same posture +/// as bad policy YAML), surfacing schema drift at the deploy boundary +/// rather than silently at invocation time. +#[derive(Debug, Clone)] +pub struct Breakage { + pub query: String, + pub message: String, +} + +/// A non-blocking advisory found during validation. Logged at boot; +/// never blocks startup. Currently: an MCP-exposed query that declares a +/// parameter an agent cannot realistically supply. +#[derive(Debug, Clone)] +pub struct Warning { + pub query: String, + pub message: String, +} + +/// Outcome of validating a registry against a schema. Breakages are +/// fatal (boot refuses); warnings are advisory. +#[derive(Debug, Clone, Default)] +pub struct CheckReport { + pub breakages: Vec, + pub warnings: Vec, +} + +impl CheckReport { + pub fn has_breakages(&self) -> bool { + !self.breakages.is_empty() + } + + pub fn is_clean(&self) -> bool { + self.breakages.is_empty() && self.warnings.is_empty() + } +} + +/// Validate a loaded registry against the live schema. +/// +/// Pure over `(registry, catalog)` β€” takes an already-parsed registry and +/// a catalog, so it is callable both at server boot (with the engine's +/// `catalog()`) and offline from the CLI (`omnigraph queries check`), +/// without coupling to server config or an open engine connection. +/// +/// Every query is type-checked via the same `typecheck_query_decl` the +/// engine runs for inline queries β€” no parallel implementation. Failures +/// are **collected, not fail-fast**, so an operator sees every broken +/// query in one pass. +/// +/// Advisory lint (warn, never block): an `mcp.expose: true` query that +/// declares a `Vector(N)` parameter. An LLM cannot supply a raw embedding +/// vector; such a query should take a `String` parameter and let the +/// engine embed it server-side at query time. Service-to-service callers +/// may legitimately pass vectors, so this warns rather than rejects. +pub fn check(registry: &QueryRegistry, catalog: &Catalog) -> CheckReport { + let mut report = CheckReport::default(); + for query in registry.iter() { + if let Err(err) = typecheck_query_decl(catalog, &query.decl) { + report.breakages.push(Breakage { + query: query.name.clone(), + message: err.to_string(), + }); + } + if query.expose { + for param in &query.decl.params { + // Resolve to the structured type via the compiler's own + // resolver rather than string-matching `Vector(` β€” one + // canonical definition of "is a vector", so this lint can't + // drift from how the parser/type system spells the type. + let is_vector = PropType::from_param_type_name(¶m.type_name, param.nullable) + .is_some_and(|pt| matches!(pt.scalar, ScalarType::Vector(_))); + if is_vector { + report.warnings.push(Warning { + query: query.name.clone(), + message: format!( + "MCP-exposed query declares a `{}` parameter `${}` that agents \ + cannot supply; use a `String` parameter for server-side embedding", + param.type_name, param.name + ), + }); + } + } + } + } + report +} + +/// Format every breakage in a registry check report into a multi-line +/// operator-facing message, naming each offending query. +pub fn format_check_breakages(label: &str, report: &CheckReport) -> String { + let joined = report + .breakages + .iter() + .map(|b| format!("query '{}': {}", b.query, b.message)) + .collect::>() + .join("\n "); + format!( + "graph '{label}': {} stored quer{} failed the schema check:\n {joined}", + report.breakages.len(), + if report.breakages.len() == 1 { + "y" + } else { + "ies" + } + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn spec(name: &str, source: &str, expose: bool) -> RegistrySpec { + RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose, + tool_name: None, + } + } + + fn spec_tool(name: &str, source: &str, expose: bool, tool_name: &str) -> RegistrySpec { + RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose, + tool_name: Some(tool_name.to_string()), + } + } + + #[test] + fn key_equal_symbol_loads() { + let reg = QueryRegistry::from_specs(vec![spec( + "find_user", + "query find_user($id: String) { match { $u: User } return { $u.name } }", + true, + )]) + .unwrap(); + let q = reg.lookup("find_user").unwrap(); + assert_eq!(q.name, "find_user"); + assert!(q.expose); + assert_eq!(q.decl.params.len(), 1); + assert!(!q.is_mutation()); + // No override β†’ the effective tool name is the query name. + assert_eq!(q.effective_tool_name(), "find_user"); + + // An explicit override is what the catalog keys on. + let with_tool = QueryRegistry::from_specs(vec![spec_tool( + "find_user", + "query find_user($id: String) { match { $u: User } return { $u.name } }", + true, + "lookup_user", + )]) + .unwrap(); + assert_eq!( + with_tool.lookup("find_user").unwrap().effective_tool_name(), + "lookup_user" + ); + } + + #[test] + fn key_mismatch_is_an_identity_error() { + let errors = QueryRegistry::from_specs(vec![spec( + "find_user", + // symbol is `lookup`, key is `find_user` β€” must be rejected. + "query lookup($id: String) { match { $u: User } return { $u.name } }", + false, + )]) + .unwrap_err(); + assert_eq!(errors.len(), 1); + assert_eq!(errors[0].query.as_deref(), Some("find_user")); + assert!(errors[0].message.contains("must match the query symbol")); + } + + #[test] + fn multi_query_file_selects_the_matching_symbol() { + let source = "query a($x: I64) { match { $u: User } return { $u.name } }\n\ + query b($y: String) { match { $u: User } return { $u.name } }"; + let reg = QueryRegistry::from_specs(vec![spec("b", source, false)]).unwrap(); + let q = reg.lookup("b").unwrap(); + assert_eq!(q.name, "b"); + assert_eq!(q.decl.params[0].name, "y"); + assert!(reg.lookup("a").is_none(), "only the selected symbol is registered"); + } + + #[test] + fn duplicate_exposed_tool_name_is_a_load_error() { + // Two MCP-exposed queries claiming one tool name is an ambiguity in + // the catalog key space β€” refused at load, naming both queries and + // the contested tool. + let errors = QueryRegistry::from_specs(vec![ + spec_tool("a", "query a() { match { $u: User } return { $u.name } }", true, "dup"), + spec_tool("b", "query b() { match { $u: User } return { $u.name } }", true, "dup"), + ]) + .unwrap_err(); + assert_eq!(errors.len(), 1); + let msg = errors[0].to_string(); + assert!(msg.contains("'dup'"), "names the contested tool: {msg}"); + assert!(msg.contains("'a'"), "names the winning query: {msg}"); + assert!(msg.contains("'b'"), "names the losing query: {msg}"); + } + + #[test] + fn duplicate_tool_name_among_unexposed_is_allowed() { + // Unexposed queries have no MCP tool, so a shared effective tool + // name is inert β€” must not error (pins the exposed-only scope). + let reg = QueryRegistry::from_specs(vec![ + spec_tool("a", "query a() { match { $u: User } return { $u.name } }", false, "dup"), + spec_tool("b", "query b() { match { $u: User } return { $u.name } }", false, "dup"), + ]) + .unwrap(); + assert_eq!(reg.len(), 2); + } + + #[test] + fn parse_error_surfaces_per_entry() { + let errors = + QueryRegistry::from_specs(vec![spec("broken", "query broken( {{ not valid", false)]) + .unwrap_err(); + assert_eq!(errors[0].query.as_deref(), Some("broken")); + assert!(errors[0].message.contains("parse error")); + } + + #[test] + fn errors_collect_rather_than_fail_fast() { + let errors = QueryRegistry::from_specs(vec![ + spec("good", "query good() { match { $u: User } return { $u.name } }", false), + spec("mismatch", "query other() { match { $u: User } return { $u.name } }", false), + spec("broken", "query broken(", false), + ]) + .unwrap_err(); + // `good` loads cleanly; only the mismatch and the parse error are + // reported, and both surface in one pass (not fail-fast). + assert_eq!(errors.len(), 2); + } + + #[test] + fn mutation_body_classifies_as_mutation() { + let reg = QueryRegistry::from_specs(vec![spec( + "add_user", + "query add_user($name: String) { insert User { name: $name } }", + false, + )]) + .unwrap(); + assert!(reg.lookup("add_user").unwrap().is_mutation()); + } + + // --- check(registry, catalog) --- + + use omnigraph_compiler::catalog::build_catalog; + use omnigraph_compiler::schema::parser::parse_schema; + + fn test_catalog() -> Catalog { + let schema = parse_schema( + r#" +node User { +name: String +age: I32? +embedding: Vector(4) +} +"#, + ) + .unwrap(); + build_catalog(&schema).unwrap() + } + + #[test] + fn check_passes_for_valid_query() { + let reg = QueryRegistry::from_specs(vec![spec( + "find_user", + "query find_user($name: String) { match { $u: User { name: $name } } return { $u.age } }", + false, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.is_clean(), "unexpected: {:?}", report); + } + + #[test] + fn check_reports_unknown_type_as_breakage() { + let reg = QueryRegistry::from_specs(vec![spec( + "ghost", + // `Widget` is not in the schema. + "query ghost() { match { $w: Widget } return { $w.name } }", + false, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.has_breakages()); + assert_eq!(report.breakages[0].query, "ghost"); + } + + #[test] + fn check_reports_unknown_property_as_breakage() { + let reg = QueryRegistry::from_specs(vec![spec( + "bad_prop", + // `User` exists but has no `nickname`. + "query bad_prop() { match { $u: User } return { $u.nickname } }", + false, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.has_breakages()); + assert_eq!(report.breakages[0].query, "bad_prop"); + } + + #[test] + fn check_collects_every_breakage_not_fail_fast() { + let reg = QueryRegistry::from_specs(vec![ + spec("a", "query a() { match { $w: Widget } return { $w.x } }", false), + spec("b", "query b() { match { $g: Gadget } return { $g.y } }", false), + spec( + "ok", + "query ok() { match { $u: User } return { $u.name } }", + false, + ), + ]) + .unwrap(); + let report = check(®, &test_catalog()); + assert_eq!(report.breakages.len(), 2, "both bad queries reported: {:?}", report); + } + + #[test] + fn vector_param_on_exposed_query_warns() { + let reg = QueryRegistry::from_specs(vec![spec( + "vec_search", + "query vec_search($q: Vector(4)) { match { $u: User } return { $u.name } \ + order { nearest($u.embedding, $q) } limit 3 }", + true, // mcp.expose + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(!report.has_breakages(), "valid query: {:?}", report); + assert_eq!(report.warnings.len(), 1); + assert_eq!(report.warnings[0].query, "vec_search"); + } + + #[test] + fn vector_param_on_unexposed_query_is_silent() { + let reg = QueryRegistry::from_specs(vec![spec( + "vec_search", + "query vec_search($q: Vector(4)) { match { $u: User } return { $u.name } \ + order { nearest($u.embedding, $q) } limit 3 }", + false, // not exposed β€” vector param is fine for service-to-service callers + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.is_clean(), "unexpected: {:?}", report); + } + + #[test] + fn non_vector_param_on_exposed_query_does_not_warn() { + // The recommended `String` alternative on an exposed query does not + // resolve to a Vector, so the embedding advisory stays silent. Guards + // the structured type check against a false positive (and pins that + // only `Vector(_)` triggers the warning). + let reg = QueryRegistry::from_specs(vec![spec( + "search", + "query search($name: String) { match { $u: User { name: $name } } return { $u.name } }", + true, + )]) + .unwrap(); + let report = check(®, &test_catalog()); + assert!(report.is_clean(), "no breakage or warning expected: {:?}", report); + } + + // --- catalog projection (api::query_catalog_entry) --- + + #[test] + fn catalog_entry_projects_every_param_kind() { + use crate::api::{self, ParamKind}; + let reg = QueryRegistry::from_specs(vec![spec_tool( + "all_types", + "query all_types($s: String, $i: I32, $big: I64, $u: U64, $f: F64, $b: Bool, \ + $d: Date, $dt: DateTime, $blob: Blob, $opt: String?, $list: [I32], $vec: Vector(4)) \ + { match { $x: User } return { $x.name } }", + true, + "all", + )]) + .unwrap(); + let entry = api::query_catalog_entry(reg.lookup("all_types").unwrap()); + assert_eq!(entry.name, "all_types"); + assert_eq!(entry.tool_name, "all"); + assert!(!entry.mutation); + + let by: std::collections::HashMap<_, _> = + entry.params.iter().map(|p| (p.name.as_str(), p)).collect(); + assert_eq!(by["s"].kind, ParamKind::String); + assert_eq!(by["i"].kind, ParamKind::Int); + assert_eq!(by["big"].kind, ParamKind::BigInt, "I64 β†’ bigint (string on the wire)"); + assert_eq!(by["u"].kind, ParamKind::BigInt, "U64 β†’ bigint"); + assert_eq!(by["f"].kind, ParamKind::Float); + assert_eq!(by["b"].kind, ParamKind::Bool); + assert_eq!(by["d"].kind, ParamKind::Date); + assert_eq!(by["dt"].kind, ParamKind::DateTime); + assert_eq!(by["blob"].kind, ParamKind::Blob); + assert!(!by["s"].nullable); + assert!(by["opt"].nullable, "String? β†’ nullable"); + assert_eq!(by["list"].kind, ParamKind::List); + assert_eq!(by["list"].item_kind, Some(ParamKind::Int), "[I32] β†’ list of int"); + assert_eq!(by["vec"].kind, ParamKind::Vector); + assert_eq!(by["vec"].vector_dim, Some(4)); + } + + #[test] + fn catalog_entry_flags_mutation_and_empty_params() { + use crate::api; + let reg = QueryRegistry::from_specs(vec![spec( + "add_user", + "query add_user($name: String) { insert User { name: $name } }", + true, + )]) + .unwrap(); + let entry = api::query_catalog_entry(reg.lookup("add_user").unwrap()); + assert!(entry.mutation, "insert body β†’ mutation flag"); + + let reg2 = QueryRegistry::from_specs(vec![spec( + "no_params", + "query no_params() { match { $u: User } return { $u.name } }", + true, + )]) + .unwrap(); + let entry2 = api::query_catalog_entry(reg2.lookup("no_params").unwrap()); + assert!(entry2.params.is_empty(), "no declared params β†’ empty list"); + } + + // --- load() error collection (file I/O + parse in one pass) --- + + #[test] + fn load_collects_io_and_parse_errors_in_one_pass() { + use crate::config::load_config; + let temp = tempfile::tempdir().unwrap(); + std::fs::write( + temp.path().join("good.gq"), + "query good() { match { $u: User } return { $u.name } }", + ) + .unwrap(); + std::fs::write(temp.path().join("broken.gq"), "query broken( {{ not valid").unwrap(); + // `missing.gq` is deliberately not written (an I/O failure). + std::fs::write( + temp.path().join("omnigraph.yaml"), + "queries:\n good:\n file: ./good.gq\n \ + missing:\n file: ./missing.gq\n broken:\n file: ./broken.gq\n", + ) + .unwrap(); + let config = load_config(Some(&temp.path().join("omnigraph.yaml"))).unwrap(); + + let errors = QueryRegistry::load(&config, config.query_entries()).unwrap_err(); + let joined = errors.iter().map(|e| e.to_string()).collect::>().join("\n"); + // Both the missing file AND the parse error surface in one pass β€” + // the I/O failure must not mask the parse failure. + assert!(joined.contains("missing"), "I/O error must surface: {joined}"); + assert!( + joined.contains("broken") && joined.contains("parse error"), + "the parse error in a readable file must surface in the same pass: {joined}" + ); + assert!(!joined.contains("'good'"), "the valid entry is not an error: {joined}"); + } +} diff --git a/crates/omnigraph-server/src/registry.rs b/crates/omnigraph-server/src/registry.rs index 5897ad1..54115e4 100644 --- a/crates/omnigraph-server/src/registry.rs +++ b/crates/omnigraph-server/src/registry.rs @@ -29,6 +29,7 @@ use tokio::sync::Mutex; use crate::identity::GraphKey; use crate::policy::PolicyEngine; +use crate::queries::QueryRegistry; /// Open handle for a single graph in the registry. Cheap to clone (`Arc`-wrapped /// engine + policy). Cluster-mode handlers extract this via @@ -47,6 +48,11 @@ pub struct GraphHandle { /// `_as` writers"; the HTTP-layer `require_bearer_auth` middleware still /// runs regardless. pub policy: Option>, + /// Per-graph stored-query registry, loaded and validated at + /// startup. `None` means the operator declared no stored queries for + /// this graph β€” `POST /queries/{name}` then 404s. Mirrors the + /// optional `policy` shape. + pub queries: Option>, } /// Immutable snapshot of the registry's current state. Replaced atomically @@ -245,6 +251,7 @@ fn canonicalize_handle_uri( uri: canonical_uri.clone(), engine: Arc::clone(&handle.engine), policy: handle.policy.clone(), + queries: handle.queries.clone(), }); Ok((canonical_uri, canonical_handle)) } @@ -276,6 +283,7 @@ mod tests { uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, }) } @@ -340,12 +348,14 @@ mod tests { uri: shared_uri.clone(), engine: Arc::clone(&engine), policy: None, + queries: None, }); let h2 = Arc::new(GraphHandle { key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), uri: shared_uri, engine, policy: None, + queries: None, }); let registry = GraphRegistry::new(); @@ -411,12 +421,14 @@ mod tests { uri: shared_uri.clone(), engine: Arc::clone(&engine), policy: None, + queries: None, }); let h2 = Arc::new(GraphHandle { key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), uri: shared_uri, engine, policy: None, + queries: None, }); let err = match GraphRegistry::from_handles(vec![h1, h2]) { Ok(_) => panic!("expected DuplicateUri, got Ok"), diff --git a/crates/omnigraph-server/tests/openapi.rs b/crates/omnigraph-server/tests/openapi.rs index a2542db..3d13e74 100644 --- a/crates/omnigraph-server/tests/openapi.rs +++ b/crates/omnigraph-server/tests/openapi.rs @@ -168,6 +168,8 @@ const EXPECTED_PATHS: &[&str] = &[ "/export", "/change", "/mutate", + "/queries", + "/queries/{name}", "/schema", "/schema/apply", "/ingest", @@ -701,6 +703,8 @@ fn protected_endpoints_reference_bearer_token_security() { ("/read", "post"), ("/change", "post"), ("/schema/apply", "post"), + ("/queries", "get"), + ("/queries/{name}", "post"), ("/ingest", "post"), ("/export", "post"), ("/snapshot", "get"), @@ -913,6 +917,34 @@ fn post_endpoints_have_request_body() { } } +#[test] +fn invoke_stored_query_request_body_is_optional() { + let doc = openapi_json(); + let request_body = &doc["paths"]["/queries/{name}"]["post"]["requestBody"]; + assert!( + request_body.is_object(), + "POST /queries/{{name}} should document its optional request body" + ); + assert_eq!( + request_body["required"].as_bool().unwrap_or(false), + false, + "stored-query invocation body should be optional" + ); + let schema = &request_body["content"]["application/json"]["schema"]; + let ref_path = schema["$ref"] + .as_str() + .or_else(|| { + schema["oneOf"] + .as_array() + .and_then(|schemas| schemas.iter().find_map(|schema| schema["$ref"].as_str())) + }) + .unwrap(); + assert!( + ref_path.contains("InvokeStoredQueryRequest"), + "POST /queries/{{name}} requestBody should reference InvokeStoredQueryRequest, got {ref_path}" + ); +} + // --------------------------------------------------------------------------- // Serialization round-trip test // --------------------------------------------------------------------------- @@ -1117,6 +1149,7 @@ async fn app_for_multi_mode(graph_ids: &[&str]) -> (Vec, Rout uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, })); dirs.push(dir); } diff --git a/crates/omnigraph-server/tests/server.rs b/crates/omnigraph-server/tests/server.rs index 3ace80e..4a49a14 100644 --- a/crates/omnigraph-server/tests/server.rs +++ b/crates/omnigraph-server/tests/server.rs @@ -8,7 +8,7 @@ use axum::body::{Body, to_bytes}; use axum::http::header::AUTHORIZATION; use axum::http::{Method, Request, StatusCode}; use lance::index::DatasetIndexExt; -use omnigraph::db::{Omnigraph, ReadTarget, SchemaApplyOptions}; +use omnigraph::db::{Omnigraph, ReadTarget}; use omnigraph::error::OmniError; use omnigraph::loader::{LoadMode, load_jsonl}; use omnigraph_policy::{PolicyChecker, PolicyEngine}; @@ -16,6 +16,7 @@ use omnigraph_server::api::{ BranchCreateRequest, BranchMergeRequest, ChangeRequest, ErrorOutput, ExportRequest, IngestRequest, QueryRequest, ReadRequest, SchemaApplyRequest, SchemaOutput, }; +use omnigraph_server::queries::{QueryRegistry, RegistrySpec}; use omnigraph_server::{AppState, build_app}; use serde_json::{Value, json}; use serial_test::serial; @@ -141,6 +142,469 @@ fn graph_path(root: &Path) -> PathBuf { root.join("server.omni") } +fn stored_query_registry(specs: &[(&str, &str, bool)]) -> QueryRegistry { + QueryRegistry::from_specs( + specs + .iter() + .map(|(name, source, expose)| RegistrySpec { + name: name.to_string(), + source: source.to_string(), + expose: *expose, + tool_name: None, + }) + .collect(), + ) + .expect("specs parse and key==symbol") +} + +#[tokio::test] +async fn server_boots_with_a_valid_stored_query_registry() { + // A stored query that type-checks against the fixture schema + // (`Person { name, age }`) must let the server boot. + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let registry = stored_query_registry(&[( + "find_person", + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }", + false, + )]); + let state = AppState::open_single_with_queries( + graph.to_string_lossy().to_string(), + vec![], + None, + registry, + ) + .await; + assert!(state.is_ok(), "valid registry should boot: {:?}", state.err()); +} + +#[tokio::test] +async fn server_refuses_boot_on_type_broken_stored_query() { + // A stored query referencing a type not in the schema (`Widget`) + // must abort boot, naming the offending query. + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let registry = stored_query_registry(&[( + "ghost", + "query ghost() { match { $w: Widget } return { $w.name } }", + false, + )]); + let result = AppState::open_single_with_queries( + graph.to_string_lossy().to_string(), + vec![], + None, + registry, + ) + .await; + // `AppState` is not `Debug`, so match rather than `expect_err`. + let err = match result { + Ok(_) => panic!("type-broken stored query must refuse boot"), + Err(err) => err, + }; + let msg = err.to_string(); + assert!(msg.contains("ghost"), "error should name the broken query: {msg}"); + assert!( + msg.contains("schema check"), + "error should mention the schema check: {msg}" + ); +} + +/// Build a single-mode app with a stored-query registry plus a bearerβ†’actor +/// pairing and a policy, so invoke tests exercise the `invoke_query` +/// boundary gate and the inner read/change gates together. +async fn app_with_stored_queries( + specs: &[(&str, &str, bool)], + tokens: &[(&str, &str)], + policy: &str, +) -> (tempfile::TempDir, Router) { + let temp = init_loaded_graph().await; + let graph = graph_path(temp.path()); + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, policy).unwrap(); + let registry = stored_query_registry(specs); + let state = AppState::open_single_with_queries( + graph.to_string_lossy().to_string(), + tokens + .iter() + .map(|(actor, token)| ((*actor).to_string(), (*token).to_string())) + .collect(), + Some(&policy_path), + registry, + ) + .await + .unwrap(); + (temp, build_app(state)) +} + +/// - `act-invoke`: invoke_query + read (stored reads, not mutations) +/// - `act-full`: invoke_query + read + change (stored mutations) +/// - `act-noinvoke`: read only, no invoke_query (boundary-denied) +/// - `act-invokeonly`: invoke_query only, no read (clears the boundary, inner read denies) +const INVOKE_POLICY_YAML: &str = r#" +version: 1 +groups: + invokers: ["act-invoke"] + full: ["act-full"] + readers: ["act-noinvoke"] + invoke_only: ["act-invokeonly"] +protected_branches: [main] +rules: + # invoke_query is graph-scoped β€” its own rules, no branch_scope. + - id: invokers-can-invoke + allow: + actors: { group: invokers } + actions: [invoke_query] + - id: full-can-invoke + allow: + actors: { group: full } + actions: [invoke_query] + - id: invoke-only-can-invoke + allow: + actors: { group: invoke_only } + actions: [invoke_query] + # read / change are branch-scoped. + - id: invokers-can-read + allow: + actors: { group: invokers } + actions: [read] + branch_scope: any + - id: full-can-read-change + allow: + actors: { group: full } + actions: [read, change] + branch_scope: any + - id: readers-can-read + allow: + actors: { group: readers } + actions: [read] + branch_scope: any +"#; + +const STORED_QUERY_SCHEMA_APPLY_POLICY_YAML: &str = r#" +version: 1 +groups: + admins: [act-ragnor] +protected_branches: [main] +rules: + - id: admins-can-invoke + allow: + actors: { group: admins } + actions: [invoke_query] + - id: admins-can-read + allow: + actors: { group: admins } + actions: [read] + branch_scope: any + - id: admins-can-schema-apply + allow: + actors: { group: admins } + actions: [schema_apply] + target_branch_scope: protected +"#; + +const FIND_PERSON_GQ: &str = + "query find_person($name: String) { match { $p: Person { name: $name } } return { $p.age } }"; + +fn invoke_request(name: &str, token: &str, body: Value) -> Request { + Request::builder() + .uri(format!("/queries/{name}")) + .method(Method::POST) + .header("content-type", "application/json") + .header("authorization", format!("Bearer {token}")) + .body(Body::from(serde_json::to_vec(&body).unwrap())) + .unwrap() +} + +fn invoke_request_bytes( + name: &str, + token: &str, + body: impl Into, + content_type: Option<&str>, +) -> Request { + let mut builder = Request::builder() + .uri(format!("/queries/{name}")) + .method(Method::POST) + .header("authorization", format!("Bearer {token}")); + if let Some(content_type) = content_type { + builder = builder.header("content-type", content_type); + } + builder.body(body.into()).unwrap() +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_read_returns_rows() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + let (status, body) = json_response( + &app, + invoke_request("find_person", "t-invoke", json!({ "params": { "name": "Alice" } })), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert_eq!(body["query_name"], "find_person"); + assert_eq!(body["row_count"], 1, "Alice is in the fixture; body: {body}"); + assert!(body["rows"].is_array(), "read envelope shape; body: {body}"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_read_accepts_absent_or_empty_body() { + let no_param_query = "query list_people() { match { $p: Person } return { $p.name } }"; + let (_temp, app) = app_with_stored_queries( + &[("list_people", no_param_query, false)], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + + let (status, body) = json_response( + &app, + invoke_request_bytes("list_people", "t-invoke", Body::empty(), None), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert_eq!(body["query_name"], "list_people"); + + let (status, body) = json_response( + &app, + invoke_request_bytes( + "list_people", + "t-invoke", + Body::empty(), + Some("application/json"), + ), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + + let (status, body) = json_response( + &app, + invoke_request_bytes( + "list_people", + "t-invoke", + Body::from("{}"), + Some("application/json"), + ), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + + let (status, body) = json_response( + &app, + invoke_request_bytes( + "list_people", + "t-invoke", + Body::from("{"), + Some("application/json"), + ), + ) + .await; + assert_eq!(status, StatusCode::BAD_REQUEST, "body: {body}"); + assert!( + body["error"] + .as_str() + .unwrap_or_default() + .contains("invalid stored-query invocation body"), + "malformed JSON should be rejected as bad request; body: {body}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_mutation_double_gates_on_change() { + let specs: &[(&str, &str, bool)] = &[( + "add_person", + "query add_person($name: String) { insert Person { name: $name } }", + false, + )]; + let (_temp, app) = app_with_stored_queries( + specs, + &[("act-invoke", "t-invoke"), ("act-full", "t-full")], + INVOKE_POLICY_YAML, + ) + .await; + + // Has invoke_query but NOT change β†’ the inner change gate denies (403). + let (status, body) = json_response( + &app, + invoke_request("add_person", "t-invoke", json!({ "params": { "name": "Eve" } })), + ) + .await; + assert_eq!( + status, + StatusCode::FORBIDDEN, + "invoke_query without change must 403; body: {body}" + ); + + // Has invoke_query + change β†’ applied. + let (status, body) = json_response( + &app, + invoke_request("add_person", "t-full", json!({ "params": { "name": "Eve" } })), + ) + .await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert_eq!(body["affected_nodes"], 1, "body: {body}"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_stored_query_bad_param_is_400() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + // `name` is declared String; pass a number. + let (status, body) = json_response( + &app, + invoke_request("find_person", "t-invoke", json!({ "params": { "name": 123 } })), + ) + .await; + assert_eq!(status, StatusCode::BAD_REQUEST, "body: {body}"); + assert!( + body["error"].as_str().unwrap_or_default().contains("name"), + "400 should name the offending param; body: {body}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_unknown_query_and_denied_actor_return_identical_404() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invoke", "t-invoke"), ("act-noinvoke", "t-noinvoke")], + INVOKE_POLICY_YAML, + ) + .await; + + // Authorized actor, unknown query name β†’ 404. + let (unknown_status, unknown_body) = + json_response(&app, invoke_request("does_not_exist", "t-invoke", json!({}))).await; + // Denied actor (no invoke_query), real query name β†’ 404. + let (denied_status, denied_body) = json_response( + &app, + invoke_request("find_person", "t-noinvoke", json!({ "params": { "name": "Alice" } })), + ) + .await; + + assert_eq!(unknown_status, StatusCode::NOT_FOUND); + assert_eq!(denied_status, StatusCode::NOT_FOUND); + assert_eq!( + unknown_body, denied_body, + "deny must be byte-identical to a missing query (no catalog probing)" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn invoke_query_holder_without_read_sees_403_not_404() { + // The 404-hiding is for callers WITHOUT invoke_query. An actor that + // HOLDS invoke_query but lacks `read` clears the boundary gate, then the + // inner read gate denies β†’ 403 for an EXISTING read query, vs 404 for an + // unknown one. Existence is visible to grant-holders by design (the + // documented double-gate); this pins that actual contract. + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, false)], + &[("act-invokeonly", "t-invokeonly")], + INVOKE_POLICY_YAML, + ) + .await; + let (exists_status, _) = json_response( + &app, + invoke_request("find_person", "t-invokeonly", json!({ "params": { "name": "Alice" } })), + ) + .await; + let (absent_status, _) = + json_response(&app, invoke_request("does_not_exist", "t-invokeonly", json!({}))).await; + assert_eq!( + exists_status, + StatusCode::FORBIDDEN, + "an existing read query the holder can't read β†’ inner-gate 403" + ); + assert_eq!(absent_status, StatusCode::NOT_FOUND, "unknown query still 404s"); +} + +fn get_request(uri: &str, token: &str) -> Request { + Request::builder() + .uri(uri) + .method(Method::GET) + .header("authorization", format!("Bearer {token}")) + .body(Body::empty()) + .unwrap() +} + +#[tokio::test(flavor = "multi_thread")] +async fn list_queries_returns_only_exposed_with_typed_params() { + let (_temp, app) = app_with_stored_queries( + &[ + ("find_person", FIND_PERSON_GQ, true), + ( + "add_person", + "query add_person($name: String) { insert Person { name: $name } }", + true, + ), + ("hidden", "query hidden() { match { $p: Person } return { $p.name } }", false), + ], + &[("act-invoke", "t-invoke")], + INVOKE_POLICY_YAML, + ) + .await; + let (status, body) = json_response(&app, get_request("/queries", "t-invoke")).await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + + let entries = body["queries"].as_array().unwrap(); + let names: Vec<&str> = entries.iter().map(|q| q["name"].as_str().unwrap()).collect(); + assert!( + names.contains(&"find_person") && names.contains(&"add_person"), + "exposed queries listed: {names:?}" + ); + assert!(!names.contains(&"hidden"), "non-exposed query hidden from the catalog: {names:?}"); + + let fp = entries.iter().find(|q| q["name"] == "find_person").unwrap(); + assert_eq!(fp["mutation"], false); + assert_eq!(fp["tool_name"], "find_person"); + assert_eq!(fp["params"][0]["name"], "name"); + assert_eq!(fp["params"][0]["kind"], "string"); + let ap = entries.iter().find(|q| q["name"] == "add_person").unwrap(); + assert_eq!(ap["mutation"], true, "stored insert β†’ mutation"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn list_queries_is_read_gated_so_a_non_invoker_can_list() { + // The catalog is read-gated (not invoke_query-gated), so a reader who + // lacks invoke_query still enumerates the exposed queries β€” the + // documented probe-oracle gap until per-query Cedar filtering lands. + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, true)], + &[("act-noinvoke", "t-noinvoke")], + INVOKE_POLICY_YAML, + ) + .await; + let (status, body) = json_response(&app, get_request("/queries", "t-noinvoke")).await; + assert_eq!(status, StatusCode::OK, "read-gated catalog; body: {body}"); + let names: Vec<&str> = body["queries"] + .as_array() + .unwrap() + .iter() + .map(|q| q["name"].as_str().unwrap()) + .collect(); + assert!( + names.contains(&"find_person"), + "a reader lists the catalog despite lacking invoke_query: {names:?}" + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn list_queries_is_empty_when_no_registry() { + let (_temp, app) = app_for_loaded_graph_with_auth("demo-token").await; + let (status, body) = json_response(&app, get_request("/queries", "demo-token")).await; + assert_eq!(status, StatusCode::OK, "body: {body}"); + assert!( + body["queries"].as_array().unwrap().is_empty(), + "no stored-query registry β†’ empty catalog" + ); +} + fn drifted_test_schema() -> String { fs::read_to_string(fixture("test.pg")) .unwrap() @@ -423,6 +887,83 @@ async fn schema_apply_route_updates_graph_for_authorized_admin() { ); } +#[tokio::test(flavor = "multi_thread")] +async fn schema_apply_route_rejects_stored_query_breakage_before_publish() { + let (temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, true)], + &[("act-ragnor", "admin-token")], + STORED_QUERY_SCHEMA_APPLY_POLICY_YAML, + ) + .await; + + let request = Request::builder() + .method(Method::POST) + .uri("/schema/apply") + .header("content-type", "application/json") + .header("authorization", "Bearer admin-token") + .body(Body::from( + serde_json::to_vec(&SchemaApplyRequest { + schema_source: renamed_age_schema(), + ..Default::default() + }) + .unwrap(), + )) + .unwrap(); + let (status, payload) = json_response(&app, request).await; + assert_eq!(status, StatusCode::BAD_REQUEST, "body: {payload}"); + let message = payload["error"].as_str().unwrap_or_default(); + assert!( + message.contains("find_person") && message.contains("schema check"), + "registry breakage should name the stored query; body: {payload}" + ); + + let reopened = Omnigraph::open(graph_path(temp.path()).to_str().unwrap()) + .await + .unwrap(); + let person = &reopened.catalog().node_types["Person"]; + assert!(person.properties.contains_key("age")); + assert!(!person.properties.contains_key("years")); + + let (invoke_status, invoke_body) = json_response( + &app, + invoke_request( + "find_person", + "admin-token", + json!({ "params": { "name": "Alice" } }), + ), + ) + .await; + assert_eq!(invoke_status, StatusCode::OK, "body: {invoke_body}"); + assert_eq!(invoke_body["row_count"], 1); +} + +#[tokio::test(flavor = "multi_thread")] +async fn schema_apply_route_noop_keeps_valid_stored_query_registry() { + let (_temp, app) = app_with_stored_queries( + &[("find_person", FIND_PERSON_GQ, true)], + &[("act-ragnor", "admin-token")], + STORED_QUERY_SCHEMA_APPLY_POLICY_YAML, + ) + .await; + + let request = Request::builder() + .method(Method::POST) + .uri("/schema/apply") + .header("content-type", "application/json") + .header("authorization", "Bearer admin-token") + .body(Body::from( + serde_json::to_vec(&SchemaApplyRequest { + schema_source: fs::read_to_string(fixture("test.pg")).unwrap(), + ..Default::default() + }) + .unwrap(), + )) + .unwrap(); + let (status, payload) = json_response(&app, request).await; + assert_eq!(status, StatusCode::OK, "body: {payload}"); + assert_eq!(payload["applied"], false); +} + #[tokio::test] async fn schema_apply_route_requires_schema_apply_policy_permission() { let (_temp, app) = app_for_graph_with_auth_tokens_and_policy( @@ -4690,6 +5231,7 @@ mod multi_graph_startup { uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, })); dirs.push(dir); } @@ -4985,12 +5527,14 @@ graphs: uri: graph_uri.clone(), engine: Arc::clone(&engine), policy: None, + queries: None, }); let beta = Arc::new(GraphHandle { key: GraphKey::cluster(GraphId::try_from("beta").unwrap()), uri: format!("file://{graph_uri}/"), engine, policy: None, + queries: None, }); match GraphRegistry::from_handles(vec![alpha, beta]) { @@ -5016,6 +5560,7 @@ graphs: uri: format!("file://{graph_uri}/"), engine: Arc::new(engine), policy: None, + queries: None, }); let registry = GraphRegistry::from_handles(vec![handle]).unwrap(); @@ -5138,11 +5683,11 @@ graphs: let err = load_server_settings(Some(&config_path), None, None, None, true).unwrap_err(); let msg = err.to_string(); assert!( - msg.contains("top-level `policy.file` is single-graph/CLI-local policy only"), - "expected single-graph policy guidance, got: {msg}" + msg.contains("top-level") && msg.contains("policy.file") && msg.contains("not honored"), + "expected top-level-not-honored guidance, got: {msg}" ); assert!( - msg.contains("graphs..policy.file"), + msg.contains("graphs."), "expected per-graph migration guidance, got: {msg}" ); assert!( @@ -5151,6 +5696,88 @@ graphs: ); } + #[test] + fn mode_inference_multi_rejects_top_level_queries() { + // Symmetric to the policy guard: a top-level `queries:` block in + // multi-graph mode is not honored (each graph uses its own), so it + // is a loud error rather than a silent no-op. + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + "queries:\n q:\n file: ./q.gq\ngraphs:\n alpha:\n uri: /tmp/alpha.omni\n", + ) + .unwrap(); + let err = load_server_settings(Some(&config_path), None, None, None, true).unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("queries") && msg.contains("not honored"), + "top-level queries must be rejected in multi-graph mode: {msg}" + ); + } + + #[test] + fn single_mode_named_graph_rejects_top_level_blocks() { + // Serving a graph by name (`--target`/`server.graph`) uses its + // per-graph block; a populated top-level block would be silently + // shadowed, so boot refuses and names the per-graph location. + let temp = tempfile::tempdir().unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + "policy:\n file: ./top.yaml\ngraphs:\n prod:\n uri: /tmp/prod.omni\n", + ) + .unwrap(); + let err = + load_server_settings(Some(&config_path), None, Some("prod".to_string()), None, true) + .unwrap_err(); + let msg = err.to_string(); + assert!( + msg.contains("prod") && msg.contains("policy.file") && msg.contains("graphs.prod"), + "named single-mode + top-level policy must refuse, naming the graph: {msg}" + ); + } + + #[test] + fn single_mode_named_graph_uses_per_graph_policy_and_queries() { + // The identity rule: `--target prod` attaches `graphs.prod`'s own + // policy + queries, not the top-level ones (which are absent here). + let temp = tempfile::tempdir().unwrap(); + fs::write( + temp.path().join("prod.gq"), + "query pq() { match { $u: User } return { $u.name } }", + ) + .unwrap(); + let config_path = temp.path().join("omnigraph.yaml"); + fs::write( + &config_path, + "graphs:\n prod:\n uri: /tmp/prod.omni\n policy:\n file: ./prod-policy.yaml\n \ + queries:\n pq:\n file: ./prod.gq\n", + ) + .unwrap(); + let settings = + load_server_settings(Some(&config_path), None, Some("prod".to_string()), None, true) + .unwrap(); + match settings.mode { + ServerConfigMode::Single { + graph_id, + policy_file, + queries, + .. + } => { + assert_eq!(graph_id, "prod", "named single-mode keeps graph identity"); + assert!( + policy_file + .as_ref() + .is_some_and(|p| p.ends_with("prod-policy.yaml")), + "per-graph policy attached: {policy_file:?}" + ); + assert!(queries.lookup("pq").is_some(), "per-graph query attached"); + } + other => panic!("expected Single mode, got {other:?}"), + } + } + #[test] fn mode_inference_normalizes_multi_graph_uris() { let temp = tempfile::tempdir().unwrap(); @@ -5383,6 +6010,7 @@ graphs: uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, }); let tokens = vec![("act-andrew".to_string(), "secret-token".to_string())]; let workload = omnigraph_server::workload::WorkloadController::from_env(); @@ -5450,6 +6078,7 @@ graphs: uri: graph_uri, engine: Arc::new(engine), policy: None, + queries: None, })); } diff --git a/crates/omnigraph/Cargo.toml b/crates/omnigraph/Cargo.toml index 1fa3436..70f51d8 100644 --- a/crates/omnigraph/Cargo.toml +++ b/crates/omnigraph/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "omnigraph-engine" -version = "0.6.0" +version = "0.6.1" edition = "2024" description = "Runtime engine for the Omnigraph graph database." license = "MIT" @@ -16,8 +16,8 @@ default = [] failpoints = ["dep:fail", "fail/failpoints"] [dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } -omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } +omnigraph-policy = { path = "../omnigraph-policy", version = "0.6.1" } lance = { workspace = true } lance-datafusion = { workspace = true } datafusion = { workspace = true } @@ -51,7 +51,7 @@ chrono = { workspace = true } arc-swap = { workspace = true } [dev-dependencies] -omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.6.1" } tokio = { workspace = true } lance-namespace-impls = { workspace = true } serial_test = "3" diff --git a/crates/omnigraph/src/db/commit_graph.rs b/crates/omnigraph/src/db/commit_graph.rs index 565bd69..9531a64 100644 --- a/crates/omnigraph/src/db/commit_graph.rs +++ b/crates/omnigraph/src/db/commit_graph.rs @@ -169,6 +169,37 @@ impl CommitGraph { self.refresh().await } + /// Idempotently drop the commit-graph branch `name`, tolerating an + /// already-absent branch (see [`TableStore::force_delete_branch`] for the + /// same semantics). Used by the best-effort reclaim in `branch_delete` and + /// the `cleanup` orphan reconciler. `RefConflict` (referencing descendants) + /// is still surfaced. + pub async fn force_delete_branch(&mut self, name: &str) -> Result<()> { + let mut ds = Dataset::open(&graph_commits_uri(&self.root_uri)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + match ds.force_delete_branch(name).await { + Ok(()) => {} + Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => {} + Err(e) => return Err(OmniError::Lance(e.to_string())), + } + self.refresh().await + } + + /// List the named branches present on the commit-graph dataset. The + /// `cleanup` reconciler diffs this against the manifest branch set to find + /// orphaned commit-graph branches to reclaim. + pub async fn list_branches(&self) -> Result> { + let ds = Dataset::open(&graph_commits_uri(&self.root_uri)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let branches = ds + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(branches.into_keys().collect()) + } + pub async fn append_commit( &mut self, manifest_branch: Option<&str>, @@ -345,7 +376,7 @@ impl CommitGraph { } } -fn graph_commits_uri(root_uri: &str) -> String { +pub(crate) fn graph_commits_uri(root_uri: &str) -> String { format!("{}/{}", root_uri.trim_end_matches('/'), GRAPH_COMMITS_DIR) } diff --git a/crates/omnigraph/src/db/graph_coordinator.rs b/crates/omnigraph/src/db/graph_coordinator.rs index a721036..dfe2767 100644 --- a/crates/omnigraph/src/db/graph_coordinator.rs +++ b/crates/omnigraph/src/db/graph_coordinator.rs @@ -211,14 +211,47 @@ impl GraphCoordinator { let branch = normalize_branch_name(name)? .ok_or_else(|| OmniError::manifest("cannot create branch 'main'".to_string()))?; self.ensure_commit_graph_initialized().await?; + + // Manifest authority flip first. self.manifest.create_branch(&branch).await?; - failpoints::maybe_fail("branch_create.after_manifest_branch_create")?; - if let Some(commit_graph) = &mut self.commit_graph { - commit_graph.create_branch(&branch).await?; + + // Derived commit-graph branch. If anything after the authority flip + // fails, roll back the manifest branch so the branch never half-exists + // (a manifest branch with no commit-graph branch breaks the next write). + if let Err(err) = self.create_commit_graph_branch(&branch).await { + if let Err(rollback_err) = self.manifest.delete_branch(&branch).await { + tracing::warn!( + target: "omnigraph::branch_create", + branch = %branch, + error = %rollback_err, + "rollback of manifest branch failed after commit-graph create failure", + ); + } + return Err(err); } Ok(()) } + /// Create the derived commit-graph branch for `branch`, healing a zombie ref + /// left by an incomplete prior delete. The manifest branch was just created + /// fresh, so any existing commit-graph branch with this name is provably + /// orphaned and is force-dropped before recreating. + async fn create_commit_graph_branch(&mut self, branch: &str) -> Result<()> { + failpoints::maybe_fail("branch_create.after_manifest_branch_create")?; + let Some(commit_graph) = &mut self.commit_graph else { + return Ok(()); + }; + if commit_graph + .list_branches() + .await? + .iter() + .any(|existing| existing == branch) + { + commit_graph.force_delete_branch(branch).await?; + } + commit_graph.create_branch(branch).await + } + pub async fn branch_delete(&mut self, name: &str) -> Result<()> { let branch = normalize_branch_name(name)? .ok_or_else(|| OmniError::manifest("cannot delete branch 'main'".to_string()))?; @@ -229,20 +262,43 @@ impl GraphCoordinator { ))); } + // Manifest authority flip β€” the single atomic op that makes the branch + // cease to exist. Must succeed; everything after is derived state + // reclaimed best-effort. self.manifest.delete_branch(&branch).await?; + // Commit-graph branch is derived state. Reclaim best-effort with the + // idempotent force variant: a failure here (or a missing dataset) is + // reconciled by `cleanup` and must not fail the delete after the + // authority already flipped. + if let Err(err) = self.reclaim_commit_graph_branch(&branch).await { + tracing::warn!( + target: "omnigraph::branch_delete::cleanup", + branch = %branch, + error = %err, + "best-effort commit-graph branch reclaim failed; cleanup will reconcile", + ); + } + + Ok(()) + } + + /// Best-effort, idempotent reclaim of the commit-graph branch `branch`. + /// Tolerates an absent commit-graph dataset (a graph that never committed). + async fn reclaim_commit_graph_branch(&mut self, branch: &str) -> Result<()> { + failpoints::maybe_fail("branch_delete.before_commit_graph_reclaim")?; if let Some(commit_graph) = &mut self.commit_graph { - commit_graph.delete_branch(&branch).await?; + commit_graph.force_delete_branch(branch).await } else if self .storage .exists(&graph_commits_uri(self.root_uri())) .await? { let mut commit_graph = CommitGraph::open(self.root_uri()).await?; - commit_graph.delete_branch(&branch).await?; + commit_graph.force_delete_branch(branch).await + } else { + Ok(()) } - - Ok(()) } pub async fn snapshot_at_version(&self, version: u64) -> Result { diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs index 7fcf7de..5bf1f87 100644 --- a/crates/omnigraph/src/db/manifest.rs +++ b/crates/omnigraph/src/db/manifest.rs @@ -36,7 +36,7 @@ use publisher::{GraphNamespacePublisher, ManifestBatchPublisher}; pub(crate) use recovery::{ RecoveryMode, RecoverySidecar, RecoverySidecarHandle, SidecarKind, SidecarTablePin, SidecarTableRegistration, SidecarTombstone, delete_sidecar, has_schema_apply_sidecar, - new_sidecar, recover_manifest_drift, write_sidecar, + list_sidecars, new_sidecar, recover_manifest_drift, write_sidecar, }; pub use state::SubTableEntry; #[cfg(test)] @@ -48,6 +48,22 @@ const OBJECT_TYPE_TABLE_VERSION: &str = "table_version"; const OBJECT_TYPE_TABLE_TOMBSTONE: &str = "table_tombstone"; const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management"; +/// Apply pending internal-schema migrations against `__manifest` on the +/// open-for-write path, independent of a publish. +/// +/// `Omnigraph::open(ReadWrite)` calls this before the coordinator reads branch +/// state, so branch-observing code (`branch_list`, the schema-apply +/// blocking-branch checks) sees the post-migration graph. In particular the +/// v2β†’v3 step sweeps legacy `__run__*` staging branches off `__manifest` +/// (MR-770); running it here closes the window where those branches would +/// otherwise block schema apply before the first publish runs the migration. +/// +/// Idempotent: a no-op stamp read when the on-disk version already matches. +pub(crate) async fn migrate_on_open(root_uri: &str) -> Result<()> { + let mut dataset = open_manifest_dataset(root_uri, None).await?; + migrations::migrate_internal_schema(&mut dataset).await +} + /// Immutable point-in-time view of the database. /// /// Cheap to create (no storage I/O). All reads within a query go through one diff --git a/crates/omnigraph/src/db/manifest/migrations.rs b/crates/omnigraph/src/db/manifest/migrations.rs index bbb7995..e2801fe 100644 --- a/crates/omnigraph/src/db/manifest/migrations.rs +++ b/crates/omnigraph/src/db/manifest/migrations.rs @@ -46,7 +46,11 @@ use crate::error::{OmniError, Result}; /// - v2 β€” `__manifest.object_id` carries the unenforced-PK annotation, /// engaging Lance's bloom-filter conflict resolver at commit time. Added /// alongside `expected_table_versions` OCC on `ManifestBatchPublisher::publish`. -pub(super) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 2; +/// - v3 β€” one-time sweep of legacy `__run__` staging branches left on the +/// `__manifest` dataset by the pre-v0.4.0 Run state machine (removed in +/// MR-771). Once swept, the `is_internal_run_branch` defense-in-depth guard +/// is no longer needed (MR-770). +pub(super) const INTERNAL_MANIFEST_SCHEMA_VERSION: u32 = 3; const INTERNAL_SCHEMA_VERSION_KEY: &str = "omnigraph:internal_schema_version"; const OBJECT_ID_PK_KEY: &str = "lance-schema:unenforced-primary-key"; @@ -89,6 +93,10 @@ pub(super) async fn migrate_internal_schema(dataset: &mut Dataset) -> Result<()> migrate_v1_to_v2(dataset).await?; current = 2; } + 2 => { + migrate_v2_to_v3(dataset).await?; + current = 3; + } other => { return Err(OmniError::manifest_internal(format!( "no internal-schema migration registered for v{} β†’ v{}", @@ -122,6 +130,51 @@ async fn migrate_v1_to_v2(dataset: &mut Dataset) -> Result<()> { set_stamp(dataset, 2).await } +/// v2 β†’ v3: sweep legacy `__run__` staging branches off the `__manifest` +/// dataset, then bump the stamp. +/// +/// The pre-v0.4.0 Run state machine (removed in MR-771) created graph-level +/// staging branches named `__run__` on `__manifest`. MR-771 stopped +/// creating them but left any pre-existing ones in place; Lance's +/// `list_branches` still enumerates them, so they leak into `branch_list()` +/// and count as blocking branches at schema-apply time. This one-time sweep +/// removes them so the `is_internal_run_branch` guard can retire (MR-770). +/// +/// The `"__run__"` prefix is inlined here on purpose: this migration must keep +/// working after the `run_registry` module (the guard) is deleted, so it does +/// not depend on it. +/// +/// Idempotent under both sequential retry and concurrent runners: each run +/// re-enumerates `list_branches` fresh, and `force_delete_branch` tolerates a +/// branch that is already gone β€” so a crash before the stamp bump, or a second +/// process opening the same legacy graph at the same time, never errors out. +async fn migrate_v2_to_v3(dataset: &mut Dataset) -> Result<()> { + const LEGACY_RUN_BRANCH_PREFIX: &str = "__run__"; + let branches = dataset + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let run_branches: Vec = branches + .into_keys() + .filter(|name| { + name.trim_start_matches('/') + .starts_with(LEGACY_RUN_BRANCH_PREFIX) + }) + .collect(); + for name in run_branches { + // `force_delete_branch` deletes even when the `BranchContents` is + // already gone. Plain `delete_branch` errors "BranchContents not + // found", which would fail a second concurrent open (or a retry that + // raced another runner) after the first one swept the branch. Force is + // exactly Lance's documented path for cleaning up zombie branches. + dataset + .force_delete_branch(&name) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + set_stamp(dataset, 3).await +} + async fn set_stamp(dataset: &mut Dataset, version: u32) -> Result<()> { dataset .update_schema_metadata([(INTERNAL_SCHEMA_VERSION_KEY.to_string(), version.to_string())]) diff --git a/crates/omnigraph/src/db/manifest/recovery.rs b/crates/omnigraph/src/db/manifest/recovery.rs index 425499a..3119531 100644 --- a/crates/omnigraph/src/db/manifest/recovery.rs +++ b/crates/omnigraph/src/db/manifest/recovery.rs @@ -2,7 +2,7 @@ //! //! This module implements the building blocks of the per-sidecar recovery //! sweep that closes the documented Phase B β†’ Phase C residual (see -//! `docs/dev/runs.md` "Open-time recovery sweep"). The high-level shape: +//! `docs/dev/writes.md` "Open-time recovery sweep"). The high-level shape: //! //! 1. Each writer that performs a multi-table commit writes a small JSON //! sidecar at `__recovery/{ulid}.json` BEFORE its per-table @@ -106,6 +106,12 @@ pub(crate) enum SidecarKind { BranchMerge, /// `ensure_indices_for_branch` β€” index lifecycle commits. EnsureIndices, + /// `optimize_all_tables` β€” Lance `compact_files` (reserve-fragments + + /// rewrite commits) followed by a manifest publish of the compacted + /// version. Loose-match like the other multi-commit writers; roll-forward + /// is always safe because compaction is content-preserving (Lance + /// `Operation::Rewrite` "reorganizes data without semantic modification"). + Optimize, } /// One table's contribution to a sidecar's intended commit. The classifier @@ -412,11 +418,13 @@ pub(crate) fn parse_sidecar(sidecar_uri: &str, body: &str) -> Result manifest_pinned` as `RolledPastExpected` when /// `pin.expected_version == manifest_pinned` (the writer's CAS /// target matches what the manifest currently shows). The risk this @@ -494,9 +502,12 @@ pub(crate) fn decide(classifications: &[TableClassification]) -> SidecarDecision /// Skipping the restore in those cases would leave Lance HEAD ahead of /// the manifest with no recovery artifact left. /// -/// Cost: under repeated mid-rollback crashes (rare), Lance HEAD -/// accumulates extra restore commits that `omnigraph cleanup` reclaims. -/// Bounded by the number of recovery iterations β€” typically 1. +/// Cost: a successful roll-back appends one restore commit and then publishes +/// the manifest to match (`roll_back_sidecar`), so the table converges +/// (`manifest == HEAD`) in one pass. Only repeated crashes *between* the restore +/// and that publish (rare) accumulate extra restore commits; each re-classified +/// roll-back restores again and `omnigraph cleanup` reclaims the surplus. +/// Bounded by the number of interrupted recovery iterations β€” typically 0. pub(crate) async fn restore_table_to_version( table_path: &str, branch: Option<&str>, @@ -801,13 +812,24 @@ async fn roll_back_sidecar( sidecar: &RecoverySidecar, states: &[ClassifiedTable], ) -> Result<()> { - // Restore every table whose Lance HEAD has drifted from the - // manifest pin (RolledPastExpected, UnexpectedAtP1, - // UnexpectedMultistep). NoMovement tables are already at the - // manifest pin β€” no action. Restore is unconditional; repeated - // mid-rollback crashes accumulate a few extra Lance commits that - // `omnigraph cleanup` reclaims. + // Restore every drifted table (RolledPastExpected / UnexpectedAtP1 / + // UnexpectedMultistep) to its manifest-pinned content, then PUBLISH so + // `manifest == Lance HEAD` for each β€” symmetric with roll-forward. The + // restore commit's content equals the manifest-pinned version, so re-pinning + // the manifest to the new (restored) HEAD is content-correct and closes the + // orphaned-drift class (`HEAD > manifest` with no covering sidecar). This is + // what makes a failed-then-retried schema_apply converge: after one + // roll-back `manifest == HEAD`, so the retry's precondition passes instead of + // failing one version higher each iteration. + // + // NoMovement tables are already at the pin β€” excluded from both the restore + // and the publish. The audit `to_version` stays the *logical* rolled-back-to + // version (`manifest_pinned`), while the manifest is published at + // `manifest_pinned + 1` (the restore commit, same content) β€” keep that + // asymmetry so the audit records the drift (`from_version > to_version`). let mut outcomes = Vec::with_capacity(sidecar.tables.len()); + let mut updates: Vec = Vec::with_capacity(sidecar.tables.len()); + let mut expected: HashMap = HashMap::with_capacity(sidecar.tables.len()); for (pin, state) in sidecar.tables.iter().zip(states.iter()) { if matches!( state.classification, @@ -821,10 +843,20 @@ async fn roll_back_sidecar( state.manifest_pinned, ) .await?; - // `from_version` records the Lance HEAD observed BEFORE the - // restore (the actual drift), not the manifest pin. Operators - // reading `_graph_commit_recoveries.lance` see "rolled back - // from v7 to v5" rather than "v5 β†’ v5". + // Publish the post-restore HEAD, CAS against the current (unmoved) + // manifest pin β€” the same helper roll-forward uses. + push_table_update_at_head( + root_uri, + &pin.table_key, + &pin.table_path, + pin.table_branch.as_deref(), + state.manifest_pinned, + &mut updates, + &mut expected, + ) + .await?; + // `from_version` records the Lance HEAD observed BEFORE the restore + // (the actual drift); `to_version` the logical pin we rolled back to. outcomes.push(TableOutcome { table_key: pin.table_key.clone(), from_version: state.lance_head, @@ -832,13 +864,23 @@ async fn roll_back_sidecar( }); } } - // Manifest pin doesn't move on rollback; record an audit-only - // commit at the existing version so operators can correlate via - // `omnigraph commit list --filter actor=omnigraph:recovery`. + // Publish the restored HEADs so manifest == HEAD. A degenerate all-NoMovement + // roll-back restores nothing β€” there's nothing to publish, and the audit + // records the unchanged snapshot version. + let manifest_version = if updates.is_empty() { + snapshot.version() + } else { + let publisher = GraphNamespacePublisher::new(root_uri, sidecar.branch.as_deref()); + publisher + .publish(&updates, &expected) + .await? + .version() + .version + }; record_audit( root_uri, sidecar, - snapshot.version(), + manifest_version, RecoveryKind::RolledBack, outcomes, ) @@ -919,44 +961,20 @@ async fn roll_forward_all( HashMap::with_capacity(sidecar.tables.len() + sidecar.additional_registrations.len()); for pin in &sidecar.tables { - // Open the dataset at its CURRENT Lance HEAD on the pin's branch - // (not at the sidecar's post_commit_pin). For strict-match writers - // (Mutation/Load) HEAD == post_commit_pin by construction. For - // loose-match writers (SchemaApply/EnsureIndices/BranchMerge) HEAD - // may be higher than post_commit_pin (multiple commit_staged - // calls per table); we want to publish to the actual current HEAD. - let head_ds = Dataset::open(&pin.table_path) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let head_ds = match pin.table_branch.as_deref() { - Some(b) if b != "main" => head_ds - .checkout_branch(b) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?, - _ => head_ds, - }; - let head_version = head_ds.version().version; - - let row_count = head_ds - .count_rows(None) - .await - .map_err(|e| OmniError::Lance(e.to_string()))? as u64; - - let table_relative_path = super::table_path_for_table_key(&pin.table_key)?; - let version_metadata = super::metadata::TableVersionMetadata::from_dataset( + // Publish to the table's CURRENT Lance HEAD on the pin's branch (not the + // sidecar's `post_commit_pin`, a lower bound for loose-match writers that + // run multiple commit_staged calls per table). CAS against the pin's + // pre-write `expected_version`. + let head_version = push_table_update_at_head( root_uri, - &table_relative_path, - &head_ds, - )?; - - updates.push(ManifestChange::Update(SubTableUpdate { - table_key: pin.table_key.clone(), - table_version: head_version, - table_branch: pin.table_branch.clone(), - row_count, - version_metadata, - })); - expected.insert(pin.table_key.clone(), pin.expected_version); + &pin.table_key, + &pin.table_path, + pin.table_branch.as_deref(), + pin.expected_version, + &mut updates, + &mut expected, + ) + .await?; published_versions.insert(pin.table_key.clone(), head_version); } @@ -1047,6 +1065,57 @@ async fn roll_forward_all( Ok((new_dataset.version().version, published_versions)) } +/// Open `table_path` at its branch HEAD, read the current Lance HEAD version, +/// row count, and version metadata, and push a `ManifestChange::Update` (plus +/// its CAS `expected` entry) that re-pins the manifest to that HEAD. Returns the +/// published HEAD version. +/// +/// Shared by `roll_forward_all` (where `expected_version` is the sidecar's +/// pre-write pin) and `roll_back_sidecar` (where it is the manifest-pinned +/// version the table was just restored to). The HEAD is read AFTER any restore +/// in the same single-threaded sweep, so no concurrent writer can have advanced +/// it. +async fn push_table_update_at_head( + root_uri: &str, + table_key: &str, + table_path: &str, + branch: Option<&str>, + expected_version: u64, + updates: &mut Vec, + expected: &mut HashMap, +) -> Result { + let head_ds = Dataset::open(table_path) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let head_ds = match branch { + Some(b) if b != "main" => head_ds + .checkout_branch(b) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?, + _ => head_ds, + }; + let head_version = head_ds.version().version; + let row_count = head_ds + .count_rows(None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))? as u64; + let table_relative_path = super::table_path_for_table_key(table_key)?; + let version_metadata = super::metadata::TableVersionMetadata::from_dataset( + root_uri, + &table_relative_path, + &head_ds, + )?; + updates.push(ManifestChange::Update(SubTableUpdate { + table_key: table_key.to_string(), + table_version: head_version, + table_branch: branch.map(str::to_string), + row_count, + version_metadata, + })); + expected.insert(table_key.to_string(), expected_version); + Ok(head_version) +} + /// Append the audit row describing this recovery action. /// /// Two-part write: (a) `_graph_commits.lance` row anchored on the recovery diff --git a/crates/omnigraph/src/db/manifest/tests.rs b/crates/omnigraph/src/db/manifest/tests.rs index effa0b5..885a2a8 100644 --- a/crates/omnigraph/src/db/manifest/tests.rs +++ b/crates/omnigraph/src/db/manifest/tests.rs @@ -1461,6 +1461,80 @@ async fn test_publish_migrates_pre_stamp_manifest_to_current_version() { assert!(reopened.snapshot().entry("node:Person").is_some()); } +#[tokio::test] +async fn test_v2_to_v3_sweeps_legacy_run_branches_on_write_open() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + + // Synthesize a pre-MR-770 graph: several stale `__run__` staging branches + // left on `__manifest` (a real legacy graph accumulates one per run), plus + // a real user branch that must survive the sweep. Multiple run branches + // exercise the migration's delete loop on a single reused dataset handle. + mc.create_branch("__run__01J9LEGACY").await.unwrap(); + mc.create_branch("__run__01J9SECOND").await.unwrap(); + mc.create_branch("__run__01J9THIRD").await.unwrap(); + mc.create_branch("feature").await.unwrap(); + let before = mc.list_branches().await.unwrap(); + assert_eq!( + before.iter().filter(|b| b.starts_with("__run__")).count(), + 3, + "precondition: three legacy run branches exist on __manifest; got {before:?}", + ); + + // Rewind the internal-schema stamp to v2 so the next write-open runs the + // v2 β†’ v3 sweep arm (init stamps at the current version, which is past it). + { + let mut ds = open_manifest_dataset(uri, None).await.unwrap(); + ds.update_schema_metadata([( + "omnigraph:internal_schema_version".to_string(), + Some("2".to_string()), + )]) + .await + .unwrap(); + let post = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!(super::migrations::read_stamp(&post), 2, "stamp rewound to v2"); + } + + // A no-op publish forces the open-for-write path, which runs the migration. + let mut expected = HashMap::new(); + expected.insert("node:Person".to_string(), 1); + GraphNamespacePublisher::new(uri, None) + .publish(&[], &expected) + .await + .unwrap(); + + // Stamp advanced to current; the legacy run branch is physically gone from + // `__manifest` (checked via the raw, unfiltered manifest list β€” not the + // guard-filtered `branch_list`), and the real branch + `main` survive. + let post = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&post), + super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION, + ); + let reopened = ManifestCoordinator::open(uri).await.unwrap(); + let after = reopened.list_branches().await.unwrap(); + assert!( + !after.iter().any(|b| b.starts_with("__run__")), + "legacy run branch must be swept; got {after:?}", + ); + assert!(after.iter().any(|b| b == "feature"), "user branch must survive"); + assert!(after.iter().any(|b| b == "main"), "main must survive"); + + // Idempotent: a second write-open finds the stamp at current and does not + // re-run the sweep or error. + GraphNamespacePublisher::new(uri, None) + .publish(&[], &expected) + .await + .unwrap(); + let final_ds = open_manifest_dataset(uri, None).await.unwrap(); + assert_eq!( + super::migrations::read_stamp(&final_ds), + super::migrations::INTERNAL_MANIFEST_SCHEMA_VERSION, + ); +} + #[tokio::test] async fn test_publish_rejects_manifest_stamped_at_future_version() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/omnigraph/src/db/mod.rs b/crates/omnigraph/src/db/mod.rs index d0b292f..13e1c74 100644 --- a/crates/omnigraph/src/db/mod.rs +++ b/crates/omnigraph/src/db/mod.rs @@ -3,7 +3,6 @@ pub mod graph_coordinator; pub mod manifest; mod omnigraph; mod recovery_audit; -mod run_registry; mod schema_state; pub(crate) mod write_queue; @@ -13,9 +12,8 @@ pub use manifest::{Snapshot, SubTableEntry, SubTableUpdate}; pub(crate) use omnigraph::ensure_public_branch_ref; pub use omnigraph::{ CleanupPolicyOptions, InitOptions, MergeOutcome, Omnigraph, OpenMode, SchemaApplyOptions, - SchemaApplyResult, TableCleanupStats, TableOptimizeStats, + SchemaApplyResult, SkipReason, TableCleanupStats, TableOptimizeStats, }; -pub(crate) use run_registry::is_internal_run_branch; pub(crate) const SCHEMA_APPLY_LOCK_BRANCH: &str = "__schema_apply_lock__"; @@ -69,5 +67,8 @@ pub(crate) fn is_schema_apply_lock_branch(name: &str) -> bool { } pub(crate) fn is_internal_system_branch(name: &str) -> bool { - is_internal_run_branch(name) || is_schema_apply_lock_branch(name) + // Legacy `__run__*` staging branches (Run state machine, removed MR-771) + // are swept off `__manifest` by the v2β†’v3 internal-schema migration, so the + // only internal branch the engine still creates is the schema-apply lock. + is_schema_apply_lock_branch(name) } diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs index 5c92ac3..ba2b70e 100644 --- a/crates/omnigraph/src/db/omnigraph.rs +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -33,7 +33,7 @@ mod optimize; mod schema_apply; mod table_ops; -pub use optimize::{CleanupPolicyOptions, TableCleanupStats, TableOptimizeStats}; +pub use optimize::{CleanupPolicyOptions, SkipReason, TableCleanupStats, TableOptimizeStats}; pub use schema_apply::SchemaApplyOptions; use super::commit_graph::GraphCommit; @@ -67,6 +67,12 @@ pub struct SchemaApplyResult { pub steps: Vec, } +#[derive(Debug, Clone)] +pub struct SchemaApplyPreview { + pub plan: SchemaMigrationPlan, + pub catalog: Catalog, +} + /// Top-level handle to an Omnigraph database. /// /// An Omnigraph is a Lance-native graph database with git-style branching. @@ -340,6 +346,16 @@ impl Omnigraph { mode: OpenMode, ) -> Result { let root = normalize_root_uri(uri)?; + // Apply pending internal-schema migrations before the coordinator reads + // branch state, so `branch_list` and the schema-apply blocking-branch + // checks observe the post-migration graph β€” notably the v2β†’v3 sweep of + // legacy `__run__*` staging branches (MR-770). ReadWrite only: a + // read-only open must not trigger object-store writes, so a read-only + // open of an unmigrated legacy graph still lists `__run__*` until its + // first read-write open (an accepted, documented limitation). + if matches!(mode, OpenMode::ReadWrite) { + crate::db::manifest::migrate_on_open(&root).await?; + } // Open the coordinator first so the schema-staging recovery sweep can // compare its snapshot against any leftover staging files. let mut coordinator = GraphCoordinator::open(&root, Arc::clone(&storage)).await?; @@ -493,6 +509,14 @@ impl Omnigraph { schema_apply::plan_schema(self, desired_schema_source, options).await } + pub async fn preview_schema_apply_with_options( + &self, + desired_schema_source: &str, + options: SchemaApplyOptions, + ) -> Result { + schema_apply::preview_schema_apply(self, desired_schema_source, options).await + } + pub async fn apply_schema(&self, desired_schema_source: &str) -> Result { self.apply_schema_as(desired_schema_source, SchemaApplyOptions::default(), None) .await @@ -523,7 +547,28 @@ impl Omnigraph { options: SchemaApplyOptions, actor: Option<&str>, ) -> Result { - schema_apply::apply_schema(self, desired_schema_source, options, actor).await + self.apply_schema_as_with_catalog_check(desired_schema_source, options, actor, |_| Ok(())) + .await + } + + pub async fn apply_schema_as_with_catalog_check( + &self, + desired_schema_source: &str, + options: SchemaApplyOptions, + actor: Option<&str>, + validate_catalog: F, + ) -> Result + where + F: FnOnce(&Catalog) -> Result<()>, + { + schema_apply::apply_schema( + self, + desired_schema_source, + options, + actor, + validate_catalog, + ) + .await } pub(crate) async fn ensure_schema_apply_idle(&self, operation: &str) -> Result<()> { @@ -1058,11 +1103,14 @@ impl Omnigraph { Ok(()) } - async fn cleanup_deleted_branch_tables( - &self, - branch: &str, - owned_tables: &[(String, String)], - ) -> Result<()> { + /// Best-effort reclaim of the per-table Lance forks a just-deleted branch + /// owned. Runs AFTER the manifest authority flip, so the branch is already + /// gone and these forks are unreachable orphans. A failure here (transient + /// object-store error, the `branch_delete.before_table_cleanup` failpoint) + /// is logged and swallowed: the `cleanup` reconciler is the guaranteed + /// backstop that converges any leftover orphan. Uses `force_delete_branch` + /// so a partially-reclaimed retry is idempotent. + async fn cleanup_deleted_branch_tables(&self, branch: &str, owned_tables: &[(String, String)]) { let mut seen_paths = HashSet::new(); let mut cleanup_targets = owned_tables .iter() @@ -1073,15 +1121,21 @@ impl Omnigraph { for (table_key, table_path) in cleanup_targets { let dataset_uri = self.table_store.dataset_uri(&table_path); - if let Err(err) = self.table_store.delete_branch(&dataset_uri, branch).await { - return Err(OmniError::manifest_internal(format!( - "branch '{}' was deleted but cleanup failed for {}: {}", - branch, table_key, err - ))); + let outcome = match crate::failpoints::maybe_fail("branch_delete.before_table_cleanup") + { + Ok(()) => self.table_store.force_delete_branch(&dataset_uri, branch).await, + Err(injected) => Err(injected), + }; + if let Err(err) = outcome { + tracing::warn!( + target: "omnigraph::branch_delete::cleanup", + branch = %branch, + table = %table_key, + error = %err, + "best-effort fork reclaim failed; cleanup will reconcile the orphan", + ); } } - - Ok(()) } async fn delete_branch_storage_only(&self, branch: &str) -> Result<()> { @@ -1105,9 +1159,12 @@ impl Omnigraph { .map(|entry| (entry.table_key.clone(), entry.table_path.clone())) .collect::>(); + // Authority flip (+ best-effort commit-graph reclaim) β€” must succeed. self.coordinator.write().await.branch_delete(branch).await?; + // Best-effort per-table fork reclaim; cleanup reconciles any leftover. self.cleanup_deleted_branch_tables(branch, &owned_tables) - .await + .await; + Ok(()) } pub(crate) fn normalize_branch_name(branch: &str) -> Result> { @@ -1444,12 +1501,6 @@ pub(crate) fn normalize_branch_name(branch: &str) -> Result> { } pub(crate) fn ensure_public_branch_ref(branch: &str, operation: &str) -> Result<()> { - if super::is_internal_run_branch(branch) { - return Err(OmniError::manifest(format!( - "{} does not allow internal run ref '{}'", - operation, branch - ))); - } if is_internal_system_branch(branch) { return Err(OmniError::manifest(format!( "{} does not allow internal system ref '{}'", @@ -1853,7 +1904,6 @@ fn json_value_from_array(array: &dyn Array, row: usize) -> Result Company #[tokio::test] async fn test_apply_schema_succeeds_after_load() { // Historical: schema apply used to be blocked by leftover - // `__run__` branches. A defense-in-depth filter now skips - // internal system branches, and run branches were made - // ephemeral on every terminal state β€” so in practice no - // `__run__` branch survives publish. The filter still guards - // the invariant. + // `__run__` branches. The Run state machine was removed in + // MR-771, so a fresh graph never creates a `__run__` branch; + // legacy ones are swept by the v2β†’v3 manifest migration. This + // asserts the invariant a current graph upholds: publish leaves + // no `__run__` branch behind, so schema apply proceeds. let dir = tempfile::tempdir().unwrap(); let uri = dir.path().to_str().unwrap(); let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); @@ -2210,8 +2260,8 @@ edge WorksAt: Person -> Company let all_branches = db.coordinator.read().await.all_branches().await.unwrap(); assert!( - !all_branches.iter().any(|b| is_internal_run_branch(b)), - "run branch should be deleted after publish, got: {:?}", + !all_branches.iter().any(|b| b.starts_with("__run__")), + "no __run__ branch should exist after publish, got: {:?}", all_branches ); @@ -2223,6 +2273,56 @@ edge WorksAt: Person -> Company assert!(result.applied, "schema apply should have applied"); } + /// Regression (MR-770): a pre-v0.4.0 graph that still carries a stale + /// `__run__*` branch on `__manifest` must not block schema apply. The + /// v2β†’v3 sweep runs in `Omnigraph::open(ReadWrite)` β€” before the + /// schema-apply blocking-branch check β€” so apply succeeds with no + /// intervening publish. + /// + /// Confirmed to fail before the open-time migration landed: the reopened + /// graph still listed `__run__legacy`, and `apply_schema` returned + /// "found non-main branches: __run__legacy". + #[tokio::test] + async fn legacy_run_branch_is_swept_on_open_and_does_not_block_schema_apply() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Synthesize a legacy graph: a stale `__run__` branch on `__manifest` + // plus the manifest stamp rewound to v2 (pre-sweep). + db.branch_create("__run__legacy").await.unwrap(); + drop(db); + { + let mut ds = lance::Dataset::open(&format!("{}/__manifest", uri)) + .await + .unwrap(); + ds.update_schema_metadata([( + "omnigraph:internal_schema_version".to_string(), + Some("2".to_string()), + )]) + .await + .unwrap(); + } + + // Reopen (ReadWrite): the open-time migration must sweep `__run__legacy` + // before any branch-observing code runs. + let db = Omnigraph::open(uri).await.unwrap(); + let branches = db.branch_list().await.unwrap(); + assert!( + !branches.iter().any(|b| b.starts_with("__run__")), + "open-time migration must sweep legacy __run__ branches; got {branches:?}", + ); + + // Schema apply must proceed with no intervening publish β€” the + // blocking-branch check no longer sees `__run__legacy`. + let desired = TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + let result = db.apply_schema(&desired).await.unwrap(); + assert!(result.applied, "schema apply should have applied"); + } + #[tokio::test] async fn test_apply_schema_adds_index_for_existing_property() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/omnigraph/src/db/omnigraph/optimize.rs b/crates/omnigraph/src/db/omnigraph/optimize.rs index e158dc7..ee39323 100644 --- a/crates/omnigraph/src/db/omnigraph/optimize.rs +++ b/crates/omnigraph/src/db/omnigraph/optimize.rs @@ -8,8 +8,14 @@ //! Two dials: //! //! * `optimize_all_tables` β€” Lance `compact_files` on every table. Rewrites -//! small fragments into fewer large ones. Non-destructive (creates a new -//! version; old fragments remain reachable via older manifest versions). +//! small fragments into fewer large ones, then **publishes the compacted +//! version to the `__manifest`** so the manifest's `table_version` tracks the +//! compacted Lance HEAD (reads pin the manifest version, so without the +//! publish compaction would be invisible to readers and would break the +//! HEAD-vs-manifest precondition of schema apply / strict writes). Compaction +//! is content-preserving (Lance `Operation::Rewrite` "reorganizes data +//! without semantic modification"), so old fragments remain reachable via +//! older manifest versions until `cleanup` runs. //! * `cleanup_all_tables` β€” Lance `cleanup_old_versions` on every table. //! Removes manifests (and their unique fragments) older than the configured //! retention. Destructive to version history β€” callers should gate this @@ -23,7 +29,9 @@ use std::time::Duration; use chrono::Utc; use futures::stream::StreamExt; use lance::dataset::cleanup::{CleanupPolicy, RemovalStats}; -use lance::dataset::optimize::{CompactionMetrics, CompactionOptions, compact_files}; +use lance::dataset::optimize::{ + CompactionMetrics, CompactionOptions, compact_files, plan_compaction, +}; use super::*; @@ -40,6 +48,20 @@ fn maint_concurrency() -> usize { .unwrap_or(DEFAULT_MAINT_CONCURRENCY) } +/// Whether the installed Lance can compact a dataset that contains blob +/// columns. `false` today: Lance `compact_files` forces +/// `BlobHandling::AllBinary` on the read side, and the blob-v2 struct decoder +/// mis-counts columns ("there were more fields in the schema than provided +/// column indices"), failing even a pristine uniform-V2_2 multi-fragment blob +/// table. Reads are unaffected (queries use descriptor handling). +/// +/// While `false`, [`optimize_all_tables`] skips blob-bearing tables and reports +/// [`SkipReason::BlobColumnsUnsupportedByLance`] instead of aborting the whole +/// sweep. Flip to `true` once the upstream Lance fix ships β€” the +/// `lance_surface_guards.rs::compact_files_still_fails_on_blob_columns` guard +/// turns red on that bump and forces this flip. Tracked in `docs/dev/lance.md`. +const LANCE_SUPPORTS_BLOB_COMPACTION: bool = false; + /// Retention knobs for [`cleanup_all_tables`]. At least one must be set or /// nothing is cleaned. If both are set, Lance applies them as AND (a manifest /// is kept if it satisfies either β€” i.e. only manifests older than BOTH the @@ -52,76 +74,314 @@ pub struct CleanupPolicyOptions { pub older_than: Option, } -/// Per-table outcome of `optimize_all_tables`. +/// Why `optimize` did not compact a table. Typed so callers branch on the +/// reason rather than sniffing a string. One variant today, gated by +/// [`LANCE_SUPPORTS_BLOB_COMPACTION`]. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[non_exhaustive] +pub enum SkipReason { + /// The table has one or more `Blob` columns. Lance `compact_files` forces + /// `BlobHandling::AllBinary`, which mis-decodes blob-v2 columns; see + /// [`LANCE_SUPPORTS_BLOB_COMPACTION`] and `docs/dev/lance.md`. + BlobColumnsUnsupportedByLance, +} + +impl SkipReason { + /// Stable machine-readable token for serialized output (e.g. CLI `--json`). + /// Once emitted this is part of the output contract β€” keep it stable. + pub fn as_str(&self) -> &'static str { + match self { + SkipReason::BlobColumnsUnsupportedByLance => "blob_columns_unsupported_by_lance", + } + } +} + +impl std::fmt::Display for SkipReason { + /// Human-readable reason for CLI and log output. + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let msg = match self { + SkipReason::BlobColumnsUnsupportedByLance => { + "blob columns β€” Lance compaction unsupported" + } + }; + f.write_str(msg) + } +} + +/// Per-table outcome of `optimize_all_tables`. This is a returned result type, +/// not built by callers, so it is `#[non_exhaustive]`: future fields stay +/// non-breaking and downstream code reads fields rather than constructing it. #[derive(Debug, Clone)] +#[non_exhaustive] pub struct TableOptimizeStats { pub table_key: String, /// Number of source fragments that were rewritten by Lance. pub fragments_removed: usize, /// Number of new, larger fragments Lance produced. pub fragments_added: usize, - /// Did this table get a new Lance manifest version from the compaction? + /// Did this table get a new manifest version from the compaction? True when + /// compaction ran and its compacted version was published to `__manifest`. pub committed: bool, + /// `Some(reason)` if this table was deliberately not compacted. When set, + /// `fragments_removed == 0`, `fragments_added == 0`, and `!committed`. + pub skipped: Option, } -/// Per-table outcome of `cleanup_all_tables`. +impl TableOptimizeStats { + /// Stat for a table that Lance actually compacted. + fn compacted(table_key: String, metrics: &CompactionMetrics, committed: bool) -> Self { + Self { + table_key, + fragments_removed: metrics.fragments_removed, + fragments_added: metrics.fragments_added, + committed, + skipped: None, + } + } + + /// Stat for a table that was deliberately skipped (compaction not attempted). + fn skipped(table_key: String, reason: SkipReason) -> Self { + Self { + table_key, + fragments_removed: 0, + fragments_added: 0, + committed: false, + skipped: Some(reason), + } + } +} + +/// Per-table outcome of `cleanup_all_tables`. `error` is `Some` when this +/// table's version GC failed; cleanup is fault-isolated per table, so a single +/// table's failure is recorded here rather than aborting the whole sweep. #[derive(Debug, Clone)] pub struct TableCleanupStats { pub table_key: String, pub bytes_removed: u64, pub old_versions_removed: u64, + pub error: Option, } -/// Run Lance `compact_files` on every node + edge table on `main`. -/// Tables run in parallel (bounded concurrency). +/// Run Lance `compact_files` on every node + edge table on `main`, publishing +/// each compacted table's new version to the `__manifest`. Tables run in +/// parallel (bounded concurrency); each is fault-isolated only at the Lance +/// level β€” a publish error is propagated (the recovery sidecar covers it). pub async fn optimize_all_tables(db: &Omnigraph) -> Result> { db.ensure_schema_state_valid().await?; db.ensure_schema_apply_idle("optimize").await?; + // Refuse on an unrecovered graph. A pending recovery sidecar means a failed + // write left partial state that the open-time sweep must resolve (roll + // forward/back) first; compacting + publishing a table covered by such a + // sidecar could commit a partial write the sweep would roll back. Reopen the + // graph to run recovery, then re-run optimize. + if !crate::db::manifest::list_sidecars(db.root_uri(), db.storage_adapter()) + .await? + .is_empty() + { + return Err(OmniError::manifest_conflict( + "optimize requires a clean recovery state; reopen the graph to run the \ + recovery sweep before optimizing", + )); + } + let resolved = db.resolved_branch_target(None).await?; let snapshot = resolved.snapshot; - let table_tasks: Vec<_> = all_table_keys(&db.catalog()) - .into_iter() - .filter_map(|table_key| { - let entry = snapshot.entry(&table_key)?; + // Compute per-table state (path + whether it has blob columns) up front, in + // a scope that drops the catalog handle before the async stream starts. + let table_tasks: Vec<(String, String, bool)> = { + let catalog = db.catalog(); + let mut tasks = Vec::new(); + for table_key in all_table_keys(&catalog) { + let Some(entry) = snapshot.entry(&table_key) else { + continue; + }; let full_path = format!("{}/{}", db.root_uri, entry.table_path); - Some((table_key, full_path)) - }) - .collect(); + let has_blob = !blob_properties_for_table_key(&catalog, &table_key)?.is_empty(); + tasks.push((table_key, full_path, has_blob)); + } + tasks + }; if table_tasks.is_empty() { return Ok(Vec::new()); } let concurrency = maint_concurrency().min(table_tasks.len()).max(1); - let table_store = &db.table_store; let stats: Vec> = futures::stream::iter(table_tasks.into_iter()) - .map(|(table_key, full_path)| async move { - let mut ds = table_store - .open_dataset_head_for_write(&table_key, &full_path, None) - .await?; - let version_before = ds.version().version; - let metrics: CompactionMetrics = - compact_files(&mut ds, CompactionOptions::default(), None) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - let version_after = ds.version().version; - Ok(TableOptimizeStats { - table_key, - fragments_removed: metrics.fragments_removed, - fragments_added: metrics.fragments_added, - committed: version_after != version_before, - }) + .map(move |(table_key, full_path, has_blob)| async move { + optimize_one_table(db, table_key, full_path, has_blob).await }) .buffer_unordered(concurrency) .collect() .await; + // Invalidate caches for any table that published a compaction β€” done BEFORE + // propagating a sibling table's error, since the published versions are + // durable and reads must observe the new fragment layout (Lance invalidates + // the original row addresses on rewrite). The CSR/CSC graph topology index + // is rebuilt only when an edge table moved. Mirrors schema_apply's + // post-publish invalidation. + let any_committed = stats + .iter() + .any(|s| matches!(s, Ok(st) if st.committed)); + let edge_committed = stats + .iter() + .any(|s| matches!(s, Ok(st) if st.committed && st.table_key.starts_with("edge:"))); + if any_committed { + db.runtime_cache.invalidate_all().await; + if edge_committed { + db.invalidate_graph_index().await; + } + } + stats.into_iter().collect() } +/// Compact one table and publish the compacted version to the `__manifest`. +/// +/// Compaction (`compact_files`) advances the *dataset's* Lance HEAD via a +/// reserve-fragments + rewrite commit, but Lance knows nothing about the +/// `__manifest`. To keep the manifest the single authority for each table's +/// visible version (invariant 2), optimize must publish the compacted version. +/// The Lance-HEAD-before-manifest-publish gap is unavoidable (Lance has no +/// staged/uncommitted compaction), so it is covered by a recovery sidecar like +/// the other multi-commit writers; roll-forward is always safe because +/// compaction is content-preserving. +async fn optimize_one_table( + db: &Omnigraph, + table_key: String, + full_path: String, + has_blob: bool, +) -> Result { + // Lance `compact_files` mis-decodes blob-v2 columns under the forced + // `BlobHandling::AllBinary` read (see LANCE_SUPPORTS_BLOB_COMPACTION). Skip + // blob-bearing tables and report it rather than aborting the whole sweep. + if has_blob && !LANCE_SUPPORTS_BLOB_COMPACTION { + tracing::warn!( + target: "omnigraph::optimize", + table = %table_key, + "skipping compaction: table has blob columns the current Lance \ + cannot rewrite (blob-v2 AllBinary decode bug); other tables \ + unaffected β€” rerun after the Lance fix", + ); + return Ok(TableOptimizeStats::skipped( + table_key, + SkipReason::BlobColumnsUnsupportedByLance, + )); + } + + // Serialize the whole compactβ†’publish against concurrent mutations on this + // (table, main): compaction is a Rewrite op that retryable-conflicts with a + // concurrent Merge/Update/Delete on overlapping fragments, and an + // interleaved write would also move the manifest version out from under the + // CAS below. Holding the queue makes the CAS baseline read under it exact. + let _guard = db + .write_queue() + .acquire_many(&[(table_key.clone(), None)]) + .await; + + let mut ds = db + .table_store + .open_dataset_head_for_write(&table_key, &full_path, None) + .await?; + + // CAS baseline: the table's current manifest version, read under the queue + // (in-memory coordinator snapshot, no storage I/O β€” stable for this section). + let expected_version = db + .snapshot() + .await + .entry(&table_key) + .map(|e| e.table_version) + .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + + // Precise "will it compact?" check β€” `plan_compaction` also accounts for + // deletion materialization (which can rewrite even a single fragment). A + // steady-state already-compacted table yields an empty plan and is never + // pinned in a sidecar (a zero-commit pin would classify NoMovement on + // recovery and force an all-or-nothing rollback). There is no drift to + // reconcile here: optimize runs only on a recovered graph (the pending- + // sidecar guard above), and recovery roll-back now publishes, so + // `HEAD == manifest` holds going in. + let options = CompactionOptions::default(); + let plan = plan_compaction(&ds, &options) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + if plan.num_tasks() == 0 { + return Ok(TableOptimizeStats::compacted( + table_key, + &CompactionMetrics::default(), + false, + )); + } + + // Phase A: recovery sidecar BEFORE compaction advances the Lance HEAD, so a + // crash before the manifest publish rolls forward on next open. + let sidecar = crate::db::manifest::new_sidecar( + crate::db::manifest::SidecarKind::Optimize, + None, + // optimize is system-attributed (no `optimize_as` actor API today). + None, + vec![crate::db::manifest::SidecarTablePin { + table_key: table_key.clone(), + table_path: full_path.clone(), + expected_version, + // Lower bound β€” compaction commits Nβ‰₯1 versions (reserve + rewrite); + // the classifier loose-matches SidecarKind::Optimize. + post_commit_pin: expected_version + 1, + table_branch: None, + }], + ); + let handle = + crate::db::manifest::write_sidecar(db.root_uri(), db.storage_adapter(), &sidecar).await?; + + // Phase B: compaction (reserve-fragments + rewrite commits advance HEAD). + let version_before = ds.version().version; + let metrics: CompactionMetrics = compact_files(&mut ds, options, None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let version_after = ds.version().version; + let committed = version_after != version_before; + + // Pin the per-writer Phase B β†’ Phase C residual for optimize: Lance HEAD has + // advanced but the manifest publish below hasn't run. + crate::failpoints::maybe_fail("optimize.post_phase_b_pre_manifest_commit")?; + + // Phase C: publish the compacted version to the manifest (one CAS commit, + // expected = the version observed under the queue). On failure the sidecar + // is intentionally left for the open-time recovery sweep to roll forward. + if committed { + let state = db.table_store.table_state(&full_path, &ds).await?; + let update = crate::db::SubTableUpdate { + table_key: table_key.clone(), + table_version: state.version, + table_branch: None, + row_count: state.row_count, + version_metadata: state.version_metadata, + }; + let mut expected = std::collections::HashMap::new(); + expected.insert(table_key.clone(), expected_version); + db.coordinator + .write() + .await + .commit_updates_with_actor_with_expected(&[update], &expected, None) + .await?; + } + + // Phase D: delete the sidecar (best-effort; recovery resolves a leftover). + if let Err(err) = crate::db::manifest::delete_sidecar(&handle, db.storage_adapter()).await { + tracing::warn!( + error = %err, + operation_id = handle.operation_id.as_str(), + "optimize recovery sidecar cleanup failed; next open's recovery sweep will resolve it" + ); + } + + Ok(TableOptimizeStats::compacted(table_key, &metrics, committed)) +} + /// Run Lance `cleanup_old_versions` on every node + edge table on `main`, /// using [`CleanupPolicyOptions`]. The latest manifest is always preserved /// regardless (Lance invariant). @@ -138,6 +398,26 @@ pub async fn cleanup_all_tables( db.ensure_schema_state_valid().await?; db.ensure_schema_apply_idle("cleanup").await?; + // Reclaim orphaned branch forks (from an incomplete prior `branch_delete`) + // before version GC. Authority-derived and idempotent; the eager + // best-effort reclaim in `branch_delete` covers the common case, this is + // the guaranteed backstop. Logged for observability. + let reconciled = reconcile_orphaned_branches(db).await?; + if !reconciled.reclaimed.is_empty() { + tracing::info!( + count = reconciled.reclaimed.len(), + reclaimed = ?reconciled.reclaimed, + "cleanup reconciled orphaned branch forks" + ); + } + if !reconciled.failures.is_empty() { + tracing::warn!( + count = reconciled.failures.len(), + failures = ?reconciled.failures, + "cleanup could not reconcile some orphaned forks; will retry next cleanup" + ); + } + let before_timestamp = options.older_than.map(|d| Utc::now() - d); let keep_versions = options.keep_versions; @@ -160,36 +440,205 @@ pub async fn cleanup_all_tables( let concurrency = maint_concurrency().min(table_tasks.len()).max(1); let table_store = &db.table_store; - let results: Vec> = futures::stream::iter(table_tasks.into_iter()) + // Fault-isolated per table: a single table's GC failure is recorded on its + // stats row (`error: Some`) and logged, never aborting the healthy tables. + // cleanup is the convergence backstop, so it must do as much as it can and + // converge on re-run rather than fail wholesale (invariant 13). + let results: Vec = futures::stream::iter(table_tasks.into_iter()) .map(|(table_key, full_path)| async move { - let ds = table_store - .open_dataset_head_for_write(&table_key, &full_path, None) - .await?; - let before_version = keep_versions - .map(|n| ds.version().version.saturating_sub(n as u64)) - .filter(|v| *v > 0); - let policy = CleanupPolicy { - before_timestamp, - before_version, - delete_unverified: false, - error_if_tagged_old_versions: false, - clean_referenced_branches: false, - delete_rate_limit: None, - }; - let removed: RemovalStats = lance::dataset::cleanup::cleanup_old_versions(&ds, policy) - .await - .map_err(|e| OmniError::Lance(e.to_string()))?; - Ok(TableCleanupStats { - table_key, - bytes_removed: removed.bytes_removed, - old_versions_removed: removed.old_versions, - }) + let outcome: Result = async { + crate::failpoints::maybe_fail("cleanup.table_gc")?; + let ds = table_store + .open_dataset_head_for_write(&table_key, &full_path, None) + .await?; + let before_version = keep_versions + .map(|n| ds.version().version.saturating_sub(n as u64)) + .filter(|v| *v > 0); + let policy = CleanupPolicy { + before_timestamp, + before_version, + delete_unverified: false, + error_if_tagged_old_versions: false, + clean_referenced_branches: false, + delete_rate_limit: None, + }; + lance::dataset::cleanup::cleanup_old_versions(&ds, policy) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + .await; + match outcome { + Ok(removed) => TableCleanupStats { + table_key, + bytes_removed: removed.bytes_removed, + old_versions_removed: removed.old_versions, + error: None, + }, + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + table = %table_key, + error = %err, + "version GC failed for table; other tables unaffected", + ); + TableCleanupStats { + table_key, + bytes_removed: 0, + old_versions_removed: 0, + error: Some(err.to_string()), + } + } + } }) .buffer_unordered(concurrency) .collect() .await; - results.into_iter().collect() + Ok(results) +} + +/// Outcome of [`reconcile_orphaned_branches`]: the `(owner, branch)` pairs +/// reclaimed and the `(owner, error)` pairs that failed, where `owner` is a +/// table key (e.g. `node:Person`) or `"_graph_commits"`. Per-owner failures are +/// isolated and recorded here, not propagated β€” the next reconcile converges. +#[derive(Debug, Clone, Default)] +pub struct BranchReconcileStats { + pub reclaimed: Vec<(String, String)>, + pub failures: Vec<(String, String)>, +} + +/// Drop every per-table and commit-graph Lance branch that the manifest no +/// longer references. +/// +/// Orphaned forks arise when a `branch_delete` flips the manifest authority +/// (atomic) but a downstream best-effort reclaim does not complete. They are +/// unreachable through any snapshot β€” no manifest entry can name them β€” yet +/// they pin their `tree/{branch}/` storage and can block reusing the branch +/// name. This is the guaranteed convergence backstop: it is idempotent and +/// derived purely from the manifest authority, so it no-ops once everything is +/// reconciled, and it would harmlessly find nothing if a future Lance atomic +/// multi-dataset branch op prevented orphans from forming. +/// +/// The keep-set is the full (unfiltered) manifest branch list, so system +/// branches' forks are never reclaimed; `main`/default is not a named Lance +/// branch and so is never a candidate. Referencing children are dropped before +/// parents (Lance refuses to delete a referenced parent) by ordering longest +/// branch names first. +pub async fn reconcile_orphaned_branches(db: &Omnigraph) -> Result { + use std::collections::HashSet; + + let keep: HashSet = db + .coordinator + .read() + .await + .all_branches() + .await? + .into_iter() + .collect(); + + let resolved = db.resolved_branch_target(None).await?; + let snapshot = resolved.snapshot; + let table_targets: Vec<(String, String)> = all_table_keys(&db.catalog()) + .into_iter() + .filter_map(|table_key| { + let entry = snapshot.entry(&table_key)?; + let full_path = format!("{}/{}", db.root_uri, entry.table_path); + Some((table_key, full_path)) + }) + .collect(); + + let mut stats = BranchReconcileStats::default(); + + // Per-table fault isolation: one table's transient failure is recorded and + // logged, never aborting the rest of the sweep. + for (table_key, full_path) in table_targets { + let listed = match db.table_store.list_branches(&full_path).await { + Ok(listed) => listed, + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + table = %table_key, + error = %err, + "listing branches failed during reconcile; skipping table", + ); + stats.failures.push((table_key.clone(), err.to_string())); + continue; + } + }; + for branch in orphan_branches(listed, &keep) { + let outcome = match crate::failpoints::maybe_fail("cleanup.reconcile_fork") { + Ok(()) => db.table_store.force_delete_branch(&full_path, &branch).await, + Err(injected) => Err(injected), + }; + match outcome { + Ok(()) => stats.reclaimed.push((table_key.clone(), branch)), + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + table = %table_key, + branch = %branch, + error = %err, + "reclaiming orphaned fork failed; will retry next cleanup", + ); + stats.failures.push((table_key.clone(), err.to_string())); + } + } + } + } + + // Commit-graph orphans (best-effort: the dataset may not exist on a graph + // that has never committed; any failure is isolated and retried next time). + if let Err(err) = reconcile_commit_graph_orphans(db, &keep, &mut stats).await { + tracing::warn!( + target: "omnigraph::cleanup", + error = %err, + "commit-graph orphan reconcile failed; will retry next cleanup", + ); + stats.failures.push(("_graph_commits".to_string(), err.to_string())); + } + + Ok(stats) +} + +/// Commit-graph half of [`reconcile_orphaned_branches`], split out so its +/// errors can be isolated. Returns `Ok` when the commit-graph dataset is absent. +async fn reconcile_commit_graph_orphans( + db: &Omnigraph, + keep: &std::collections::HashSet, + stats: &mut BranchReconcileStats, +) -> Result<()> { + let commits_uri = crate::db::commit_graph::graph_commits_uri(db.root_uri()); + if !db.storage_adapter().exists(&commits_uri).await? { + return Ok(()); + } + let mut commit_graph = crate::db::commit_graph::CommitGraph::open(db.root_uri()).await?; + for branch in orphan_branches(commit_graph.list_branches().await?, keep) { + match commit_graph.force_delete_branch(&branch).await { + Ok(()) => stats.reclaimed.push(("_graph_commits".to_string(), branch)), + Err(err) => { + tracing::warn!( + target: "omnigraph::cleanup", + branch = %branch, + error = %err, + "reclaiming orphaned commit-graph branch failed; will retry next cleanup", + ); + stats.failures.push(("_graph_commits".to_string(), err.to_string())); + } + } + } + Ok(()) +} + +/// Filter `present` Lance branches down to those absent from the manifest +/// `keep` set, ordered children-before-parents (longest name first) so Lance's +/// referenced-parent `RefConflict` cannot block reclamation. +fn orphan_branches(present: Vec, keep: &std::collections::HashSet) -> Vec { + let mut orphans: Vec = present + .into_iter() + .filter(|branch| !keep.contains(branch)) + .collect(); + orphans.sort_by(|a, b| b.len().cmp(&a.len()).then_with(|| a.cmp(b))); + orphans } fn all_table_keys(catalog: &omnigraph_compiler::catalog::Catalog) -> Vec { diff --git a/crates/omnigraph/src/db/omnigraph/schema_apply.rs b/crates/omnigraph/src/db/omnigraph/schema_apply.rs index 0dcf0f9..7cb3193 100644 --- a/crates/omnigraph/src/db/omnigraph/schema_apply.rs +++ b/crates/omnigraph/src/db/omnigraph/schema_apply.rs @@ -48,57 +48,24 @@ pub(super) async fn plan_schema( Ok(plan) } -pub(super) async fn apply_schema( - db: &Omnigraph, - desired_schema_source: &str, - options: SchemaApplyOptions, - actor: Option<&str>, -) -> Result { - // Engine-layer policy gate (MR-722 chassis core). - // - // Fires BEFORE acquiring the schema-apply lock or doing any other - // work. When no PolicyChecker is installed this is a no-op and - // the apply path behaves exactly as it did before MR-722. When - // a PolicyChecker IS installed and the actor is None, this is a - // hard error β€” see Omnigraph::enforce's docstring for the - // forget-the-actor-footgun reasoning. - // - // Scope is TargetBranch("main") to match the HTTP-layer convention - // for SchemaApply: branch=None, target_branch=Some("main"). Cedar - // policies in the wild use `target_branch_scope: protected` to - // gate schema applies, so the engine-layer call has to set the - // target_branch shape that activates that predicate. Wrong scope - // here = silent policy mismatch with HTTP. See - // `omnigraph_policy::ResourceScope::to_branch_pair` for the mapping. - db.enforce( - omnigraph_policy::PolicyAction::SchemaApply, - &omnigraph_policy::ResourceScope::TargetBranch("main".to_string()), - actor, - )?; - - acquire_schema_apply_lock(db).await?; - let result = apply_schema_with_lock(db, desired_schema_source, options).await; - let release_result = release_schema_apply_lock(db).await; - match (result, release_result) { - (Ok(result), Ok(())) => Ok(result), - (Ok(_), Err(err)) => Err(err), - (Err(err), Ok(())) => Err(err), - (Err(err), Err(_)) => Err(err), - } +struct PlannedSchemaApply { + plan: SchemaMigrationPlan, + desired_ir: SchemaIR, + desired_catalog: Catalog, } -pub(super) async fn apply_schema_with_lock( +async fn plan_schema_for_apply( db: &Omnigraph, desired_schema_source: &str, options: SchemaApplyOptions, -) -> Result { +) -> Result { db.ensure_schema_state_valid().await?; let branches = db.coordinator.read().await.all_branches().await?; - // Skip `main` and internal system branches. The schema-apply lock branch - // is excluded because it is the cluster-wide schema-apply serializer. - // `__run__*` branches are no longer created; the filter remains as - // defense-in-depth for legacy graphs with leftover staging branches. - // A future production sweep will let this guard go. + // Skip `main` and internal system branches (the schema-apply lock branch, + // the cluster-wide schema-apply serializer). Legacy `__run__*` staging + // branches were swept off `__manifest` by the v2β†’v3 migration that runs in + // `Omnigraph::open(ReadWrite)` before this check (MR-770), so they no + // longer appear here. let blocking_branches = branches .into_iter() .filter(|branch| branch != "main" && !is_internal_system_branch(branch)) @@ -123,6 +90,87 @@ pub(super) async fn apply_schema_with_lock( .unwrap_or_else(|| "unsupported schema migration plan".to_string()); return Err(OmniError::manifest(message)); } + + let mut desired_catalog = build_catalog_from_ir(&desired_ir)?; + fixup_blob_schemas(&mut desired_catalog); + Ok(PlannedSchemaApply { + plan, + desired_ir, + desired_catalog, + }) +} + +pub(super) async fn preview_schema_apply( + db: &Omnigraph, + desired_schema_source: &str, + options: SchemaApplyOptions, +) -> Result { + let planned = plan_schema_for_apply(db, desired_schema_source, options).await?; + Ok(SchemaApplyPreview { + plan: planned.plan, + catalog: planned.desired_catalog, + }) +} + +pub(super) async fn apply_schema( + db: &Omnigraph, + desired_schema_source: &str, + options: SchemaApplyOptions, + actor: Option<&str>, + validate_catalog: F, +) -> Result +where + F: FnOnce(&Catalog) -> Result<()>, +{ + // Engine-layer policy gate (MR-722 chassis core). + // + // Fires BEFORE acquiring the schema-apply lock or doing any other + // work. When no PolicyChecker is installed this is a no-op and + // the apply path behaves exactly as it did before MR-722. When + // a PolicyChecker IS installed and the actor is None, this is a + // hard error β€” see Omnigraph::enforce's docstring for the + // forget-the-actor-footgun reasoning. + // + // Scope is TargetBranch("main") to match the HTTP-layer convention + // for SchemaApply: branch=None, target_branch=Some("main"). Cedar + // policies in the wild use `target_branch_scope: protected` to + // gate schema applies, so the engine-layer call has to set the + // target_branch shape that activates that predicate. Wrong scope + // here = silent policy mismatch with HTTP. See + // `omnigraph_policy::ResourceScope::to_branch_pair` for the mapping. + db.enforce( + omnigraph_policy::PolicyAction::SchemaApply, + &omnigraph_policy::ResourceScope::TargetBranch("main".to_string()), + actor, + )?; + + acquire_schema_apply_lock(db).await?; + let result = apply_schema_with_lock(db, desired_schema_source, options, validate_catalog).await; + let release_result = release_schema_apply_lock(db).await; + match (result, release_result) { + (Ok(result), Ok(())) => Ok(result), + (Ok(_), Err(err)) => Err(err), + (Err(err), Ok(())) => Err(err), + (Err(err), Err(_)) => Err(err), + } +} + +pub(super) async fn apply_schema_with_lock( + db: &Omnigraph, + desired_schema_source: &str, + options: SchemaApplyOptions, + validate_catalog: F, +) -> Result +where + F: FnOnce(&Catalog) -> Result<()>, +{ + let planned = plan_schema_for_apply(db, desired_schema_source, options).await?; + validate_catalog(&planned.desired_catalog)?; + let PlannedSchemaApply { + plan, + desired_ir, + desired_catalog, + } = planned; if plan.steps.is_empty() { return Ok(SchemaApplyResult { supported: true, @@ -132,9 +180,6 @@ pub(super) async fn apply_schema_with_lock( }); } - let mut desired_catalog = build_catalog_from_ir(&desired_ir)?; - fixup_blob_schemas(&mut desired_catalog); - let snapshot = db.snapshot().await; let base_manifest_version = snapshot.version(); let mut added_tables = BTreeSet::new(); diff --git a/crates/omnigraph/src/db/omnigraph/table_ops.rs b/crates/omnigraph/src/db/omnigraph/table_ops.rs index 0e89c45..3ed9c43 100644 --- a/crates/omnigraph/src/db/omnigraph/table_ops.rs +++ b/crates/omnigraph/src/db/omnigraph/table_ops.rs @@ -483,6 +483,22 @@ pub(super) async fn open_owned_dataset_for_branch_write( Ok((ds, Some(active_branch.to_string()))) } source_branch => { + crate::failpoints::maybe_fail("fork.before_classify")?; + // Authority check before forking: re-read the live manifest. If this + // table is already forked on active_branch, a concurrent first-write + // won the race and our snapshot is stale β€” that is a retryable + // conflict, not an orphan. (A zombie fork is never in the manifest, + // so this only fires for a live concurrent fork.) + let live = db.snapshot_for_branch(Some(active_branch)).await?; + if let Some(entry) = live.entry(table_key) { + if entry.table_branch.as_deref() == Some(active_branch) { + return Err(OmniError::manifest_expected_version_mismatch( + table_key, + entry_version, + entry.table_version, + )); + } + } fork_dataset_from_entry_state( db, table_key, diff --git a/crates/omnigraph/src/db/run_registry.rs b/crates/omnigraph/src/db/run_registry.rs deleted file mode 100644 index ee3d336..0000000 --- a/crates/omnigraph/src/db/run_registry.rs +++ /dev/null @@ -1,16 +0,0 @@ -// The Run state machine has been removed. Mutations now write directly -// to target tables and use the publisher's `expected_table_versions` -// CAS for cross-table OCC; `__run__` staging branches and the -// `_graph_runs.lance` state machine no longer exist. -// -// What remains is the branch-name predicate, kept as a defense-in-depth -// guard against users naming a public branch `__run__*`. A future -// production sweep of legacy `_graph_runs.lance` rows and stale -// `__run__*` branches will let this predicate (and this file) go too. - -pub(crate) const INTERNAL_RUN_BRANCH_PREFIX: &str = "__run__"; - -pub(crate) fn is_internal_run_branch(name: &str) -> bool { - name.trim_start_matches('/') - .starts_with(INTERNAL_RUN_BRANCH_PREFIX) -} diff --git a/crates/omnigraph/src/exec/merge.rs b/crates/omnigraph/src/exec/merge.rs index 2e5f32e..eb6c4a3 100644 --- a/crates/omnigraph/src/exec/merge.rs +++ b/crates/omnigraph/src/exec/merge.rs @@ -1087,9 +1087,9 @@ impl Omnigraph { target: &str, actor_id: Option<&str>, ) -> Result { - if is_internal_run_branch(source) || is_internal_run_branch(target) { + if is_internal_system_branch(source) || is_internal_system_branch(target) { return Err(OmniError::manifest(format!( - "branch_merge does not allow internal run refs ('{}' -> '{}')", + "branch_merge does not allow internal system refs ('{}' -> '{}')", source, target ))); } diff --git a/crates/omnigraph/src/exec/mod.rs b/crates/omnigraph/src/exec/mod.rs index 33a7e41..ce72d42 100644 --- a/crates/omnigraph/src/exec/mod.rs +++ b/crates/omnigraph/src/exec/mod.rs @@ -35,7 +35,7 @@ use time::format_description::well_known::Rfc3339; use crate::db::commit_graph::CommitGraph; use crate::db::manifest::ManifestCoordinator; -use crate::db::{MergeOutcome, Omnigraph, is_internal_run_branch}; +use crate::db::{MergeOutcome, Omnigraph, is_internal_system_branch}; use crate::db::{ReadTarget, Snapshot}; use crate::embedding::EmbeddingClient; use crate::error::{MergeConflict, MergeConflictKind, OmniError, Result}; diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs index cade1f4..d5d74c0 100644 --- a/crates/omnigraph/src/loader/mod.rs +++ b/crates/omnigraph/src/loader/mod.rs @@ -288,21 +288,24 @@ async fn load_jsonl_reader( let mut node_rows: HashMap> = HashMap::new(); let mut edge_rows: HashMap> = HashMap::new(); - for (line_num, line) in reader.lines().enumerate() { - let line = line?; - let line = line.trim(); - if line.is_empty() { - continue; - } - let value: JsonValue = serde_json::from_str(line).map_err(|e| { - OmniError::manifest(format!("invalid JSON on line {}: {}", line_num + 1, e)) + // Parse a stream of JSON values. Accepts both compact JSONL (one object + // per line) and pretty-printed JSON where a single object spans multiple + // lines β€” serde's streaming deserializer treats any whitespace (including + // newlines) between top-level values as a separator. + for (idx, parsed) in serde_json::Deserializer::from_reader(reader) + .into_iter::() + .enumerate() + { + let record_num = idx + 1; + let value: JsonValue = parsed.map_err(|e| { + OmniError::manifest(format!("invalid JSON at record {}: {}", record_num, e)) })?; if let Some(type_name) = value.get("type").and_then(|v| v.as_str()) { if !catalog.node_types.contains_key(type_name) { return Err(OmniError::manifest(format!( - "line {}: unknown node type '{}'", - line_num + 1, + "record {}: unknown node type '{}'", + record_num, type_name ))); } @@ -317,8 +320,8 @@ async fn load_jsonl_reader( } else if let Some(edge_name) = value.get("edge").and_then(|v| v.as_str()) { if catalog.lookup_edge_by_name(edge_name).is_none() { return Err(OmniError::manifest(format!( - "line {}: unknown edge type '{}'", - line_num + 1, + "record {}: unknown edge type '{}'", + record_num, edge_name ))); } @@ -326,14 +329,14 @@ async fn load_jsonl_reader( .get("from") .and_then(|v| v.as_str()) .ok_or_else(|| { - OmniError::manifest(format!("line {}: edge missing 'from'", line_num + 1)) + OmniError::manifest(format!("record {}: edge missing 'from'", record_num)) })? .to_string(); let to = value .get("to") .and_then(|v| v.as_str()) .ok_or_else(|| { - OmniError::manifest(format!("line {}: edge missing 'to'", line_num + 1)) + OmniError::manifest(format!("record {}: edge missing 'to'", record_num)) })? .to_string(); let data = value @@ -347,8 +350,8 @@ async fn load_jsonl_reader( .push((from, to, data)); } else { return Err(OmniError::manifest(format!( - "line {}: expected 'type' or 'edge' field", - line_num + 1 + "record {}: expected 'type' or 'edge' field", + record_num ))); } } @@ -613,7 +616,7 @@ async fn load_jsonl_reader( } else { // LoadMode::Overwrite keeps the legacy inline-commit path β€” // truncate-then-append doesn't fit the staged shape (see - // `docs/runs.md` "LoadMode::Overwrite residual"). The recovery + // `docs/dev/writes.md` "LoadMode::Overwrite residual"). The recovery // sidecar is not applicable here because the writer doesn't go // through MutationStaging; per-table inline commits + a final // manifest publish handle their own residual via the documented diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs index ddab706..10123b0 100644 --- a/crates/omnigraph/src/table_store.rs +++ b/crates/omnigraph/src/table_store.rs @@ -49,7 +49,7 @@ pub struct DeleteState { /// `exec/mutation.rs`) and the bulk loader (`loader/mod.rs`). The /// intent: defer Lance commits to end-of-query so a mid-query failure /// leaves the touched table at the pre-mutation HEAD instead of -/// drifting ahead. See `docs/runs.md` for the publisher-CAS contract +/// drifting ahead. See `docs/dev/writes.md` for the publisher-CAS contract /// this builds on. /// /// `transaction` is opaque from our side β€” Lance owns its semantics. We @@ -177,6 +177,45 @@ impl TableStore { .map_err(|e| OmniError::Lance(e.to_string())) } + /// List the named Lance branches present on the dataset at `dataset_uri`. + /// The `cleanup` orphan reconciler diffs this against the manifest branch + /// set to find orphaned per-table forks. `main`/default is not a named + /// branch and never appears here. + pub async fn list_branches(&self, dataset_uri: &str) -> Result> { + let ds = Dataset::open(dataset_uri) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let branches = ds + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(branches.into_keys().collect()) + } + + /// Idempotently drop `branch` from the dataset at `dataset_uri`. + /// + /// Unlike [`delete_branch`](Self::delete_branch), this tolerates an + /// already-absent branch β€” both a missing contents ref (Lance's + /// `force_delete_branch` handles that) and a missing `tree/{branch}/` + /// directory (the local-store `NotFound` quirk pinned by + /// `lance_surface_guards::force_delete_branch_semantics`). Safe to call on a + /// possibly-orphaned or already-reclaimed fork. + /// + /// A branch that still has referencing descendants (`RefConflict`) is NOT + /// tolerated: that is a real ordering error and surfaces as `OmniError::Lance`. + /// Used by the eager best-effort reclaim in `cleanup_deleted_branch_tables` + /// and the `cleanup` orphan reconciler. + pub async fn force_delete_branch(&self, dataset_uri: &str, branch: &str) -> Result<()> { + let mut ds = Dataset::open(dataset_uri) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + match ds.force_delete_branch(branch).await { + Ok(()) => Ok(()), + Err(lance::Error::RefNotFound { .. }) | Err(lance::Error::NotFound { .. }) => Ok(()), + Err(e) => Err(OmniError::Lance(e.to_string())), + } + } + pub async fn open_dataset_at_state( &self, table_path: &str, @@ -243,21 +282,24 @@ impl TableStore { .map_err(|e| OmniError::Lance(e.to_string()))?; self.ensure_expected_version(&source_ds, table_key, source_version)?; - match source_ds + if source_ds .create_branch(target_branch, source_version, None) .await + .is_err() { - Ok(_) => {} - Err(create_err) => match self - .open_dataset_head(dataset_uri, Some(target_branch)) - .await - { - Ok(ds) => { - self.ensure_expected_version(&ds, table_key, source_version)?; - return Ok(ds); - } - Err(_) => return Err(OmniError::Lance(create_err.to_string())), - }, + // The target branch ref already exists. The caller + // (`open_owned_dataset_for_branch_write`) re-reads the live manifest + // before forking and returns a retryable error when a concurrent + // writer legitimately holds the fork, so reaching here means the + // manifest does NOT reference this fork: it is an orphan from an + // incomplete prior `branch_delete`. Surface the actionable cleanup + // error rather than guessing from Lance branch versions. + return Err(OmniError::manifest_conflict(format!( + "branch '{}' has orphaned table state for '{}' from an incomplete \ + prior delete; run `omnigraph cleanup` to reclaim it before reusing \ + this branch name", + target_branch, table_key + ))); } let ds = self @@ -901,7 +943,7 @@ impl TableStore { /// Lift path: either a Lance API extension that lets /// `MergeInsertBuilder` accept additional staged fragments, or an /// in-memory pre-merge here that folds prior staged batches into the - /// input stream. See `docs/runs.md`. + /// input stream. See `docs/dev/writes.md`. pub async fn stage_merge_insert( &self, ds: Dataset, diff --git a/crates/omnigraph/tests/composite_flow.rs b/crates/omnigraph/tests/composite_flow.rs index 6c720da..dd41310 100644 --- a/crates/omnigraph/tests/composite_flow.rs +++ b/crates/omnigraph/tests/composite_flow.rs @@ -294,21 +294,19 @@ async fn composite_flow_canonical_lifecycle() { ); // ───────────────────────────────────────────────────────────────── - // Step 10: optimize the post-merge graph β€” verify indices stay - // valid and queryable. + // Step 10: optimize the post-merge graph β€” verify compaction is + // published to the manifest (so the manifest pin tracks the compacted + // Lance HEAD), indices stay valid and queryable, and a post-optimize + // strict write commits. // - // **Known limitation**: `optimize_all_tables` calls Lance - // `compact_files` directly β€” it advances per-table Lance HEAD - // without updating the omnigraph `__manifest` pin. After optimize, - // the next writer's expected_table_versions captures the - // pre-optimize manifest pin, but the publisher's pre-check reads - // a higher version from the manifest dataset (because some other - // path β€” possibly schema-state recovery on reopen β€” wrote a newer - // __manifest row). The `ExpectedVersionMismatch` is benign - // (re-issuing the mutation after a snapshot refresh succeeds), but - // a composite test cannot reliably exercise post-optimize mutations - // until that path is investigated. Coverage of post-optimize - // mutations is left to a focused optimize+cleanup integration test. + // This step used to carry a "Known limitation": `optimize_all_tables` + // ran Lance `compact_files` without publishing the new version to + // `__manifest`, so the manifest pin lagged the Lance HEAD and the next + // strict write / schema apply failed with `ExpectedVersionMismatch` + // ("stale view … refresh and retry") β€” so post-optimize mutations were + // deliberately omitted here. optimize now publishes the compacted + // version, and this flow exercises exactly that previously-failing + // write below. // ───────────────────────────────────────────────────────────────── let optimize_stats = db.optimize().await.unwrap(); assert!( @@ -331,6 +329,28 @@ async fn composite_flow_canonical_lifecycle() { "row counts unchanged by optimize" ); + // A strict update on a compacted table is exactly the write that + // failed with "stale view" before optimize published its compaction. + // It must now commit (Alice is one of the seed Persons; an update + // leaves the row count at 6). + let post_optimize_update = mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 41)]), + ) + .await + .expect("post-optimize strict update must commit β€” optimize published the manifest"); + assert_eq!( + post_optimize_update.affected_nodes, 1, + "post-optimize update must affect exactly Alice" + ); + assert_eq!( + count_rows(&db, "node:Person").await, + 6, + "an update must not change the Person row count" + ); + // ───────────────────────────────────────────────────────────────── // Step 11: cleanup β€” keep last 10 versions, only purge versions // older than 1 hour. With this small test, we have well under 10 @@ -373,14 +393,27 @@ async fn composite_flow_canonical_lifecycle() { branches, ); - // Final query exercise β€” full read path works post-reopen, - // post-cleanup. Post-cleanup mutation is omitted here pending - // resolution of the optimize-vs-manifest-pin interaction documented - // in Step 10. + // Final exercise β€” full read AND write path works post-reopen, + // post-cleanup. (The post-cleanup mutation was previously omitted + // pending resolution of the optimize-vs-manifest-pin interaction in + // Step 10; that is now fixed, so a strict write here must commit.) let final_total = query_main(&mut db, TEST_QUERIES, "total_people", &ParamMap::default()) .await .unwrap(); assert!(!final_total.batches().is_empty()); + + let post_reopen_update = mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 42)]), + ) + .await + .expect("post-reopen, post-cleanup strict update must commit"); + assert_eq!( + post_reopen_update.affected_nodes, 1, + "post-reopen update must affect exactly Alice" + ); } /// Cross-handle sequence that exercises operations after a schema_apply diff --git a/crates/omnigraph/tests/end_to_end.rs b/crates/omnigraph/tests/end_to_end.rs index a0fdb0e..ea11d0e 100644 --- a/crates/omnigraph/tests/end_to_end.rs +++ b/crates/omnigraph/tests/end_to_end.rs @@ -1933,3 +1933,87 @@ query docs_with_tag($tag: String) { "contains-pushdown should return exactly the rows whose tags list contains 'red'" ); } + +// ─── Maintenance in the full lifecycle: optimize (compaction) ──────────────── + +/// `optimize` (Lance compaction) is part of a realistic graph lifecycle: it +/// advances the Lance HEAD and publishes the compacted version to the manifest. +/// The rest of the flow must keep working across that boundary β€” reads observe +/// the compacted data, strict updates (which check Lance HEAD == manifest +/// version) still commit, inserts still commit, and the state survives a reopen +/// (the open-time recovery sweep finds no leftover drift). Before optimize +/// published its compaction, the manifest lagged the Lance HEAD here and the +/// post-optimize update below failed with "stale view ... refresh and retry". +#[tokio::test] +async fn full_flow_optimize_then_query_update_and_reopen() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = init_and_load(&dir).await; + + // Build several Person fragments so compaction has something to merge. + for (name, age) in [("Eve", 40), ("Frank", 41), ("Grace", 42)] { + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", name)], &[("$age", age)]), + ) + .await + .unwrap(); + } + + let stats = db.optimize().await.unwrap(); + assert!( + stats.iter().any(|s| s.committed), + "a multi-fragment table should have compacted in this flow" + ); + + // Reads observe the compacted data. + let qr = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + + // Strict update after optimize commits (previously failed with "stale view" + // because the manifest lagged the compacted Lance HEAD). + let upd = mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), + ) + .await + .unwrap(); + assert_eq!(upd.affected_nodes, 1); + + // Insert after optimize also commits. + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Ivan")], &[("$age", 50)]), + ) + .await + .unwrap(); + assert_eq!(count_rows(&db, "node:Person").await, 8); // 4 seed + Eve/Frank/Grace + Ivan + + // State survives a reopen β€” the recovery sweep runs and finds no drift. + drop(db); + let reopened = Omnigraph::open(&uri).await.unwrap(); + assert_eq!(count_rows(&reopened, "node:Person").await, 8); + let alice = reopened + .entity_at_target(ReadTarget::branch("main"), "node:Person", "Alice") + .await + .unwrap() + .unwrap(); + assert_eq!( + alice["age"], + serde_json::json!(31), + "Alice's post-optimize age update must persist across reopen" + ); +} diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs index 5ea71c5..d240108 100644 --- a/crates/omnigraph/tests/failpoints.rs +++ b/crates/omnigraph/tests/failpoints.rs @@ -41,6 +41,452 @@ async fn branch_create_failpoint_triggers() { ); } +// Branch delete flips the manifest authority first, then reclaims the per-table +// forks best-effort. A failure during that reclaim (here, the +// `branch_delete.before_table_cleanup` failpoint, standing in for a transient +// object-store error) must NOT fail the call: the branch is already gone, and +// `cleanup` reconciles the stranded fork. The branch name is reusable after. +#[tokio::test] +async fn branch_delete_partial_failure_converges_via_cleanup() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut main = helpers::init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + drop(feature); + + let person_uri = node_table_uri(&uri, "Person"); + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("feature"), + "precondition: the owned table fork exists before delete" + ); + } + + // Inject a failure during per-table cleanup, AFTER the manifest authority + // flip. branch_delete must still succeed (best-effort reclaim). + { + let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return"); + main.branch_delete("feature").await.expect( + "branch_delete is best-effort after the manifest flip: a cleanup-step \ + failure must not fail the call", + ); + } + + // Authority flipped: the branch is gone. + assert_eq!(main.branch_list().await.unwrap(), vec!["main".to_string()]); + + // The eager reclaim failed, so the orphan is stranded until cleanup. + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("feature"), + "failed eager reclaim should leave the orphan for cleanup to reconcile" + ); + } + + // cleanup converges: the orphan is reclaimed. + main.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("feature"), + "cleanup should reconcile the orphaned fork away" + ); + } + + // The name is reusable after cleanup reclaims the orphan. + main.branch_create("feature").await.unwrap(); + let mut feature2 = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut feature2, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 41)]), + ) + .await + .unwrap(); +} + +// Reusing a branch name whose delete left an orphaned fork (before `cleanup` +// reconciles it) must fail with a clear, actionable error pointing at +// `cleanup`, not the opaque `ExpectedVersionMismatch` that leaks from the fork +// path. The recreate itself succeeds; the first write to the previously-forked +// table is where the stale orphan collides. +#[tokio::test] +async fn recreate_over_orphaned_fork_before_cleanup_is_actionable() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut main = helpers::init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + drop(feature); + + // Partial delete: leaves the Person fork orphaned (cleanup not yet run). + { + let _fp = ScopedFailPoint::new("branch_delete.before_table_cleanup", "return"); + main.branch_delete("feature").await.unwrap(); + } + + // Recreate the name and write to the previously-forked table WITHOUT a + // cleanup in between. + main.branch_create("feature").await.unwrap(); + let mut feature2 = Omnigraph::open(&uri).await.unwrap(); + let err = helpers::mutate_branch( + &mut feature2, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 41)]), + ) + .await + .expect_err("write should collide with the stale orphaned fork"); + + let msg = err.to_string(); + assert!( + msg.contains("cleanup") + && (msg.contains("orphan") || msg.contains("incomplete prior delete")), + "expected an actionable orphaned-fork error pointing at cleanup, got: {msg}" + ); + assert!( + !msg.contains("expected manifest table version"), + "should not surface the opaque ExpectedVersionMismatch, got: {msg}" + ); +} + +// cleanup is the guaranteed convergence backstop, so one table's transient +// failure must not abort the whole sweep. Inject a one-shot version-GC failure +// for a single table and assert: cleanup still succeeds, the failure is +// surfaced per-table in the returned stats, and the independent reconcile pass +// still reclaimed an orphan. +#[tokio::test] +async fn cleanup_isolates_single_table_failure() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = helpers::init_and_load(&dir).await; + + // Forge an orphaned fork on the Person table (a reconcile target). + let person_uri = node_table_uri(&uri, "Person"); + { + let mut ds = lance::Dataset::open(&person_uri).await.unwrap(); + let base = ds.version().version; + ds.create_branch("ghost", base, None).await.unwrap(); + } + + // One table's version GC fails once; the sweep must isolate it. + let _fp = ScopedFailPoint::new("cleanup.table_gc", "1*return"); + let stats = db + .cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .expect("a single table's GC failure must not abort cleanup"); + + let errored = stats.iter().filter(|s| s.error.is_some()).count(); + assert_eq!( + errored, 1, + "exactly one table's GC failure should be surfaced in stats, got {errored}" + ); + assert!( + stats.len() >= 4, + "every node+edge table should still appear in the stats" + ); + + // The reconcile pass is independent of the GC failure, so the orphan is gone. + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("ghost"), + "reconcile should reclaim the orphan despite the GC failure" + ); + } +} + +// Companion to the version-GC isolation test, exercising the OTHER cleanup +// loop: a force-delete failure while reconciling one orphaned fork must be +// isolated (logged, not propagated) so the sweep continues, and a later +// cleanup converges. This is the loop the Devin finding was about. +#[tokio::test] +async fn cleanup_isolates_reconcile_failure() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = helpers::init_and_load(&dir).await; + + // Forge an orphaned fork the reconcile pass will try to reclaim. + let person_uri = node_table_uri(&uri, "Person"); + { + let mut ds = lance::Dataset::open(&person_uri).await.unwrap(); + let base = ds.version().version; + ds.create_branch("ghost", base, None).await.unwrap(); + } + + // Inject a one-shot failure into the reconcile force-delete. The sweep must + // not abort. + { + let _fp = ScopedFailPoint::new("cleanup.reconcile_fork", "1*return"); + db.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .expect("a reconcile force-delete failure must not abort cleanup"); + } + // The blocked orphan is still present (the failure was isolated, not retried). + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("ghost"), + "the orphan whose reclaim was injected-to-fail should remain" + ); + } + // A second cleanup with no injected failure converges. + db.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + { + let ds = lance::Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("ghost"), + "the second cleanup should reconcile the orphan" + ); + } +} + +// The cleanup reconciler must reclaim orphaned commit-graph branches, not just +// per-table forks. A delete whose best-effort commit-graph reclaim fails leaves +// a commit-graph orphan; the next cleanup must drop it. +#[tokio::test] +async fn cleanup_reclaims_orphaned_commit_graph_branch() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = helpers::init_and_load(&dir).await; + + db.branch_create("feature").await.unwrap(); + // Delete, failing the commit-graph reclaim β†’ commit-graph "feature" orphan + // (manifest branch gone, commit-graph branch left behind). + { + let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return"); + db.branch_delete("feature").await.unwrap(); + } + + let commits_uri = format!("{}/_graph_commits.lance", uri.trim_end_matches('/')); + { + let ds = lance::Dataset::open(&commits_uri).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("feature"), + "precondition: the commit-graph branch should be orphaned after the failed reclaim" + ); + } + + db.cleanup(omnigraph::db::CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + + { + let ds = lance::Dataset::open(&commits_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("feature"), + "cleanup should reclaim the orphaned commit-graph branch" + ); + } +} + +// A branch_delete whose best-effort commit-graph reclaim fails leaves a +// commit-graph "zombie" branch. Recreating that name must heal the zombie and +// succeed (branch_create force-deletes a stale commit-graph ref since the +// manifest branch is created fresh), instead of dying on the leftover ref. +#[tokio::test] +async fn branch_create_recreates_over_commit_graph_zombie() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) + .await + .unwrap(); + + db.branch_create("feature").await.unwrap(); + { + // Fail the best-effort commit-graph reclaim β†’ commit-graph "feature" + // zombie survives the delete (manifest authority still flips). + let _fp = ScopedFailPoint::new("branch_delete.before_commit_graph_reclaim", "return"); + db.branch_delete("feature").await.unwrap(); + } + assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); + + db.branch_create("feature") + .await + .expect("branch_create should heal the zombie commit-graph branch and succeed"); + assert!( + db.branch_list() + .await + .unwrap() + .contains(&"feature".to_string()) + ); +} + +// branch_create is authority-then-derived: if the derived commit-graph branch +// cannot be created, the manifest branch (the authority) must be rolled back so +// the branch does not half-exist. The existing failpoint fires right after the +// manifest create, standing in for any post-authority failure. +#[tokio::test] +async fn branch_create_rolls_back_manifest_on_commit_graph_failure() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) + .await + .unwrap(); + + let err = { + let _fp = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return"); + db.branch_create("feature").await.unwrap_err() + }; + assert!( + !db.branch_list() + .await + .unwrap() + .contains(&"feature".to_string()), + "branch_create must roll back the manifest branch when the derived \ + commit-graph branch fails, got error: {err}" + ); +} + +// A fork collision must be classified by the manifest authority, not by Lance +// branch versions. When a concurrent first-write legitimately wins the fork +// race, the loser sees a version mismatch β€” but that is a stale snapshot, not +// an orphan, so it must be a retryable "refresh and retry", never a misleading +// "run cleanup". +// +// Ordering is made deterministic (no sleeps) via a callback at the fork point: +// `compare_exchange` lets only the FIRST arrival (writer A) record readiness and +// block until released; later arrivals (writer B) fall through. The test waits +// on the readiness flag, lets B win and commit the fork, then releases A. +static FORK_A_AT_POINT: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); +static FORK_RELEASE_A: std::sync::atomic::AtomicBool = std::sync::atomic::AtomicBool::new(false); + +#[tokio::test(flavor = "multi_thread")] +async fn fork_collision_with_live_concurrent_fork_is_retryable() { + use std::sync::atomic::Ordering::SeqCst; + + let _scenario = FailScenario::setup(); + FORK_A_AT_POINT.store(false, SeqCst); + FORK_RELEASE_A.store(false, SeqCst); + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let main = helpers::init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + // First arrival (A) records readiness and blocks until released; the rest + // (B) fall through immediately. Bounded spin so a mistake can't hang forever. + fail::cfg_callback("fork.before_classify", || { + if FORK_A_AT_POINT + .compare_exchange(false, true, SeqCst, SeqCst) + .is_ok() + { + for _ in 0..2000 { + if FORK_RELEASE_A.load(SeqCst) { + break; + } + std::thread::sleep(std::time::Duration::from_millis(5)); + } + } + }) + .unwrap(); + + let uri_a = uri.clone(); + let writer_a = tokio::spawn(async move { + let mut a = Omnigraph::open(&uri_a).await.unwrap(); + helpers::mutate_branch( + &mut a, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + }); + + // Wait (bounded) until A is parked at the fork point. + for _ in 0..600 { + if FORK_A_AT_POINT.load(SeqCst) { + break; + } + tokio::time::sleep(std::time::Duration::from_millis(5)).await; + } + assert!( + FORK_A_AT_POINT.load(SeqCst), + "writer A never reached the fork point" + ); + + // B wins the fork and commits it. + let mut b = Omnigraph::open(&uri).await.unwrap(); + helpers::mutate_branch( + &mut b, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 41)]), + ) + .await + .unwrap(); + + // Release A; it resumes, re-reads the manifest, and sees the fork is live. + FORK_RELEASE_A.store(true, SeqCst); + let err = writer_a + .await + .unwrap() + .expect_err("A's stale-snapshot fork should be a retryable conflict"); + fail::remove("fork.before_classify"); + + let msg = err.to_string(); + assert!( + !msg.contains("cleanup"), + "a live concurrent fork must not be misclassified as an orphan, got: {msg}" + ); + assert!( + msg.contains("refresh and retry") || msg.contains("expected manifest table version"), + "expected a retryable stale-view error, got: {msg}" + ); +} + #[tokio::test(flavor = "multi_thread")] async fn graph_publish_failpoint_triggers_before_commit_append() { let _scenario = FailScenario::setup(); @@ -799,7 +1245,7 @@ async fn refresh_defers_rollback_eligible_sidecar_to_next_open() { // the rollback (will use Dataset::restore safely; no concurrent // writers at open time). drop(db); - let _db = Omnigraph::open(&uri).await.unwrap(); + let db = Omnigraph::open(&uri).await.unwrap(); // After full-sweep recovery, the sidecar should be processed // (deleted). Sidecar's tables are eligible for rollback (UnexpectedAtP1): // restore happens on Person (HEAD advances by 1). @@ -822,6 +1268,19 @@ async fn refresh_defers_rollback_eligible_sidecar_to_next_open() { "full sweep must run Dataset::restore (head advances); \ post_head={post_head}, final_head={final_head}", ); + // Convergence: roll-back published the restored HEAD, so the manifest pin + // tracks Lance HEAD afterward (no residual drift). + let entry_version = db + .snapshot_of(omnigraph::db::ReadTarget::branch("main")) + .await + .unwrap() + .entry("node:Person") + .unwrap() + .table_version; + assert_eq!( + entry_version, final_head, + "full-sweep roll-back must publish so manifest pin ({entry_version}) == Lance HEAD ({final_head})", + ); } /// Companion to the above β€” confirms that a finalizeβ†’publisher failure @@ -1015,10 +1474,15 @@ edge WorksAt: Person -> Company } let db = Omnigraph::open(&uri).await.unwrap(); - assert_eq!( - version_main(&db).await.unwrap(), - pre_failure_version, - "manifest must remain on the old schema when no schema staging files existed" + // Roll-back now publishes the restored version, so the manifest version + // advances β€” but to the OLD-schema content: the migration never applied + // (asserted by count_rows + the `_schema.pg` checks below), and the sweep + // converges (`manifest == Lance HEAD`, asserted by + // assert_post_recovery_invariants's RolledBack arm). + assert!( + version_main(&db).await.unwrap() > pre_failure_version, + "roll-back publishes the restored (old-schema) version, advancing the manifest; \ + pre={pre_failure_version}", ); assert_eq!( helpers::count_rows(&db, "node:Person").await, @@ -1191,6 +1655,100 @@ edge WorksAt: Person -> Company ); } +/// `optimize` Phase B β†’ Phase C residual: `compact_files` advanced the Lance +/// HEAD but the manifest publish hasn't run. The `Optimize` recovery sidecar +/// (loose-match, like SchemaApply/EnsureIndices) must roll the compacted version +/// forward on next open so the manifest tracks the Lance HEAD β€” and the healed +/// table must then accept a schema apply (the original bug's victim). +#[tokio::test] +async fn optimize_phase_b_failure_recovered_on_next_open() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let operation_id; + + // Seed: several separate Person inserts β†’ multiple fragments, so compaction + // has real work and advances the Lance HEAD. + { + let db = Omnigraph::init(&uri, helpers::TEST_SCHEMA).await.unwrap(); + for (name, age) in [("alice", 30), ("bob", 31), ("carol", 32), ("dave", 33)] { + db.mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", name)], &[("$age", age)]), + ) + .await + .unwrap(); + } + } + + let pre_failure_version = { + let db = Omnigraph::open(&uri).await.unwrap(); + version_main(&db).await.unwrap() + }; + + // Failpoint fires AFTER compact_files advanced the Lance HEAD but BEFORE the + // manifest publish. The Optimize sidecar persists (only node:Person has + // compactable fragments, so exactly one sidecar is written). + { + let db = Omnigraph::open(&uri).await.unwrap(); + let _failpoint = + ScopedFailPoint::new("optimize.post_phase_b_pre_manifest_commit", "return"); + let err = db.optimize().await.unwrap_err(); + assert!( + err.to_string() + .contains("injected failpoint triggered: optimize.post_phase_b_pre_manifest_commit"), + "unexpected error: {err}" + ); + + let recovery_dir = dir.path().join("__recovery"); + let sidecars: Vec<_> = std::fs::read_dir(&recovery_dir) + .unwrap() + .filter_map(|e| e.ok()) + .collect(); + assert_eq!( + sidecars.len(), + 1, + "exactly one Optimize sidecar must persist after optimize failure" + ); + operation_id = single_sidecar_operation_id(dir.path()); + } + + // Recovery: reopen runs the sweep. The Optimize sidecar classifies + // RolledPastExpected (loose-match) β†’ RollForward β†’ manifest extends to the + // compacted Lance HEAD. + let db = Omnigraph::open(&uri).await.unwrap(); + let post_recovery_version = version_main(&db).await.unwrap(); + assert!( + post_recovery_version > pre_failure_version, + "manifest version must advance post-recovery (compaction rolled forward); \ + pre={pre_failure_version}, post={post_recovery_version}", + ); + drop(db); + + assert_post_recovery_invariants( + dir.path(), + &operation_id, + RecoveryExpectation::RolledForward { + tables: vec![TableExpectation::main("node:Person")], + }, + ) + .await + .unwrap(); + + // The healed table accepts an additive schema apply β€” its HEAD-vs-manifest + // precondition is satisfied because recovery published the compacted version. + let db = Omnigraph::open(&uri).await.unwrap(); + let desired = helpers::TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + db.apply_schema(&desired) + .await + .expect("schema apply after optimize recovery must succeed"); +} + #[tokio::test] async fn branch_merge_phase_b_failure_recovered_on_next_open() { use omnigraph::loader::{LoadMode, load_jsonl}; diff --git a/crates/omnigraph/tests/helpers/recovery.rs b/crates/omnigraph/tests/helpers/recovery.rs index c76009e..90d9a25 100644 --- a/crates/omnigraph/tests/helpers/recovery.rs +++ b/crates/omnigraph/tests/helpers/recovery.rs @@ -181,6 +181,9 @@ pub async fn assert_post_recovery_invariants( "audit row for {operation_id} recorded the wrong recovery_kind", ); assert_rollback_outcomes_record_drift(&audit); + // Roll-back now publishes the restored HEAD, so manifest == Lance + // HEAD afterward (symmetric with roll-forward) β€” no residual drift. + assert_manifest_pins_match_lance_heads(graph_root, &tables).await?; assert_recovery_commit_shape(graph_root, &audit, &tables).await?; assert_non_main_did_not_move_main(graph_root, &tables).await?; assert_idempotent_reopen(graph_root, operation_id).await?; diff --git a/crates/omnigraph/tests/lance_surface_guards.rs b/crates/omnigraph/tests/lance_surface_guards.rs index b65a808..1d60c08 100644 --- a/crates/omnigraph/tests/lance_surface_guards.rs +++ b/crates/omnigraph/tests/lance_surface_guards.rs @@ -242,3 +242,136 @@ async fn _compile_delete_result_field_shape() -> lance::Result<()> { let _num_deleted: u64 = result.num_deleted_rows; Ok(()) } + +// --- Guard 9: force_delete_branch semantics -------------------------------- +// +// The branch-delete reconciler (`db/omnigraph/optimize.rs::reconcile_orphaned_branches`) +// and the eager best-effort reclaim in `cleanup_deleted_branch_tables` call +// `force_delete_branch` to drop orphaned branch refs. The single-authority +// design relies on three facts pinned here: +// 1. plain `delete_branch` errors on a missing ref (so the design uses the +// force variant instead); +// 2. `force_delete_branch` removes an existing (forked) branch β€” the orphan +// case, where a `tree/{branch}/` exists; +// 3. `force_delete_branch` on a *fully-absent* branch (no tree dir) still +// errors on the local store, because `remove_dir_all`'s NotFound is not +// caught for Lance's native error variant. `TableStore::force_delete_branch` +// wraps this to be fully idempotent. Pin the raw quirk so a future Lance +// fix (which would let us simplify the wrapper) is noticed. + +#[tokio::test] +async fn force_delete_branch_semantics() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("guard9.lance"); + let uri = uri.to_str().unwrap(); + let mut ds = fresh_dataset(uri).await; + + // (1) Plain delete of a never-created branch errors (RefNotFound). + assert!( + ds.delete_branch("nope").await.is_err(), + "Dataset::delete_branch on a missing ref should error; if this is now \ + Ok, the reconciler could drop the force variant." + ); + + // (2) force_delete_branch removes an existing (forked) branch. + let base = ds.version().version; + ds.create_branch("feature", base, None).await.unwrap(); + ds.force_delete_branch("feature").await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("feature"), + "force_delete_branch should remove an existing branch ref" + ); + + // (3) Quirk: force_delete on a fully-absent branch errors on the local + // store (worked around by TableStore::force_delete_branch). + assert!( + ds.force_delete_branch("never").await.is_err(), + "force_delete_branch on a fully-absent branch no longer errors β€” \ + TableStore::force_delete_branch's NotFound tolerance can be simplified." + ); +} + +// --- Guard 10: blob-column compaction is still broken in this Lance -------- +// +// `db/omnigraph/optimize.rs` skips tables with blob columns while +// `LANCE_SUPPORTS_BLOB_COMPACTION = false`: Lance `compact_files` forces +// `BlobHandling::AllBinary`, and the blob-v2 struct decoder mis-counts columns +// ("more fields in the schema than provided column indices"), failing even a +// pristine uniform-V2_2 multi-fragment blob table. Reads are unaffected (they +// use descriptor handling). +// +// WHEN THIS TEST TURNS RED (compact_files no longer errors), the Lance bug is +// fixed: flip `LANCE_SUPPORTS_BLOB_COMPACTION` to true in optimize.rs, drop the +// blob-skip branch + the `optimize_skips_blob_table_and_reports_skip` +// skip assertions in maintenance.rs, and re-pin docs/dev/lance.md. + +#[tokio::test] +async fn compact_files_still_fails_on_blob_columns() { + use arrow_array::{LargeBinaryArray, StructArray}; + + fn blob_batch(start: i32, n: i32) -> RecordBatch { + let ids: Vec = (start..start + n).map(|i| format!("n{i}")).collect(); + let data = + LargeBinaryArray::from_iter_values((start..start + n).map(|i| format!("blob{i}"))); + let blob_uri = StringArray::from(vec![None::<&str>; n as usize]); + let DataType::Struct(fields) = lance::blob::blob_field("content", true).data_type().clone() + else { + unreachable!("blob_field is always a Struct"); + }; + let content = StructArray::new( + fields, + vec![Arc::new(data) as _, Arc::new(blob_uri) as _], + None, + ); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + lance::blob::blob_field("content", true), + ])); + RecordBatch::try_new( + schema, + vec![Arc::new(StringArray::from(ids)) as _, Arc::new(content) as _], + ) + .unwrap() + } + + async fn write(uri: &str, batch: RecordBatch, mode: WriteMode) { + let schema = batch.schema(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + // Blob v2 requires file version >= 2.2; without the pin the *write* + // would fail with a different error, masking the guard's intent. + let params = WriteParams { + mode, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + Dataset::write(reader, uri, Some(params)).await.unwrap(); + } + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("guard10-blob.lance"); + let uri = uri.to_str().unwrap(); + + // Uniform V2_2, two fragments β†’ forces compaction to actually rewrite. + write(uri, blob_batch(0, 2), WriteMode::Create).await; + write(uri, blob_batch(100, 2), WriteMode::Append).await; + + let mut ds = Dataset::open(uri).await.unwrap(); + assert!( + ds.get_fragments().len() >= 2, + "guard needs a multi-fragment table to trigger a real compaction rewrite" + ); + + let result = compact_files(&mut ds, CompactionOptions::default(), None).await; + let err = result.expect_err( + "compact_files unexpectedly SUCCEEDED on a blob table β€” the Lance blob-v2 \ + compaction bug is fixed. Flip LANCE_SUPPORTS_BLOB_COMPACTION to true in \ + db/omnigraph/optimize.rs, remove the blob-skip branch, and re-pin docs/dev/lance.md.", + ); + assert!( + err.to_string() + .contains("more fields in the schema than provided column indices"), + "blob compaction failed with an unexpected error (Lance internals may have \ + shifted): {err}" + ); +} diff --git a/crates/omnigraph/tests/maintenance.rs b/crates/omnigraph/tests/maintenance.rs index 3c6ab30..2a5a659 100644 --- a/crates/omnigraph/tests/maintenance.rs +++ b/crates/omnigraph/tests/maintenance.rs @@ -7,10 +7,25 @@ mod helpers; use std::time::Duration; -use omnigraph::db::{CleanupPolicyOptions, Omnigraph}; +use lance::Dataset; +use omnigraph::db::{CleanupPolicyOptions, Omnigraph, ReadTarget, SkipReason}; use omnigraph::loader::{LoadMode, load_jsonl}; -use helpers::{TEST_DATA, TEST_SCHEMA, count_rows, init_and_load}; +use helpers::{ + MUTATION_QUERIES, TEST_DATA, TEST_SCHEMA, count_rows, init_and_load, mixed_params, mutate_main, +}; + +/// Filesystem URI of a node sub-table, mirroring the engine's layout +/// (FNV-1a of the type name under `nodes/`). Matches the helper in +/// `failpoints.rs`; used to inspect/forge Lance branches directly in tests. +fn node_table_uri(root: &str, type_name: &str) -> String { + let mut hash: u64 = 0xcbf2_9ce4_8422_2325; + for &b in type_name.as_bytes() { + hash ^= b as u64; + hash = hash.wrapping_mul(0x100_0000_01b3); + } + format!("{}/nodes/{hash:016x}", root.trim_end_matches('/')) +} #[tokio::test] async fn optimize_on_empty_graph_returns_stats_per_table_with_no_changes() { @@ -59,6 +74,215 @@ async fn optimize_after_load_then_again_is_idempotent() { } } +// Regression: `optimize` must not crash on a graph that has a `Blob` table. +// +// Lance `compact_files` forces `BlobHandling::AllBinary`, which mis-decodes +// blob-v2 columns ("more fields in the schema than provided column indices"), +// failing even a pristine uniform-V2_2 multi-fragment blob table. `optimize` +// must skip blob-bearing tables (and report the skip) rather than aborting the +// whole sweep. +// +// Before the skip fix, `optimize()` returned that Lance error here and aborted +// the whole sweep; it now skips the blob table (`doc.skipped == Some(..)`) +// while the sibling non-blob `Tag` table still compacts. The skip is gated by +// `LANCE_SUPPORTS_BLOB_COMPACTION`; the surface guard +// `compact_files_still_fails_on_blob_columns` flags when the upstream Lance fix +// makes the skip (and this test's blob arm) removable. +#[tokio::test] +async fn optimize_skips_blob_table_and_reports_skip() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + // One Blob node type (`Doc`) + one plain node type (`Tag`): proves the blob + // table is skipped while a non-blob table in the same sweep still compacts. + let schema = "\ +node Doc {\n slug: String @key\n content: Blob\n}\n\ +node Tag {\n slug: String @key\n}\n"; + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + // Multi-fragment blob table: Overwrite creates fragment 1; each Merge of + // new keys appends another. A >=2-fragment blob table is exactly what + // crashes `compact_files` today (single fragment would no-op and not crash). + load_jsonl( + &mut db, + "{\"type\":\"Doc\",\"data\":{\"slug\":\"d1\",\"content\":\"base64:aGVsbG8x\"}}\n{\"type\":\"Doc\",\"data\":{\"slug\":\"d2\",\"content\":\"base64:aGVsbG8y\"}}", + LoadMode::Overwrite, + ) + .await + .unwrap(); + load_jsonl( + &mut db, + "{\"type\":\"Doc\",\"data\":{\"slug\":\"d3\",\"content\":\"base64:aGVsbG8z\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + load_jsonl( + &mut db, + "{\"type\":\"Doc\",\"data\":{\"slug\":\"d4\",\"content\":\"base64:aGVsbG80\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + // Plain table, also multi-fragment so it has something to compact. + load_jsonl( + &mut db, + "{\"type\":\"Tag\",\"data\":{\"slug\":\"t1\"}}\n{\"type\":\"Tag\",\"data\":{\"slug\":\"t2\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + load_jsonl( + &mut db, + "{\"type\":\"Tag\",\"data\":{\"slug\":\"t3\"}}", + LoadMode::Merge, + ) + .await + .unwrap(); + + let stats = db + .optimize() + .await + .expect("optimize must not crash on a graph with a Blob table"); + + let doc = stats + .iter() + .find(|s| s.table_key == "node:Doc") + .expect("Doc stat present"); + let tag = stats + .iter() + .find(|s| s.table_key == "node:Tag") + .expect("Tag stat present"); + // The blob table is skipped (and reported), not compacted. + assert_eq!( + doc.skipped, + Some(SkipReason::BlobColumnsUnsupportedByLance), + "blob table must be reported as skipped", + ); + assert!(!doc.committed, "skipped blob table is not compacted"); + assert_eq!(doc.fragments_removed, 0); + assert_eq!(doc.fragments_added, 0); + // The plain (non-blob) table is unaffected by the skip. + assert_eq!(tag.skipped, None, "non-blob table must not be skipped"); +} + +// Regression: `optimize` must publish its compaction to the `__manifest` so the +// manifest's recorded `table_version` tracks the compacted Lance HEAD. +// +// Lance `compact_files` advances the *dataset's* version (reserve-fragments + +// rewrite commits) but knows nothing about OmniGraph's `__manifest`. If optimize +// does not publish a manifest update, the manifest's `table_version` lags the +// Lance HEAD: reads stay pinned to the pre-compaction version (compaction is +// invisible to them) and any subsequent schema apply / strict update/delete +// fails its HEAD-vs-manifest precondition with +// "stale view of '': expected manifest table version X but current is Y". +// This pins the fix β€” optimize publishes the compacted version, so manifest == +// HEAD and migrations after a compaction succeed. +#[tokio::test] +async fn optimize_publishes_compaction_to_manifest_so_schema_apply_succeeds() { + let dir = tempfile::tempdir().unwrap(); + let root = dir.path().to_str().unwrap().trim_end_matches('/').to_string(); + let mut db = init_and_load(&dir).await; + + // Several separate inserts β†’ multiple Person fragments, so `compact_files` + // actually merges and moves the Lance HEAD (a single fragment is a no-op). + for (name, age) in [("Eve", 40), ("Frank", 41), ("Grace", 42), ("Heidi", 43)] { + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", name)], &[("$age", age as i64)]), + ) + .await + .expect("insert"); + } + + let stats = db.optimize().await.unwrap(); + let person = stats + .iter() + .find(|s| s.table_key == "node:Person") + .expect("Person stat present"); + assert!( + person.committed, + "Person is multi-fragment, so optimize must have compacted it" + ); + + // After optimize, the manifest's recorded table_version must equal the actual + // Lance HEAD β€” optimize published its compaction, so there is no drift. + let snap = db.snapshot_of(ReadTarget::branch("main")).await.unwrap(); + let entry = snap.entry("node:Person").unwrap(); + let manifest_version = entry.table_version; + let full = format!("{}/{}", root, entry.table_path); + let lance_head = Dataset::open(&full).await.unwrap().version().version; + assert_eq!( + manifest_version, lance_head, + "after optimize, manifest table_version ({manifest_version}) must equal Lance HEAD ({lance_head})", + ); + + // Reads observe the compacted version with rows preserved (4 seed + 4 inserts). + assert_eq!(count_rows(&db, "node:Person").await, 8); + + // The headline: an additive (nullable property) migration touching the + // just-compacted table succeeds, where it previously failed with "stale view". + let desired = TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + let result = db + .apply_schema(&desired) + .await + .expect("additive schema apply after optimize must succeed"); + assert!(result.applied, "schema apply should report applied=true"); +} + +// Regression: `optimize` must REFUSE when an unresolved recovery sidecar is +// pending. Operating on an unrecovered graph could publish a partial write that +// the all-or-nothing recovery sweep would roll back; the operator must reopen +// (run the recovery sweep) first. +#[tokio::test] +async fn optimize_defers_when_recovery_sidecar_is_pending() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = init_and_load(&dir).await; + + // Simulate an in-process failed write that left a recovery sidecar on disk. + let recovery_dir = dir.path().join("__recovery"); + std::fs::create_dir_all(&recovery_dir).unwrap(); + let person_path = node_table_uri(uri, "Person"); + let sidecar_json = format!( + r#"{{ + "schema_version": 1, + "operation_id": "01H000000000000000000DEFR", + "started_at": "0", + "branch": null, + "actor_id": "act-test", + "writer_kind": "Mutation", + "tables": [ + {{ + "table_key": "node:Person", + "table_path": "{}", + "expected_version": 1, + "post_commit_pin": 2 + }} + ] + }}"#, + person_path + ); + std::fs::write( + recovery_dir.join("01H000000000000000000DEFR.json"), + sidecar_json, + ) + .unwrap(); + + let err = db + .optimize() + .await + .expect_err("optimize must defer (error) while a recovery sidecar is pending"); + assert!( + err.to_string().to_lowercase().contains("recovery"), + "optimize defer error should mention recovery; got: {err}", + ); +} + #[tokio::test] async fn cleanup_without_any_policy_option_errors() { let dir = tempfile::tempdir().unwrap(); @@ -158,3 +382,59 @@ async fn cleanup_then_optimize_preserves_rows_and_table_remains_writable() { .unwrap(); assert_eq!(count_rows(&db, "node:Person").await, people_before); } + +#[tokio::test] +async fn cleanup_reconciles_orphaned_branch_forks() { + // An incomplete prior `branch_delete` can leave a per-table Lance branch + // that the manifest no longer references (a "zombie" fork). It is + // unreachable through any snapshot but pins its `tree/{branch}/` storage. + // `cleanup` must reconcile it away: drop every Lance branch absent from the + // manifest authority, without touching `main`. + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap().to_string(); + let mut db = init_and_load(&dir).await; + + let people_before = count_rows(&db, "node:Person").await; + assert!(people_before > 0, "fixture should seed Person rows"); + + // Forge an orphaned fork the manifest never knew about. + let person_uri = node_table_uri(&uri, "Person"); + { + let mut ds = Dataset::open(&person_uri).await.unwrap(); + let base = ds.version().version; + ds.create_branch("ghost", base, None).await.unwrap(); + assert!( + ds.list_branches().await.unwrap().contains_key("ghost"), + "precondition: orphaned fork staged" + ); + } + + db.cleanup(CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); + + // Orphan reclaimed; main untouched. + { + let ds = Dataset::open(&person_uri).await.unwrap(); + assert!( + !ds.list_branches().await.unwrap().contains_key("ghost"), + "cleanup should reconcile the orphaned 'ghost' fork away" + ); + } + assert_eq!( + count_rows(&db, "node:Person").await, + people_before, + "cleanup must not disturb main while reconciling orphans" + ); + + // Idempotent: a second cleanup with the orphan already gone is a no-op. + db.cleanup(CleanupPolicyOptions { + keep_versions: Some(1), + older_than: None, + }) + .await + .unwrap(); +} diff --git a/crates/omnigraph/tests/recovery.rs b/crates/omnigraph/tests/recovery.rs index a090178..f6b19e8 100644 --- a/crates/omnigraph/tests/recovery.rs +++ b/crates/omnigraph/tests/recovery.rs @@ -278,6 +278,97 @@ async fn recovery_rolls_back_synthetic_drift_on_open() { ); } +/// Regression: recovery roll-back must PUBLISH the restored version so +/// `manifest == Lance HEAD` afterward (no residual "orphaned drift"). Before the +/// fix, roll-back restored via `Dataset::restore` but left the manifest pin +/// behind HEAD, so a subsequent strict write / schema apply failed its +/// HEAD-vs-manifest precondition ("stale view … refresh and retry") β€” and a +/// failed schema apply's own roll-back leaked +1 each retry (the original bug's +/// loop). With convergence, one roll-back leaves `manifest == HEAD` and the +/// follow-up succeeds. +#[tokio::test] +async fn recovery_rollback_converges_manifest_so_schema_apply_succeeds() { + use omnigraph::db::ReadTarget; + use omnigraph::loader::{LoadMode, load_jsonl}; + use omnigraph::table_store::TableStore; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + load_jsonl( + &mut db, + r#"{"type":"Person","data":{"name":"alice","age":30}} +{"type":"Person","data":{"name":"bob","age":25}} +"#, + LoadMode::Append, + ) + .await + .unwrap(); + drop(db); + + // Forge a Phase-B residual: advance Person's Lance HEAD without publishing to + // the manifest (the manifest pin stays at the load's committed version). + let person_uri = node_table_uri(uri, "Person"); + let store = TableStore::new(uri); + let mut ds = Dataset::open(&person_uri).await.unwrap(); + let manifest_pin = ds.version().version; + let _ = store + .delete_where(&person_uri, &mut ds, "1 = 2") + .await + .unwrap(); + drop(ds); + + // Roll-back-classified sidecar (post_commit_pin != observed head β‡’ + // UnexpectedAtP1 β‡’ RollBack). + let sidecar_json = format!( + r#"{{ + "schema_version": 1, + "operation_id": "01H0000000000000000000CVG", + "started_at": "0", + "branch": null, + "actor_id": "act-test", + "writer_kind": "Mutation", + "tables": [ + {{ + "table_key": "node:Person", + "table_path": "{}", + "expected_version": {}, + "post_commit_pin": {} + }} + ] + }}"#, + person_uri, manifest_pin, manifest_pin + ); + write_sidecar_file(dir.path(), "01H0000000000000000000CVG", &sidecar_json); + + // Reopen runs the sweep: restore Person to manifest_pin, then PUBLISH so the + // manifest tracks the restored Lance HEAD. + let db = Omnigraph::open(uri).await.unwrap(); + + // Convergence: manifest pin == Lance HEAD. Fails before the fix β€” the + // manifest stays at manifest_pin while HEAD advanced past it. + let snap = db.snapshot_of(ReadTarget::branch("main")).await.unwrap(); + let entry = snap.entry("node:Person").unwrap(); + let lance_head = Dataset::open(&person_uri).await.unwrap().version().version; + assert_eq!( + entry.table_version, lance_head, + "roll-back must publish so manifest pin ({}) == Lance HEAD ({})", + entry.table_version, lance_head, + ); + + // The +1-loop victim: an additive schema apply must now succeed (its + // HEAD-vs-manifest precondition is satisfied). Before the fix this failed + // with "stale view … refresh and retry". + let desired = TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + db.apply_schema(&desired) + .await + .expect("schema apply after a converging roll-back must succeed"); +} + // ===================================================================== // Phase 4 β€” roll-forward path + audit row recording // ===================================================================== diff --git a/crates/omnigraph/tests/staged_writes.rs b/crates/omnigraph/tests/staged_writes.rs index 021b36e..5335057 100644 --- a/crates/omnigraph/tests/staged_writes.rs +++ b/crates/omnigraph/tests/staged_writes.rs @@ -2,7 +2,7 @@ //! exercise `stage_append`, `stage_merge_insert`, `scan_with_staged`, //! and `count_rows_with_staged` directly against a Lance dataset β€” no //! Omnigraph engine involved. The engine-level use of these primitives -//! is exercised by `tests/runs.rs`. +//! is exercised by `tests/writes.rs`. //! //! Test surface here: //! 1. `stage_append` + `scan_with_staged` shows committed + staged data @@ -709,7 +709,7 @@ async fn stage_create_inverted_index_does_not_advance_head_until_commit() { /// /// **When Lance #6658 lands**: this test will need to flip β€” replace /// the assertion with a `stage_delete` + `commit_staged` round-trip -/// and remove the residual line in `docs/runs.md`. +/// and remove the residual line in `docs/dev/writes.md`. #[tokio::test] async fn delete_where_advances_head_inline_documents_residual() { let dir = tempfile::tempdir().unwrap(); diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/writes.rs similarity index 97% rename from crates/omnigraph/tests/runs.rs rename to crates/omnigraph/tests/writes.rs index cfff3fc..0a309c9 100644 --- a/crates/omnigraph/tests/runs.rs +++ b/crates/omnigraph/tests/writes.rs @@ -1,7 +1,7 @@ -//! Tests for the direct-to-target write path (Run state machine -//! removed). The Run/`__run__` staging branch / RunRecord state machine no -//! longer exists; mutations and loads write directly to target tables and -//! commit once via the publisher's `expected_table_versions` CAS. +//! Tests for the direct-publish write path: mutations and loads write +//! directly to target tables and commit once via the publisher's +//! `expected_table_versions` CAS. (History: this replaced the removed Run +//! state machine / `__run__` staging branches / RunRecord β€” MR-771.) //! //! What this file covers: //! - No `__run__*` branches are created by load or mutate. @@ -371,11 +371,10 @@ async fn cancelled_mutation_future_leaves_no_state() { // Cancel-safety property: no graph-level run/staging state remains. // - // Note: `branch_list()` already filters `__run__*` via - // `is_internal_system_branch`, so a runtime "no `__run__` branches" check - // would be vacuous. The structural property that no `__run__` branches - // can ever be created is enforced by deletion of `begin_run` etc. in - // (verified by the build itself β€” those symbols no longer exist). + // No `__run__` branches can ever be created: the Run state machine + // (`begin_run` etc.) was deleted in MR-771 β€” verified by the build itself, + // those symbols no longer exist. Any legacy `__run__*` branch on an + // upgraded graph is swept by the v2β†’v3 manifest migration. // // (1) The branch list is unchanged: cancellation/completion cannot // synthesize new public branches. @@ -442,34 +441,40 @@ async fn repeated_loads_do_not_accumulate_branches() { assert_eq!(db.branch_list().await.unwrap(), vec!["main".to_string()]); } -/// User code must not be able to write to internal `__run__*` names. -/// The branch-name guard predicate is kept as defense-in-depth; it -/// will be removed once a future production sweep retires the legacy -/// branches. +/// After MR-770, `__run__*` is an ordinary branch name β€” the Run state machine +/// and its `is_internal_run_branch` guard are gone. The surviving internal-ref +/// guard still rejects the active `__schema_apply_lock__` branch on the public +/// create/merge APIs. #[tokio::test] -async fn public_branch_apis_reject_internal_run_refs() { +async fn public_branch_apis_reject_internal_system_refs() { let dir = tempfile::tempdir().unwrap(); let mut db = init_and_load(&dir).await; - let create_err = db.branch_create("__run__synthetic").await.unwrap_err(); + // `__run__*` is no longer reserved β€” creating it now succeeds. + db.branch_create("__run__formerly_reserved") + .await + .expect("__run__ prefix is a normal branch name post-MR-770"); + + // The schema-apply lock branch is still rejected on public branch APIs. + let create_err = db.branch_create("__schema_apply_lock__").await.unwrap_err(); let OmniError::Manifest(err) = create_err else { panic!("expected Manifest error"); }; assert!( - err.message.contains("internal run ref"), + err.message.contains("internal system ref"), "unexpected error: {}", err.message ); let merge_err = db - .branch_merge("__run__synthetic", "main") + .branch_merge("__schema_apply_lock__", "main") .await .unwrap_err(); let OmniError::Manifest(err) = merge_err else { panic!("expected Manifest error"); }; assert!( - err.message.contains("internal run refs"), + err.message.contains("internal system refs"), "unexpected error: {}", err.message ); diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh index 83b7d34..a5fb275 100644 --- a/docker/entrypoint.sh +++ b/docker/entrypoint.sh @@ -9,8 +9,14 @@ fi bind="${OMNIGRAPH_BIND:-0.0.0.0:8080}" +# URI comes from the env var (the positional arg wins over any config +# `graphs` block in resolve_target_uri). OMNIGRAPH_CONFIG, when also set, +# is forwarded as --config purely to supply a policy file β€” the two +# compose. Without OMNIGRAPH_CONFIG the behavior is unchanged. if [ -n "${OMNIGRAPH_TARGET_URI:-}" ]; then - exec "$SERVER_BIN" "${OMNIGRAPH_TARGET_URI}" --bind "${bind}" + exec "$SERVER_BIN" "${OMNIGRAPH_TARGET_URI}" \ + ${OMNIGRAPH_CONFIG:+--config "$OMNIGRAPH_CONFIG"} \ + --bind "${bind}" fi if [ -n "${OMNIGRAPH_CONFIG:-}" ]; then @@ -28,5 +34,7 @@ omnigraph-server container startup requires one of: Optional: - OMNIGRAPH_BIND (default: 0.0.0.0:8080) - OMNIGRAPH_TARGET (used with OMNIGRAPH_CONFIG) + - OMNIGRAPH_CONFIG (may also accompany OMNIGRAPH_TARGET_URI to add a + policy file; the URI still comes from OMNIGRAPH_TARGET_URI) EOF exit 64 diff --git a/docker/entrypoint_test.sh b/docker/entrypoint_test.sh new file mode 100755 index 0000000..01fbee2 --- /dev/null +++ b/docker/entrypoint_test.sh @@ -0,0 +1,65 @@ +#!/bin/sh +# Self-contained test for docker/entrypoint.sh argument composition. +# Runs the entrypoint against a stub server that echoes its args, and +# asserts the forwarded argv for each startup mode. No Docker required. +# +# sh docker/entrypoint_test.sh +# +# Exits 0 on success, 1 on the first mismatch. +set -eu + +here=$(CDPATH= cd -- "$(dirname -- "$0")" && pwd) +entrypoint="$here/entrypoint.sh" + +work=$(mktemp -d) +trap 'rm -rf "$work"' EXIT +mkdir -p "$work/bin" +cat > "$work/bin/omnigraph-server" <<'EOF' +#!/bin/sh +echo "ARGS: $*" +EOF +chmod +x "$work/bin/omnigraph-server" + +# Run the real entrypoint with SERVER_BIN pointed at the stub. +ep="$work/entrypoint.sh" +sed "s#SERVER_BIN=\"/usr/local/bin/omnigraph-server\"#SERVER_BIN=\"$work/bin/omnigraph-server\"#" \ + "$entrypoint" > "$ep" + +fail=0 +check() { + desc=$1; want=$2; got=$3 + if [ "$got" != "$want" ]; then + echo "FAIL: $desc" + echo " want: $want" + echo " got: $got" + fail=1 + else + echo "ok: $desc" + fi +} + +got=$(OMNIGRAPH_TARGET_URI="s3://b/g" OMNIGRAPH_BIND="0.0.0.0:8080" sh "$ep") +check "TARGET_URI only (legacy)" \ + "ARGS: s3://b/g --bind 0.0.0.0:8080" "$got" + +got=$(OMNIGRAPH_TARGET_URI="s3://b/g" OMNIGRAPH_CONFIG="/etc/omnigraph/omnigraph.yaml" OMNIGRAPH_BIND="0.0.0.0:8080" sh "$ep") +check "TARGET_URI + CONFIG composes (policy)" \ + "ARGS: s3://b/g --config /etc/omnigraph/omnigraph.yaml --bind 0.0.0.0:8080" "$got" + +got=$(OMNIGRAPH_CONFIG="/etc/omnigraph/omnigraph.yaml" OMNIGRAPH_BIND="0.0.0.0:8080" sh "$ep") +check "CONFIG only" \ + "ARGS: --config /etc/omnigraph/omnigraph.yaml --bind 0.0.0.0:8080" "$got" + +got=$(OMNIGRAPH_CONFIG="/etc/omnigraph/omnigraph.yaml" OMNIGRAPH_TARGET="active" OMNIGRAPH_BIND="0.0.0.0:8080" sh "$ep") +check "CONFIG + TARGET" \ + "ARGS: --config /etc/omnigraph/omnigraph.yaml --target active --bind 0.0.0.0:8080" "$got" + +got=$(sh "$ep" some-uri --bind 1.2.3.4:9 --extra) +check "explicit args passthrough" \ + "ARGS: some-uri --bind 1.2.3.4:9 --extra" "$got" + +if [ "$fail" -ne 0 ]; then + echo "entrypoint_test: FAILED" + exit 1 +fi +echo "entrypoint_test: all cases passed" diff --git a/docs/dev/architecture.md b/docs/dev/architecture.md index 8b7fca2..813f30c 100644 --- a/docs/dev/architecture.md +++ b/docs/dev/architecture.md @@ -207,7 +207,7 @@ contracts: This pattern realizes read-your-writes within a multi-statement mutation and keeps failure scope bounded for inserts/updates by construction at the writer layer. See [docs/dev/invariants.md](invariants.md) and -[docs/dev/runs.md](runs.md) for the publisher CAS contract this builds on. +[docs/dev/writes.md](writes.md) for the publisher CAS contract this builds on. ### Storage trait β€” today vs. roadmap @@ -278,7 +278,7 @@ flowchart LR eng --> wq ``` -The server applies Cedar policy at the HTTP boundary today. The roadmap, called out in [docs/dev/invariants.md](invariants.md) as a known gap, is to push policy into the planner as predicates. After Cedar, mutating handlers go through `WorkloadController` (per-actor admission cap + byte budget; PR 2 / MR-686) before reaching the engine. The engine itself holds an `Arc` so concurrent mutations on the same `(table, branch)` serialize at the queue, while disjoint keys run in parallel β€” see [docs/user/server.md](../user/server.md) "Per-actor admission control" and [docs/dev/runs.md](runs.md). The CLI bypasses the HTTP layer (and admission) and calls the engine API directly. +The server applies Cedar policy at the HTTP boundary today. The roadmap, called out in [docs/dev/invariants.md](invariants.md) as a known gap, is to push policy into the planner as predicates. After Cedar, mutating handlers go through `WorkloadController` (per-actor admission cap + byte budget; PR 2 / MR-686) before reaching the engine. The engine itself holds an `Arc` so concurrent mutations on the same `(table, branch)` serialize at the queue, while disjoint keys run in parallel β€” see [docs/user/server.md](../user/server.md) "Per-actor admission control" and [docs/dev/writes.md](writes.md). The CLI bypasses the HTTP layer (and admission) and calls the engine API directly. Code paths: diff --git a/docs/dev/branch-protection.md b/docs/dev/branch-protection.md index 9b2fa78..2b6cc37 100644 --- a/docs/dev/branch-protection.md +++ b/docs/dev/branch-protection.md @@ -8,7 +8,7 @@ This page explains what the policy says and how to change it. | Setting | Value | Why | |---|---|---| -| **Required status checks (strict)** | `Classify Changes`, `Check AGENTS.md Links`, `Test Workspace`, `Test omnigraph-server --features aws`, `CODEOWNERS / drift`, `CODEOWNERS / noedit` | Every PR must pass workspace tests, AGENTS.md link integrity, and the CODEOWNERS hygiene checks. `strict: true` requires the branch to be up-to-date with `main` before merge. | +| **Required status checks (strict)** | `Classify Changes`, `Check AGENTS.md Links`, `Test Workspace`, `Test omnigraph-server --features aws`, `CODEOWNERS matches source`, `CODEOWNERS not hand-edited` | Every PR must pass workspace tests, AGENTS.md link integrity, and the CODEOWNERS hygiene checks. The two CODEOWNERS contexts must equal the job `name:` values in `.github/workflows/codeowners.yml` **verbatim** β€” a context naming a job that never reports (the old `CODEOWNERS / drift` used the job *id*, and the job was path-filtered) leaves every PR permanently pending and forces admin overrides. `strict: true` requires the branch to be up-to-date with `main` before merge. | | **Required approving reviews** | `1` | At least one reviewer. With a 2-person team, going higher would block all merges when one person is unavailable. | | **Require code-owner reviews** | `true` | The reviewer must be a code owner per `.github/CODEOWNERS`. This is what makes the codeowners chassis enforced. | | **Dismiss stale reviews on new commits** | `true` | A push after approval invalidates the prior review. Prevents the "approve, then sneak in unreviewed changes" pattern. | @@ -16,7 +16,7 @@ This page explains what the policy says and how to change it. | **Disallow force pushes** | `true` | No history rewrites on `main`. | | **Disallow branch deletions** | `true` | `main` cannot be deleted. | | **Required conversation resolution** | `true` | All review comment threads must be resolved before merge. | -| **Enforce on admins** | `true` | Even repository admins go through the gates. The point is no bypasses. | +| **Enforce on admins** | `false` | Admins can override the gates (`enforce_admins: false` in the JSON). This is the intended escape hatch for the 2-person team; tightening to `true` is tracked under hardening below. | | **Required signed commits** | not yet | Not enabled. Would lock out maintainers until everyone enrolls GPG/SSH commit signing. Tracked as a follow-up. | ## How to apply diff --git a/docs/dev/ci.md b/docs/dev/ci.md index 8495d5e..1124cb4 100644 --- a/docs/dev/ci.md +++ b/docs/dev/ci.md @@ -4,7 +4,8 @@ - **ci.yml**: text-only changes skip; otherwise `cargo test --workspace --locked` on ubuntu-latest with protobuf compiler. OpenAPI-drift check that auto-commits the regenerated `openapi.json` for same-repository PRs. Also runs the AGENTS.md cross-link integrity check (`scripts/check-agents-md.sh`). - **AWS feature build job**: `cargo build/test -p omnigraph-server --features aws` on ubuntu-latest. +- **Windows binary build job**: `cargo build --release --locked -p omnigraph-cli -p omnigraph-server` on windows-latest with smoke checks for `omnigraph.exe version`, `omnigraph-server.exe --help`, and PowerShell installer syntax. - **RustFS S3 integration**: spins up RustFS in Docker, runs `s3_storage`, `server_opens_s3_graph_directly_and_serves_snapshot_and_read`, and `local_cli_s3_end_to_end_init_load_read_flow`. -- **release-edge.yml**: on every push to main, retags `edge`, builds Linux x86_64 / macOS arm64 archives + sha256, publishes a rolling prerelease. -- **release.yml**: on `v*` tags, builds the Linux x86_64 / macOS arm64 release matrix and updates the Homebrew tap (`scripts/update-homebrew-formula.sh`) by pushing the regenerated formula to `ModernRelay/homebrew-tap`. +- **release-edge.yml**: on every push to main, retags `edge`, builds Linux x86_64 / macOS arm64 archives and Windows x86_64 zip + sha256, publishes a rolling prerelease, then smoke-tests the Windows PowerShell installer against `edge`. +- **release.yml**: on `v*` tags, builds the Linux x86_64 / macOS arm64 archives and Windows x86_64 zip release matrix, updates the Homebrew tap (`scripts/update-homebrew-formula.sh`) by pushing the regenerated formula to `ModernRelay/homebrew-tap`, and smoke-tests the Windows PowerShell installer against the tag. - **package.yml**: manual ECR image build; emits two image tags per commit (``, `-aws`) via CodeBuild. diff --git a/docs/dev/codeowners.md b/docs/dev/codeowners.md index 9a7fb50..50c4dc7 100644 --- a/docs/dev/codeowners.md +++ b/docs/dev/codeowners.md @@ -4,24 +4,45 @@ This setup gives every role change a reviewable PR and a permanent in-repository audit trail (`git log .github/codeowners-roles.yml`). -## Current roles +## Who owns what -| Role | Members | Scope | +The tables below are **generated** from `.github/codeowners-roles.yml` by `.github/scripts/render-codeowners.py` (the same render that produces `.github/CODEOWNERS`). They are the always-current "who owns what at this commit" view β€” don't edit them by hand; edit the yml and re-render. + + + +**Path β†’ owners** (GitHub applies *last match wins*; the `*` catch-all is listed first and is overridden by the specific patterns below it): + +| Path | Owners | Role(s) | |---|---|---| -| `engineering` | `@ragnorc` | All code under `crates/**`, repository infrastructure, default for unmapped paths | -| `docs` | `@ragnorc` | `docs/**`, README.md, AGENTS.md, CLAUDE.md, SECURITY.md | +| `*` | @ragnorc @aaltshuler | engineering | +| `crates/**` | @ragnorc @aaltshuler | engineering | +| `docs/**` | @ragnorc | docs | +| `README.md` | @ragnorc | docs | +| `AGENTS.md` | @ragnorc | docs | +| `CLAUDE.md` | @ragnorc | docs | +| `SECURITY.md` | @ragnorc | docs | -GitHub treats multiple owners in a CODEOWNERS line as **"any one of them satisfies the review requirement"**. To require N distinct approvers on a specific path, layer a CI check on top (not currently configured). +**Roles**: + +| Role | Members | Description | +|---|---|---| +| `engineering` | @ragnorc @aaltshuler | All production code under crates/**. Engine, CLI, server, compiler. | +| `docs` | @ragnorc | Documentation under docs/**, plus repo-level docs (README.md, AGENTS.md, CLAUDE.md symlink, SECURITY.md). | + + + +GitHub treats multiple owners on a CODEOWNERS line as **"any one of them satisfies the review requirement"**. To require N distinct approvers on a specific path, layer a CI check on top (not currently configured). ## How to change role membership or path mappings 1. Edit `.github/codeowners-roles.yml`. -2. Run `python3 .github/scripts/render-codeowners.py` (requires PyYAML; `pip install pyyaml`). -3. Commit both files in the same PR. +2. Open a PR. **CI re-renders for you**: the `CODEOWNERS` workflow regenerates `.github/CODEOWNERS` and the ownership tables above and auto-commits them back to your PR branch on same-repository PRs β€” you don't have to run the script locally (though you can: `python3 .github/scripts/render-codeowners.py`, requires PyYAML). + +On a fork (where CI can't push back), the workflow instead fails with the diff so you can run the script and commit it yourself. CI fails the PR if: -- `CODEOWNERS` was edited without a corresponding yml change, or -- The yml was changed but the rendered `CODEOWNERS` doesn't match. +- a fork PR left a generated artifact out of sync, or +- `CODEOWNERS` was edited without a corresponding yml change (the `CODEOWNERS not hand-edited` check). ## How to add a new role diff --git a/docs/dev/execution.md b/docs/dev/execution.md index f5c2840..3a108d7 100644 --- a/docs/dev/execution.md +++ b/docs/dev/execution.md @@ -147,7 +147,7 @@ sequenceDiagram - End-of-query Lance commit: `TableStore::stage_append`, `stage_merge_insert`, `commit_staged` at `crates/omnigraph/src/table_store.rs` - Manifest commit primitive: `commit_updates_on_branch_with_expected` at `crates/omnigraph/src/db/omnigraph/table_ops.rs` -Atomicity guarantee for multi-statement mutations: a mid-query failure leaves Lance HEAD untouched on staged tables (no inline commit happened during op execution), so the next mutation proceeds normally with no `ExpectedVersionMismatch`. The publisher CAS at the very end either succeeds (manifest advances atomically across all touched sub-tables) or fails with a typed `ManifestConflictDetails::ExpectedVersionMismatch` (no partial publish). See [docs/dev/invariants.md](invariants.md) and [docs/dev/runs.md](runs.md). +Atomicity guarantee for multi-statement mutations: a mid-query failure leaves Lance HEAD untouched on staged tables (no inline commit happened during op execution), so the next mutation proceeds normally with no `ExpectedVersionMismatch`. The publisher CAS at the very end either succeeds (manifest advances atomically across all touched sub-tables) or fails with a typed `ManifestConflictDetails::ExpectedVersionMismatch` (no partial publish). See [docs/dev/invariants.md](invariants.md) and [docs/dev/writes.md](writes.md). ## Bulk loader (`loader/mod.rs`) diff --git a/docs/dev/index.md b/docs/dev/index.md index 83df8c8..1e41342 100644 --- a/docs/dev/index.md +++ b/docs/dev/index.md @@ -21,7 +21,7 @@ constraints. User-facing behavior should still be documented through |---|---| | System structure, L1/L2 framing, component diagrams | [architecture.md](architecture.md) | | On-disk layout, manifest schema, URI behavior | [storage.md](../user/storage.md) | -| Direct-publish writes, D2, staged writes, recovery sidecars | [runs.md](runs.md) | +| Direct-publish writes, D2, staged writes, recovery sidecars | [writes.md](writes.md) | | Query execution, mutation execution, loader flow | [execution.md](execution.md) | | Index lifecycle and graph topology indexes | [indexes.md](../user/indexes.md) | | Branch and commit internals | [branches-commits.md](../user/branches-commits.md) | @@ -51,6 +51,18 @@ constraints. User-facing behavior should still be documented through | Install and deployment packaging | [install.md](../user/install.md), [deployment.md](../user/deployment.md) | | Release history | [releases/](../releases/) | +## Contribution & Governance + +| Area | Read | +|---|---| +| How to contribute (external) | [CONTRIBUTING.md](../../CONTRIBUTING.md) | +| Governance model, roles, decision authority | [GOVERNANCE.md](../../GOVERNANCE.md) | +| Public contribution RFC track | [rfcs/](../rfcs/) | + +The `docs/rfcs/` track is the **public, externally-authorable** RFC process. The +maintainer/internal RFCs below (`rfc-00N-*.md`) are a separate, team-owned +track; don't conflate the two. + ## Active Implementation Plans Working documents for in-flight feature work. Removed when the work lands. @@ -59,6 +71,8 @@ Working documents for in-flight feature work. Removed when the work lands. |---|---| | Schema-lint chassis v1 (MR-694) β€” `--allow-data-loss`, soft/hard drops | [schema-lint-v1-plan.md](schema-lint-v1-plan.md) | | Inline + stored queries, request/response envelope, MCP (MR-656 / MR-976 / MR-969) | [rfc-001-queries-envelope-mcp.md](rfc-001-queries-envelope-mcp.md) | +| Config & CLI architecture β€” layered config, client targeting, file naming (MR-973 / MR-974 / MR-981) | [rfc-002-config-cli-architecture.md](rfc-002-config-cli-architecture.md) | +| MCP server surface β€” full tool parity, stored queries, modular auth (MR-969 / MR-956 / MR-974) | [rfc-003-mcp-server-surface.md](rfc-003-mcp-server-surface.md) | ## Boundary diff --git a/docs/dev/invariants.md b/docs/dev/invariants.md index 958042f..5ee4f17 100644 --- a/docs/dev/invariants.md +++ b/docs/dev/invariants.md @@ -38,7 +38,7 @@ Use it this way: publishes one manifest update. Do not commit per statement. Delete-only queries are the documented inline residual; the parse-time D2 rule prevents mixing deletes with insert/update until Lance exposes two-phase delete. - Read [runs.md](runs.md) and [execution.md](execution.md). + Read [writes.md](writes.md) and [execution.md](execution.md). 5. **Recovery is part of the commit protocol.** Writers that can advance Lance HEAD before manifest publish must write `__recovery/{ulid}.json` sidecars. @@ -56,7 +56,7 @@ Use it this way: branch they read even when index coverage is partial. Expensive index work should converge from manifest state instead of extending the critical write path. Scalar staged index builds and vector inline residuals are documented - in [runs.md](runs.md) and [indexes.md](../user/indexes.md). + in [writes.md](writes.md) and [indexes.md](../user/indexes.md). 8. **Schema identity survives renames.** Accepted schema identity must remain stable across type and property renames. Rename support belongs in migration @@ -96,17 +96,25 @@ Use it this way: | Area | Current state | Source | |---|---|---| -| Multi-table commit | Manifest CAS plus recovery sidecars; not a single Lance primitive | [runs.md](runs.md), [architecture.md](architecture.md) | -| Constructive mutations | In-memory `MutationStaging`, one end-of-query table commit per touched table, then one manifest publish | [runs.md](runs.md), [execution.md](execution.md) | -| Deletes | Inline-commit residual; delete-only queries allowed, mixed insert/update/delete rejected by D2 | [query-language.md](../user/query-language.md), [runs.md](runs.md) | +| Multi-table commit | Manifest CAS plus recovery sidecars; not a single Lance primitive | [writes.md](writes.md), [architecture.md](architecture.md) | +| Constructive mutations | In-memory `MutationStaging`, one end-of-query table commit per touched table, then one manifest publish | [writes.md](writes.md), [execution.md](execution.md) | +| Deletes | Inline-commit residual; delete-only queries allowed, mixed insert/update/delete rejected by D2 | [query-language.md](../user/query-language.md), [writes.md](writes.md) | +| Branch delete | Manifest is the single authority, flipped atomically first; per-table forks + commit-graph branch are derived state, reclaimed best-effort (`force_delete_branch`) with the `cleanup` reconciler as the guaranteed backstop. Reusing a name whose reclaim failed before `cleanup` surfaces an actionable error | [branches-commits.md](../user/branches-commits.md), [maintenance.md](../user/maintenance.md) | | Schema validation | Type checks, required fields, defaults, edge endpoint checks, and edge cardinality are enforced on write paths | [schema-language.md](../user/schema-language.md), [execution.md](execution.md) | | Unique constraints | Intra-batch and write-path checks exist; full cross-version uniqueness is still a gap | [schema-language.md](../user/schema-language.md) | -| Storage trait | `TableStorage` exists as the sealed staged-write surface; full call-site migration and capability/stat surfaces are incomplete | [runs.md](runs.md), [architecture.md](architecture.md) | +| Storage trait | `TableStorage` exists as the sealed staged-write surface; full call-site migration and capability/stat surfaces are incomplete | [writes.md](writes.md), [architecture.md](architecture.md) | | Index lifecycle | `ensure_indices` is explicit today; reconciler-based convergence is roadmap | [indexes.md](../user/indexes.md), [maintenance.md](../user/maintenance.md) | | Traversal IDs | Runtime still builds `TypeIndex`; Lance stable row-id based graph IDs are roadmap | [architecture.md](architecture.md), [query-language.md](../user/query-language.md) | | Auth | Bearer token hashing and server-side actor resolution are implemented at the HTTP boundary | [server.md](../user/server.md), [policy.md](../user/policy.md) | | Tests | Tempdir-backed Lance tests are the current substrate; there is no `MemStorage` test backend | [testing.md](testing.md) | +The branch-delete reconciler is authority-derived: it reclaims orphaned forks +today and degrades to a no-op if Lance ships an atomic multi-dataset branch +operation, so the design composes with that future rather than blocking it. This +is the same shape as invariant 7 (indexes are derived state); prefer it over a +recovery-sidecar-style approach for any new multi-dataset metadata operation, +since the sidecar would be scaffolding to remove once the substrate closes the gap. + ## Known Gaps Do not hide these behind invariant wording. Either move them forward or keep @@ -122,6 +130,15 @@ them explicit. - **Deletes and vector indexes:** `delete_where` and vector index creation still advance Lance HEAD inline because the required public Lance APIs are missing. Keep D2 and recovery coverage in place until those residuals are removed. +- **Blob-column compaction:** Lance `compact_files` mis-decodes blob-v2 columns + under its forced `BlobHandling::AllBinary` read ("more fields in the schema + than provided column indices"), so `optimize` skips any table with a `Blob` + property β€” reporting `SkipReason::BlobColumnsUnsupportedByLance` (loud, not a + silent drop) behind the `LANCE_SUPPORTS_BLOB_COMPACTION` gate. Reads and writes + are unaffected; only space/fragment reclamation on blob tables is deferred. + Remove the skip when the upstream Lance fix lands β€” the + `lance_surface_guards.rs::compact_files_still_fails_on_blob_columns` guard + turns red on that bump to force it. - **Planner capability/stat surfaces:** cost-aware planning, complete capability advertisement, and explain-with-cost are roadmap. Do not describe them as implemented. diff --git a/docs/dev/lance.md b/docs/dev/lance.md index ef83f2c..9d2b990 100644 --- a/docs/dev/lance.md +++ b/docs/dev/lance.md @@ -175,7 +175,9 @@ Migration from Lance 4.0.0 β†’ 6.0.1 landed in this cycle (DataFusion 52 β†’ 53, - **Lance #6658 closed** (2026-05-14) but `DeleteBuilder::execute_uncommitted` did **not** ship in v6.0.1 β€” binary search across the release stream shows it first appears in `v7.0.0-beta.10` (the closing commits landed on main but didn't backport to the 6.x line). Tracked as MR-A: migrate `delete_where` to staged, retire the parse-time D2 mutation rule, extend recovery sidecar coverage. **Gated on the Lance v7.x bump**, not this PR. v7.0.0-rc.1 dropped 2026-05-21. - **Lance #6666 still open** (`build_index_metadata_from_segments` public): vector-index two-phase blocked; inline `create_vector_index` residual retained. - **Lance #6877 still open** (`MergeInsertBuilder` dup-rowid): PR #109's `SourceDedupeBehavior::FirstSeen` + `check_batch_unique_by_keys` precondition stay load-bearing. +- **`Dataset::force_delete_branch`** (`branches().delete(name, force=true)`, dataset.rs:524) tolerates a missing branch-*contents* ref (vs plain `delete_branch`'s `RefNotFound`), but on the local store still errors `NotFound` if the branch `tree/` directory is fully absent (`remove_dir_all`'s NotFound is not caught for Lance's native error variant, refs.rs:526-549). Both variants still refuse a branch with referencing descendants (`RefConflict`). `TableStore::force_delete_branch` wraps this to be fully idempotent (tolerates already-absent). The single-authority branch-delete redesign uses it for orphan reclamation (eager best-effort reclaim + cleanup reconciler). Pinned by `lance_surface_guards.rs::force_delete_branch_semantics`. Branch delete is "flip the ref atomically, then `remove_dir_all(tree/{branch})`"; branch-exclusive data lives under `tree/{branch}/` so a drop reclaims it immediately without touching `main`. +- **Lance blob-v2 `compact_files` bug** (no public issue found as of 2026-06): `compact_files` disables binary-copy for blob datasets and forces `BlobHandling::AllBinary` on the read side; the v2.1+ structural decoder then mis-counts column infos for the blob-v2 struct and fails with `Invalid user input: there were more fields in the schema than provided column indices / infos` (`lance-encoding/src/decoder.rs::ColumnInfoIter::expect_next`). This fails even a pristine uniform-V2_2 multi-fragment blob table; vector/list/scalar/ragged columns and mixed file versions all compact fine. Reads/queries use descriptor handling (`BlobHandling::default()`) and are unaffected. `optimize` skips blob-bearing tables behind `LANCE_SUPPORTS_BLOB_COMPACTION = false` (`db/omnigraph/optimize.rs`), reporting `SkipReason::BlobColumnsUnsupportedByLance`. Pinned by `lance_surface_guards.rs::compact_files_still_fails_on_blob_columns`, which turns red when the bug is fixed β†’ flip the gate, remove the skip branch + the `maintenance.rs::optimize_skips_blob_table_and_reports_skip` skip assertions. -Surface guards added: `crates/omnigraph/tests/lance_surface_guards.rs` (8 named guards; 3 runtime + 5 compile-only). Future Lance bumps re-run this file first as the smoke check. Two additional guards from the original plan deferred to follow-up (`manifest_cas_returns_row_level_contention_variant` needs full publisher-race harness; `table_version_metadata_byte_compatible_with_v4` needs `pub(crate)` reach extension). +Surface guards added: `crates/omnigraph/tests/lance_surface_guards.rs` (10 named guards; 5 runtime + 5 compile-only). Future Lance bumps re-run this file first as the smoke check. Two additional guards from the original plan deferred to follow-up (`manifest_cas_returns_row_level_contention_variant` needs full publisher-race harness; `table_version_metadata_byte_compatible_with_v4` needs `pub(crate)` reach extension). Bump this date stanza on the next alignment pass. diff --git a/docs/dev/rfc-002-config-cli-architecture.md b/docs/dev/rfc-002-config-cli-architecture.md new file mode 100644 index 0000000..0a8e573 --- /dev/null +++ b/docs/dev/rfc-002-config-cli-architecture.md @@ -0,0 +1,590 @@ +# RFC: Config & CLI Architecture β€” Layered Config, Client Targeting, File Naming + +**Status:** Proposed +**Date:** 2026-05-30 +**Tickets:** MR-668 (multi-graph server, shipped β€” the dependency this builds on), MR-969 (stored queries + MCP β€” supplies the in-repo agent tool surface), MR-973 (quickstart / onboarding), MR-974 (agent setup surface), MR-981 (agent-friendly CLI hardening) +**Target release:** v0.8.x (tentative; phased β€” see Rollout) + +## Summary + +OmniGraph today has a single config file, `omnigraph.yaml`, read both by the CLI (operating the embedded engine) and by `omnigraph-server` (hosting graphs). There is **no client-side configuration that targets a *running server*** β€” to talk to a deployed `omnigraph-server` you drop to `curl` or the `omnigraph-ts` client. This is the one real gap in an otherwise coherent design (storage-URI addressing, multi-graph routing, per-graph policy). + +This RFC defines the config and CLI architecture that closes that gap, derived from first principles β€” *working backwards from what OmniGraph uniquely enables* rather than copying kubeconfig / `helix.toml`. The result: + +1. A **global-first layered config** β€” user-global (`~/.omnigraph/`) is the **primary, self-sufficient default**; per-project (`./omnigraph.yaml`) is an *optional* override + deployment manifest. One uniform schema, both layers optional; the CLI works from any directory with **no project file** (the `kubectl`/`aws`/`gh` posture), unlike today's project-anchored behavior. +2. A single unifying noun β€” the **target** β€” that resolves a name to a concrete `(locus, graph, sub-state, credential)` tuple, where the locus is **embedded (storage URI) XOR remote (server endpoint)**. +3. A **multi-server Γ— multi-graph** client model (OmniGraph hosts N graphs per server and there are M servers β€” unlike Helix's one-cluster-one-graph). +4. **Credentials by reference, keyed by server name** (the AWS/gh/kube model) β€” OS keychain `omnigraph:` (preferred) β†’ a `[]` profile in `~/.omnigraph/credentials` β†’ `OMNIGRAPH_TOKEN[_]` env (CI). `servers.` is endpoint-only by default but may carry an explicit, secret-free `auth: { token: { env|file|command|keychain } }` source; no `credentials.yaml`; the shipped `bearer_token_env` + dotenv stay as a legacy compat path. Every committed/GitOps'd surface stays secret-free. +5. A **file-naming** decision: project and server config are **the same artifact, same name** (`omnigraph.yaml`); the only differently-named file is the user-global `config.yaml`, justified by **scope, not role**. + +The design optimizes jointly for **DX** (one command surface across embedded and remote; clone-and-go) and **AX** (agent experience: one flat resolved context, secrets structurally unreachable, branch-pinned reproducible reads, and a GitOps'd capability surface). + +## Reconciliation with shipped / planned CLI work + +Verified **against the code**, not ticket statuses (which are unreliable β€” e.g. MR-581 is marked done but is stale and unbuilt). Findings and the corrections they force: + +- **Noun is `graph`/`graphs`, NOT `target`/`targets`.** The config key is `graphs:` in `config.rs` and the flag is `--graph`. **This RFC uses `graphs:`/`--graph` throughout**; the unifying noun is a **`graphs:` entry** that is *embedded* (`storage:`, formerly `uri:`) XOR *remote* (`server:` + `graph_id:` defaulting to the entry key) β€” a typed locator (Β§1.1). Read any lingering `targets:`/`--target` below as `graphs:`/`--graph`. +- **`~/.omnigraph/` stands on its own merits** (Helix/aws/kube peer convention), **not** on precedent β€” there is **no `~/.omnigraph/` usage in the code** today. (MR-581 / MR-531 templates-into-`~/.omnigraph/` are *stale tickets, unbuilt*.) +- **Templates do not exist** in the code (no `template` command). The template mechanism is a *design question for this RFC / the init family*, not an existing foothold. +- **What actually exists in the CLI** (verified): `init, query(read), mutate(change), load, ingest, branch, schema, lint, snapshot, export, commit, policy, optimize, cleanup, graphs`. **Not built:** `serve, quickstart, template, prune, login`. `omnigraph init` exists (with `scaffold_config_if_missing`, `main.rs:1415`); the rest of the "init family" (`quickstart` MR-973, `serve` MR-970, `prune`/`init --force` MR-972/975, `mcp install`/skills MR-974, agent-mode MR-981) are **unbuilt tickets**, some stale. +- **Config still uses `aliases:`** (no `operations:` in code; MR-839 unbuilt). Β§6's reconciliation talks about `aliases:` as-is, noting `operations:` is a *proposed* rename. +- **`bearer_token_env` exists** (per-graph, `config.rs`); MR-971 flags a CLI-parity / server-side gap. The per-`servers.` extension lands on top of that. +- **A top-level `omnigraph lint` command exists** (verified). A stored-query *registry* validator must pick a verb that doesn't read as a competing lint/check. + +## Motivation + +Three problems, in priority order: + +- **No clientβ†’server targeting config.** The moment an operator stands up `omnigraph-server` β€” for bearer auth + Cedar at a network boundary + admission control + multi-graph routing β€” the CLI can't address it. `curl` is the fallback. There is no named, switchable, credential-carrying way to say "run this against `prod` on the team server." +- **Multi-server Γ— multi-graph has no first-class expression.** OmniGraph genuinely runs N graphs per server across M servers. The same graph is **multi-homed** β€” `s3://b/prod` may be `prod` on server A, `production` on server B, and opened directly by the CLI. Today's flat `graphs:` map (nameβ†’storage-URI) can't express "graph `production` on server `prod-eu`." +- **Solo-first and embedded-first are unserved by the remote story.** A solo developer with no projects should define everything in `~`. A developer iterating locally (embedded, no server) and then pointing at staging (remote) should change *one word*, not learn a second command surface. + +MR-668 shipped the server side (multiple graphs per server). MR-969 ships the in-repo agent tool surface (stored queries / MCP). This RFC supplies the **client and config layer** that lets humans and agents target that surface coherently β€” the foundation under MR-973 / MR-974 / MR-981. + +## Non-Goals + +- **A control plane / dashboard for config.** Operators edit files and (for servers) restart. No runtime config-mutation API. Matches the MR-668 / MR-969 operational model. +- **Hot reload.** Restart-only for server-side config, matching MR-668 and MR-969. +- **Embedding secrets in any config file.** Credentials are by-reference; the git-ignored `auth.env_file` dotenv (or, later, the OS keychain) holds tokens. Never a committable `*.yaml`. +- **Renaming the project manifest by role.** No `omnigraph.server.yaml` / `omnigraph.client.yaml`. Role lives in sections, not filenames (see Design Β§3). +- **Dropping embedded mode.** Embedded-first is load-bearing for the file-naming decision; this RFC assumes it stays. +- **Cross-graph / cross-server tool listing in MCP.** Clients loop over per-graph catalogs (a MR-969 non-goal, restated). + +## Background + +OmniGraph runs on Lance 6.x: typed nodes/edges in per-type Lance datasets, atomic multi-table commits via a `__manifest` table, branchable and time-travelable. The CLI (`omnigraph`) operates the **embedded engine** directly against a storage URI β€” no HTTP client in its runtime dependencies. `omnigraph-server` (Axum) is a *separate* HTTP front-end over the same engine, with bearer auth + per-graph Cedar (MR-668). The two read the same `omnigraph.yaml` but never connect to each other. + +OmniGraph **already has a credentials-by-reference mechanism**, which this RFC builds on rather than replacing: `TargetConfig.bearer_token_env` names the env var holding a graph's bearer token, and `auth.env_file` points at a git-ignored dotenv (`.env.omni`) that the CLI auto-loads into the process (`load_env_file_into_process`) with real-env-vars-win precedence; `resolve_remote_bearer_token` resolves a token via env var then dotenv named lookup. `.env.omni` is already in `.gitignore`. + +The six **irreducible enablers** that drive the design (referenced as E1–E6 below): + +| # | Enabler | Consequence | +|---|---|---| +| E1 | A graph is a **self-contained storage URI**; the substrate (object store + manifest CAS) is the source of truth β€” no server required to read/write. | A graph is addressable **directly (embedded)**, not only via a server. | +| E2 | A server hosts **many graphs**; **many servers** exist. | The remote address space is **`{server} Γ— {graph_id}`**. | +| E3 | The same graph is **multi-homed** under different per-locus names. | **Name β‰  identity.** Resolution is mandatory. | +| E4 | **Branch / commit / snapshot** are first-class addressable sub-state. | An address is *graph @ branch/snapshot*, not just graph. | +| E5 | Enforcement is **two-layered**: engine-layer Cedar (`_as` writers, works embedded) + HTTP-boundary bearer+Cedar (server only). | *How* you reach a graph determines *which* enforcement applies. | +| E6 | **Stored queries / MCP tools are a per-graph registry defined in the project config** (MR-969). | The **agent tool surface is version-controlled in the repo**. | + +Competitors collapse dimensions OmniGraph keeps live: **Helix** fuses E2+E3 (one cluster = one graph); **namidb** fuses E1+E3 into the URI (`s3://b?ns=prod`) and serves one namespace per process. OmniGraph has all of E1–E6 at once, so its config resolves a richer space β€” but the richness is *earned* by capability. + +## Design + +### 1. The address space and the `target` abstraction + +Every OmniGraph address is a tuple: + +``` +(locus, graph, sub-state, credential) + locus = embedded(URI) XOR remote(server-endpoint) # E1, E2 + graph = a URI (embedded) | a graph_id on a server (remote) # E3 + sub-state = branch | snapshot # E4 + credential = cloud-storage creds (embedded) | bearer token (remote) # E5 +``` + +The config's only job is **name β†’ this tuple**. Define one noun β€” a **target** β€” that resolves to either shape: + +```yaml +targets: + dev: # embedded β€” substrate-direct (E1) + storage: s3://team-bucket/dev.omni + branch: main # sub-state (E4) + staging: # remote β€” resolves a server by reference (E2/E3) + server: staging # β†’ looked up in `servers` + graph_id: prod # the graph's id on that server (defaults to the entry key) + branch: review +``` + +`--target staging` resolves: project `targets.staging` β†’ `{server: staging, graph_id: prod, branch: review}` β†’ `servers.staging` β†’ `{endpoint, token-by-ref}` β†’ final `(remote(https://…), prod, review, $TOKEN)`. Embedded targets skip the server hop and use cloud-storage credentials. + +**Two concepts, not kubeconfig's three.** kube splits cluster / user / context; that 3-way split is its most-cursed UX. A target *bundles* server+graph+branch+defaults under one name; the **only** thing split out is `servers`, because endpoints+credentials are shared across many targets and are secret-bearing (different ownership and rate-of-change; see Β§2). Result: **2 nouns β€” `servers` and `targets`.** Embedded `targets` (`storage:`) subsume today's `graphs:` entries. + +### 1.1 The resolved address is a typed *locator*, not a `uri` string + +The shipped config models a graph as a single `uri: String`, and code branches on `is_remote_uri(uri)`. That conflates two structurally different addresses: an **embedded** graph is a *complete, self-contained* address β€” one storage URI = one graph, opened directly via the embedded engine; a **remote** graph is a *server endpoint + a `graph_id`* β€” one server hosts N graphs. A bare server URL **is not a graph**; it lacks the `graph_id`. The cost of the string model, in the code today: + +- the CLI re-decides "server or file?" via `is_remote_uri` at ~16 call sites; +- `TargetConfig` (one `uri` field) **cannot express** multi-server Γ— multi-graph or a multi-homed graph (E2/E3) β€” "graph `production` on server `prod-eu`" has no representation; +- the CLI **bails on remote URIs** for most operations, precisely because the string can't carry the `graph_id`; +- the `omnigraph-ts` SDK had to model `baseUrl` **+** `graphId` *separately* (rewriting `/graphs/{graphId}/…`) β€” it invented the structure the string lacks. + +So the *resolved* address is a **typed locator**, not a string: + +```rust +enum GraphLocator { + Embedded { storage: StorageUri }, // file:// , s3:// β€” a complete graph + Remote { server: ServerId, graph_id: GraphId }, // which server + which graph (+ bearer creds) +} +``` + +A `graphs:` entry resolves into this **once**; downstream code dispatches on the variant (the breadboard's `GraphConn = Embedded(engine) | Remote(http)`) instead of re-sniffing a scheme at each call site. The `uri` string becomes an *input format* for the embedded variant, never the address itself. + +**YAML naming follows the locator β€” the *key* names the locus**, so neither the value's scheme nor a comment is load-bearing: + +| Locus | Key | Value | +|---|---|---| +| Embedded | **`storage:`** (shipped `uri:` is a deprecated alias) | a storage URI (`s3://…`, `file://…`) | +| Remote | **`server:`** | a name in `servers:` (its `endpoint` + creds resolve by name, Β§5) | +| Remote graph id | **`graph_id:`** | the id on that server β€” **defaults to the entry key**; set only when the local alias differs | + +An entry has `storage:` **xor** `server:` β€” the deserializer rejects *both* and *neither* (no silent ambiguity). This removes two prior confusions: `graphs:` (the map) vs `graph:` (the remote id), and `uri:`-might-be-a-server. + +```yaml +servers: + prod-eu: { endpoint: https://og-eu.internal:8080 } +graphs: + dev: { storage: s3://team-bucket/dev.omni } # embedded + production: { server: prod-eu } # remote β€” graph_id = "production" (the key) + staging: { server: prod-eu, graph_id: prod } # remote β€” alias β‰  server's id +``` + +### 1.2 Invalid configs are rejected by design + +The DX rule is: **a config field is either honored or rejected, never silently ignored**. The loader therefore has two phases: + +1. Parse YAML into a loose/raw shape that preserves origin (`base_dir`, layer, line/path when available). +2. Convert once into a typed, role-aware resolved config. Every command receives the resolved form, not the raw YAML structs. + +The typed graph shape is: + +```rust +enum GraphEntry { + Embedded(EmbeddedGraphEntry), + Remote(RemoteGraphEntry), +} + +struct EmbeddedGraphEntry { + storage: StorageUri, + branch: Option, + policy: Option, + queries: QueryRegistrySpec, +} + +struct RemoteGraphEntry { + server: ServerId, + graph_id: GraphId, + branch: Option, +} +``` + +That makes these rules structural rather than advisory: + +- A graph entry must specify **exactly one** locator: `storage:`/legacy `uri:` xor `server:`. +- `policy:` and `queries:` are valid only on `Embedded` graph entries, because they define the capability surface of a graph this process opens directly. A `Remote` graph entry points at a server; that server owns policy and stored-query definitions. +- `omnigraph-server` may serve only `Embedded` graph entries. A server manifest entry with `server:` is rejected: a server should not "host" a graph by proxying another server. +- A named graph uses its own graph entry. Top-level `policy:` / `queries:` are a legacy anonymous-bare-URI compatibility path only; if a named graph is selected while top-level blocks would be ignored, config validation errors with a migration hint. +- A client-defined remote graph discovers stored queries from the server (`GET /queries`) and invokes them (`POST /queries/{name}`); it does not define `queries:` locally for that remote graph. + +Examples that must fail fast: + +```yaml +graphs: + prod: + storage: s3://team-bucket/prod.omni + server: prod-us # invalid: storage xor server +``` + +```yaml +graphs: + prod: + server: prod-us + graph_id: production + policy: { file: ./policies/prod.yaml } # invalid: remote graph policy lives on the server + queries: + find_user: { file: ./queries/find_user.gq } # invalid: remote graph queries are discovered +``` + +`omnigraph config view --resolved --show-origin` is the user-facing debugger for this boundary: it shows the final `Embedded` or `Remote` graph and where every honored field came from. Fields that cannot be honored never make it into the resolved view; they fail validation first. + +### 2. Layered config β€” global-first, uniform schema, project-optional + +**Posture: global-first, project-optional.** OmniGraph's CLI is primarily a *client* (it operates against graphs and servers, embedded or remote), so it sits on the **global-first** side of the CLI-config axis β€” like `kubectl` / `aws` / `gh` / `docker`, and unlike *project-first* tools (`git` / `cargo` / `terraform`) whose primary config is per-repo. The **global user config is the primary, self-sufficient default**; the project file is an *optional* repo-scoped override (and, when present, the deployment manifest). `omnigraph query --target prod` must work from **any directory with no project file**, exactly as `kubectl get pods --context prod` works from anywhere. *(This is a deliberate flip from today, where the CLI reads `./omnigraph.yaml` and does not even walk parent dirs β€” i.e. today it is project-anchored.)* + +**Rule: the two layers share ONE raw schema, and each is fully self-sufficient** (the git-layering mechanism β€” same schema at both levels; you never need a repo to have a working config). Do **not** specialize the file format by layer. Instead, run the same role-aware validation everywhere (Β§1.2): the global and project layers may both define graph locators, defaults, servers, and aliases, but fields that are meaningless for a resolved graph variant are rejected rather than ignored. For example, `queries:` is valid for an embedded graph this config opens directly; it is invalid on a remote graph entry because remote stored queries are server-owned and discovered. + +This makes the **zero-project case the default, not an edge case**: a solo user (or an agent) defines everything needed for client work in `~/.omnigraph/config.yaml` β€” servers, embedded + remote graph locators, defaults, aliases, and optionally personal embedded-graph query registries β€” and **never creates a project file**. A team adds `./omnigraph.yaml` only when it wants repo-scoped overrides or a committed, GitOps'd deployment manifest. Global-first does **not** forbid project files; it stops *requiring* them (the kubectl model: `~/.kube/config` is sufficient and default; per-project kubeconfigs are opt-in via `KUBECONFIG`). + +| Layer | Required? | Typical use | Path | +|---|---|---|---| +| Global | no | **the default** β€” solo/agent's entire config; shared servers+creds for teams; even a personal server's graphs/queries | `~/.omnigraph/config.yaml` | +| Project | no | **opt-in** β€” repo-scoped overrides + the committed deployment manifest (graphs, queries, policy) | `./omnigraph.yaml` | + +**Precedence (low β†’ high):** built-in defaults < global < project < env vars < CLI flags. With no project file it collapses to **built-in < global < env < flags** β€” the common global-only path. + +**Merge semantics β€” "closest layer wins, at the smallest meaningful unit"** (the field consensus: git / kubeconfig / cargo / Helm / VS Code): +- **Settings objects** (`defaults`, `auth`, `server`) β†’ **deep-merge per field**: a project sets `defaults.graph` and *inherits* the global `defaults.output_format`. (VS Code / cargo behavior.) +- **Named-resource maps** (`servers`, `graphs` / compat `targets`, `queries`, `aliases`) β†’ **union by key; on a collision the higher layer's entry REPLACES the lower wholesale** β€” *no field-level deep-merge within an entry*. (kubeconfig: union contexts by name.) The footgun this avoids: global `servers.prod = {endpoint, policy}`, project `servers.prod = {endpoint: other}` β€” deep-merge would silently retain the old fields; replace makes the project's `prod` self-contained and predictable. +- **Lists/arrays** β†’ **replace, never append** (Helm convention; appending is order-sensitive and surprising). +- **Scalars** β†’ higher layer wins. +- **Relative paths carry their origin's base_dir.** A `queries:` entry's `.gq` path, or a `policy.file`, resolves against the directory of the layer it was *defined in* β€” global entries under `~/.omnigraph/`, project entries under the project dir. +- **Inspectable (non-negotiable):** `omnigraph config view --resolved --show-origin` prints each final value *and which layer set it* (the `git config --show-origin` / `kubectl config view` rule). A layered config without origin-tracing is a debugging trap. + +### 3. Roles, and the file-naming decision (same name for project = server) + +`omnigraph.yaml` carries two *roles* that diverge in prod and collapse on a laptop: + +- **Server role** (read by `omnigraph-server`): `graphs:` entries that are **embedded storage locators**, per-graph `policy.file`, **`queries:` β€” the stored-query/MCP registry lives here**, plus serving knobs. Remote graph locators are rejected in this role. +- **Client role** (read by the CLI/agent): `servers:`, embedded or remote `graphs:` locators, `defaults:`, `aliases:`. A remote graph locator points at server-owned capabilities; it cannot define local `policy:` or `queries:`. + +**Project config and server config are the same artifact, hence the same name.** The server *serves the project*: the file that says "these graphs exist, with these stored queries and this policy" is simultaneously the project manifest and the server's deploy config. Role is distinguished by which *sections* are populated, never by filename. Readers ignore sections that are not theirs (today's file already does this with `cli:` vs `server:`). + +**Why not kube's role-split.** Two coherent models exist: (A) one project file with role-sections (Helix `helix.toml` holds both `[local.dev]` and `[enterprise.production]`; compose; Cargo), and (B) deployment-manifest strictly separate from client config (kubectl β€” you never put a context in `deployment.yaml`). kube is the sharpest topological analog (multi-server Γ— multi-graph, one client targeting many), so B has a real claim. The tiebreaker is **E1: OmniGraph is embedded-first.** In embedded mode the manifest's `graphs:` *is* the local target list β€” manifest and local-client-view are the same object, so splitting them (B) fights the grain and forces two files for local work. kube splits because it has **no** embedded mode (client always remote+global). So: take the half kube is right about β€” *remote* client targeting (`servers:`, endpoints, creds) is a separate concern in a separate **user-global** file (`config.yaml`, like `~/.kube/config`); reject the half it is wrong about for us β€” do **not** split the *project* layer by role. **The second name (`config.yaml`) is justified by scope (user-global), not role.** *(If OmniGraph ever dropped embedded mode and went pure-remote, model B's strict split would become cleanest.)* + +### 4. File naming + +Principles from the field: **one global dir** `~/.omnigraph/` (like `~/.aws`/`~/.kube`/`~/.helix`), with config/cache/state as **subdirectories** (separation without XDG's three-root scatter); **secrets keyed by server name in the OS keychain or a separate git-ignored profile file** (AWS/gh model, not a new `credentials.yaml`); **project-root manifest keeps the app-named file** (`Cargo.toml`, `package.json`); **`.yaml`, not `.yml`**; keep OmniGraph's established names. The genuinely *new* decisions are the **global** dir's existence and keyed-by-name resolution with an explicit `auth.token` override (MR-971); the shipped `bearer_token_env` + `auth.env_file` mechanism remains as legacy compat. + +| Artifact | Path / name | Why | +|---|---|---| +| Project = server config (one artifact) | `./omnigraph.yaml` | **Keep.** Root manifest like `Cargo.toml` / `compose.yaml` / `helix.toml`. Same name for both roles because it is one file. In prod the server's deploy repo and an app repo each have their own `omnigraph.yaml` β€” same name, different repos. | +| Global user config | `~/.omnigraph/config.yaml` | **One dir** (`~/.omnigraph/`, like `~/.aws`/`~/.kube`/`~/.helix`). Named `config.yaml` *not* `omnigraph.yaml` β€” the name signals scope (and `~/.aws/config`, `~/.kube/config`, `~/.helix/config` all do this). Holds the full schema so a solo user needs nothing else. | +| Credentials | OS keychain (`omnigraph:`, preferred) β†’ `~/.omnigraph/credentials` profile file (`[]`, `0600`, git-ignored). **Keyed by server name**, inside the one dir. | **Key by name, AWS/gh model** β€” `~/.aws/credentials [profile]`, `~/.kube/config users:`, `~/.helix/credentials`. *Not* a `credentials.yaml`, and *not* a per-server hand-named env var; the secret lives under the server name (no indirection). Legacy `bearer_token_env` + `.env.omni` dotenv remain as a compat path. See Β§5. | +| Cache / state | `~/.omnigraph/cache/`, `~/.omnigraph/state/` | Subdirs of the one dir (like `~/.aws/sso/cache/`, `~/.kube/cache/`) β€” cache is `rm -rf`-safe and backup-excludable without scattering across XDG roots. | +| Cedar policy | `./policies/.yaml` + `.tests.yaml` | **Keep.** Referenced by `policy.file`. | +| Schema | `./*.pg` (e.g. `schema.pg`) | **Keep.** | +| Stored queries | `./queries/*.gq` | **Keep.** `.gq` sources referenced by the `queries:` registry. | + +**Global dir: `~/.omnigraph/` β€” one place, with subdirectories.** Everything OmniGraph keeps for a user lives under a single `~/.omnigraph/` directory, matching the peer group (`~/.aws`, `~/.kube`, `~/.docker`) and the direct competitor (`~/.helix`). This is what DB/cloud-CLI users expect and the lowest-cognitive-load shape. + +*Separation and "one place" are not in conflict* β€” the decisive realization. The peer tools get config/cache/state separation via **subdirectories inside the one dir**, not via XDG's three scattered roots: `~/.aws/sso/cache/`, `~/.kube/cache/`. So OmniGraph keeps `~/.omnigraph/config.yaml`, `~/.omnigraph/credentials`, `~/.omnigraph/cache/` (catalogs β€” `rm -rf`-safe, backup-excludable), `~/.omnigraph/state/` (session, logs) β€” getting cache hygiene **and** a single discoverable location, without the XDG scatter. An earlier draft argued XDG on a false dichotomy (it assumed single-dir β‡’ mixed); subdirs dissolve it. `~/.omnigraph/` is canonical and documented; `$XDG_CONFIG_HOME` may optionally be honored if a user has set it, but XDG is not part of the mental model. + +**Env / override precedence (the `KUBECONFIG` analog):** +- `OMNIGRAPH_CONFIG=/path` β€” explicit config file, highest precedence. +- `OMNIGRAPH_HOME=/path` β†’ the global dir (default `~/.omnigraph/`); `$XDG_CONFIG_HOME` optionally honored if a user has set it, but `~/.omnigraph/` is canonical. +- Cache and state are subdirs of the one dir: `~/.omnigraph/cache/` (cached remote catalogs), `~/.omnigraph/state/` (session, logs). +- Per-server token resolution: an explicit `auth: { token: {...} }` source (env/file/command/keychain) wins if set; otherwise **keyed by the server name** β€” `OMNIGRAPH_TOKEN_` (or `OMNIGRAPH_TOKEN` for the active server) β†’ OS keychain `omnigraph:` β†’ the `[]` profile in `~/.omnigraph/credentials`; legacy `bearer_token_env` still honored. See Β§5. + +### 5. Credentials, connection tiers, and bind portability (12-factor) + +**Credentials are by-reference everywhere, never inlined β€” and keyed by the *server name*, not by a hand-invented env-var name.** This is the one place the design departs from simply reusing the shipped `bearer_token_env` mechanism, because that mechanism is sub-optimal for a multi-server client: it forces the operator to invent and coordinate an env-var name per server (three steps to add a server: pick a var, name it in config, set it in the store). The peer group (AWS profiles, `gh` hosts, kubeconfig users, docker auths) instead keys the secret **by the server's name** β€” no indirection. OmniGraph should match that. + +**Resolution for server `` (no config field required):** +1. **`OMNIGRAPH_TOKEN_`** env var (name-derived, upper-snake), else **`OMNIGRAPH_TOKEN`** for the active server β€” the CI/headless override (12-factor). +2. **OS keychain** entry `omnigraph:` β€” the preferred interactive store (no plaintext on disk); written by `omnigraph login `. +3. **`~/.omnigraph/credentials`** β€” an AWS-style profile file keyed by server name (mode `0600`, git-ignored), the fallback when no keychain: + ```ini + [prod-us] + token = … + [prod-eu] + token = … + ``` +So a `servers.` with no token field resolves by name β€” adding a server is one step (`omnigraph login `), and "multiple servers, multiple tokens" falls out for free. + +**But implicit must not be the *only* path β€” explicit sourcing is a first-class option** (the DX/AX lesson). Pure-convention is invisible (you must *know* `OMNIGRAPH_TOKEN_`), can't integrate with a secrets-manager's fixed var name, and can't do dynamic/short-lived tokens. So a server may declare an explicit `auth:` block β€” a **method-agnostic wrapper** (today only `token:` for bearer; `mtls:`/`oidc:` are the future siblings, so the credential model never has to be re-keyed) holding a tagged token *source*. Secrets are *still* never inlined (every source is a reference): + +```yaml +servers: + prod-us: + endpoint: https://og-us… + auth: { token: { env: OG_PROD_US_TOKEN } } # explicit env var β€” self-documenting (= legacy bearer_token_env) + prod-eu: + endpoint: https://og-eu… + auth: { token: { command: [vault, read, -field=token, secret/og] } } # dynamic / short-lived + edge: + endpoint: https://og-edge… + auth: { token: { file: /run/secrets/og-token } } # k8s/docker mounted secret + staging: + endpoint: https://og-staging… # no auth: β†’ implicit chain (below) +``` + +| `auth.token:` source | when | DX/AX value | +|---|---|---| +| *(auth omitted)* | the common case | zero-config; `omnigraph login` populates keychain `omnigraph:` | +| `{ env: VAR }` | secrets-manager / CI injects a fixed var | **self-documenting** β€” config states the source; = the legacy `bearer_token_env` | +| `{ file: PATH }` | k8s/docker secret mounted as a file | no env plumbing | +| `{ command: [...] }` | Vault, cloud IAM, `gh auth token` | **dynamic tokens** β€” first-class exec, the capability pure-env/keychain can't give (kube `exec` / AWS `credential_process`) | +| `{ keychain: ENTRY }` | pin a non-default keychain entry | explicit override of the name-derived default | + +**Resolution per server:** if `auth.token:` is set, use that source (no fallthrough). Else the **implicit chain**: `OMNIGRAPH_TOKEN_` (or `OMNIGRAPH_TOKEN` for the active server) β†’ keychain `omnigraph:` β†’ `[]` in `~/.omnigraph/credentials` (`0600`, git-ignored). `omnigraph login ` writes/rotates only that server's secret; per-server precedence is independent; sharing is opt-in (same env var or source). The `command` source runs locally with the operator's own privileges and is defined only in operator-owned config (never server-supplied), so it adds no remote-execution surface. The `auth:` wrapper is method-agnostic so adding mTLS/OIDC later is a new sibling key, not a breaking re-key (Hyrum's Law: the field name is a contract once shipped). There is **no `credentials.yaml`** and **no inlined secret**. *Convention for the floor, explicit for control β€” and explicit is legible to agents and never inlines a secret.* + +**Back-compat.** The shipped per-graph `bearer_token_env` + `auth.env_file` dotenv (`resolve_remote_bearer_token`, real-env-wins) keeps working unchanged for existing single-server setups; `bearer_token_env` is just the legacy flat alias for `auth: { token: { env } }`. Resolution tries an explicit `auth.token:` (or legacy `bearer_token_env`) first, then the keyed-by-name chain β€” so nothing breaks, but the zero-config default is the no-boilerplate keyed-by-name path. (MR-971 β€” the `bearer_token_env` parity gap β€” is where this resolver work lands.) + +**Three connection tiers** (Supabase/Prisma teach the zero-config floor): +1. **Env vars** β€” `OMNIGRAPH_SERVER=https://…` + `OMNIGRAPH_TOKEN=…`: zero-config remote, no file (the `DATABASE_URL` floor). +2. **Global `config.yaml`** β€” named `servers:` + `graphs:` for multi-server setups (the AWS-profiles convenience). +3. **Project `omnigraph.yaml`** β€” project-pinned targets/graphs, committed. + +**Keep `omnigraph.yaml` a *portable* manifest (12-factor).** Deploy-specific runtime that varies per environment β€” the **bind host/port**, worker counts β€” should be supplied by **`--bind` / `OMNIGRAPH_BIND` (flags/env)**, *not* a committed `server.bind:` baked into the manifest. A manifest that hardcodes `0.0.0.0:8080` is not portable across deploys and leaks an environment detail into a version-controlled file. The same-named `omnigraph.yaml` stays portable across deploys precisely because the volatile, per-environment knobs live in env/flags (12-factor config), while the stable, portable definition (graphs, queries, policy) lives in the file. This is the one concrete lesson taken from kube's model-B without adopting its file split: portability via env/flags, not via a second file. + +### 6. Where stored queries live: defined locally, invoked remotely + +A stored query splits across two axes; do not conflate them: +- **Definition** (`.gq` source + `queries:` entry) lives next to the **embedded graph entry that owns it**. For a hosted remote graph, that is the **deployment manifest** read by `omnigraph-server`; for a personal embedded graph, it may be the user's own config. It never lives on a client-side `Remote` graph entry. +- **Discovery** ("what tools exist for me?") is fetched from the **server** (Cedar-filtered `GET /queries` / MCP catalog) at connect time. +- **Invocation** is **remote** (client β†’ server, HTTP/MCP) β€” or **embedded** (the CLI opens the graph directly and reads the same manifest). + +For remote use, the client carries *pointers to servers*, not query definitions; it **discovers and invokes**, never defines. This is the **capability-as-code guarantee for agents**: an agent can only invoke tools the server's *committed, reviewed* config exposes β€” it **cannot define a new tool at runtime**. Definition is structurally outside the agent's reach. + +`queries:` (graph-capability registry, Cedar-gated when served remotely, MCP-visible when exposed) and `aliases:` (client CLI shortcut) overlap β€” both can name `.gq`-backed operations. This RFC keeps them siblings (the MR-969 decision); the clean long-term is **one registry, two invocation surfaces** (embedded + remote), with `aliases:` subsumed. Out of scope here. + +#### Reconciling `aliases:` with the role model + +`aliases:` is the pre-MR-969, **client-role, embedded-only, ungated** ancestor of `queries:`. An alias bundles `command` (read/change), `query` (`.gq` path), `name` (symbol), `args` (positional param names), and `graph`/`branch`/`format` defaults; the CLI runs it embedded. The server never reads it. So: + +- **Role:** `aliases:` is **client-role** (CLI behavior) β†’ it may live in **both** the user-global `config.yaml` and the project manifest, layered. `queries:` is **graph-capability role** β†’ it lives only on an `Embedded` graph entry, and for remote server graphs that means the server deployment manifest. *Who opens the graph determines where query definitions can live.* +- **Difference:** `aliases:` = embedded invocation, no gating, explicit `command`, bundles client defaults + positional args. `queries:` = remote (+future embedded), Cedar + `mcp.expose`, **infers** read/mutate, bundles only MCP settings. +- **Convergence:** decompose an alias β€” *definition* (nameβ†’.gq+symbol) β†’ `queries:` (the superset: typed, validated, gated, multi-surface, no redundant `command`); *target/branch/format* β†’ client invocation context (`--target`/`--branch`/`--format` or `defaults:`), not baked per-query; *positional `args`* β†’ thin CLI sugar or dropped (agents/services use named JSON params). End-state: one `queries:` registry + the client config model subsumes `aliases:`. +- **Validation:** a file-backed alias (`query: ./foo.gq`) may target only an embedded graph. A remote graph shortcut must be explicit that it invokes a server-owned stored query, e.g. `invoke: find_user`, so the client cannot smuggle a new `.gq` definition into a remote capability surface. +- **v1:** keep `aliases:` unchanged. Footgun worth a load-time warn: an alias and a query with the same name in one manifest are different namespaces invoked differently (`--alias X` vs `POST /queries/X`). + +```yaml +aliases: + local_owner: + command: query + query: ./queries/owner.gq + name: owner + graph: dev # valid only if `dev` resolves Embedded + + remote_owner: + invoke: find_user + graph: prod # valid only if `prod` resolves Remote; source lives on the server + args: [name] +``` + +### 7. CLI surface + +- `omnigraph login ` β€” interactive auth; stores the token keyed by server name in the OS keychain (`omnigraph:`) or the `[]` profile of `~/.omnigraph/credentials` (0600). The `gh auth login` analog. +- `omnigraph use ` β€” set the active graph (writes the appropriate layer). The `kubectl config use-context` analog. +- `omnigraph config view [--resolved] [--show-origin] []` β€” print the merged config and, with `--resolved`, the final tuple **plus the origin layer of every field** (the `git config --show-origin` / `kubectl config view` analog). Resolution is never a mystery. +- All existing verbs (`query`, `mutate`, `load`, `schema`, `branch`, …) gain `--graph `; resolution decides embedded vs remote transparently. + +### 7.5 Init, login, and bootstrap β€” three tiers (folds in the Q2 design) + +Scaffolding splits into three tiers by *scope* and *fatness*, mirroring the field (supabase `init` vs `login`; HelixDB thin `init` vs fat `chef`). Most of this lives in sibling tickets; this RFC owns only the **user route**. + +| Tier | Command | Scope | What it does | Model | Status | +|---|---|---|---|---|---| +| **User route** | `omnigraph login []` | user (`~/.omnigraph/`) | auth + write `~/.omnigraph/config.yaml` / `credentials`; first-run global setup | gh / supabase `login` | **this RFC** (unbuilt) | +| **Thin project init** | `omnigraph init` | project, in-place | create graph + `scaffold_config_if_missing` (`omnigraph.yaml` + minimal `.pg`/`.gq`); refuse-if-exists or `--force` | `cargo init`, `prisma init` | exists; `--force` purge = MR-975 | +| **Fat bootstrap** | `omnigraph quickstart [--template ] [--auto]` | project, possibly new-dir | scaffold + seed data + `serve start` + agent prompt file | HelixDB `chef`, `create-next-app` | MR-973 (unbuilt) | + +**Design positions** (first-principles, since none of the fat tier is built): +- **Split `init` (project) from `login` (user)** β€” never one command writing to both `$HOME` and the project (the supabase line, not the dbt line). `init`=project scaffold; `login`=user credential + global config. +- **`init` is in-place + refuse-if-exists** (cargo/prisma/terraform default): don't clobber; adopt existing files; require `--force` to overwrite (and `--force` purges Lance state per MR-975). +- **Interactive for humans, `--auto`/agent-mode for automation** (npm `-y`, create-* `--CI`, MR-981 `--machine`). In `OMNIGRAPH_AGENT_MODE` any prompt β†’ fail with a repair hint. +- **Templates are a `--template ` flag on the fat tier** (create-vite model), with the *content* (schema + queries + seed) coming from a template source. Mechanism is a design question (bundled-in vs `og template pull` from a repo vs `npm create-*`-style delegation) β€” **not** an existing foothold (MR-581 stale). Lean: a small set of bundled templates first (generic `Personβ†’Knows`, plus promote `omnigraph-intel-bootstrap`), `--template ` later. +- **`init`/`quickstart` can scaffold the `graphs:` map with one or more entries**; "init with specific graphs" = the scaffolded `graphs:` block (embedded `storage:` locally; the agent/operator adds remote `server:` entries via `login` + editing). +- **Secrets-on-scaffold rule** (prisma/dbt/supabase all do this): anything that writes a token also keeps it out of VCS. `login` prefers the OS keychain (no file); the `~/.omnigraph/credentials` profile fallback is `0600` and git-ignored, and any project-local `.env`-shaped file gets a `.gitignore` entry. + +### 8. Concrete shape + +**Global** `~/.omnigraph/config.yaml` (per-user, secret-free): +```yaml +servers: # endpoint only β€” token is keyed by the server name + prod-us: { endpoint: https://og-us.internal:8080 } + prod-eu: { endpoint: https://og-eu.internal:8080 } + staging: { endpoint: https://og-staging.internal:8080 } +graphs: + personal: { storage: ~/graphs/personal.omni } +defaults: + graph: personal +aliases: + my_people: + command: query + query: ~/queries/people.gq + name: list_people + graph: personal +``` + +**Project client** `./omnigraph.yaml` (committed, secret-free, portable β€” no `server.bind`). Note the shipped noun is `graphs:` (MR-603); an entry is embedded (`storage:`) XOR remote (`server:` + `graph_id:`, Β§1.1): +```yaml +graphs: + dev: { storage: s3://team-bucket/dev.omni, branch: main } # embedded + staging: { server: staging, graph_id: prod, branch: review } # remote β†’ graph `prod` on server `staging` + prod-us: { server: prod-us, graph_id: production } + prod-eu: { server: prod-eu, graph_id: production } # multi-homed: same graph, another server +defaults: { graph: dev, output_format: table } +aliases: + owner: + command: query + query: ./queries/owner.gq + name: owner + args: [name] + graph: dev +``` +Select with `--graph ` (shipped flag, MR-603). + +**Server deployment** `./omnigraph.yaml` (committed in the deploy repo, read by `omnigraph-server`). Every served graph is an embedded storage locator; server-owned policy and stored-query definitions live here: +```yaml +graphs: + production: + storage: s3://team-bucket/prod.omni + policy: + file: ./policies/prod.yaml + queries: + find_user: + file: ./queries/find_user.gq + mcp: { expose: true, tool_name: lookup_user } + +server: + policy: + file: ./policies/server.yaml +``` + +**Credentials** are keyed by server name β€” `omnigraph login prod-us` writes the OS keychain entry `omnigraph:prod-us` (or a `[prod-us]` profile in `~/.omnigraph/credentials`, 0600, git-ignored); `OMNIGRAPH_TOKEN_PROD_US` overrides for CI. No token fields in any config file; no committable secrets. + +## DX + +1. **One command surface, two loci.** `query --graph dev` (embedded) and `--graph staging` (remote) are the same command; only resolution differs. Change one word, not a mental model. +2. **Clone-and-go.** Project config names servers+graphs; teammate runs `omnigraph login staging` once and every target resolves. The git + `gh auth login` model. +3. **Multi-server Γ— multi-graph is the default.** Remote graph entries reference `server` by name; `servers` is a global named map; graphs are per-server. `prod-us` and `prod-eu` both serving `production` is two graph entries β€” Helix cannot express this. +4. **Solo-first.** Everything in `~`, no project required. +5. **Laptop-to-fleet on one schema.** Local = one `omnigraph.yaml` (both roles); prod = role-split across repos. No second format to learn. + +## AX (agent experience) + +1. **One flat resolved context, never a config to navigate.** targetβ†’serverβ†’endpointβ†’token resolves *before* the agent sees anything. The agent reasons about tools, not topology (the LLM-safe-surface principle extended to config). +2. **Secrets are structurally outside the agent's reach.** The repo it operates in has no tokens; they are in the global layer / keychain, outside its view. An agent *cannot* exfiltrate a prod token from project config because it is not there. +3. **Branch/snapshot-pinned contexts** (E4) β€” hand an agent a `branch: review` / `--snapshot v42` target and its reads are reproducible and cannot see uncommitted main-line state. No kubeconfig analog. +4. **The agent's capabilities are a GitOps'd artifact** (E6) β€” which graphs exist, which stored-query tools it may call, and which Cedar rules gate them are all in the version-controlled server config. Powers change only via a reviewed PR, deployed by restart. Infrastructure-as-code for what the AI can do. +5. **Config + policy compose.** Config = "where am I pointed + which token"; Cedar = "what may I do there." Orthogonal; no enforcement logic leaks into config. + +## GitOps β€” three surfaces, secrets in none + +| Surface | Repo | Contents | Deploy | Secrets | +|---|---|---|---|---| +| Server deployment config | infra/deploy repo | `graphs:`, policy, **`queries:` + `.gq` files** | commit β†’ CI β†’ **server restart** (no hot reload) | none β€” by-reference | +| Project client config | app repo | `graphs:` β†’ embedded storage or remote server+graph | committed, read by CLI/agent | none | +| Global user config | **not GitOps'd** β€” machine-local `~` | `servers:` + creds-by-ref | `omnigraph login` writes it | refs only (like `~/.kube/config`) | + +## Comparison + +| Property | kubeconfig | Helix | git | compose | **OmniGraph (this RFC)** | +|---|---|---|---|---|---| +| Named remote endpoints + creds-by-ref | βœ… | βœ… | partial | partial | βœ… (global `servers`) | +| Global + project layering, uniform schema | βœ— | βœ— | βœ… | βœ— | βœ… | +| Embedded OR remote under one name | βœ— | βœ— | n/a | βœ— | βœ… (E1) | +| Multi-server Γ— multi-graph | βœ… | βœ— | n/a | n/a | βœ… (E2) | +| Branch/snapshot in the address | βœ— | βœ— | partial | βœ— | βœ… (E4) | +| Agent tool surface in the repo | βœ— | βœ— (separate bundle) | n/a | n/a | βœ… (E6) | +| Project manifest renamed by role | β€” | no | β€” | no | **no** | +| Concept count | 3 | 1 | 2 | 1 | **2 (servers/targets)** | + +## Migration / backwards compatibility + +- **Additive.** Today's `omnigraph.yaml` (`graphs:`, `cli:`, `server:`, `aliases:`, `policy:`) keeps working unchanged. `graphs:` entries are equivalent to embedded `targets:` with a `storage:` (shipped `uri:` is a deprecated alias); both resolve. +- **`targets:` is new** and optional. `servers:` is new and optional. Absent β†’ today's behavior. +- **Global `~/.omnigraph/config.yaml` is new.** Absent β†’ only project + env + flags, exactly as now. Its addition is the **global-first posture flip**: today the CLI is project-anchored (reads `./omnigraph.yaml`, no parent walk); the global config becomes the new primary discovery path so the CLI works with no project file. Existing project-only workflows are unchanged (project still overrides global); the flip is additive β€” it adds a fallback layer below the project file, it does not remove the project file. +- **`graphs:` β†’ `targets:` is an evolution, not a break.** Both can coexist; `targets:` is the superset (adds remote + branch pinning). A future cleanup may alias `graphs:` to embedded `targets:`. +- **`server.bind` stays supported** but documentation steers operators to `--bind` / `OMNIGRAPH_BIND` for portability; no removal. +- **Credentials: keyed-by-name is new; `bearer_token_env` is the compat path.** The primary design (keychain / `[]` profile / `OMNIGRAPH_TOKEN_`) is new resolver work (lands on MR-971). The shipped `bearer_token_env` + `auth.env_file` dotenv (`resolve_remote_bearer_token`) is **unchanged and still honored** β€” existing single-server dotenv setups keep working, and the resolver honors an explicit `auth: { token: {...} }` source (env/file/command/keychain) with `bearer_token_env` as its flat legacy alias. No `credentials.yaml`. +- **Validation tightens invalid mixes, not valid legacy use.** Top-level `policy:` / `queries:` remain only for anonymous bare-URI compatibility. Named graphs use per-entry fields. Remote graph entries with local `policy:` / `queries:` and server manifests with `server:` graph locators are rejected because there is no correct way to honor those fields. + +## Open questions + +- **`graphs:` vs `targets:` naming churn.** Do we rename `graphs:` β†’ `targets:` (with a deprecation alias) or keep `graphs:` for embedded and add `targets:` for remote? Leaning: keep both, document `targets:` as the superset. +- **Keychain integration scope.** Keychain is now the *primary* credential store (Β§5), so this is on the critical path, not optional: macOS Keychain first (matches operator practice) with the `0600` `[]` profile file as fallback; Linux Secret Service / `pass` later. Open: which keyring crate, and the exact `OMNIGRAPH_TOKEN_` name-derivation (upper-snake, non-alnum β†’ `_`). +- **Project-local `servers:`.** Allowed (e.g. a localhost dev server), merged with global. Confirm creds stay by-reference even for project-local servers (yes). +- **`aliases:` ⇄ `queries:` convergence.** Out of scope here; tracked separately. One registry with embedded + remote invocation surfaces is the target end state. +- **Single-file `KUBECONFIG`-style list.** Do we support `OMNIGRAPH_CONFIG` pointing at multiple files (colon-joined), or a single file only? Start single; revisit if demand appears. + +## Implementation β€” breadboard + slices (Shape A) + +Shaped via requirements + a fit check (Shape A β€” global-first layered config + unified `graphs:` entry + three-tier init β€” selected over a project-first minimal option and a Helix-clone). This section breadboards A and slices it. **Bold** = NEW. + +### Places + +| # | Place | What | +|---|---|---| +| P1 | Disk | `~/.omnigraph/{config.yaml, credentials, cache/, state/}` + project `omnigraph.yaml` + `.env.omni` | +| P2 | Config resolution | runs on every command: load layers β†’ merge β†’ resolve `--graph` | +| P3 | Command execution | embedded engine OR remote HTTP client | +| P4 | Remote `omnigraph-server` | existing HTTP surface (`/query`, `/mutate`, `/queries/{name}`) | +| P5 | Scaffold | `login` / `init` / `quickstart` | + +### Affordances + +| # | Place | Affordance | NEW? | Wires | +|---|---|---|---|---| +| U1 | P1 | `~/.omnigraph/config.yaml` (operator edits) | **N** | β†’ N1 | +| U2 | P1 | project `./omnigraph.yaml` | β€” | β†’ N1 | +| U3 | P1 | `~/.omnigraph/credentials` / `.env.omni` dotenv (secrets, git-ignored) | β€” | β†’ N4 | +| U4 | P3 | `omnigraph --graph ` (any command) | β€” | β†’ N14 | +| U5 | P5 | `omnigraph login []` | **N** | β†’ N11 | +| U6 | P5 | `omnigraph init` / `quickstart [--template]` | partly | β†’ N12 / N13 | +| U7 | P2 | `omnigraph config view --resolved --show-origin` | **N** | β†’ N10 | +| N1 | P2 | `load_layered_config()` β€” global (N3) + project (cwd), serde each | **N** | β†’ N2 | +| N2 | P2 | **merge engine** β€” deep-merge settings; replace named-resource entries; replace lists; **retain provenance** and raw field origins | **N⚠️** | β†’ N5, β†’ S_merged | +| N3 | P2 | global-dir resolver β€” `OMNIGRAPH_HOME` else `~/.omnigraph/` | **N** | β†’ N1 | +| N4 | P2 | `load_env_file_into_process` β€” dotenv, real-env-wins (existing) | β€” | β†’ N9 | +| N5 | P2 | `resolve_graph(name, merged)` β†’ typed `Embedded`/`Remote` locator; rejects invalid role/field combinations before execution | **N⚠️** | β†’ N6 | +| N6 | P3 | `GraphConn` β€” `Embedded(engine)` \| `Remote(http)` dispatch | **N⚠️** | β†’ N7, β†’ N8 | +| N7 | P3 | embedded path β€” `Omnigraph::open(uri)` (existing) | β€” | β†’ engine | +| N8 | P3 | **HTTP-client path** β€” POST `/query`/`/mutate`/`/queries/{name}` | **N⚠️** | β†’ P4, β†’ N9 | +| N9 | P2 | `resolve_bearer_token(server)` β€” explicit `auth.token` source if set, else **keyed by name**: `OMNIGRAPH_TOKEN_`/`OMNIGRAPH_TOKEN` β†’ keychain `omnigraph:` β†’ `[]` profile; legacy `bearer_token_env`/dotenv (MR-971) | **N⚠️** | β†’ N8 | +| N10 | P2 | `config view` handler β€” merged + per-field origin (needs N2 provenance) | **N** | β†’ U7 | +| N11 | P5 | `login` handler β€” interactive auth β†’ write `config.yaml` + `credentials` (0600) + `.gitignore` | **N⚠️** | β†’ S_global | +| N12 | P5 | `init` handler β€” `scaffold_config_if_missing` + create graph; refuse-if-exists/`--force` purge (MR-975) | partly | β†’ S_project | +| N13 | P5 | `quickstart` handler β€” scaffold + `--template` + seed + `serve start` + agent prompt (MR-973; needs serve MR-970) | **N⚠️** | β†’ S_project | +| N14 | P3 | agent-mode wrapper β€” `--machine`/`OMNIGRAPH_AGENT_MODE`: JSON, structured errors, never-prompt, typed exit codes (MR-981) | **N⚠️** | β†’ N1 | +| S_global | P1 | `~/.omnigraph/config.yaml` + `credentials` | **N** | read by N1/N9 | +| S_project | P1 | `./omnigraph.yaml` + `.env.omni` | β€” | read by N1/N4 | +| S_merged | P2 | in-memory resolved config (per command, with provenance) | **N** | read by N5/N10 | +| S_cache | P1 | `~/.omnigraph/cache/` (remote catalogs) | **N** | read by N8 | + +```mermaid +flowchart TB + subgraph P1["P1: Disk"] + U1["U1: ~/.omnigraph/config.yaml"] + U2["U2: ./omnigraph.yaml"] + U3["U3: credentials dotenv"] + end + subgraph P2["P2: Config resolution"] + N3["N3: global-dir (OMNIGRAPH_HOME)"] + N1["N1: load_layered_config"] + N2["N2: merge engine (+provenance)"] + N4["N4: dotenv loader"] + N5["N5: resolve_graph(--graph)"] + N9["N9: resolve_bearer_token"] + N10["N10: config view"] + end + subgraph P3["P3: Command execution"] + U4["U4: omnigraph --graph"] + N14["N14: agent-mode wrapper"] + N6["N6: GraphConn embedded|remote"] + N7["N7: embedded Omnigraph::open"] + N8["N8: HTTP-client POST"] + end + subgraph P5["P5: Scaffold"] + U5["U5: login"]; U6["U6: init/quickstart"] + N11["N11: login handler"]; N12["N12: init"]; N13["N13: quickstart"] + end + P4["P4: remote omnigraph-server"] + U1-->N1; U2-->N1; N3-->N1; N1-->N2-->N5-->N6 + U3-->N4-->N9-->N8 + U4-->N14-->N1 + N6-->N7; N6-->N8-->P4 + N2-->N10-->U7["U7: config view --resolved"] + U5-->N11; U6-->N12; U6-->N13 + classDef ui fill:#ffb6c1,stroke:#d87093,color:#000 + classDef n fill:#d3d3d3,stroke:#808080,color:#000 + class U1,U2,U3,U4,U5,U6,U7 ui + class N1,N2,N3,N4,N5,N6,N7,N8,N9,N10,N11,N12,N13,N14 n +``` + +### Slices (vertical, each demo-able) + +| # | Slice | Parts/affordances | Demo | +|---|---|---|---| +| **V1** | **Global layer + merge + `config view`** | A1–A4 Β· N1,N2,N3,N10 Β· U1,U7,S_global,S_merged | Put config in `~/.omnigraph/`, run `omnigraph config view --resolved --show-origin` from any dir β†’ merged result with per-field origin; existing embedded commands work global-first with no project file | +| **V2** | **Remote graphs + HTTP client + creds** | A5–A7 Β· N5,N6,N8,N9 Β· S_cache | Define a `server:` graph entry; `omnigraph query --graph prod` hits the remote server (`curl`-free); embedded `--graph dev` still local | +| **V3** | **`omnigraph login`** | A8 Β· N11,U5 | `omnigraph login prod` writes `~/.omnigraph/credentials` (0600) + `.gitignore`; V2 remote query now works with no manual env | +| **V4** | **Thin-init hardening + quickstart + templates** | A9 Β· N12,N13,U6 (needs serve MR-970) | `omnigraph quickstart --template person-knows` scaffolds + seeds + serves; `init --force` purges (MR-975) | +| **V5** | **Agent-mode** | A10 Β· N14,U4 (MR-981) | `OMNIGRAPH_AGENT_MODE=1 omnigraph query …` β†’ JSON + structured errors + typed exit codes; never-prompt | + +V1 is the foundation (global-first + merge + view). V2 closes the substantive clientβ†’server gap. V3 is credential ergonomics. V4/V5 ride sibling tickets (MR-970/973/981). MR-969 (stored queries) ships independently and is reached by N8's `/queries/{name}` once V2 lands. + +## Rollout + +The slices above are the rollout order: **V1 (global layer + merge) β†’ V2 (remote graphs + HTTP client) β†’ V3 (login) β†’ V4 (quickstart/templates, on MR-970) β†’ V5 (agent-mode, MR-981).** V1–V2 close the substantive gap (global-first config + `curl`-free server access); V3–V5 are ergonomics that ride sibling tickets. Evaluate after V2 against early-adopter and agent-onboarding (MR-973 / MR-974) signal. The spikes (X1 HTTP-client, X2 merge engine, X3 resolver+provenance, X4 login) resolve before their owning slice. + +## Prior art + +- kubeconfig (clusters / users / contexts; `KUBECONFIG`; `kubectl config view`) +- Helix CLI v2 (`helix.toml` local+enterprise instance blocks; `~/.helix/config`; `~/.helix/credentials`) +- AWS CLI (`~/.aws/config` + `~/.aws/credentials` split; named profiles; `credential_process`) +- git (`~/.gitconfig` + `.git/config`; `--show-origin`) +- Cargo (`Cargo.toml` manifest + `~/.cargo/config.toml`) +- Supabase / Prisma (one project manifest; connection via `DATABASE_URL` env) +- 12-factor app (config that varies by deploy lives in the environment) diff --git a/docs/dev/rfc-003-mcp-server-surface.md b/docs/dev/rfc-003-mcp-server-surface.md new file mode 100644 index 0000000..32fbce5 --- /dev/null +++ b/docs/dev/rfc-003-mcp-server-surface.md @@ -0,0 +1,270 @@ +# RFC: MCP Server Surface for `omnigraph-server` β€” Full Tool Parity, Stored Queries, Modular Auth + +**Status:** Proposed +**Date:** 2026-06-01 +**Tickets:** MR-969 (stored queries + MCP exposure β€” the surface this completes), MR-956 (federated auth / WorkOS OAuth β€” the auth substrate this consumes), MR-971 (per-server credential resolver), MR-974 (agent setup surface β€” the installer that wires this), MR-668 (multi-graph server β€” shipped, the routing this builds on) +**Builds on:** [omnigraph#128](https://github.com/ModernRelay/omnigraph/pull/128) (`ragnorc/stored-queries-mcp`) β€” the shipped stored-query registry, `GET /queries`, `POST /queries/{name}`, and the coarse `invoke_query` gate. +**Supersedes:** the MCP-transport portion of [rfc-001-queries-envelope-mcp.md](rfc-001-queries-envelope-mcp.md) (`/mcp/tools` + `/mcp/invoke`). See [Relationship to RFC-001](#relationship-to-rfc-001). +**Target release:** v0.8.x (phased β€” see Rollout) + +## Summary + +Add a first-class **MCP (Model Context Protocol) server surface to `omnigraph-server`**, exposed over **Streamable HTTP**, that projects the server's operations as MCP tools and resources for LLM clients (Claude Code/Desktop/web, Cursor, etc.). Two populations of tools share one projection path: + +1. **Built-in operational tools** β€” parity with the existing `@modernrelay/omnigraph-mcp` stdio package's **13 tools** (`health`, `snapshot`, `read`, `schema_get`, `branches_list`, `commits_list`, `commits_get`, `change`, `ingest`, `branches_create`, `branches_delete`, `branches_merge`, `schema_apply`) and its **2 resources** (`omnigraph://schema`, `omnigraph://branches`), plus a new server-scoped `graphs_list` tool and an `omnigraph://graphs` resource (multi-graph mode). +2. **Dynamic stored-query tools** β€” one MCP tool per `mcp.expose: true` entry in the `queries:` registry (MR-969 / #128), with parameters typed from the `.gq` declaration via the shipped `query_catalog_entry` / `param_descriptor` projection. + +Every tool is **authorized by the server's existing Cedar policy engine**. The MCP layer never implements its own authentication: it consumes an **already-resolved `ResolvedActor`** from the server's bearer middleware (`require_bearer_auth` today; the `TokenVerifier` seam when MR-956 lands), so the **same MCP endpoint serves on-prem (static or customer-OIDC tokens) and our cloud (WorkOS OAuth) by configuration only**. Cloud OAuth is an additive layer (RFC 9728 protected-resource metadata) that slots in with zero MCP changes. + +The end-state collapses two diverging tool implementations into one: the in-server MCP is the canonical, Cedar-gated, remotely-reachable surface; the stdio package becomes a thin stdio↔HTTP proxy (local on-ramp) over it. + +> **Key caveat, stated up front (see Β§5.9 below):** the headline "a token scoped via Cedar to a *specific set* of stored queries" requires **per-query `invoke_query` scope**, which is *designed* (rfc-001) but **not yet implemented** β€” the shipped action is coarse (any stored query on the graph, or none). Per-actor Cedar curation works today for *built-in vs ad-hoc vs admin* tools and for *stored-vs-ad-hoc*; sub-selecting individual stored queries per actor is gated on a prerequisite (PR 0b). Until then, stored-query curation is graph-level (registry membership + `mcp.expose`). + +## Relationship to RFC-001 + +[rfc-001-queries-envelope-mcp.md](rfc-001-queries-envelope-mcp.md) (MR-656 / MR-976 / MR-969) is the parent design for stored queries + the response envelope + MCP. This RFC is the **detailed MCP-transport design** that #128 left for a follow-up, and it **revises rfc-001 in three places where the shipped code or the MCP wire protocol diverged from rfc-001's sketch**: + +1. **Transport shape.** rfc-001 sketched `GET /mcp/tools` + `POST /mcp/invoke` (a bespoke REST pair). **That is not the MCP wire protocol β€” real MCP clients cannot connect to it.** This RFC implements actual MCP JSON-RPC over Streamable HTTP and reuses `query_catalog_entry` as a *projection source*, not a parallel surface. (rfc-001's own Open Question already leaned toward Streamable HTTP.) +2. **Exposure config.** rfc-001 specified inline `.gq` pragmas (`@mcp(expose=…)`, default `expose=false`). **#128 shipped a different mechanism:** YAML `queries..mcp.expose` in `omnigraph.yaml`, **default `true`** (declaring a query in the manifest *is* the opt-in). This RFC builds on the shipped YAML form; the `.gq`-pragma design in rfc-001 is superseded for exposure. +3. **Schema introspection.** rfc-001 lists "Schema introspection through MCP" as a **non-goal** ("agents see types through declared return shapes"). This RFC **revises that**: the operational-parity tools include `schema_get` and `omnigraph://schema` β€” *because the shipped stdio package already exposes both*. The non-goal is achieved by *policy*, not omission: `schema_get`/`omnigraph://schema` are Cedar-gated by `Read`, and the recommended locked-down agent policy denies `Read`, so a curated agent still never sees the schema. (rfc-001's intent is preserved; the mechanism moves from "don't build it" to "build it, gate it.") + +Everything else in rfc-001 (two-paths-one-engine, per-query `invoke_query` *as the intended scope*, the response envelope, multi-graph per-graph endpoints) this RFC consumes unchanged. + +> **Numbering note:** the `TokenVerifier`/WorkOS auth design is referred to in code (`crates/omnigraph-server/src/identity.rs`) as "RFC 0001," which is a *different* document from this repo's `docs/dev/rfc-001-queries-envelope-mcp.md`. To avoid the collision this RFC cites the auth substrate as **MR-956** throughout, never "RFC 0001." + +## Reconciliation with shipped code (verified against `ragnorc/stored-queries-mcp` HEAD) + +Verified against `crates/omnigraph-server/src/{lib.rs,api.rs}` and `crates/omnigraph-policy/src/lib.rs` at the current branch head (not the #128 PR body, and not `api.rs` alone): + +- βœ… `GET /queries` returns the `mcp.expose == true` subset as `QueriesCatalogOutput { queries: [QueryCatalogEntry] }`, each with typed `ParamDescriptor`s, `tool_name`, `description`, `instruction`, and a `mutation` flag. **MCP-ready projection, but exposed as bespoke REST/JSON β€” not the MCP wire protocol.** +- βœ… `POST /queries/{name}` route exists (`server_invoke_query`, `lib.rs`). +- βœ… `query_catalog_entry()` / `param_descriptor()` with an exhaustive `ScalarType β†’ ParamKind` map (a new scalar is a compile error). +- βœ… `InvokeQuery` Cedar action defined in `omnigraph-policy`. +- βœ… **`InvokeQuery` IS enforced** at `POST /queries/{name}`: `server_invoke_query` calls `authorize(PolicyAction::InvokeQuery)` and **masks a denial to a 404 identical to "unknown query"** so the catalog isn't probeable (the denial-masking the previous draft of this RFC reported as missing is shipped β€” it lives in `lib.rs`, not `api.rs`). The stored-mutation path is already double-gated: `InvokeQuery` outer, then `Change` inside `run_mutate`. +- βœ… **Reuse path exists:** `run_query` / `run_mutate` are already decoupled from their HTTP request bodies and take registry-supplied `(source, name, params, branch/snapshot)`. MCP `tools/call` for both stored and ad-hoc tools delegates to these β€” no new business logic. +- ❌ **Per-query (`invoke_query[name]`) scope is NOT implemented.** `PolicyRequest` carries only `{action, branch, target_branch}` β€” **no query-name dimension** β€” and the action is documented coarse ("permits *any* stored query on the graph"). rfc-001 *designed* per-name scope; it is unbuilt. This RFC's per-query Cedar filtering (Β§5.4) and recommended agent policy (Β§5.9) depend on it β†’ tracked as **PR 0b**. +- ❌ No MCP protocol surface (`initialize`/`tools/list`/`tools/call`, JSON-RPC, transport). +- ❌ No `TokenVerifier` trait yet β€” `require_bearer_auth` resolves a `ResolvedActor` inline (static-hash). The trait/`OidcJwtVerifier` are MR-956 (draft). The MCP layer's only requirement β€” *consume `ResolvedActor`* β€” is satisfiable today. + +Stack (verified `Cargo.toml`): Axum + utoipa (OpenAPI) + `omnigraph-policy` (Cedar) + `futures` + `tokio`. **No MCP crate present.** `edition = "2024"`. + +## Motivation + +- **One curated, safe, remotely-reachable tool surface.** MR-969's thesis: hand an LLM a token Cedar-scoped to a set of tools and it sees exactly those typed tools β€” cannot construct ad-hoc queries it isn't permitted, cannot read the schema it isn't permitted, cannot reach other graphs. Today the only MCP is the stdio package: local-only, full surface, ungated. +- **Parity, so the in-server MCP can be the single implementation.** Operators/agents already depend on the operational tools. Supporting them server-side behind one Cedar gate lets the stdio package degrade to a proxy and removes two diverging tool sets. +- **On-prem and cloud from one endpoint.** A managed cloud (WorkOS OAuth) and an on-prem/air-gapped deploy (static or customer-OIDC tokens) must serve the same MCP without forks or MCP-specific auth. +- **Foundation for the agent on-ramp (MR-974).** `omnigraph mcp install --agent ` needs a decided transport + a stable endpoint. + +## Goals + +- Project built-in tools + stored queries as MCP tools through **one** registry abstraction. +- `tools/list` and the callable set are **identical for argument-independent authorization**, both driven by Cedar (see Β§5.4 for the branch-scoped caveat). +- The MCP layer is **auth-method-agnostic**: it consumes `ResolvedActor`, never a raw token, never branches on how auth happened. +- The same endpoint works on-prem (static/OIDC) and cloud (WorkOS OAuth), switched by config; cloud OAuth is additive (RFC 9728). +- No new business logic: MCP tools delegate to the same `run_query`/`run_mutate`/branch/schema functions the HTTP routes call. +- Behaviour-neutral when unused: no MCP traffic = no change. + +## Non-Goals + +- **Building/hosting an OAuth authorization server.** The server is a Resource Server; WorkOS AuthKit+Connect is the AS (MR-956). The MCP endpoint validates tokens, never issues them, never holds client secrets. +- **OAuth/WorkOS implementation itself** β€” MR-956's work. This RFC leaves a clean RFC-9728 hook and consumes `ResolvedActor`. +- **MCP prompts, elicitation, `tools/list_changed`, resource subscriptions, server-initiated messages.** None needed β†’ enables a stateless POST-only transport (Β§5.6). +- **stdio transport inside the server.** stdio stays in the TS package (now a proxy). +- **Cross-graph tool listing.** Per-graph catalogs only (MR-969 + RFC-002 non-goal). +- **Hot reload of the query registry.** Restart-only (MR-969). + +## Background + +`omnigraph-server` (Axum) already implements every operation this RFC exposes as an authenticated HTTP route; each authorizes via a `PolicyAction` against the Cedar policy for a server-resolved actor and calls into the engine. The existing stdio MCP package is a *client* of these routes (it owns no business logic). MR-956 will introduce a `TokenVerifier` trait (`StaticHashTokenVerifier` today inline, `OidcJwtVerifier` for OIDC/WorkOS) producing the `ResolvedActor { actor_id, tenant_id: Option, scopes: Vec, source }` that already exists in `identity.rs` and is consumed by Cedar β€” token *validation* is offline (cached JWKS), so on-prem/air-gapped has no request-path dependency on the cloud. + +## Design + +### 5.1 One tool model: a `McpTool` trait, two populators + +Both built-in and stored-query tools implement one trait so `tools/list` / `tools/call` never special-case: + +```rust +trait McpTool: Send + Sync { + fn name(&self) -> &str; // MCP tool id (stable) + fn title(&self) -> Option<&str>; + fn description(&self) -> &str; + fn input_schema(&self) -> serde_json::Value; // JSON Schema (draft 2020-12) + fn annotations(&self) -> ToolAnnotations; // readOnlyHint / destructiveHint / idempotentHint + /// The Cedar request(s) this call requires, given parsed args. Used BOTH at + /// list-time (dry-run filter, default args) and call-time (enforce, real args). + fn authorization(&self, args: &ToolArgs) -> Vec; + async fn call(&self, ctx: &GraphCtx, args: ToolArgs) -> Result; +} +``` + +- **Built-ins**: ~14 static impls, each delegating to the *same* function its HTTP route calls (`run_query`, `run_mutate`, branch ops, `apply_schema_as`, …). `input_schema` authored once (or derived from each route's existing `utoipa`/`ToSchema` DTO). +- **Stored queries**: generated `McpTool` instances, one per `mcp.expose` entry; `input_schema` from `param_descriptor` (Β§5.3); `authorization` β†’ `InvokeQuery` (coarse today; `InvokeQuery{name}` after PR 0b) then the inner `Read`/`Change`. + +`ToolRegistry` for a graph = the static built-ins + the dynamic stored-query tools resolved from that graph's `GraphHandle` registry. + +### 5.2 Tool catalog (parity) and Cedar mapping + +Each built-in **reuses the exact `PolicyAction` its HTTP route already enforces** β€” verified against the handlers in `lib.rs`, not invented: + +| MCP tool | Scope | Read/Mutate | Cedar action (verified from route) | +|---|---|---|---| +| `health` | server | read | none (liveness/version) | +| `graphs_list` *(new)* | server | read | `GraphList` | +| `snapshot` | graph | read | `Read` | +| `schema_get` | graph | read | `Read` | +| `branches_list` | graph | read | `Read` | +| `commits_list`, `commits_get` | graph | read | `Read` | +| `read` (ad-hoc `.gq`) / `query` *(alias)* | graph | read | `Read` | +| `change` (ad-hoc `.gq`) / `mutate` *(alias)* | graph | mutate | `Change` | +| `ingest` (NDJSON) | graph | mutate | `Change` (+ `BranchCreate` when forking a new branch) | +| `branches_create` | graph | mutate | `BranchCreate` | +| `branches_delete` | graph | mutate | `BranchDelete` | +| `branches_merge` | graph | mutate | `BranchMerge` | +| `schema_apply` (`allow_data_loss`) | graph | mutate | `SchemaApply` | +| **stored query** (`find_user`, …) | graph | inferred | `InvokeQuery` (coarse; `InvokeQuery{name}` after PR 0b) + inner `Read`/`Change` | + +There is **no `Ingest` and no separate `snapshot`/`Export` action** β€” `ingest` enforces `Change`, `snapshot` enforces `Read`. (`Export` exists but maps to the `/export` route, which this RFC does not expose as a tool.) + +**Tool id parity vs. canonicalization.** The shipped stdio package uses tool ids **`read`/`change`** (and calls the deprecated `/read`,`/change` routes). The server HTTP surface canonicalized to `/query`,`/mutate` with `/read`,`/change` deprecated (MR-656). To keep existing package clients working *and* align with the server, the MCP exposes **`query`/`mutate` as canonical with `read`/`change` retained as deprecated-but-live aliases** (both dispatch to the same handler). Open Q7 asks whether to drop the aliases later. + +Resources (Β§5.5): `omnigraph://schema`, `omnigraph://branches` (parity), plus `omnigraph://graphs` *(new)* β€” each gated by the same action as its list/get route (`Read`, `Read`, `GraphList`). + +### 5.3 `ParamDescriptor β†’ JSON Schema` (stored-query tools) + +| `ParamKind` | JSON Schema | Notes | +|---|---|---| +| String | `{"type":"string"}` | | +| Bool | `{"type":"boolean"}` | | +| Int (i32/u32) | `{"type":"integer"}` | | +| BigInt (i64/u64) | `{"type":"string","pattern":"^-?\\d+$"}` | JSON numbers lose precision >2⁡³ β†’ string (matches the shipped `api.rs` rationale). (Open Q1) | +| Float (f32/f64) | `{"type":"number"}` | | +| Date | `{"type":"string","format":"date"}` | | +| DateTime | `{"type":"string","format":"date-time"}` | | +| Blob | `{"type":"string","contentEncoding":"base64"}` | | +| Vector | `{"type":"array","items":{"type":"number"},"minItems":dim,"maxItems":dim}` | uses `vector_dim` | +| List | `{"type":"array","items":}` | scalar items only (grammar guarantees) | + +`nullable == false` β†’ param is in `required`. Annotations: `mutation` β†’ `{readOnlyHint:false, destructiveHint:true}`; else `{readOnlyHint:true}`. `description` β†’ tool description; `instruction` β†’ appended to description (or `_meta`). (The shipped `check()` already warns when an `mcp.expose` query declares a `Vector` param an LLM can't supply.) + +For built-in tools the schema is hand-authored from the route DTO; e.g. `query` β†’ `{source: string, branch?: string, params?: object}`; `schema_apply` β†’ `{schema: string, allow_data_loss?: boolean}`; `ingest` β†’ `{ndjson: string, mode?: "merge"|"append"|"overwrite", branch?: string}`. + +### 5.4 `tools/list` (Cedar-filtered) and `tools/call` (dispatch + masking) + +- **`tools/list`**: build the `ToolRegistry`; for each tool evaluate `authorization(default_args)` against the actor's Cedar policy; **emit only tools that authorize**. Authz decisions memoized per request. Stored-query tools additionally require `mcp.expose: true`. + - **Exactness caveat (R7 is conditional):** the listed set equals the callable set **only for tools whose authorization is argument-independent** (`health`, `graphs_list`, `snapshot`, `schema_get`, `branches_list`, `commits_*`, ad-hoc `query`/`mutate`, and stored queries under the *coarse* action). For **branch-scoped tools** (`branches_create`/`merge` with `target_branch_scope`, and any branch-scoped `Read`/`Change` rule), list-time uses `default_args` (e.g. branch `main`) and cannot know the real target, so the listed set is a *best-effort approximation* of callability β€” a call may still be denied (or, rarely, a hidden tool would have been allowed). `tools/call` is always the authoritative gate. The contract is: **list never shows a tool the actor can't ever call; for branch-scoped tools it may show one the actor can call only on some branches.** +- **`tools/call`**: resolve `name` β†’ `McpTool` (masked-404 if unknown *or* `mcp.expose:false`); parse+validate args against `input_schema`; enforce `authorization(args)` (mutations stay double-gated: `InvokeQuery` then `Change`); on success `call`. **Denial masking** lives in one place (the dispatcher): an authz denial is returned identically to "unknown tool" (Β§5.10), reusing the same deny≑missing principle already shipped at `POST /queries/{name}`. + +### 5.5 Resources + +Advertise `resources` capability (`subscribe:false, listChanged:false`). `resources/list` β†’ the URIs the actor may read; `resources/read` β†’ schema `.pg` text / branches JSON / (multi-graph) graphs JSON, each gated by the corresponding action (`Read`, `Read`, `GraphList`). A locked-down agent denied `Read` simply never sees `omnigraph://schema` or `omnigraph://branches` β€” this is how rfc-001's "agents don't introspect schema" intent is met *by policy* (Β§Relationship-to-RFC-001). + +### 5.6 Transport: Streamable HTTP, stateless, POST-only + +- **Streamable HTTP** (MCP's current standard; we're already an HTTP server). One endpoint per scope (Β§5.7). +- Because the server emits **no** server-initiated messages, implement the **minimal conformant** shape: client `POST`s JSON-RPC, server replies `application/json`. **No SSE channel, no `Mcp-Session-Id`, stateless** β€” each request authenticated independently via the bearer middleware. Honour the `MCP-Protocol-Version` header. SSE/sessions can be added later if subscriptions land. +- **JSON-RPC methods:** `initialize` (advertise `{tools:{listChanged:false}, resources:{listChanged:false, subscribe:false}}` + serverInfo/version), `notifications/initialized` (no-op ack), `ping`, `tools/list`, `tools/call`, `resources/list`, `resources/read`. `prompts/list` returns empty if probed. +- **Library decision (Open Q2):** spike `rmcp` (official Rust MCP SDK) for conformance + Streamable-HTTP/Axum on edition 2024; **fall back to a hand-rolled ~150 LOC JSON-RPC-over-POST** (only the methods above) on friction. Given the tiny surface, hand-roll is an acceptable default. + +### 5.7 Endpoint routing (server- vs graph-scoped) + +- **Single-graph mode:** `POST /mcp` β€” graph tools + server tools (`health`, `graphs_list`). +- **Multi-graph mode (MR-668):** `POST /graphs/{graph_id}/mcp` β€” graph-scoped tools for that graph; plus a server-level `POST /mcp` exposing only server-scoped tools (`health`, `graphs_list`). A per-graph endpoint never lists another graph's tools (isolation, tested). Mirrors the shipped `/graphs/{graph_id}/…` cluster routing. (Open Q5: confirm naming + whether server tools also appear on the per-graph endpoint.) + +### 5.8 Modular / decoupled auth (the cross-cutting requirement) + +**Invariant (load-bearing, satisfiable today):** the MCP handler receives an **already-resolved `ResolvedActor`** and **branches on nothing** about how the token was verified. No token parsing, no method check, no OAuth inside the MCP module. Today that actor comes from `require_bearer_auth`; when MR-956 lands it comes from a `TokenVerifier` β€” the MCP code is identical either way. + +``` +request β†’ [auth middleware: ResolvedActor] β†’ [MCP route] β†’ Cedar β†’ McpTool +``` + +**Server side β€” auth is config, not code:** + +| Deployment | Verifier | MCP change | +|---|---|---| +| On-prem, static bearer | `require_bearer_auth` / `StaticHashTokenVerifier` | none | +| On-prem, customer IdP | `OidcJwtVerifier` β†’ customer issuer (MR-956) | none | +| Our cloud | `OidcJwtVerifier` β†’ WorkOS, `tenant_id = Some(org_id)` (MR-956) | none | + +Token validation is offline (cached JWKS) β€” on-prem/air-gapped keeps working with no request-path cloud dependency. The MCP endpoint never terminates OAuth and never holds a client secret (Resource Server only). + +**Cloud client negotiation β€” additive, no MCP changes:** when MR-956 lands, the server publishes RFC 9728 `/.well-known/oauth-protected-resource` and returns `WWW-Authenticate: Bearer ..., resource_metadata="..."` on 401. A compliant MCP client (Claude) then auto-negotiates: static bearer to an on-prem endpoint; on a cloud 401 it discovers the WorkOS AS and runs OAuth/PKCE itself β€” **same endpoint URL, zero client-side branching.** This RFC only requires that MCP routes flow through the standard 401 path so that hook can be added later without touching MCP. + +**Multi-user identity pass-through (cloud):** the *caller's* token (a WorkOS JWT, audience-bound per-tenant) must reach the server so Cedar enforces per-user/per-tenant policy β€” never a shared service token. The MCP endpoint validates it offline and maps `org_id β†’ tenant_id`. This is why the **remote path is the in-server HTTP MCP that Claude connects to directly** (its token flows through), not a stdio bridge impersonating a user. + +**Client-side credential acquisition (CLI/SDK/proxy) β€” pluggable `CredentialSource`** (RFC-002 Β§5, MR-971), keyed by server name, so OAuth is a future *sibling key*, not a re-key: + +```yaml +servers: + onprem: { endpoint: https://og.internal:8080, auth: { token: { env: OG_TOKEN } } } + edge: { endpoint: https://og-edge, auth: { token: { command: [vault, read, -field=token, secret/og] } } } + cloud: { endpoint: https://api.omnigraph.cloud, auth: { oauth: { issuer: workos } } } # future sibling +``` + +Implicit chain when `auth:` omitted: `OMNIGRAPH_TOKEN_` β†’ keychain `omnigraph:` β†’ `[]` in `~/.omnigraph/credentials`; legacy `bearer_token_env` honoured. Secrets never inlined. + +### 5.9 Safety model β€” Cedar is the gate, default-deny is the floor + +With ad-hoc `query`/`mutate`/`schema_apply` present as tools, the **only** thing protecting an untrusted agent is the Cedar policy. Therefore: + +- **Default-deny when tokens are configured** (MR-723, shipped) is the floor β€” an actor with no grants sees an empty tool list. +- **What works today (coarse action):** a policy can hide all ad-hoc tools and admin tools per-actor (`deny Read, Change, SchemaApply, Branch*`) while allowing stored queries (`allow InvokeQuery`). That already reproduces "can't run ad-hoc, can't read schema, can only call stored queries" β€” the agent sees *every* exposed stored query plus nothing else. +- **What needs PR 0b (per-query scope):** selecting *which* stored queries an actor may call (`allow InvokeQuery [find_user, list_orders]`, deny the rest). The shipped `invoke_query` is coarse (all stored queries or none). Until PR 0b adds a query-name dimension to `PolicyRequest` + the Cedar schema (rfc-001's intended design), per-actor sub-selection of stored queries is **not expressible**; curation is graph-level (which `.gq` files are registered + `mcp.expose`). +- `schema_apply`, `branches_delete`, ad-hoc `mutate` require an explicit admin-tier grant; never in a default agent policy. +- (Open Q3) Optional `mcp.allow_adhoc` server switch defaulting **off** for the ad-hoc `query`/`mutate` tools β€” defence-in-depth independent of Cedar, and independent of PR 0b. + +### 5.10 Result shaping and error mapping + +- **Success:** `tools/call` returns `content: [{type:"text", text:}]` where `` is the route's existing output envelope (read rows / mutation summary, i.e. `ReadOutput` / `ChangeOutput`). (Open Q4: also emit `structuredContent` + `outputSchema` β€” defer; text-JSON for v1.) +- **Tool execution error** (bad params after schema validation, engine error): result with `isError:true` + a text content block. +- **Authorization denial / unknown tool / `mcp.expose:false`:** a single JSON-RPC error (`-32602`, message `"unknown tool"`) β€” identical for all three so policy isn't probeable (same principle as the shipped `POST /queries/{name}` 404 masking). +- **Auth failure** (bad/absent bearer): HTTP 401 from the middleware *before* MCP β€” carries `WWW-Authenticate` (the RFC 9728 hook), never masked as a tool error. (This is exactly the path the shipped `authorize`/`authorize_request` split preserves: operational failures keep their status; only *denials* are masked.) + +## Relationship to the `@modernrelay/omnigraph-mcp` stdio package + +Verified surface of the package (`omnigraph-ts`, pkg version `0.3.0`, `@modelcontextprotocol/sdk@^1.29.0`, **stdio only**): **13 tools** (`health`, `snapshot`, `read`, `schema_get`, `branches_list`, `commits_list`, `commits_get`, `change`, `ingest`, `branches_create`, `branches_delete`, `branches_merge`, `schema_apply`) and **2 resources** (`omnigraph://schema`, `omnigraph://branches`). It is a thin client over the SDK β†’ HTTP routes and **forwards the caller's bearer verbatim** (no inspection). + +Once parity lands, **collapse to one implementation**: the in-server MCP is canonical (Cedar-gated, remote-capable, the path that becomes a Claude-web connector via MR-956). The stdio package degrades to a **thin stdio↔HTTP proxy** forwarding JSON-RPC (and the incoming `Authorization`) to `/mcp` β€” staying the local on-ramp for Claude Code/Desktop while sharing one tool set, one Cedar gate. Transition: keep the current independent stdio package on its `0.3.x`/`0.6.x` line; ship proxy mode in a later TS minor once the server endpoint is GA. (Note: the package is currently several minors behind the server β€” its vendored `spec/openapi.json` predates the stored-query routes β€” so it needs the standard re-sync regardless of MCP work.) + +## Testing + +- **Protocol conformance:** `initialize` handshake + advertised capabilities; `tools/list` shape; `tools/call` happy path; JSON-RPC error envelopes (`-32601` unknown method, `-32602` invalid params / unknown tool); `resources/list` + `resources/read`. +- **Cedar filtering (coarse, today):** an actor with `allow InvokeQuery` + `deny Read/Change` sees *all* exposed stored queries but **not** `query`/`mutate`/`schema_get`; `tools/call query` returns masked "unknown tool"; an admin sees the full catalog. +- **Cedar filtering (per-query, gated on PR 0b):** actor scoped to `InvokeQuery [find_user]` sees *only* `find_user`; `tools/call list_orders` masks. **This test ships with PR 0b**, not PR 1 β€” it cannot pass against the coarse action. +- **Parity per built-in:** each tool round-trips against the same expectations as its HTTP route (reuse route tests); `read`/`change` aliases dispatch identically to `query`/`mutate`. +- **Double-gating:** a stored mutation requires both `InvokeQuery` and `Change`; `schema_apply` requires `SchemaApply`. +- **`mcp.expose:false`:** absent from `GET /queries` and MCP `tools/list`; still service-callable by name through `POST /queries/{name}` when the actor has `invoke_query`, but not MCP-callable. +- **Schema generation:** table-driven over every `ParamKind` incl. nullable / list / vector(dim). +- **Branch-scoped list approximation:** assert the documented R7 caveat β€” a branch-scoped policy lists `branches_create`, and `tools/call` is the authoritative gate (a denied target still 403s/masks). +- **Multi-graph isolation:** `/graphs/a/mcp` never lists graph `b`'s tools; server `/mcp` exposes only server tools. +- **Auth decoupling:** the MCP suite is green under the current `require_bearer_auth` and under a mock OIDC `ResolvedActor` source β€” proving verifier-agnosticism. A 401 carries `WWW-Authenticate`. +- **OpenAPI:** the JSON-RPC endpoint is not REST β€” document only the envelope in utoipa (or exclude); keep `openapi.json` drift test green (`OMNIGRAPH_UPDATE_OPENAPI=1` to regenerate on intentional change). +- **Cross-repo smoke (optional):** point `@modelcontextprotocol/sdk` (TS) at the HTTP endpoint in an `omnigraph-ts` integration test. + +## Rollout β€” phased by risk + +- **PR 0a β€” extract the reusable invoke path (small).** The coarse `invoke_query` gate + 404 denial-masking are **already shipped** in `server_invoke_query`. Extract the read/mutate dispatch into `invoke_stored_query(handle, name, params, branch/snapshot, actor)` so MCP `tools/call` and the HTTP route share one path. No behaviour change. *(Replaces the previous draft's "PR 0 β€” wire the gate", which was already done.)* +- **PR 0b β€” per-query `invoke_query` scope (the safety prerequisite).** Add a query-name dimension to `PolicyRequest` + the Cedar schema (rfc-001's intended design), wire it at `POST /queries/{name}` and in the stored-query `McpTool::authorization`. Independently useful (the `allow InvokeQuery [find_user]` policy). **Gates the per-query Cedar-filtering test and Β§5.9's recommended agent policy.** +- **PR 1 β€” MCP transport + read-only parity + stored-query reads.** Endpoint(s), `initialize`/`tools/list`/`tools/call`/`resources/*`, the `McpTool` registry, Cedar-filtered listing, the read-only built-ins (`health`, `graphs_list`, `snapshot`, `read`/`query`, `schema_get`, `branches_list`, `commits_*`) + resources + stored-query *reads*. All auth-agnostic. +- **PR 2 β€” mutating parity + stored-query mutations.** `change`/`mutate`, `ingest`, `branches_create/delete/merge`, `schema_apply`, stored-query mutations + the `mcp.allow_adhoc` switch. +- **PR 3 β€” docs + agent on-ramp hook.** `docs/user/server.md` MCP section (incl. the recommended agent policy + the coarse-vs-per-query caveat), `openapi.json` sync, the `omnigraph mcp install` config target (MR-974), and the downstream `omnigraph-ts` re-sync/proxy follow-up. +- **Later (separate, MR-956):** RFC 9728 protected-resource metadata + WorkOS β€” slots in with zero MCP changes. +- **Later (TS minor):** stdio package β†’ proxy mode. + +## Migration / backwards compatibility + +- **Additive.** No `queries:` and no MCP traffic β†’ today's behaviour unchanged. New endpoints are new routes. +- **Cedar default-deny** (when tokens configured) means MCP exposes nothing until an actor is granted β€” safe by default. +- The stdio package keeps working unchanged; proxy mode is opt-in later. +- `openapi.json` only gains the documented MCP envelope; existing REST routes untouched. + +## Open Questions + +1. **BigInt/u64 as JSON string** (recommended, precision-safe) vs number. +2. **`rmcp` vs hand-rolled** JSON-RPC (spike `rmcp` on edition 2024; default to hand-roll on friction). +3. **Default-off `mcp.allow_adhoc`** for ad-hoc `query`/`mutate` (recommended) vs always-on + Cedar-only. +4. **`structuredContent` + `outputSchema`** now vs text-JSON v1 (recommend v1 text-JSON). +5. **Endpoint paths:** `/mcp` + `/graphs/{id}/mcp` β€” confirm naming and whether server-scoped tools also appear on the per-graph endpoint. +6. **Stateless POST-only** confirmed (no near-term server-initiated messages) β€” revisit only if subscriptions land. +7. **Legacy alias tools** (`read`/`change`): keep for client compat (the shipped package uses them), or drop and rely on `query`/`mutate`? +8. **PR 0b shape:** per-query scope as a Cedar *resource* (`StoredQuery::"find_user"`) vs a `query_name` *context attribute* + policy condition β€” affects how `allow InvokeQuery [list]` is authored. diff --git a/docs/dev/testing.md b/docs/dev/testing.md index e6989ba..8974a9f 100644 --- a/docs/dev/testing.md +++ b/docs/dev/testing.md @@ -20,7 +20,7 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav | `end_to_end.rs` | Full init β†’ load β†’ query/mutate flow | | `branching.rs` | Branch create / list / delete, lazy fork | | `merge_truth_table.rs` | Merge-pair truth table (MR-786): all 9Γ—9 `(left_op, right_op)` cells from `{noop, addNode, removeNode, addEdge, removeEdge, setProperty, dropProperty, addLabel, removeLabel}`. Adding a new op to `OpVariant` forces a compile error in `build_case` until the new row + column are dispositioned. 36 executable cells run through real `branch_merge` with a structured oracle (`MergeOutcome` / `MergeConflictKind` + graph-state assert); 45 cells involving `dropProperty`/`addLabel`/`removeLabel` are recorded as `Unsupported` until the mutation grammar grows. | -| `runs.rs` | Direct-publish writes: cancellation, concurrent-writer CAS, multi-statement atomicity, MR-794 staged-write rewire (Dβ‚‚ rejection, insert+update coalesce, multi-append coalesce, partial-failure recovery, load RI/cardinality recovery) | +| `writes.rs` | Direct-publish writes: cancellation, concurrent-writer CAS, multi-statement atomicity, MR-794 staged-write rewire (Dβ‚‚ rejection, insert+update coalesce, multi-append coalesce, partial-failure recovery, load RI/cardinality recovery) | | `staged_writes.rs` | TableStore staged-write primitives (`stage_append`, `stage_merge_insert`, `commit_staged`, `scan_with_staged`, `count_rows_with_staged`) β€” primitive-level only; engine code uses the in-memory `MutationStaging` accumulator instead | | `lifecycle.rs` | Graph lifecycle, schema state | | `point_in_time.rs` | Snapshots, time travel (`snapshot_at_version`, `entity_at`) | @@ -34,10 +34,10 @@ The engine's `tests/` is the principal coverage surface; most graph-shaped behav | `s3_storage.rs` | S3-backed graph (skipped unless `OMNIGRAPH_S3_TEST_BUCKET` is set) | | `lance_version_columns.rs` | Per-row `_row_last_updated_at_version` behavior | | `validators.rs` | Schema constraint enforcement (enum, range, unique, cardinality) across JSONL, insert, update paths | -| `maintenance.rs` | `optimize` (compaction) + `cleanup` (version GC): empty/idempotent/no-op edges, policy validation, head preservation | -| `failpoints.rs` | Failure-injection coverage (gated on `failpoints` feature). Includes the four per-writer Phase B β†’ recovery integration tests (`recovery_rolls_forward_after_finalize_publisher_failure`, `schema_apply_phase_b_failure_recovered_on_next_open`, `branch_merge_phase_b_failure_recovered_on_next_open`, `ensure_indices_phase_b_failure_recovered_on_next_open`). | +| `maintenance.rs` | `optimize` (compaction) + `cleanup` (version GC): empty/idempotent/no-op edges, policy validation, head preservation; `optimize` publishes the compacted version so the manifest tracks the Lance HEAD and a subsequent schema apply succeeds (`optimize_publishes_compaction_to_manifest_so_schema_apply_succeeds`), and refuses to run while a `__recovery` sidecar is pending so optimize only ever operates on a recovered graph (`optimize_defers_when_recovery_sidecar_is_pending`) | +| `failpoints.rs` | Failure-injection coverage (gated on `failpoints` feature). Includes the five per-writer Phase B β†’ recovery integration tests (`recovery_rolls_forward_after_finalize_publisher_failure`, `schema_apply_phase_b_failure_recovered_on_next_open`, `branch_merge_phase_b_failure_recovered_on_next_open`, `ensure_indices_phase_b_failure_recovered_on_next_open`, `optimize_phase_b_failure_recovered_on_next_open`). | | `recovery.rs` | Open-time recovery sweep β€” sidecar I/O, classifier dispatch (NoMovement / RolledPastExpected / UnexpectedAtP1 / UnexpectedMultistep / InvariantViolation), all-or-nothing decision, roll-forward via `ManifestBatchPublisher::publish`, roll-back via `Dataset::restore`, audit row in `_graph_commit_recoveries.lance`, `OpenMode::ReadOnly` skip path | -| `composite_flow.rs` | Compositional/narrative end-to-end stories β€” multi-step flows that compose mechanics covered by other test files. Catches integration regressions where individual operations all pass their unit tests but their composition breaks (sequential merges, post-merge main writes, time-travel through merge DAG, reopen consistency over multi-merge histories). | +| `composite_flow.rs` | Compositional/narrative end-to-end stories β€” multi-step flows that compose mechanics covered by other test files. Catches integration regressions where individual operations all pass their unit tests but their composition breaks (sequential merges, post-merge main writes, time-travel through merge DAG, reopen consistency over multi-merge histories, post-optimize and post-cleanup strict writes). | ## Fixtures @@ -89,7 +89,7 @@ If introducing coverage tooling is in scope for your task, the natural first ste How to check: -1. **Map the change to an area** β€” use the engine integration-test table above (`branching.rs`, `runs.rs`, `search.rs`, etc.). The filename usually names the area. +1. **Map the change to an area** β€” use the engine integration-test table above (`branching.rs`, `writes.rs`, `search.rs`, etc.). The filename usually names the area. 2. **Open the file and skim every test fn name.** Test fn names are the index β€” read them all, not just the first few. 3. **Grep for the symbol or path you're changing.** `rg ` or `rg ` across all `tests/` directories surfaces existing coverage you might miss. 4. **Decide one of three outcomes**, in this order of preference: diff --git a/docs/dev/runs.md b/docs/dev/writes.md similarity index 87% rename from docs/dev/runs.md rename to docs/dev/writes.md index 816f2ac..d2c7c7e 100644 --- a/docs/dev/runs.md +++ b/docs/dev/writes.md @@ -1,7 +1,10 @@ -# Runs β€” REMOVED (MR-771) +# Direct-Publish Write Path -The Run state machine and `__run__` staging branches were removed in -MR-771. `mutate_as` and `load` now write **directly to the target table** +> History: the Run state machine and `__run__` staging branches were +> removed in MR-771 (shipped v0.4.0). Writes now go directly to the target +> table; this document specifies that direct-publish path. + +`mutate_as` and `load` write **directly to the target table** and call `ManifestBatchPublisher::publish` once at the end with `expected_table_versions` (the per-table manifest versions captured before the first write). Cross-table OCC is enforced inside the publisher; the @@ -11,8 +14,11 @@ publisher's row-level CAS on `__manifest` is the single fence. - No `RunRecord`, no `_graph_runs.lance`, no `_graph_run_actors.lance`. - No `omnigraph run *` CLI subcommands and no `/runs/*` HTTP endpoints. -- No `__run__` staging branches. (Legacy on-disk artifacts from - pre-MR-771 repos are inert; MR-770 sweeps them in production.) +- No `__run__` staging branches; `__run__*` is no longer a reserved + name. The branch-name guard was removed in MR-770, and any stale + `__run__*` branch on an upgraded graph is swept off `__manifest` by the + v2β†’v3 internal-schema migration on first read-write open. (The inert + `_graph_runs.lance` bytes remain until a `delete_prefix` primitive lands.) - Cancelled mutation futures leave **no graph-level state** β€” only orphaned Lance fragments, which the existing `omnigraph cleanup` pipe reclaims. @@ -151,10 +157,14 @@ are left at `Lance HEAD = manifest_pinned + 1`. **Recovery protocol** (lifecycle of every staged-write writer β€” `MutationStaging::finalize`, `schema_apply::apply_schema_with_lock`, -`branch_merge_on_current_target`, `ensure_indices_for_branch`): +`branch_merge_on_current_target`, `ensure_indices_for_branch`, +`optimize_all_tables`): 1. **Phase A**: writer writes a sidecar JSON to - `__recovery/{ulid}.json` BEFORE its first `commit_staged`. The + `__recovery/{ulid}.json` BEFORE its first HEAD-advancing commit + (`commit_staged`, or `compact_files` for `optimize_all_tables`, + which advances the Lance HEAD via a reserve-fragments + rewrite + commit rather than a staged write). The sidecar names every `(table_key, table_path, expected_version, post_commit_pin)` it intends to commit + the writer kind + actor_id. @@ -189,8 +199,13 @@ recovery sweep in `crates/omnigraph/src/db/manifest/recovery.rs`: otherwise full open-time recovery rolls them back and refresh-time recovery leaves them for the next read-write open. - Otherwise **roll back**: per-table `Dataset::restore` to the - manifest-pinned table version for that branch. Rollback records the - actual restore target in the audit row's `to_version`. + manifest-pinned table version, then a single `ManifestBatchPublisher::publish` + of the restored HEAD β€” symmetric with roll-forward, so `manifest == HEAD` + after recovery (no residual drift). This convergence is what lets a + failed-then-retried schema apply succeed instead of failing one version higher + each iteration. The audit row's `to_version` records the logical + rolled-back-to version (`manifest_pinned`); the manifest is published at the + restore commit (`manifest_pinned + 1`, same content). - After a successful roll-forward or roll-back, an audit row is recorded β€” `_graph_commits.lance` carries a commit tagged `actor_id = "omnigraph:recovery"`, and a sibling @@ -242,9 +257,14 @@ list`. ## Migration code -`db/manifest/migrations.rs` does not change. Active deletion of -`_graph_runs.lance` belongs in MR-770 (the production sweep) β€” this PR -stops *creating* run state but does not destroy legacy bytes on disk. +`db/manifest/migrations.rs` carries the v2β†’v3 internal-schema step (MR-770): +a one-time sweep that deletes legacy `__run__*` staging branches off +`__manifest`. It runs in `Omnigraph::open(ReadWrite)` (via +`manifest::migrate_on_open`, before the coordinator reads branch state) and +again on the publisher's write path; both are idempotent once the stamp is at +v3. Deleting the inert `_graph_runs.lance` / `_graph_run_actors.lance` dataset +*bytes* is still deferred β€” it needs a `StorageAdapter::delete_prefix` +primitive β€” but those bytes are invisible to graph-level state. ## Mid-query partial failure: closed by MR-794 diff --git a/docs/releases/v0.4.0.md b/docs/releases/v0.4.0.md index efb2da7..d3a8244 100644 --- a/docs/releases/v0.4.0.md +++ b/docs/releases/v0.4.0.md @@ -65,7 +65,7 @@ manifest. The next mutation against that table fails with `ExpectedVersionMismatch`. Most validation runs before any Lance write, so single-statement mutations are unaffected; the narrow path is multi-statement queries with late-op failures. Tracked as a follow-up; -see [docs/dev/runs.md](../dev/runs.md#known-limitation-mid-query-partial-failure-on-the-same-table) +see [docs/dev/writes.md](../dev/writes.md#mid-query-partial-failure-closed-by-mr-794) for the workaround. ## Upgrade notes diff --git a/docs/releases/v0.4.1.md b/docs/releases/v0.4.1.md index 78211e4..4983015 100644 --- a/docs/releases/v0.4.1.md +++ b/docs/releases/v0.4.1.md @@ -19,7 +19,7 @@ mutation proceeds normally. HEAD on every staged table is untouched and the next mutation proceeds normally. A narrowed residual remains at the finalizeβ†’publisher boundary (multi-table `commit_staged` is not - atomic with the manifest commit) β€” see [docs/dev/runs.md](../dev/runs.md) + atomic with the manifest commit) β€” see [docs/dev/writes.md](../dev/writes.md) "Finalize β†’ publisher residual" for details. - **Dβ‚‚ parse-time rule**: a single mutation query is either insert/update-only or delete-only. Mixed β†’ rejected with a clear @@ -75,14 +75,14 @@ mutation proceeds normally. ## Tests added -- `tests/runs.rs::partial_failure_leaves_target_queryable_and_unblocks_next_mutation` +- `tests/writes.rs::partial_failure_leaves_target_queryable_and_unblocks_next_mutation` (replaces the old `partial_failure_observably_rolls_back_but_blocks_next_mutation_on_same_table`) -- `tests/runs.rs::mutation_rejects_mixed_insert_and_delete_at_parse_time` -- `tests/runs.rs::mixed_insert_and_update_on_same_person_coalesces_to_one_merge` -- `tests/runs.rs::multiple_appends_to_same_edge_coalesce_to_one_append` -- `tests/runs.rs::multi_statement_inserts_publish_exactly_once` -- `tests/runs.rs::load_with_bad_edge_reference_unblocks_next_load` -- `tests/runs.rs::load_with_cardinality_violation_unblocks_next_load` +- `tests/writes.rs::mutation_rejects_mixed_insert_and_delete_at_parse_time` +- `tests/writes.rs::mixed_insert_and_update_on_same_person_coalesces_to_one_merge` +- `tests/writes.rs::multiple_appends_to_same_edge_coalesce_to_one_append` +- `tests/writes.rs::multi_statement_inserts_publish_exactly_once` +- `tests/writes.rs::load_with_bad_edge_reference_unblocks_next_load` +- `tests/writes.rs::load_with_cardinality_violation_unblocks_next_load` ## Files changed @@ -105,7 +105,7 @@ mutation proceeds normally. - `Cargo.toml` (workspace) + `crates/omnigraph/Cargo.toml` β€” added `datafusion = "52"` direct dep (transitively pulled by Lance already; required for `MemTable`). -- `docs/dev/runs.md` β€” removed "Known limitation" section; documented +- `docs/dev/writes.md` β€” removed "Known limitation" section; documented the new accumulator + Dβ‚‚ + LoadMode::Overwrite residual. - `docs/dev/invariants.md` β€” mutation atomicity / read-your-writes status flipped to `upheld for inserts/updates`. @@ -127,7 +127,7 @@ mutation proceeds normally. as legacy. - `docs/user/cli.md` β€” replaced the legacy `omnigraph run *` quickstart block with `omnigraph commit list/show`. -- `docs/dev/testing.md` β€” extended the `runs.rs` row to cover the new +- `docs/dev/testing.md` β€” extended the `writes.rs` row to cover the new staged-write contract tests; added the `staged_writes.rs` row. - `AGENTS.md` (CLAUDE.md symlink) β€” updated the atomic-per-query description and the L2 capability matrix row. diff --git a/docs/releases/v0.6.1.md b/docs/releases/v0.6.1.md new file mode 100644 index 0000000..0acc34b --- /dev/null +++ b/docs/releases/v0.6.1.md @@ -0,0 +1,28 @@ +# Omnigraph v0.6.1 + +v0.6.1 focuses on operational polish after v0.6.0: stored-query registries, safer branch cleanup, more complete release artifacts, and a Lance blob-compaction workaround. + +## Highlights + +- **Stored-query registries.** `omnigraph.yaml` can declare curated `queries:` blocks per graph. Servers load and type-check them at startup, `omnigraph queries validate` checks them offline, `omnigraph queries list` shows exposed queries and typed params, `GET /queries` exposes a typed catalog, and `POST /queries/{name}` invokes a stored query without accepting ad hoc `.gq` source from the client. +- **Stored-query policy gate.** New Cedar action `invoke_query` gates the stored-query invocation surface. Stored mutations are double-gated: `invoke_query` to reach the stored query and `change` for the actual write. +- **Safer branch deletion.** `branch_delete` now treats the manifest as the authority, flips branch visibility atomically, and reclaims per-table/commit-graph forks as derived state. If best-effort reclaim is interrupted, `cleanup` reconciles orphaned forks; reusing a branch name before cleanup reports an actionable error. +- **Legacy `__run__` cleanup (MR-770).** Removed the last functional remnant of the Run state machine (retired in v0.4.0): the `__run__` branch-name guard. A new v2β†’v3 `__manifest` internal-schema migration sweeps any stale `__run__*` staging branches on the first read-write open, so `__run__*` is no longer a reserved branch name. This closes the "unpromoted `__run__` branches block reads" condition behind the zombie-run cascade incident; the inert `_graph_runs.lance` row cleanup is tracked separately (it needs a `delete_prefix` primitive). +- **Blob-safe optimize.** `omnigraph optimize` skips tables with `Blob` properties instead of failing the whole sweep on Lance's blob-v2 compaction decode bug. Skips are visible in human output, `--json` as `skipped`, `TableOptimizeStats.skipped`, and logs; non-blob tables still compact normally. +- **Deployment improvements.** The container entrypoint now composes `OMNIGRAPH_TARGET_URI` with `OMNIGRAPH_CONFIG`, so operators can keep the graph URI in env while loading policy/query config from a mounted file. The local RustFS bootstrap pins RustFS beta.3 and allows the current insecure local-dev default credentials. +- **Windows release support.** Tagged and edge releases now publish Windows x86_64 archives containing `omnigraph.exe` and `omnigraph-server.exe`, with a PowerShell installer and Windows install docs. +- **Release tooling.** Homebrew formula generation was tightened to produce audit-clean formulas. + +## Compatibility Notes + +- A graph selected by name (`--target` or `server.graph`) now uses `graphs..policy` and `graphs..queries`. Top-level `policy` / `queries` blocks are only for anonymous bare-URI single-graph mode; using them with a named graph now fails loudly with migration guidance. +- `mcp.expose` defaults to `true` for stored-query registry entries. Set `mcp: { expose: false }` for service-only queries that should not appear in the catalog. +- `invoke_query` is graph-scoped, not branch-scoped. Branch/snapshot access remains enforced by the inner `read` / `change` gate. +- **Legacy `__run__` migration.** Graphs created before v0.4.0 are migrated automatically on the first **read-write** open by a v0.6.1 binary (one-time `__manifest` stamp v2β†’v3 sweep of stale `__run__*` branches). No action required. Two caveats: (1) a graph opened **read-only** still lists any stale `__run__*` branch until its first read-write open, since the migration is write-path-only like all manifest migrations β€” long-lived read-only deployments should be opened read-write once after upgrading; (2) the inert `_graph_runs.lance` / `_graph_run_actors.lance` dataset bytes are left in place until a future `delete_prefix` primitive (they are invisible to graph-level state). +- Blob tables are not compacted until the upstream Lance fix lands, so fragment count and deleted-row space on blob tables are not reclaimed by `optimize`. Reads, writes, and query results are unaffected; no on-disk migration is required. +- `TableOptimizeStats` is now `#[non_exhaustive]` and gains a `skipped: Option` field (so does the new `SkipReason` enum). This is a source-level change only for downstream code that built this returned result struct by literal β€” rare, since it is produced by `optimize` and consumed by reading its fields; field access is unaffected, and `#[non_exhaustive]` keeps future additions non-breaking. + +## Docs And Cleanup + +- Public docs were updated for stored queries, policy, server routes, deployment, Windows installation, branch deletion, maintenance, and the `runs` docs rename to `writes`. +- README copy and release documentation were refreshed; older release notes had small typo/wording fixes. diff --git a/docs/rfcs/0000-template.md b/docs/rfcs/0000-template.md new file mode 100644 index 0000000..48f4bda --- /dev/null +++ b/docs/rfcs/0000-template.md @@ -0,0 +1,54 @@ +# RFC NNNN: + +| | | +|---|---| +| **Status** | Proposed | +| **Author(s)** | <your name / handle> | +| **Discussion** | <link to the originating Discussion, if any> | +| **Implementation** | <issue/PR links, filled in as work lands> | + +> Status is maintained by maintainers: `Proposed` while the PR is open, +> `Accepted` on merge, `Declined` on close, `Superseded by NNNN` later. + +## Summary + +One paragraph: what this changes, in plain terms. + +## Motivation + +What problem does this solve, and why is it worth the ongoing cost? Tie it to a +concrete need (a Discussion, a recurring issue, a user request). Per the +project's first principle, argue the *long-run liability*, not just the +short-term convenience. + +## Guide-level explanation + +Explain the change as you'd teach it to a user or contributor: new commands, +syntax, API shapes, behavior. Examples first. + +## Reference-level design + +The precise design: data structures, IR/AST/planner changes, storage/format +impact, migration path, error behavior. Enough that a reviewer can find the +holes. + +## Invariants & deny-list check + +Which Hard Invariants in [../dev/invariants.md](../dev/invariants.md) does this +touch? Does it brush against any deny-list item β€” and if so, why is this the +justified exception? State explicitly that no invariant is weakened, or which +Known Gap moves. + +## Drawbacks & alternatives + +What does this cost, what did you reject, and why. "Do nothing" is a valid +alternative to weigh. + +## Reversibility + +Is this reversible? On-disk/wire/format and substrate choices are near-permanent +and demand more evidence; a CLI flag or doc is cheap to undo. Say which this is. + +## Unresolved questions + +What's deliberately left open for review to settle. diff --git a/docs/rfcs/README.md b/docs/rfcs/README.md new file mode 100644 index 0000000..99cdd76 --- /dev/null +++ b/docs/rfcs/README.md @@ -0,0 +1,66 @@ +# RFCs + +Substantial changes to OmniGraph β€” new user-facing surface, format or protocol +changes, anything irreversible or cross-cutting β€” go through a lightweight RFC +so the design is agreed *as reviewable code* before implementation starts. This +is the public RFC track, open to **anyone, including external contributors**. + +This complements the always-on review bar in +[../dev/invariants.md](../dev/invariants.md): the invariants say *what every +change must respect*; an RFC says *why this particular change is worth making and +how*. + +> **Two tracks, don't conflate them.** This `docs/rfcs/` directory is the +> **public contribution** track (anyone authors; maintainers accept). The +> maintainer-internal RFCs under `docs/dev/rfc-00N-*.md` are a separate, +> team-owned track for in-flight internal work. If you're an outside +> contributor, you're in the right place here. + +## When you need one + +- **RFC required:** new query/schema/CLI/HTTP surface; on-disk or wire-format + changes; a new substrate dependency; anything the deny-list in + [../dev/invariants.md](../dev/invariants.md) flags; anything irreversible + ("reversibility shapes evidence demand"). +- **RFC not required:** bug fixes for an `accepted` issue, and the trivial + fast-lane (typos, docs, deps) β€” see [../../CONTRIBUTING.md](../../CONTRIBUTING.md). + +If you're unsure, start a [Discussion](../../../discussions); a maintainer will +tell you whether it needs an RFC. + +## Lifecycle + +``` +Discussion (incubate, get rough consensus) + β”‚ graduate + β–Ό +RFC pull request β†’ adds docs/rfcs/NNNN-title.md (Status: Proposed) + β”‚ +maintainer review ──▢ changes requested / declined (PR closed, with rationale) + β”‚ + β–Ό +merged == Accepted (the merged file is the durable decision record) + β”‚ + β–Ό +Implementation PR(s) reference the accepted RFC +``` + +- **Author:** anyone. **Acceptance:** a maintainer decision, performed by + merging the RFC PR. Declining is closing it with rationale. +- The merged RFC *is* the accepted record β€” there is no separate sign-off step. +- Later reversals don't edit history: supersede with a new RFC that links back + and flip the old one's `Status` to `Superseded`. + +## Numbering & naming + +- File: `docs/rfcs/NNNN-kebab-title.md`, where `NNNN` is the next free + zero-padded integer (`0001`, `0002`, …). `0000-template.md` is reserved. +- Pick the number when you open the PR; if it collides with another in-flight + RFC, the second to merge bumps theirs. + +## Status values + +`Proposed` (open PR) Β· `Accepted` (merged) Β· `Declined` (closed) Β· +`Superseded by NNNN` Β· `Implemented` (set once the work lands, optional). + +Copy [0000-template.md](0000-template.md) to start. diff --git a/docs/user/audit.md b/docs/user/audit.md index e8abe5b..ab028ac 100644 --- a/docs/user/audit.md +++ b/docs/user/audit.md @@ -4,4 +4,4 @@ - `_as` variants of every write API let callers override the actor: `mutate_as`, `ingest_as`, `branch_merge_as`, `apply_schema_as`, etc. - Actor IDs are persisted on `GraphCommit.actor_id` with split storage in `_graph_commit_actors.lance` (the commit graph is split into `_graph_commits.lance` for the linkage and `_graph_commit_actors.lance` for the actor map). - HTTP server uses the bearer-token actor automatically; CLI uses the local user / explicit env (no implicit actor). -- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0 and reclaimed by MR-770's production sweep. +- Pre-v0.4.0 graphs also stored actor IDs on `RunRecord.actor_id` in `_graph_runs.lance` / `_graph_run_actors.lance`. The Run state machine was removed in MR-771; those files are inert post-v0.4.0. The v2β†’v3 manifest migration sweeps any stale `__run__*` branches on first write-open (MR-770); the inert dataset bytes remain until a `delete_prefix` primitive lands. diff --git a/docs/user/branches-commits.md b/docs/user/branches-commits.md index de6c653..a4044cb 100644 --- a/docs/user/branches-commits.md +++ b/docs/user/branches-commits.md @@ -8,10 +8,10 @@ Lance supports branching at the dataset level: a branch is a named lineage of ve OmniGraph builds *graph branches* on top by branching every sub-table coherently: -- `branch_create(name)` / `branch_create_from(target, name)` β€” disallowed name `main`; fails if branch exists; ensures the schema-apply lock is idle. -- `branch_list()` β€” returns public branches, **filters internal** `__run__…` and `__schema_apply_lock__` prefixes. -- `branch_delete(name)` β€” refuses if there are descendants or active runs on the branch; cleans up owned per-branch fragments. -- **Lazy forking**: a branch only forks a sub-table when that sub-table is first mutated on it. Pure-read branches share fragments with their source. +- `branch_create(name)` / `branch_create_from(target, name)` β€” disallowed name `main`; fails if branch exists; ensures the schema-apply lock is idle. Atomic and authority-first like `branch_delete`: it flips the `__manifest` branch (authority), then creates the derived commit-graph branch, force-dropping any orphaned commit-graph ref left by an incomplete prior delete (the manifest branch is fresh, so a same-named commit-graph branch is provably a zombie). If commit-graph creation fails, the manifest branch is rolled back so the name never half-exists. +- `branch_list()` β€” returns public branches, **filters the internal** `__schema_apply_lock__` branch. +- `branch_delete(name)` β€” refuses if there are descendants on the branch, or if it is the current branch. The manifest is the single authority for branch existence: deletion flips the `__manifest` branch ref first (one atomic op), after which the branch is gone from every snapshot. The owned per-table forks and the commit-graph branch are derived state, reclaimed best-effort with `force_delete_branch` after the flip. A failure during that reclaim (transient object-store error) does not fail the call or block the authority flip; the leftover forks are unreachable orphans that the [`cleanup`](maintenance.md) reconciler converges. One consequence: if a delete's best-effort reclaim fails, reusing that branch name before the next `cleanup` surfaces a clear error pointing at `cleanup` (the stale fork would otherwise collide on first write). +- **Lazy forking**: a branch only forks a sub-table when that sub-table is first mutated on it. Pure-read branches share fragments with their source. A fork collision is classified by the manifest authority, not by Lance branch versions: if the live manifest already records the fork on the active branch, a concurrent first-write won and the caller gets a retryable "refresh and retry"; if the manifest does not, a physical branch there is an orphan and the caller is pointed at `cleanup`. - `sync_branch(branch)` β€” re-binds the in-memory handle to the latest head of the branch. ## L2 β€” Commit graph (`db/commit_graph.rs`) @@ -51,13 +51,13 @@ Notes: ## L2 β€” Internal system branches -Filtered from `branch_list()` but visible to internals: +Internal or legacy branch refs: -- `__schema_apply_lock__` β€” serializes schema migrations. -- `__run__<run-id>` β€” legacy from the pre-v0.4.0 Run state machine (removed in MR-771). The branch-name guard predicate `is_internal_run_branch` is kept as defense-in-depth so users cannot create a branch matching the legacy prefix; the filter will be removed once production legacy branches are swept (MR-770). +- `__schema_apply_lock__` β€” serializes schema migrations; filtered from `branch_list()` but visible to internals. +- `__run__<run-id>` β€” legacy from the pre-v0.4.0 Run state machine (removed in MR-771). These are swept off `__manifest` on the first read-write open by the v2β†’v3 internal-schema migration (MR-770), and `__run__*` is no longer a reserved name. Known limitation: a pre-v0.4.0 graph opened **read-only** still surfaces any stale `__run__*` branch in `branch_list()` until its first read-write open (the migration is write-path-only, like all manifest migrations). ## L2 β€” Recovery audit trail -The four migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`) protect their multi-table commits with a sidecar at `__recovery/{ulid}.json` written before Phase B and deleted after Phase C. The next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the recovery sweep in `crates/omnigraph/src/db/manifest/recovery.rs`: classify per-table state, decide all-or-nothing per sidecar, roll forward / back, record an audit row. +The five migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`, `optimize_all_tables`) protect their multi-table commits with a sidecar at `__recovery/{ulid}.json` written before Phase B and deleted after Phase C. The next `Omnigraph::open` (gated on `OpenMode::ReadWrite`) runs the recovery sweep in `crates/omnigraph/src/db/manifest/recovery.rs`: classify per-table state, decide all-or-nothing per sidecar, roll forward / back, record an audit row. Audit rows live in `_graph_commit_recoveries.lance` (sibling to `_graph_commits.lance`) and reference the commit graph by `graph_commit_id`. The linked recovery commit is identified by that same `graph_commit_id`, and `actor_id="omnigraph:recovery"` is stored in `_graph_commit_actors.lance` (joined by `graph_commit_id`) β€” `_graph_commits.lance` itself does not carry the `actor_id` column. To find recoveries for a specific original actor: `omnigraph commit list --filter actor=omnigraph:recovery`, then join to `_graph_commit_recoveries.lance` by `graph_commit_id` to read `recovery_for_actor`. Schema: see `crates/omnigraph/src/db/recovery_audit.rs`. diff --git a/docs/user/cli-reference.md b/docs/user/cli-reference.md index 0326e64..8263919 100644 --- a/docs/user/cli-reference.md +++ b/docs/user/cli-reference.md @@ -20,10 +20,11 @@ A reference for the `omnigraph` binary's command surface and `omnigraph.yaml` sc | `run list \| show \| publish \| abort` | transactional run ops | | `schema plan \| apply \| show (alias: get)` | migrations | | `lint` (alias: `check`) | offline / graph-backed query validation. Replaces `query lint` / `query check`, which are kept as deprecated argv-level shims that print a one-line warning and rewrite to `omnigraph lint` | -| `optimize` | non-destructive Lance compaction | +| `queries validate \| list` | operate on the server-side stored-query registry (the `queries:` block). `validate` type-checks every stored query against the live schema offline (opens the selected graph; exits non-zero on any breakage), catching schema drift without restarting the server; `list` prints the selected registry's query names, MCP exposure, and typed params. For per-graph registries, pass `--target <graph>` or set `cli.graph`; with no graph selection, `list` shows only top-level `queries:`. Distinct from `lint`, which validates a single `.gq` file | +| `optimize` | non-destructive Lance compaction (skips tables with `Blob` columns; `--json` reports a `skipped` field) | | `cleanup --keep N --older-than 7d --confirm` | destructive version GC | | `embed` | offline JSONL embedding pipeline | -| `policy validate \| test \| explain` | Cedar tooling | +| `policy validate \| test \| explain` | Cedar tooling. Selects `cli.graph`, else `server.graph`, else top-level `policy.file` | | `version` / `-v` | print `omnigraph 0.3.x` | ## `omnigraph.yaml` schema @@ -34,6 +35,13 @@ graphs: <name>: uri: <local|s3://|http(s)://> bearer_token_env: <ENV_NAME> + queries: # per-graph stored-query registry (server-role; multi-graph mode) + <query-name>: # key MUST equal the `query <name>` symbol inside the .gq + file: <path-to-.gq> # relative to this config's directory + mcp: + expose: true # default true: listed in the MCP catalog (GET /queries); set false to hide (still HTTP-callable) + tool_name: <name> # optional MCP tool-name override (defaults to <query-name>; + # must be unique across exposed queries) server: graph: <name> bind: <ip:port> @@ -59,6 +67,8 @@ aliases: graph: <name> branch: <name> format: <output-format> +queries: # top-level registry β€” applies only to a bare-URI (anonymous) graph; a graph served by name uses its `graphs.<id>.queries`. Mirrors top-level `policy`. + <query-name>: { file: <path-to-.gq> } # mcp.expose defaults to true policy: file: ./policy.yaml ``` diff --git a/docs/user/constants.md b/docs/user/constants.md index 527aaea..210155e 100644 --- a/docs/user/constants.md +++ b/docs/user/constants.md @@ -4,13 +4,14 @@ |---|---|---| | `MANIFEST_DIR` | `__manifest` | `db/manifest/layout.rs` | | Commit graph dir | `_graph_commits.lance` | `db/commit_graph.rs` | -| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; reclaimed by MR-770 | -| Run branch prefix (legacy, removed MR-771) | `__run__` | filtered by `is_internal_run_branch` defense-in-depth | +| Run registry dir (legacy, removed MR-771) | `_graph_runs.lance` | inert post-v0.4.0; bytes remain until a `delete_prefix` primitive lands | +| Run branch prefix (legacy, removed MR-771/MR-770) | `__run__` | swept off `__manifest` by the v2β†’v3 migration; no longer a reserved name | | Schema apply lock | `__schema_apply_lock__` | `db/mod.rs` | | Manifest publisher retry budget | `PUBLISHER_RETRY_BUDGET = 5` | `db/manifest/publisher.rs` | -| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 2` | `db/manifest/migrations.rs` | +| Internal manifest schema version | `INTERNAL_MANIFEST_SCHEMA_VERSION = 3` | `db/manifest/migrations.rs` | | Merge stage batch | `MERGE_STAGE_BATCH_ROWS = 8192` | `exec/merge.rs` | | Maintenance concurrency | `OMNIGRAPH_MAINTENANCE_CONCURRENCY=8` | `db/omnigraph/optimize.rs` | +| Lance blob compaction support | `LANCE_SUPPORTS_BLOB_COMPACTION = false` | `db/omnigraph/optimize.rs` | | Graph index cache size | `8` (LRU) | `runtime_cache.rs` | | Default body limit | `1 MB` | `omnigraph-server/lib.rs` | | Ingest body limit | `32 MB` | `omnigraph-server/lib.rs` | diff --git a/docs/user/deployment.md b/docs/user/deployment.md index fc5ee08..9a4466c 100644 --- a/docs/user/deployment.md +++ b/docs/user/deployment.md @@ -20,6 +20,8 @@ Build or install: - `omnigraph` - `omnigraph-server` +On Windows, the binaries are `omnigraph.exe` and `omnigraph-server.exe`. + Run against a local graph: ```bash @@ -107,6 +109,35 @@ docker run --rm -p 8080:8080 \ --bind 0.0.0.0:8080 ``` +### Container entrypoint env vars + +When no positional args are given, the image entrypoint +(`docker/entrypoint.sh`) builds the server command from env vars: + +| Var | Effect | +|---|---| +| `OMNIGRAPH_TARGET_URI` | Graph URI, passed as the positional argument. | +| `OMNIGRAPH_CONFIG` | Path to an `omnigraph.yaml`, passed as `--config`. Used to supply a `policy.file` (Cedar authorization). The config file and any relative `policy.file` must be mounted into the container. | +| `OMNIGRAPH_TARGET` | Graph name to select from the config's `graphs:` block (with `OMNIGRAPH_CONFIG`, when no `OMNIGRAPH_TARGET_URI`). | +| `OMNIGRAPH_BIND` | Listen address (default `0.0.0.0:8080`). | + +`OMNIGRAPH_TARGET_URI` and `OMNIGRAPH_CONFIG` **compose**: set both to keep the +graph URI in the env var while loading policy from the config file (the +positional URI wins over any `graphs:` entry). To enable Cedar policy on a +container otherwise driven by `OMNIGRAPH_TARGET_URI`, mount the config dir and +add `OMNIGRAPH_CONFIG`: + +```bash +docker run --rm -p 8080:8080 \ + -e OMNIGRAPH_SERVER_BEARER_TOKEN="change-me" \ + -e OMNIGRAPH_TARGET_URI="s3://my-bucket/graphs/example/releases/2026-04-10-v0.1.0" \ + -e OMNIGRAPH_CONFIG="/etc/omnigraph/omnigraph.yaml" \ + -v "$PWD/config:/etc/omnigraph:ro" \ + omnigraph-server:local +# /etc/omnigraph/omnigraph.yaml contains `policy: { file: ./policy.yaml }`; +# policy.yaml (+ optional policy.tests.yaml) sit beside it in the mount. +``` + ## Auth The server can run unauthenticated for local development only when explicitly @@ -141,8 +172,10 @@ The server binary ships in two flavors: | **AWS** | `cargo build --release --features aws` | Adds AWS Secrets Manager backend for bearer tokens | Tagged release archives contain the default `omnigraph` and -`omnigraph-server` binaries. AWS-enabled server binaries are built from source -with `cargo build --release --features aws -p omnigraph-server` when needed. +`omnigraph-server` binaries on macOS / Linux, and `omnigraph.exe` plus +`omnigraph-server.exe` on Windows. AWS-enabled server binaries are built from +source with `cargo build --release --features aws -p omnigraph-server` when +needed. The AWS build adds ~150 transitive deps and ~30-60s of first-build compile time. Default builds don't pay that cost. diff --git a/docs/user/errors.md b/docs/user/errors.md index fd047eb..8373b0d 100644 --- a/docs/user/errors.md +++ b/docs/user/errors.md @@ -9,7 +9,7 @@ - `Manifest(ManifestError { kind: BadRequest|NotFound|Conflict|Internal, details: Option<ManifestConflictDetails>, … })` - `ManifestConflictDetails::ExpectedVersionMismatch { table_key, expected, actual }` β€” caller's `expected_table_versions` did not match the manifest's current latest non-tombstoned version (set by `OmniError::manifest_expected_version_mismatch`). - `ManifestConflictDetails::RowLevelCasContention` β€” Lance row-level CAS rejected the publish because a concurrent writer landed the same `object_id`. Retried internally by the publisher; only surfaces if the retry budget exhausts. - - **Dβ‚‚ parse-time rejection** (MR-794): a single mutation query that mixes inserts/updates with deletes errors out *before any I/O* with kind `BadRequest`. Message: `mutation '<name>' on the same query mixes inserts/updates and deletes; split into separate mutations: (1) inserts and updates, then (2) deletes`. See [docs/user/query-language.md](query-language.md) for the rule and [docs/dev/runs.md](../dev/runs.md) for the underlying staged-write rationale. + - **Dβ‚‚ parse-time rejection** (MR-794): a single mutation query that mixes inserts/updates with deletes errors out *before any I/O* with kind `BadRequest`. Message: `mutation '<name>' on the same query mixes inserts/updates and deletes; split into separate mutations: (1) inserts and updates, then (2) deletes`. See [docs/user/query-language.md](query-language.md) for the rule and [docs/dev/writes.md](../dev/writes.md) for the underlying staged-write rationale. - `MergeConflicts(Vec<MergeConflict>)` Compiler-side `NanoError` covers parse / catalog / type / storage / plan / execution / arrow / lance / IO / manifest / unique-constraint, each with structured spans (`SourceSpan { start, end }`) for ariadne-style diagnostics. diff --git a/docs/user/install.md b/docs/user/install.md index ea9fb8c..4a11372 100644 --- a/docs/user/install.md +++ b/docs/user/install.md @@ -2,16 +2,29 @@ ## Quick Install +macOS / Linux: + ```bash curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/install.sh | bash ``` +Windows PowerShell: + +```powershell +powershell -NoProfile -ExecutionPolicy Bypass -Command "iwr -UseBasicParsing https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/install.ps1 | iex" +``` + By default the installer places: - `omnigraph` - `omnigraph-server` -in `~/.local/bin`. +in `~/.local/bin` on macOS / Linux, or: + +- `omnigraph.exe` +- `omnigraph-server.exe` + +in `%USERPROFILE%\.local\bin` on Windows. The default installer is binary-only. It downloads a published release asset, verifies the SHA256 checksum, and unpacks it. It does not build from source. @@ -39,6 +52,13 @@ Rolling edge binaries from `main`: curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/install.sh | RELEASE_CHANNEL=edge bash ``` +Windows rolling edge binaries: + +```powershell +iwr -UseBasicParsing https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/install.ps1 -OutFile install.ps1 +powershell -NoProfile -ExecutionPolicy Bypass -File .\install.ps1 -ReleaseChannel edge +``` + Install from source: ```bash @@ -53,12 +73,24 @@ Install to a different directory: curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/install.sh | INSTALL_DIR="$HOME/bin" bash ``` +Windows: + +```powershell +powershell -NoProfile -ExecutionPolicy Bypass -File .\install.ps1 -InstallDir "$env:USERPROFILE\bin" +``` + Install a specific tag: ```bash curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/install.sh | VERSION=v0.1.0 bash ``` +Windows: + +```powershell +powershell -NoProfile -ExecutionPolicy Bypass -File .\install.ps1 -Version v0.1.0 +``` + Build from a specific git ref: ```bash @@ -67,27 +99,53 @@ curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph/main/scripts/ ## Manual Source Build +macOS / Linux: + ```bash cargo build --release --locked -p omnigraph-cli -p omnigraph-server install -m 0755 target/release/omnigraph ~/.local/bin/omnigraph install -m 0755 target/release/omnigraph-server ~/.local/bin/omnigraph-server ``` +Windows: + +```powershell +cargo build --release --locked -p omnigraph-cli -p omnigraph-server +New-Item -ItemType Directory -Force "$env:USERPROFILE\.local\bin" | Out-Null +Copy-Item target\release\omnigraph.exe "$env:USERPROFILE\.local\bin\omnigraph.exe" +Copy-Item target\release\omnigraph-server.exe "$env:USERPROFILE\.local\bin\omnigraph-server.exe" +``` + ## Release Assets Tagged releases are expected to publish: - `omnigraph-linux-x86_64.tar.gz` - `omnigraph-macos-arm64.tar.gz` +- `omnigraph-windows-x86_64.zip` -Each archive contains both binaries: +The macOS / Linux archives contain both binaries: - `omnigraph` - `omnigraph-server` +The Windows archive contains: + +- `omnigraph.exe` +- `omnigraph-server.exe` + ## Verify The Install +macOS / Linux: + ```bash omnigraph version omnigraph-server --help ``` + +Windows: + +```powershell +omnigraph.exe version +omnigraph-server.exe --help +``` diff --git a/docs/user/maintenance.md b/docs/user/maintenance.md index 08ae8da..a835799 100644 --- a/docs/user/maintenance.md +++ b/docs/user/maintenance.md @@ -4,19 +4,28 @@ ## `optimize_all_tables(db)` β€” non-destructive -- Lance `compact_files()` on every node + edge table on `main`. -- Rewrites small fragments into fewer large ones; old fragments remain reachable via older manifests. +- Lance `compact_files()` on every node + edge table on `main`, then **publishes the compacted version to the `__manifest`** so the manifest's `table_version` tracks the compacted Lance HEAD. Reads pin the manifest version, so without this publish compaction would be invisible to readers *and* would break the HEAD-vs-manifest precondition of the next schema apply / strict update/delete ("stale view … refresh and retry"). The publish advances the graph version (a system-attributed commit) only for tables that actually compacted. +- Rewrites small fragments into fewer large ones; old fragments remain reachable via older manifests until `cleanup` runs. +- Each table's compactβ†’publish runs under its per-`(table, main)` write queue (serializing with concurrent mutations β€” compaction is a Lance `Rewrite` op that retryable-conflicts with a concurrent merge/update/delete on overlapping fragments). The Lance-HEAD-before-manifest-publish gap is covered by a `SidecarKind::Optimize` recovery sidecar (loose-match): a crash in that window rolls the compacted version forward on the next `Omnigraph::open` (compaction is content-preserving, so roll-forward is always safe). +- **Requires a recovered graph.** `optimize` refuses (errors) when an unresolved recovery sidecar is present under `__recovery` β€” operating on an unrecovered graph could publish a partial write the open-time recovery sweep would roll back. Reopen the graph to run the recovery sweep, then re-run `optimize`. (Recovery roll-back now publishes its restored version, so a recovered graph always satisfies `manifest == Lance HEAD` going in; there is no leftover drift for `optimize` to interpret.) - Bounded by `OMNIGRAPH_MAINTENANCE_CONCURRENCY` (default 8). -- Returns `[TableOptimizeStats { table_key, fragments_removed, fragments_added, committed }]`. +- Returns `[TableOptimizeStats { table_key, fragments_removed, fragments_added, committed, skipped }]`. +- **Blob tables are skipped.** A table that declares any `Blob` property is not compacted: it is reported with `skipped: Some(BlobColumnsUnsupportedByLance)` (and logged via `tracing::warn`) instead of compacted, and the rest of the sweep proceeds normally. The current Lance `compact_files` mis-decodes blob-v2 columns under its forced `BlobHandling::AllBinary` read; **reads and writes are unaffected** β€” only compaction is. This is gated by `LANCE_SUPPORTS_BLOB_COMPACTION` (`db/omnigraph/optimize.rs`) and removed when the upstream Lance fix lands (see [docs/dev/lance.md](../dev/lance.md)). Consequence: fragment count and deleted-row space on blob tables are not reclaimed until then; query results are never affected. ## `cleanup_all_tables(db, options)` β€” destructive - Lance `cleanup_old_versions()` per table. - Removes manifests (and their unique fragments) older than the retention policy. - `CleanupPolicyOptions { keep_versions: Option<u32>, older_than: Option<Duration> }` β€” at least one is required. -- Returns `[TableCleanupStats { table_key, bytes_removed, old_versions_removed }]`. +- Returns `[TableCleanupStats { table_key, bytes_removed, old_versions_removed, error }]`. +- **Fault-isolated per table.** A single table's transient failure (version GC or + orphan reclaim) is recorded on that table's stats row (`error: Some(..)`, logged + via `tracing`) and never aborts the healthy tables β€” cleanup is the convergence + backstop, so it does as much as it can and converges on re-run. The CLI reports + any failed tables; rerun `cleanup` to retry them. - CLI guards with `--confirm`; without it, prints a preview line. - **Recovery floor:** `--keep < 3` may garbage-collect Lance versions that the open-time recovery sweep needs as a rollback target (the sweep restores to the branch's manifest-pinned table version, which is HEAD-1 in the typical Phase B β†’ Phase C drift case). Default `--keep 10` is safe. +- **Orphaned-branch reconciliation:** before the version GC, cleanup runs `reconcile_orphaned_branches`, which `force_delete_branch`es any per-table or commit-graph Lance branch absent from the manifest branch list. These orphans arise when a `branch_delete` flips the manifest authority but a downstream best-effort reclaim does not complete (see [branches-commits.md](branches-commits.md)). The reconciler is authority-derived and idempotent (it no-ops once nothing is orphaned), runs regardless of the `keep_versions` / `older_than` values (those gate version GC only), and never reclaims `main` or system-branch forks. Reclaimed forks are logged via `tracing::info`. ## Tombstones diff --git a/docs/user/policy.md b/docs/user/policy.md index 749d3be..ec0d214 100644 --- a/docs/user/policy.md +++ b/docs/user/policy.md @@ -14,10 +14,11 @@ Per-graph actions (bind to `Omnigraph::Graph::"<graph_id>"`): 6. `branch_delete` 7. `branch_merge` 8. `admin` β€” reserved for policy-management surfaces (hot reload, audit log, approvals). No call site today; see MR-724 for the reservation rationale. +9. `invoke_query` β€” gates invoking a server-side stored query (the `queries:` registry). Graph-scoped (like `admin`) β€” per-branch access is enforced by the inner `read` / `change` gate, so a rule that sets `branch_scope` on `invoke_query` is rejected. Coarse in this release: an `invoke_query` allow rule permits any stored query on the graph; a future, additive refinement adds an optional per-query-name scope without changing rules written against the coarse action. Enforced at `POST /queries/{name}` (see [server](server.md)). A stored *mutation* is double-gated: `invoke_query` to reach the tool, plus `change` for the write itself (the engine `_as` writers still enforce per the query body). Server-scoped action (v0.6.0+; binds to `Omnigraph::Server::"root"`): -9. `graph_list` β€” `GET /graphs` registry enumeration (multi-graph mode) +10. `graph_list` β€” `GET /graphs` registry enumeration (multi-graph mode) Server-scoped actions cannot use `branch_scope` or `target_branch_scope` β€” they operate on the registry, not on a graph's branches. A rule cannot mix server-scoped and per-graph actions; split into separate rules. (Runtime `graph_create` / `graph_delete` are reserved but not shipped in v0.6.0; operators add/remove graphs by editing `omnigraph.yaml` and restarting.) @@ -46,10 +47,15 @@ graphs: # no per-graph policy β†’ no engine-layer Cedar enforcement on beta ``` -Top-level `policy.file` is single-graph / CLI-local policy only. Multi-graph -server startup rejects it because applying one graph policy to every configured -graph is ambiguous. Move per-graph rules to `graphs.<graph_id>.policy.file` and -move `graph_list` rules to `server.policy.file`. +**Config follows graph identity, not server mode.** A graph served by **name** +(`--target <name>` or `server.graph`) uses its own `graphs.<name>.policy.file`, +exactly as in multi-graph mode. Top-level `policy.file` applies only to an +**anonymous** graph β€” one served by a bare `<URI>` with no `graphs:` entry. +Serving a **named** graph (single- or multi-graph mode) while top-level +`policy.file` (or `queries:`) is populated **refuses boot**, naming the block, +since the top-level value would otherwise be silently shadowed by the per-graph +block. Move per-graph rules to `graphs.<graph_id>.policy.file` and `graph_list` +rules to `server.policy.file`. Each graph's HTTP request flows through its own per-graph policy. The management endpoint (`GET /graphs`) flows through the server-level policy. When `server.policy.file` is unset, `GET /graphs` is denied in every runtime state, including `--unauthenticated`; with bearer tokens configured, it returns 403 after admission control because `graph_list` is not a `read`-equivalent action. The operator must explicitly authorize via `server-policy.yaml` to expose `/graphs`. @@ -92,6 +98,10 @@ bearer token. ## CLI +Policy tooling resolves its graph like server single-mode policy: `cli.graph` +wins, otherwise `server.graph` is used, otherwise the top-level `policy.file` +is validated/tested/explained as the anonymous policy. + - `omnigraph policy validate` β€” parse + count actors, exit 1 on parse error. - `omnigraph policy test` β€” run cases in `policy.tests.yaml`, exit 1 on any expectation mismatch. - `omnigraph policy explain --actor … --action … [--branch …] [--target-branch …]` β€” show decision and matched rule. diff --git a/docs/user/query-language.md b/docs/user/query-language.md index 94528af..6c7516f 100644 --- a/docs/user/query-language.md +++ b/docs/user/query-language.md @@ -70,7 +70,7 @@ A single mutation query must be **either insert/update-only or delete-only**. Mi > `mutation '<name>' on the same query mixes inserts/updates and deletes; split into separate mutations: (1) inserts and updates, then (2) deletes. This restriction lifts when Lance exposes a two-phase delete API (tracked: MR-793 / Lance-upstream).` -Reason: under the staged-write rewire (MR-794), inserts and updates accumulate in memory and commit at end-of-query, while deletes still inline-commit (Lance 4.0.0 has no public two-phase delete). Mixing creates ordering hazards (same-row insertβ†’delete becomes a no-op because the staged insert isn't visible to delete; cascading deletes of just-inserted edges break referential integrity by silent design). Until Lance exposes `DeleteJob::execute_uncommitted`, the parse-time rejection keeps both paths atomic and correct. See [docs/dev/runs.md](../dev/runs.md) and [docs/dev/invariants.md](../dev/invariants.md). +Reason: under the staged-write rewire (MR-794), inserts and updates accumulate in memory and commit at end-of-query, while deletes still inline-commit (Lance 4.0.0 has no public two-phase delete). Mixing creates ordering hazards (same-row insertβ†’delete becomes a no-op because the staged insert isn't visible to delete; cascading deletes of just-inserted edges break referential integrity by silent design). Until Lance exposes `DeleteJob::execute_uncommitted`, the parse-time rejection keeps both paths atomic and correct. See [docs/dev/writes.md](../dev/writes.md) and [docs/dev/invariants.md](../dev/invariants.md). ## IR (Intermediate Representation) diff --git a/docs/user/server.md b/docs/user/server.md index 6f55e16..67b5afe 100644 --- a/docs/user/server.md +++ b/docs/user/server.md @@ -6,7 +6,9 @@ Axum 0.8 + tokio + utoipa-generated OpenAPI. **Two modes** (v0.6.0+): single-gra ### Single-graph mode (legacy) -`omnigraph-server <URI>` or `omnigraph-server --target <name> --config omnigraph.yaml`. Routes are flat β€” `/snapshot`, `/read`, `/branches`, etc. Behavior unchanged from v0.6.0. +`omnigraph-server <URI>` or `omnigraph-server --target <name> --config omnigraph.yaml`. Routes are flat β€” `/snapshot`, `/read`, `/branches`, etc. + +**Config follows graph identity.** A bare `<URI>` is an *anonymous* graph and uses the **top-level** `policy.file` / `queries:`. A graph chosen by **name** (`--target` / `server.graph`) uses its own `graphs.<name>.{policy.file, queries}` β€” the same block multi-graph mode uses. ⚠️ *Changed from v0.6.0, which always used top-level config in single mode: a named-graph config that puts `policy`/`queries` at top-level now **refuses boot** and points you at `graphs.<name>.…` (move the block there). Bare-`<URI>` single mode is unchanged.* ### Multi-graph mode (v0.6.0+) @@ -20,6 +22,10 @@ Mode inference (four-rule matrix): 4. `--config` + non-empty `graphs:` + no single-mode selector β†’ **multi** 5. otherwise β†’ error with migration hint +### Stored-query validation at startup + +If a graph declares a `queries:` registry (see [cli-reference](cli-reference.md)), the server **loads and type-checks every stored query against that graph's live schema at startup** and **refuses to boot** if any query references a type or property the schema lacks β€” the same fail-loud posture as a malformed policy file, so schema drift surfaces at the deploy boundary rather than at invocation. Two MCP-exposed queries claiming the same tool name is likewise a boot error. Non-blocking advisories (e.g. an MCP-exposed query with a vector parameter an agent cannot supply) are logged. Validate offline before deploying with `omnigraph queries validate`. Discover the exposed queries as a typed tool catalog with `GET /queries`, and invoke one over HTTP with `POST /queries/{name}` (both below). + ## Endpoint inventory Per-graph endpoints β€” same body shape across modes; URLs differ: @@ -34,6 +40,8 @@ Per-graph endpoints β€” same body shape across modes; URLs differ: | POST | `/export` | `/graphs/{id}/export` | bearer + `export` | NDJSON stream | `server_export` | | POST | `/mutate` | `/graphs/{id}/mutate` | bearer + `change` | mutation (canonical; `query`/`name`; accepts legacy `query_source`/`query_name` as serde aliases) | `server_mutate` | | POST | `/change` | `/graphs/{id}/change` | bearer + `change` | **deprecated** alias of `/mutate` (carries `Deprecation: true` + `Link: </mutate>; rel="successor-version"`) | `server_change` | +| GET | `/queries` | `/graphs/{id}/queries` | bearer + `read` | list the `mcp.expose` stored queries as a typed tool catalog | `server_list_queries` | +| POST | `/queries/{name}` | `/graphs/{id}/queries/{name}` | bearer + `invoke_query` (+ `change` for a stored mutation) | invoke a named query from the `queries:` registry; deny == 404 | `server_invoke_query` | | GET | `/schema` | `/graphs/{id}/schema` | bearer + `read` | get current `.pg` source | `server_schema_get` | | POST | `/schema/apply` | `/graphs/{id}/schema/apply` | bearer + `schema_apply` (target=`main`) | migrate | `server_schema_apply` | | POST | `/ingest` | `/graphs/{id}/ingest` | bearer + `branch_create` (if new) + `change` | bulk load | `server_ingest` (32 MB body limit) | @@ -50,6 +58,23 @@ Server-level management endpoints (v0.6.0+): |---|---|---|---|---| | GET | `/graphs` | bearer + `graph_list` on `Server::"root"` | list registered graphs | `server_graphs_list` (405 in single mode) | +### Stored-query catalog (`GET /queries`) + +List the graph's **`mcp.expose`** stored queries as a typed tool catalog β€” enough for a client (e.g. an MCP server) to register each as a tool without fetching `.gq` source. Each entry: `{ name, tool_name, description, instruction, mutation, params }`, where each param is `{ name, kind, item_kind?, vector_dim?, nullable }`. `kind` is one of `string | bool | int | bigint | float | date | datetime | blob | vector | list` (decomposed so a consumer maps it with a closed `switch`, never re-parsing GQ type spelling). `bigint` (I64/U64), `date`, `datetime`, and `blob` are carried as JSON **strings** β€” a 64-bit integer loses precision as a JSON number, dates are ISO strings, and a blob is a URI string. + +- **Read-gated** (works in default-deny mode). The catalog is **graph-wide** (branch-independent; `read` is authorized against `main`). +- **`mcp.expose` defaults to `true`** β€” declaring a query in `queries:` lists it; set `mcp: { expose: false }` to keep it HTTP/service-callable but hidden from the catalog. +- **Not Cedar-filtered per query (yet).** A caller with `read` but not `invoke_query` can *list* a query they can't *invoke* (which would 404). Closing that gap is future per-query authorization; for now the catalog is a discovery surface and `invoke_query` remains the invocation gate. + +### Stored-query invocation (`POST /queries/{name}`) + +Invoke a curated, server-side stored query by **name** β€” the source comes from the graph's `queries:` registry, so the client never sends `.gq`. The request body itself is optional; omit it for no-param queries, or send `{ "params": { … }, "branch": "main", "snapshot": null }`, where every field is optional and `params` keys match the query's declared parameters. The response is the **read envelope** (`ReadOutput`) for a stored read or the **mutation envelope** (`ChangeOutput`) for a stored mutation β€” serialized untagged, so the wire shape is identical to `/query` / `/mutate`. + +- **Gate:** `invoke_query` (per-graph, graph-scoped) at the boundary. A stored *mutation* is **double-gated** β€” it also passes the engine's `change` gate, so an actor with `invoke_query` but not `change` gets `403`. +- **Deny == unknown, for callers without `invoke_query`:** for a caller lacking the grant, an `invoke_query` denial and an unknown query name return the **same `404`** (identical body), so the catalog can't be probed. A caller that *holds* `invoke_query` may still get the inner gate's `403` for an existing query it can't `read`/`change` (the double-gate, above) β€” so existence is visible to grant-holders by design. +- **Requires an explicit policy grant when auth is on.** In default-deny mode (bearer tokens but no `policy.file`), only `read` is permitted, so *every* `/queries/{name}` call returns `404` until an `invoke_query` rule is configured. +- A stored mutation cannot target a `snapshot` (`400`); a parameter type error is a structured `400` naming the parameter. + ## Adding and removing graphs (multi mode) Runtime add/remove via API is **not** exposed in v0.6.0 β€” neither diff --git a/docs/user/storage.md b/docs/user/storage.md index c22d4d6..2c57a92 100644 --- a/docs/user/storage.md +++ b/docs/user/storage.md @@ -22,7 +22,7 @@ OmniGraph is **not** a single Lance dataset; it is a *graph* of datasets coordin - `edges/{fnv1a64-hex(edge_type_name)}` β€” one Lance dataset per edge type - `__manifest/` β€” the catalog of all sub-tables and their published versions - `_graph_commits.lance` / `_graph_commit_actors.lance` β€” the commit graph and its actor map - - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771 and these files are cleaned up via MR-770's production sweep) + - (legacy `_graph_runs.lance` / `_graph_run_actors.lance` from pre-v0.4.0 graphs are inert; the run state machine was removed in MR-771. The v2β†’v3 manifest migration sweeps stale `__run__*` branches on first write-open; the inert dataset bytes themselves remain until a `delete_prefix` storage primitive lands) - **Manifest row schema** (`object_id, object_type, location, metadata, base_objects, table_key, table_version, table_branch, row_count`): - `object_type` ∈ `table | table_version | table_tombstone` - `table_key` ∈ `node:<TypeName> | edge:<EdgeName>` @@ -47,6 +47,7 @@ Adding a new on-disk shape change is one constant bump (`INTERNAL_MANIFEST_SCHEM |---|---| | v1 (implicit, pre-stamp) | `__manifest.object_id` had no PK annotation; publisher had no row-level CAS protection. | | v2 | `__manifest.object_id` carries `lance-schema:unenforced-primary-key=true`; row-level CAS engaged. Stamped as `omnigraph:internal_schema_version=2`. | +| v3 | One-time sweep of legacy `__run__*` staging branches (pre-v0.4.0 Run state machine, removed MR-771) off `__manifest`. Runs at `Omnigraph::open(ReadWrite)` and on publish. Stamped as `omnigraph:internal_schema_version=3`. | ## On-disk layout @@ -91,9 +92,9 @@ flowchart TB - **Graph root** is one directory (or S3 prefix). Everything below is part of one OmniGraph graph. - **`__manifest/`** is a Lance dataset whose rows describe which sub-table version is published at which graph-branch. Reading a snapshot starts here. - **`nodes/`** and **`edges/`** are sibling directories holding one Lance dataset per declared type. Names are `fnv1a64-hex` of the type name to keep paths fixed-length and case-safe. -- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; MR-770 sweeps these in production.) +- **`_graph_commits.lance`** is an L2 dataset that records the graph-level commit DAG, with a paired `_graph_commit_actors.lance` for the actor map. (Pre-v0.4.0 graphs also have inert `_graph_runs.lance` / `_graph_run_actors.lance` from the removed Run state machine; the v2β†’v3 migration sweeps their stale `__run__*` branches, and the dataset bytes are reclaimed once `delete_prefix` lands.) - **`_graph_commit_recoveries.lance`** β€” one row per recovery sweep action. Joined to `_graph_commits.lance` by `graph_commit_id`; the linked commit row carries `actor_id=omnigraph:recovery`. Operators correlate recoveries with the original mutations they rolled forward / back via this join. See `crates/omnigraph/src/db/recovery_audit.rs`. -- **`__recovery/{ulid}.json`** β€” transient sidecar files written by the four migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`) before Phase B begins, deleted after Phase C succeeds. A sidecar persisting after process exit means the writer crashed in the Phase B β†’ Phase C window; the next `Omnigraph::open` recovery sweep processes it. Steady-state directory is empty. See `crates/omnigraph/src/db/manifest/recovery.rs`. +- **`__recovery/{ulid}.json`** β€” transient sidecar files written by the five migrated writers (`MutationStaging::finalize`, `schema_apply`, `branch_merge`, `ensure_indices`, `optimize_all_tables`) before Phase B begins, deleted after Phase C succeeds. A sidecar persisting after process exit means the writer crashed in the Phase B β†’ Phase C window; the next `Omnigraph::open` recovery sweep processes it. Steady-state directory is empty. See `crates/omnigraph/src/db/manifest/recovery.rs`. - **`_refs/branches/{name}.json`** is graph-level branch metadata β€” pointers from a branch name to the manifest version it heads. - **Inside each Lance dataset** (orange): the standard Lance directory layout. `_versions/{n}.manifest` records every commit; `data/` holds the actual Arrow fragments; `_indices/{uuid}/` holds index segments with their own `fragment_bitmap` for partial coverage; `_refs/` holds Lance-native per-dataset branches and tags. diff --git a/docs/user/transactions.md b/docs/user/transactions.md index e4ed485..d6c79f4 100644 --- a/docs/user/transactions.md +++ b/docs/user/transactions.md @@ -164,5 +164,5 @@ This is the workflow MR-797 / agentic loops are designed around: **branches are - [`docs/user/branches-commits.md`](branches-commits.md) β€” branch and commit-graph mechanics. - [`docs/dev/merge.md`](../dev/merge.md) β€” three-way merge details and conflict kinds. - [`docs/user/query-language.md`](query-language.md) β€” `.gq` syntax for the multi-statement queries used above. -- [`docs/dev/runs.md`](../dev/runs.md) β€” the per-query commit pipeline that gives single-query atomicity. +- [`docs/dev/writes.md`](../dev/writes.md) β€” the per-query commit pipeline that gives single-query atomicity. - [`docs/dev/invariants.md`](../dev/invariants.md) β€” the architectural rule. diff --git a/openapi.json b/openapi.json index d1fa337..aced64d 100644 --- a/openapi.json +++ b/openapi.json @@ -7,7 +7,7 @@ "name": "MIT", "identifier": "MIT" }, - "version": "0.6.0" + "version": "0.6.1" }, "paths": { "/branches": { @@ -829,6 +829,177 @@ ] } }, + "/queries": { + "get": { + "tags": [ + "queries" + ], + "summary": "List the graph's exposed stored queries as a typed tool catalog.", + "description": "Returns the `mcp.expose == true` subset of the `queries:` registry, each\nwith its MCP tool name, read/mutate flag, description/instruction, and\ntyped parameters β€” enough for a client to register them as tools without\nfetching `.gq` source. Read-gated; the catalog is graph-wide (branch\nindependent β€” `read` is authorized against `main`). **Not** Cedar-filtered\nper query yet, so it can list a query whose `invoke_query` the caller\nlacks (a known gap until per-query authorization lands).", + "operationId": "list_queries", + "responses": { + "200": { + "description": "Stored-query catalog (the mcp.expose subset, with typed params)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/QueriesCatalogOutput" + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "403": { + "description": "Forbidden", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + } + }, + "security": [ + { + "bearer_token": [] + } + ] + } + }, + "/queries/{name}": { + "post": { + "tags": [ + "queries" + ], + "summary": "Invoke a curated, server-side stored query by name.", + "description": "The query source comes from the graph's `queries:` registry, not the\nrequest body β€” callers send only runtime inputs (`params`, `branch`,\n`snapshot`). Gated by the `invoke_query` Cedar action at the boundary;\na stored *mutation* additionally passes the engine's `change` gate\n(double-gated). An actor **without** `invoke_query` cannot tell a denied\nquery from a missing one β€” both return the same 404, so the catalog\ncan't be probed without the grant. Once `invoke_query` is held, the\ninner `read`/`change` gate may surface a 403 for an existing query the\nactor can't run (the intended double-gate signal).", + "operationId": "invoke_query", + "parameters": [ + { + "name": "name", + "in": "path", + "description": "Stored query name (the registry key)", + "required": true, + "schema": { + "type": "string" + } + } + ], + "requestBody": { + "content": { + "application/json": { + "schema": { + "oneOf": [ + { + "type": "null" + }, + { + "$ref": "#/components/schemas/InvokeStoredQueryRequest" + } + ] + } + } + } + }, + "responses": { + "200": { + "description": "Read envelope (ReadOutput) or mutation envelope (ChangeOutput), serialized untagged", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/InvokeStoredQueryResponse" + } + } + } + }, + "400": { + "description": "Bad request (param type error; snapshot on a stored mutation)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "401": { + "description": "Unauthorized", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "403": { + "description": "Forbidden (the inner `change` gate for a stored mutation)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "404": { + "description": "Unknown stored query, or `invoke_query` denied β€” indistinguishable to a caller without the grant", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "409": { + "description": "Merge conflict", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "429": { + "description": "Per-actor admission cap exceeded; honor `Retry-After` header", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + }, + "500": { + "description": "Policy evaluation error (a denial is reported as 404, not 500)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorOutput" + } + } + } + } + }, + "security": [ + { + "bearer_token": [] + } + ] + } + }, "/query": { "post": { "tags": [ @@ -1628,6 +1799,40 @@ } } }, + "InvokeStoredQueryRequest": { + "type": "object", + "description": "Body for `POST /queries/{name}` β€” invokes the server-side stored query\nnamed in the path. The query source and name come from the registry,\nnever the body; only the runtime inputs are supplied here.", + "properties": { + "branch": { + "type": [ + "string", + "null" + ], + "description": "Branch to run against. Defaults to `main`; for a stored mutation the\nwrite targets this branch." + }, + "params": { + "description": "JSON object whose keys match the stored query's declared parameters." + }, + "snapshot": { + "type": [ + "string", + "null" + ], + "description": "Snapshot id to read from (read queries only β€” rejected for a stored\nmutation). Mutually exclusive with `branch`." + } + } + }, + "InvokeStoredQueryResponse": { + "oneOf": [ + { + "$ref": "#/components/schemas/ReadOutput" + }, + { + "$ref": "#/components/schemas/ChangeOutput" + } + ], + "description": "Response for `POST /queries/{name}`: the read envelope for a stored\nread, or the mutation envelope for a stored mutation. Serialized\n**untagged**, so the wire shape is exactly [`ReadOutput`] or\n[`ChangeOutput`] β€” classification follows the stored query, not a\nwrapper field." + }, "LoadMode": { "type": "string", "description": "Shadow enum for documenting [`LoadMode`] in the OpenAPI schema.", @@ -1698,6 +1903,120 @@ } } }, + "ParamDescriptor": { + "type": "object", + "description": "One declared parameter of a stored query, projected for the catalog.", + "required": [ + "name", + "kind", + "nullable" + ], + "properties": { + "item_kind": { + "oneOf": [ + { + "type": "null" + }, + { + "$ref": "#/components/schemas/ParamKind", + "description": "Element kind when `kind == list` (always a scalar β€” the grammar\nforbids lists of vectors or nested lists)." + } + ] + }, + "kind": { + "$ref": "#/components/schemas/ParamKind" + }, + "name": { + "type": "string" + }, + "nullable": { + "type": "boolean", + "description": "`false` β†’ the caller must supply it; `true` β†’ optional." + }, + "vector_dim": { + "type": [ + "integer", + "null" + ], + "format": "int32", + "description": "Dimension when `kind == vector`.", + "minimum": 0 + } + } + }, + "ParamKind": { + "type": "string", + "description": "The kind of a stored-query parameter, decomposed so a client (e.g. an\nMCP server) can build a typed input schema with a closed `match` and\nnever re-parse omnigraph's type spelling. `bigint`/`date`/`datetime`/\n`blob` are carried as JSON strings on the wire: a 64-bit integer past\n2^53 loses precision as a JSON number, and Date/DateTime are ISO\nstrings, Blob a blob-URI string.", + "enum": [ + "string", + "bool", + "int", + "bigint", + "float", + "date", + "datetime", + "blob", + "vector", + "list" + ] + }, + "QueriesCatalogOutput": { + "type": "object", + "description": "Response for `GET /queries`: the `mcp.expose` subset of a graph's\nstored-query registry, each with typed parameters.", + "required": [ + "queries" + ], + "properties": { + "queries": { + "type": "array", + "items": { + "$ref": "#/components/schemas/QueryCatalogEntry" + } + } + } + }, + "QueryCatalogEntry": { + "type": "object", + "description": "One entry in the stored-query catalog (`GET /queries`).", + "required": [ + "name", + "tool_name", + "mutation", + "params" + ], + "properties": { + "description": { + "type": [ + "string", + "null" + ] + }, + "instruction": { + "type": [ + "string", + "null" + ] + }, + "mutation": { + "type": "boolean", + "description": "`true` for a stored mutation β†’ an MCP read-only hint of `false`." + }, + "name": { + "type": "string", + "description": "Registry key / invoke path segment (`POST /queries/{name}`)." + }, + "params": { + "type": "array", + "items": { + "$ref": "#/components/schemas/ParamDescriptor" + } + }, + "tool_name": { + "type": "string", + "description": "MCP tool id (the `tool_name` override, else `name`)." + } + } + }, "QueryRequest": { "type": "object", "description": "Inline read-query request for `POST /query`.\n\nFriendlier-named alternative to [`ReadRequest`] for ad-hoc reads and\nAI-agent integration. Mutations are rejected with 400 β€” use `POST\n/mutate` (or its deprecated alias `POST /change`) for write queries.\nField names are deliberately short (`query`, `name`) to match the GQ\nkeyword and the CLI `-e` flag.", diff --git a/scripts/check-agents-md.sh b/scripts/check-agents-md.sh index abc6469..02a177a 100755 --- a/scripts/check-agents-md.sh +++ b/scripts/check-agents-md.sh @@ -34,10 +34,15 @@ PY canonical=() while IFS= read -r line; do canonical+=("$line") -done < <(find docs -type f -name '*.md' ! -path 'docs/releases/*' ! -path 'docs/internal/*' | sort) +done < <(find docs -type f -name '*.md' ! -path 'docs/releases/*' ! -path 'docs/internal/*' ! -path 'docs/rfcs/*' | sort) if [[ -d docs/releases ]]; then canonical+=("docs/releases/") fi +# RFCs are a growing collection (like releases): represent the directory, not +# every per-RFC file. The dir must be linked from an audience index. +if [[ -d docs/rfcs ]]; then + canonical+=("docs/rfcs/") +fi linked=() for index_file in "${index_files[@]}"; do diff --git a/scripts/install.ps1 b/scripts/install.ps1 new file mode 100644 index 0000000..3bfd0f1 --- /dev/null +++ b/scripts/install.ps1 @@ -0,0 +1,151 @@ +param( + [string]$RepoSlug = "ModernRelay/omnigraph", + [string]$InstallDir = "$env:USERPROFILE\.local\bin", + [ValidateSet("stable", "edge")] + [string]$ReleaseChannel = "stable", + [string]$Version = "" +) + +$ErrorActionPreference = "Stop" + +$assetName = "omnigraph-windows-x86_64.zip" +$assetStem = "omnigraph-windows-x86_64" +$workDir = Join-Path ([System.IO.Path]::GetTempPath()) ("omnigraph-install-" + [System.Guid]::NewGuid().ToString("N")) +$selectedChannel = "" + +function Write-Log { + param([string]$Message) + Write-Host "==> $Message" +} + +function Get-ReleaseBaseUrl { + param([string]$Channel) + + if ($Version -ne "") { + return "https://github.com/$RepoSlug/releases/download/$Version" + } + + if ($Channel -eq "stable") { + return "https://github.com/$RepoSlug/releases/latest/download" + } + + if ($Channel -eq "edge") { + return "https://github.com/$RepoSlug/releases/download/edge" + } + + throw "unsupported ReleaseChannel '$Channel' (expected stable or edge)" +} + +function Download-ReleaseFiles { + param( + [string]$BaseUrl, + [string]$ArchivePath, + [string]$ChecksumPath + ) + + try { + Invoke-WebRequest -UseBasicParsing -Uri "$BaseUrl/$assetName" -OutFile $ArchivePath + Invoke-WebRequest -UseBasicParsing -Uri "$BaseUrl/$assetStem.sha256" -OutFile $ChecksumPath + return $true + } catch { + return $false + } +} + +function Verify-Checksum { + param( + [string]$ArchivePath, + [string]$ChecksumPath + ) + + $checksumText = (Get-Content -Path $ChecksumPath -Raw).Trim() + $expected = ($checksumText -split "\s+")[0].ToLowerInvariant() + if ($expected -eq "") { + throw "checksum file did not contain a SHA256 digest" + } + + $actual = (Get-FileHash -Path $ArchivePath -Algorithm SHA256).Hash.ToLowerInvariant() + if ($actual -ne $expected) { + throw "checksum verification failed for $assetName" + } +} + +function Install-FromDirectory { + param([string]$SourceDir) + + New-Item -ItemType Directory -Force -Path $InstallDir | Out-Null + Copy-Item -Path (Join-Path $SourceDir "omnigraph.exe") -Destination (Join-Path $InstallDir "omnigraph.exe") -Force + Copy-Item -Path (Join-Path $SourceDir "omnigraph-server.exe") -Destination (Join-Path $InstallDir "omnigraph-server.exe") -Force +} + +function Install-FromRelease { + New-Item -ItemType Directory -Force -Path $workDir | Out-Null + + $archivePath = Join-Path $workDir $assetName + $checksumPath = Join-Path $workDir "$assetStem.sha256" + + if ($Version -ne "") { + $script:selectedChannel = $Version + $baseUrl = Get-ReleaseBaseUrl -Channel $ReleaseChannel + Write-Log "Downloading $assetName from $Version" + if (!(Download-ReleaseFiles -BaseUrl $baseUrl -ArchivePath $archivePath -ChecksumPath $checksumPath)) { + throw "no published binary found for $assetName at release $Version" + } + } else { + $script:selectedChannel = $ReleaseChannel + $baseUrl = Get-ReleaseBaseUrl -Channel $selectedChannel + Write-Log "Downloading $assetName from $selectedChannel" + if (!(Download-ReleaseFiles -BaseUrl $baseUrl -ArchivePath $archivePath -ChecksumPath $checksumPath)) { + if ($ReleaseChannel -ne "stable") { + throw "no published binary found for $assetName on channel $ReleaseChannel" + } + + Write-Log "Stable release binaries are not published yet; falling back to edge" + $script:selectedChannel = "edge" + $baseUrl = Get-ReleaseBaseUrl -Channel $selectedChannel + if (!(Download-ReleaseFiles -BaseUrl $baseUrl -ArchivePath $archivePath -ChecksumPath $checksumPath)) { + throw "no published binary found for $assetName on stable or edge; build from source" + } + } + } + + Verify-Checksum -ArchivePath $archivePath -ChecksumPath $checksumPath + + $extractDir = Join-Path $workDir "extract" + New-Item -ItemType Directory -Force -Path $extractDir | Out-Null + Expand-Archive -Path $archivePath -DestinationPath $extractDir -Force + Install-FromDirectory -SourceDir $extractDir +} + +function Print-Summary { + $omnigraphPath = Join-Path $InstallDir "omnigraph.exe" + $serverPath = Join-Path $InstallDir "omnigraph-server.exe" + + Write-Host "" + Write-Host "Installed:" + Write-Host " $omnigraphPath" + Write-Host " $serverPath" + Write-Host "" + Write-Host "Verify:" + Write-Host " $omnigraphPath version" + Write-Host " $serverPath --help" + Write-Host "" + + if ($selectedChannel -ne "") { + Write-Host "Installed from release channel: $selectedChannel" + } + + $pathParts = $env:Path -split [System.IO.Path]::PathSeparator + if ($pathParts -notcontains $InstallDir) { + Write-Host "Add $InstallDir to PATH if needed." + } +} + +try { + Install-FromRelease + Print-Summary +} finally { + if (Test-Path $workDir) { + Remove-Item -Path $workDir -Recurse -Force + } +} diff --git a/scripts/local-rustfs-bootstrap.sh b/scripts/local-rustfs-bootstrap.sh index 6327f77..c4fdcbe 100755 --- a/scripts/local-rustfs-bootstrap.sh +++ b/scripts/local-rustfs-bootstrap.sh @@ -6,7 +6,14 @@ SOURCE_REF="${SOURCE_REF:-main}" RELEASE_CHANNEL="${RELEASE_CHANNEL:-edge}" WORKDIR="${WORKDIR:-$PWD/.omnigraph-rustfs-demo}" RUSTFS_CONTAINER_NAME="${RUSTFS_CONTAINER_NAME:-omnigraph-rustfs-demo}" -RUSTFS_IMAGE="${RUSTFS_IMAGE:-rustfs/rustfs:latest}" +# Pinned to 1.0.0-beta.3 (2026-05-14) β€” the last known-good tag, matching CI +# (.github/workflows/ci.yml). `rustfs/rustfs:latest` (1.0.0-beta.4, 2026-05-21) +# added a credentials-policy check that refuses to start when the access/secret +# keys are values it considers "default" (rustfsadmin/rustfsadmin here). This +# script still works on beta.4+ because it passes +# RUSTFS_ALLOW_INSECURE_DEFAULT_CREDENTIALS=true below β€” so overriding +# RUSTFS_IMAGE to a newer tag is safe. +RUSTFS_IMAGE="${RUSTFS_IMAGE:-rustfs/rustfs:1.0.0-beta.3}" RUSTFS_DATA_DIR="${RUSTFS_DATA_DIR:-$WORKDIR/rustfs-data}" BUCKET="${BUCKET:-omnigraph-local}" PREFIX="${PREFIX:-repos/context}" @@ -74,9 +81,6 @@ platform_asset_name() { Linux/x86_64) printf 'omnigraph-linux-x86_64.tar.gz\n' ;; - Darwin/x86_64) - printf 'omnigraph-macos-x86_64.tar.gz\n' - ;; Darwin/arm64) printf 'omnigraph-macos-arm64.tar.gz\n' ;; @@ -268,6 +272,7 @@ start_rustfs() { -v "$RUSTFS_DATA_DIR:/data" \ -e RUSTFS_ACCESS_KEY="$AWS_ACCESS_KEY_ID" \ -e RUSTFS_SECRET_KEY="$AWS_SECRET_ACCESS_KEY" \ + -e RUSTFS_ALLOW_INSECURE_DEFAULT_CREDENTIALS=true \ "$RUSTFS_IMAGE" \ /data >/dev/null } diff --git a/scripts/update-homebrew-formula.sh b/scripts/update-homebrew-formula.sh index 90a5dea..f2f0df9 100755 --- a/scripts/update-homebrew-formula.sh +++ b/scripts/update-homebrew-formula.sh @@ -64,20 +64,8 @@ cat >"$FORMULA_PATH" <<EOF class Omnigraph < Formula desc "Typed property graph database with Git-style workflows" homepage "https://github.com/${REPO_SLUG}" - license "MIT" version "${VERSION}" - - on_macos do - depends_on arch: :arm64 - url "${MACOS_ARM_URL}" - sha256 "${MACOS_ARM_SHA}" - end - - on_linux do - url "${LINUX_X86_URL}" - sha256 "${LINUX_X86_SHA}" - end - + license "MIT" head "https://github.com/${REPO_SLUG}.git", branch: "main" livecheck do @@ -85,6 +73,21 @@ class Omnigraph < Formula regex(/^v?(\\d+(?:\\.\\d+)+)$/i) end + on_macos do + depends_on arch: :arm64 + on_arm do + url "${MACOS_ARM_URL}" + sha256 "${MACOS_ARM_SHA}" + end + end + + on_linux do + on_intel do + url "${LINUX_X86_URL}" + sha256 "${LINUX_X86_SHA}" + end + end + def install bin.install "omnigraph", "omnigraph-server" end