commit 338289656a533b57bef66fd5437229f8e3d19968 Author: andrew Date: Fri Apr 10 20:49:41 2026 +0300 Initial public Omnigraph repository diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..ab6a1f8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +** +!Dockerfile +!docker/entrypoint.sh +!target/release/omnigraph-server diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..8325681 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,123 @@ +name: CI + +on: + pull_request: + push: + branches: + - main + tags: + - "v*" + workflow_dispatch: + +concurrency: + group: ci-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + test: + name: Test Workspace + runs-on: ubuntu-latest + timeout-minutes: 45 + permissions: + contents: read + env: + CARGO_TERM_COLOR: always + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y protobuf-compiler libprotobuf-dev + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + + - name: Cache Rust build data + uses: Swatinem/rust-cache@v2 + with: + workspaces: | + . -> target + + - name: Run workspace tests + run: cargo test --workspace --locked + + rustfs_integration: + name: RustFS S3 Integration + needs: test + runs-on: ubuntu-latest + timeout-minutes: 45 + permissions: + contents: read + env: + AWS_ACCESS_KEY_ID: rustfsadmin + AWS_SECRET_ACCESS_KEY: rustfsadmin + AWS_REGION: us-east-1 + AWS_ENDPOINT_URL: http://127.0.0.1:9000 + AWS_ENDPOINT_URL_S3: http://127.0.0.1:9000 + AWS_ALLOW_HTTP: "true" + AWS_S3_FORCE_PATH_STYLE: "true" + OMNIGRAPH_S3_TEST_BUCKET: omnigraph-ci + OMNIGRAPH_S3_TEST_PREFIX: github-actions + CARGO_TERM_COLOR: always + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Install system dependencies + run: | + sudo apt-get update + sudo apt-get install -y protobuf-compiler libprotobuf-dev python3-pip + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + + - name: Cache Rust build data + uses: Swatinem/rust-cache@v2 + with: + workspaces: | + . -> target + + - name: Start RustFS + run: | + docker rm -f rustfs >/dev/null 2>&1 || true + docker run -d \ + --name rustfs \ + -p 9000:9000 \ + -p 9001:9001 \ + -e RUSTFS_ACCESS_KEY="${AWS_ACCESS_KEY_ID}" \ + -e RUSTFS_SECRET_KEY="${AWS_SECRET_ACCESS_KEY}" \ + rustfs/rustfs:latest \ + /data + + - name: Install AWS CLI + run: | + python3 -m pip install --user awscli + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + - name: Create RustFS test bucket + run: | + for _ in $(seq 1 30); do + if aws --endpoint-url "${AWS_ENDPOINT_URL_S3}" s3api list-buckets >/dev/null 2>&1; then + break + fi + sleep 2 + done + aws --endpoint-url "${AWS_ENDPOINT_URL_S3}" \ + s3api create-bucket \ + --bucket "${OMNIGRAPH_S3_TEST_BUCKET}" >/dev/null 2>&1 || true + + - name: Run RustFS-backed repo tests + run: | + cargo test --locked -p omnigraph --test s3_storage -- --nocapture + cargo test --locked -p omnigraph-server --test server server_opens_s3_repo_directly_and_serves_snapshot_and_read -- --nocapture + cargo test --locked -p omnigraph-cli --test system_local local_cli_s3_end_to_end_init_load_read_flow -- --nocapture + + - name: Dump RustFS logs on failure + if: failure() + run: docker logs rustfs diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml new file mode 100644 index 0000000..ec2e3f6 --- /dev/null +++ b/.github/workflows/release.yml @@ -0,0 +1,68 @@ +name: Release + +on: + push: + tags: + - "v*" + workflow_dispatch: + +jobs: + build_release: + name: Build ${{ matrix.asset_name }} + runs-on: ${{ matrix.runner }} + permissions: + contents: write + strategy: + fail-fast: false + matrix: + include: + - runner: ubuntu-latest + asset_name: omnigraph-linux-x86_64 + - runner: macos-13 + asset_name: omnigraph-macos-x86_64 + - runner: macos-14 + asset_name: omnigraph-macos-arm64 + env: + CARGO_TERM_COLOR: always + steps: + - name: Checkout source + uses: actions/checkout@v4 + + - name: Install Linux dependencies + if: runner.os == 'Linux' + run: | + sudo apt-get update + sudo apt-get install -y protobuf-compiler libprotobuf-dev + + - name: Install macOS dependencies + if: runner.os == 'macOS' + run: brew install protobuf + + - name: Install Rust stable + uses: dtolnay/rust-toolchain@stable + with: + toolchain: stable + + - name: Cache Rust build data + uses: Swatinem/rust-cache@v2 + with: + workspaces: | + . -> target + + - name: Build release binaries + run: cargo build --release --locked -p omnigraph-cli -p omnigraph-server + + - name: Package release archive + run: | + mkdir -p release + install -m 0755 target/release/omnigraph release/omnigraph + install -m 0755 target/release/omnigraph-server release/omnigraph-server + tar -C release -czf "${{ matrix.asset_name }}.tar.gz" omnigraph omnigraph-server + shasum -a 256 "${{ matrix.asset_name }}.tar.gz" > "${{ matrix.asset_name }}.sha256" + + - name: Publish GitHub release assets + uses: softprops/action-gh-release@v2 + with: + files: | + ${{ matrix.asset_name }}.tar.gz + ${{ matrix.asset_name }}.sha256 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6f70bdc --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +/target +**/target +*.lance +*.nano +*.nanograph +.DS_Store +.env +.env.* +*.tfstate +*.tfstate.* +.terraform/ +.terraform.lock.hcl +*.tfvars +!*.tfvars.example +__pycache__/ +*.pyc +demo/*.omni/ +.omnigraph-rustfs-demo/ diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000..653f297 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,13 @@ +# Code Of Conduct + +This project follows a simple rule: be direct, respectful, and constructive. + +Expected behavior: + +- focus on technical substance +- assume good intent +- give actionable feedback +- avoid harassment, personal attacks, and pile-ons + +Maintainers may remove comments, issues, or pull requests that make the project +harder to collaborate in productively. diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..65d1e24 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,23 @@ +# Contributing + +Small bug fixes and documentation improvements are welcome directly through pull +requests. + +For larger changes, please open an issue or design discussion first so the +proposed direction is clear before implementation starts. + +## Development + +```bash +cargo build --workspace +cargo test --workspace +``` + +If you touch S3-backed flows, the CI model uses a local RustFS instance for +integration tests. + +## Pull Requests + +- keep changes focused +- include tests for behavior changes when practical +- update public docs when the user-facing surface changes diff --git a/Cargo.lock b/Cargo.lock new file mode 100644 index 0000000..4c235ad --- /dev/null +++ b/Cargo.lock @@ -0,0 +1,7646 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 4 + +[[package]] +name = "addr2line" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b5d307320b3181d6d7954e663bd7c774a838b8220fe0593c86d9fb09f498b4b" +dependencies = [ + "gimli", +] + +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "aes" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b169f7a6d4742236a0a00c541b845991d0ac43e546831af1249753ab4c3aa3a0" +dependencies = [ + "cfg-if", + "cipher", + "cpufeatures", +] + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + +[[package]] +name = "aho-corasick" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddd31a130427c27518df266943a5308ed92d4b226cc639f5a8f1002816174301" +dependencies = [ + "memchr", +] + +[[package]] +name = "alloc-no-stdlib" +version = "2.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc7bb162ec39d46ab1ca8c77bf72e890535becd1751bb45f64c597edb4c8c6b3" + +[[package]] +name = "alloc-stdlib" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94fb8275041c72129eb51b7d0322c29b8387a0386127718b096429201a5d6ece" +dependencies = [ + "alloc-no-stdlib", +] + +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "anstream" +version = "0.6.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43d5b281e737544384e969a5ccad3f1cdd24b48086a0fc1b2a5262a26b8f4f4a" +dependencies = [ + "anstyle", + "anstyle-parse", + "anstyle-query", + "anstyle-wincon", + "colorchoice", + "is_terminal_polyfill", + "utf8parse", +] + +[[package]] +name = "anstyle" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5192cca8006f1fd4f7237516f40fa183bb07f8fbdfedaa0036de5ea9b0b45e78" + +[[package]] +name = "anstyle-parse" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e7644824f0aa2c7b9384579234ef10eb7efb6a0deb83f9630a49594dd9c15c2" +dependencies = [ + "utf8parse", +] + +[[package]] +name = "anstyle-query" +version = "1.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "40c48f72fd53cd289104fc64099abca73db4166ad86ea0b4341abe65af83dadc" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "anstyle-wincon" +version = "3.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "291e6a250ff86cd4a820112fb8898808a366d8f9f58ce16d1f538353ad55747d" +dependencies = [ + "anstyle", + "once_cell_polyfill", + "windows-sys 0.61.2", +] + +[[package]] +name = "anyhow" +version = "1.0.102" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f202df86484c868dbad7eaa557ef785d5c66295e41b460ef922eca0723b842c" + +[[package]] +name = "ar_archive_writer" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7eb93bbb63b9c227414f6eb3a0adfddca591a8ce1e9b60661bb08969b87e340b" +dependencies = [ + "object", +] + +[[package]] +name = "arc-swap" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a07d1f37ff60921c83bdfc7407723bdefe89b44b98a9b772f225c8f9d67141a6" +dependencies = [ + "rustversion", +] + +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + +[[package]] +name = "arrayvec" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "23b62fc65de8e4e7f52534fb52b0f3ed04746ae267519eef2a83941e8085068b" + +[[package]] +name = "arrayvec" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" + +[[package]] +name = "arrow" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e4754a624e5ae42081f464514be454b39711daae0458906dacde5f4c632f33a8" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-csv", + "arrow-data", + "arrow-ipc", + "arrow-json", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "arrow-string", +] + +[[package]] +name = "arrow-arith" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7b3141e0ec5145a22d8694ea8b6d6f69305971c4fa1c1a13ef0195aef2d678b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "num-traits", +] + +[[package]] +name = "arrow-array" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c8955af33b25f3b175ee10af580577280b4bd01f7e823d94c7cdef7cf8c9aef" +dependencies = [ + "ahash", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "chrono", + "chrono-tz", + "half", + "hashbrown 0.16.1", + "num-complex", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-buffer" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c697ddca96183182f35b3a18e50b9110b11e916d7b7799cbfd4d34662f2c56c2" +dependencies = [ + "bytes", + "half", + "num-bigint", + "num-traits", +] + +[[package]] +name = "arrow-cast" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "646bbb821e86fd57189c10b4fcdaa941deaf4181924917b0daa92735baa6ada5" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "atoi", + "base64", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num-traits", + "ryu", +] + +[[package]] +name = "arrow-csv" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8da746f4180004e3ce7b83c977daf6394d768332349d3d913998b10a120b790a" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "csv", + "csv-core", + "regex", +] + +[[package]] +name = "arrow-data" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fdd994a9d28e6365aa78e15da3f3950c0fdcea6b963a12fa1c391afb637b304" +dependencies = [ + "arrow-buffer", + "arrow-schema", + "half", + "num-integer", + "num-traits", +] + +[[package]] +name = "arrow-ipc" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "abf7df950701ab528bf7c0cf7eeadc0445d03ef5d6ffc151eaae6b38a58feff1" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "flatbuffers", + "lz4_flex 0.12.1", + "zstd", +] + +[[package]] +name = "arrow-json" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ff8357658bedc49792b13e2e862b80df908171275f8e6e075c460da5ee4bf86" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "chrono", + "half", + "indexmap 2.13.0", + "itoa", + "lexical-core", + "memchr", + "num-traits", + "ryu", + "serde_core", + "serde_json", + "simdutf8", +] + +[[package]] +name = "arrow-ord" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7d8f1870e03d4cbed632959498bcc84083b5a24bded52905ae1695bd29da45b" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", +] + +[[package]] +name = "arrow-row" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18228633bad92bff92a95746bbeb16e5fc318e8382b75619dec26db79e4de4c0" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "half", +] + +[[package]] +name = "arrow-schema" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c872d36b7bf2a6a6a2b40de9156265f0242910791db366a2c17476ba8330d68" +dependencies = [ + "bitflags", + "serde_core", + "serde_json", +] + +[[package]] +name = "arrow-select" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68bf3e3efbd1278f770d67e5dc410257300b161b93baedb3aae836144edcaf4b" +dependencies = [ + "ahash", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "num-traits", +] + +[[package]] +name = "arrow-string" +version = "57.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85e968097061b3c0e9fe3079cf2e703e487890700546b5b0647f60fca1b5a8d8" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "memchr", + "num-traits", + "regex", + "regex-syntax", +] + +[[package]] +name = "ascii-canvas" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef1e3e699d84ab1b0911a1010c5c106aa34ae89aeac103be5ce0c3859db1e891" +dependencies = [ + "term", +] + +[[package]] +name = "assert_cmd" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a686bbee5efb88a82df0621b236e74d925f470e5445d3220a5648b892ec99c9" +dependencies = [ + "anstyle", + "bstr", + "libc", + "predicates", + "predicates-core", + "predicates-tree", + "wait-timeout", +] + +[[package]] +name = "async-channel" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "924ed96dd52d1b75e9c1a3e6275715fd320f5f9439fb5a4a11fa51f4221158d2" +dependencies = [ + "concurrent-queue", + "event-listener-strategy", + "futures-core", + "pin-project-lite", +] + +[[package]] +name = "async-compression" +version = "0.4.41" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0f9ee0f6e02ffd7ad5816e9464499fba7b3effd01123b515c41d1697c43dad1" +dependencies = [ + "compression-codecs", + "compression-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "async-lock" +version = "3.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "290f7f2596bd5b78a9fec8088ccd89180d7f9f55b94b0576823bbbdc72ee8311" +dependencies = [ + "event-listener", + "event-listener-strategy", + "pin-project-lite", +] + +[[package]] +name = "async-recursion" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b43422f69d8ff38f95f1b2bb76517c91589a924d1559a0e935d7c8ce0274c11" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "async-trait" +version = "0.1.89" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9035ad2d096bed7955a320ee7e2230574d28fd3c3a0f186cbea1ff3c7eed5dbb" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "async_cell" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "447ab28afbb345f5408b120702a44e5529ebf90b1796ec76e9528df8e288e6c2" +dependencies = [ + "loom", +] + +[[package]] +name = "atoi" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f28d99ec8bfea296261ca1af174f24225171fea9664ba9003cbebee704810528" +dependencies = [ + "num-traits", +] + +[[package]] +name = "atomic-waker" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1505bd5d3d116872e7271a6d4e16d81d0c8570876c8de68093a09ac269d8aac0" + +[[package]] +name = "autocfg" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c08606f8c3cbf4ce6ec8e28fb0014a2c086708fe954eaa885384a6165172e7e8" + +[[package]] +name = "aws-config" +version = "1.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a8fc176d53d6fe85017f230405e3255cedb4a02221cb55ed6d76dccbbb099b2" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-sdk-sso", + "aws-sdk-ssooidc", + "aws-sdk-sts", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "hex", + "http 1.4.0", + "ring", + "time", + "tokio", + "tracing", + "url", + "zeroize", +] + +[[package]] +name = "aws-credential-types" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f20799b373a1be121fe3005fba0c2090af9411573878f224df44b42727fcaf7" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "zeroize", +] + +[[package]] +name = "aws-lc-rs" +version = "1.16.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a054912289d18629dc78375ba2c3726a3afe3ff71b4edba9dedfca0e3446d1fc" +dependencies = [ + "aws-lc-sys", + "zeroize", +] + +[[package]] +name = "aws-lc-sys" +version = "0.39.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83a25cf98105baa966497416dbd42565ce3a8cf8dbfd59803ec9ad46f3126399" +dependencies = [ + "cc", + "cmake", + "dunce", + "fs_extra", +] + +[[package]] +name = "aws-runtime" +version = "1.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fc0651c57e384202e47153c1260b84a9936e19803d747615edf199dc3b98d17" +dependencies = [ + "aws-credential-types", + "aws-sigv4", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "bytes-utils", + "fastrand", + "http 1.4.0", + "http-body 1.0.1", + "percent-encoding", + "pin-project-lite", + "tracing", + "uuid", +] + +[[package]] +name = "aws-sdk-sso" +version = "1.97.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9aadc669e184501caaa6beafb28c6267fc1baef0810fb58f9b205485ca3f2567" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-ssooidc" +version = "1.99.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1342a7db8f358d3de0aed2007a0b54e875458e39848d54cc1d46700b2bfcb0a8" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sdk-sts" +version = "1.101.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab41ad64e4051ecabeea802d6a17845a91e83287e1dd249e6963ea1ba78c428a" +dependencies = [ + "aws-credential-types", + "aws-runtime", + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-json", + "aws-smithy-observability", + "aws-smithy-query", + "aws-smithy-runtime", + "aws-smithy-runtime-api", + "aws-smithy-types", + "aws-smithy-xml", + "aws-types", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "regex-lite", + "tracing", +] + +[[package]] +name = "aws-sigv4" +version = "1.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0b660013a6683ab23797778e21f1f854744fdf05f68204b4cca4c8c04b5d1f4" +dependencies = [ + "aws-credential-types", + "aws-smithy-http", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "form_urlencoded", + "hex", + "hmac", + "http 0.2.12", + "http 1.4.0", + "percent-encoding", + "sha2", + "time", + "tracing", +] + +[[package]] +name = "aws-smithy-async" +version = "1.2.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ffcaf626bdda484571968400c326a244598634dc75fd451325a54ad1a59acfc" +dependencies = [ + "futures-util", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "aws-smithy-http" +version = "0.63.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba1ab2dc1c2c3749ead27180d333c42f11be8b0e934058fb4b2258ee8dbe5231" +dependencies = [ + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "bytes-utils", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "percent-encoding", + "pin-project-lite", + "pin-utils", + "tracing", +] + +[[package]] +name = "aws-smithy-http-client" +version = "1.1.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a2f165a7feee6f263028b899d0a181987f4fa7179a6411a32a439fba7c5f769" +dependencies = [ + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "h2", + "http 1.4.0", + "hyper", + "hyper-rustls", + "hyper-util", + "pin-project-lite", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower", + "tracing", +] + +[[package]] +name = "aws-smithy-json" +version = "0.62.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9648b0bb82a2eedd844052c6ad2a1a822d1f8e3adee5fbf668366717e428856a" +dependencies = [ + "aws-smithy-types", +] + +[[package]] +name = "aws-smithy-observability" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06c2315d173edbf1920da8ba3a7189695827002e4c0fc961973ab1c54abca9c" +dependencies = [ + "aws-smithy-runtime-api", +] + +[[package]] +name = "aws-smithy-query" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a56d79744fb3edb5d722ef79d86081e121d3b9422cb209eb03aea6aa4f21ebd" +dependencies = [ + "aws-smithy-types", + "urlencoding", +] + +[[package]] +name = "aws-smithy-runtime" +version = "1.10.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "028999056d2d2fd58a697232f9eec4a643cf73a71cf327690a7edad1d2af2110" +dependencies = [ + "aws-smithy-async", + "aws-smithy-http", + "aws-smithy-http-client", + "aws-smithy-observability", + "aws-smithy-runtime-api", + "aws-smithy-types", + "bytes", + "fastrand", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "pin-project-lite", + "pin-utils", + "tokio", + "tracing", +] + +[[package]] +name = "aws-smithy-runtime-api" +version = "1.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "876ab3c9c29791ba4ba02b780a3049e21ec63dabda09268b175272c3733a79e6" +dependencies = [ + "aws-smithy-async", + "aws-smithy-types", + "bytes", + "http 0.2.12", + "http 1.4.0", + "pin-project-lite", + "tokio", + "tracing", + "zeroize", +] + +[[package]] +name = "aws-smithy-types" +version = "1.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d73dbfbaa8e4bc57b9045137680b958d274823509a360abfd8e1d514d40c95c" +dependencies = [ + "base64-simd", + "bytes", + "bytes-utils", + "http 0.2.12", + "http 1.4.0", + "http-body 0.4.6", + "http-body 1.0.1", + "http-body-util", + "itoa", + "num-integer", + "pin-project-lite", + "pin-utils", + "ryu", + "serde", + "time", +] + +[[package]] +name = "aws-smithy-xml" +version = "0.60.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce02add1aa3677d022f8adf81dcbe3046a95f17a1b1e8979c145cd21d3d22b3" +dependencies = [ + "xmlparser", +] + +[[package]] +name = "aws-types" +version = "1.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47c8323699dd9b3c8d5b3c13051ae9cdef58fd179957c882f8374dd8725962d9" +dependencies = [ + "aws-credential-types", + "aws-smithy-async", + "aws-smithy-runtime-api", + "aws-smithy-types", + "rustc_version", + "tracing", +] + +[[package]] +name = "axum" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b52af3cb4058c895d37317bb27508dccc8e5f2d39454016b297bf4a400597b8" +dependencies = [ + "axum-core", + "axum-macros", + "bytes", + "form_urlencoded", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-util", + "itoa", + "matchit", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "serde_core", + "serde_json", + "serde_path_to_error", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-core" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08c78f31d7b1291f7ee735c1c6780ccde7785daae9a9206026862dab7d8792d1" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "mime", + "pin-project-lite", + "sync_wrapper", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "axum-macros" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "604fde5e028fea851ce1d8570bbdc034bec850d157f7569d10f347d06808c05c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "backon" +version = "1.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cffb0e931875b666fc4fcb20fee52e9bbd1ef836fd9e9e04ec21555f9f85f7ef" +dependencies = [ + "fastrand", + "gloo-timers", + "tokio", +] + +[[package]] +name = "backtrace" +version = "0.3.76" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb531853791a215d7c62a30daf0dde835f381ab5de4589cfe7c649d2cbe92bd6" +dependencies = [ + "addr2line", + "cfg-if", + "libc", + "miniz_oxide", + "object", + "rustc-demangle", + "windows-link", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "base64-simd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "339abbe78e73178762e23bea9dfd08e697eb3f3301cd4be981c0f78ba5859195" +dependencies = [ + "outref", + "vsimd", +] + +[[package]] +name = "base64ct" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2af50177e190e07a26ab74f8b1efbfe2ef87da2116221318cb1c2e82baf7de06" + +[[package]] +name = "bigdecimal" +version = "0.4.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d6867f1565b3aad85681f1015055b087fcfd840d6aeee6eee7f2da317603695" +dependencies = [ + "autocfg", + "libm", + "num-bigint", + "num-integer", + "num-traits", +] + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + +[[package]] +name = "bitflags" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843867be96c8daad0d758b57df9392b6d8d271134fce549de6ce169ff98a92af" + +[[package]] +name = "bitpacking" +version = "0.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96a7139abd3d9cebf8cd6f920a389cf3dc9576172e32f4563f188cae3c3eb019" +dependencies = [ + "crunchy", +] + +[[package]] +name = "bitvec" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc2832c24239b0141d5674bb9174f9d68a8b5b3f2753311927c172ca46f7e9c" +dependencies = [ + "funty", + "radium", + "tap", + "wyz", +] + +[[package]] +name = "blake2" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46502ad458c9a52b69d4d4d32775c788b7a1b85e8bc9d482d92250fc0e3f8efe" +dependencies = [ + "digest", +] + +[[package]] +name = "blake3" +version = "1.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2468ef7d57b3fb7e16b576e8377cdbde2320c60e1491e961d11da40fc4f02a2d" +dependencies = [ + "arrayref", + "arrayvec 0.7.6", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures", +] + +[[package]] +name = "block-buffer" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3078c7629b62d3f0439517fa394996acacc5cbc91c5a20d8c658e77abd503a71" +dependencies = [ + "generic-array", +] + +[[package]] +name = "block-padding" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8894febbff9f758034a5b8e12d87918f56dfc64a8e1fe757d65e29041538d93" +dependencies = [ + "generic-array", +] + +[[package]] +name = "bon" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f47dbe92550676ee653353c310dfb9cf6ba17ee70396e1f7cf0a2020ad49b2fe" +dependencies = [ + "bon-macros", + "rustversion", +] + +[[package]] +name = "bon-macros" +version = "3.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "519bd3116aeeb42d5372c29d982d16d0170d3d4a5ed85fc7dd91642ffff3c67c" +dependencies = [ + "darling", + "ident_case", + "prettyplease", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.115", +] + +[[package]] +name = "borsh" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a" +dependencies = [ + "bytes", + "cfg_aliases", +] + +[[package]] +name = "brotli" +version = "8.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bd8b9603c7aa97359dbd97ecf258968c95f3adddd6db2f7e7a5bef101c84560" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", + "brotli-decompressor", +] + +[[package]] +name = "brotli-decompressor" +version = "5.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "874bb8112abecc98cbd6d81ea4fa7e94fb9449648c93cc89aa40c81c24d7de03" +dependencies = [ + "alloc-no-stdlib", + "alloc-stdlib", +] + +[[package]] +name = "bstr" +version = "1.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63044e1ae8e69f3b5a92c736ca6269b8d12fa7efe39bf34ddb06d102cf0e2cab" +dependencies = [ + "memchr", + "regex-automata", + "serde", +] + +[[package]] +name = "bumpalo" +version = "3.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46c5e41b57b8bba42a04676d81cb89e9ee8e859a1a66f80a5a72e1cb76b34d43" + +[[package]] +name = "bytemuck" +version = "1.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8efb64bd706a16a1bdde310ae86b351e4d21550d98d056f22f8a7f7a2183fec" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "bytes" +version = "1.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" + +[[package]] +name = "bytes-utils" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dafe3a8757b027e2be6e4e5601ed563c55989fcf1546e933c66c8eb3a058d35" +dependencies = [ + "bytes", + "either", +] + +[[package]] +name = "cbc" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26b52a9543ae338f279b96b0b9fed9c8093744685043739079ce85cd58f289a6" +dependencies = [ + "cipher", +] + +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "jobserver", + "libc", + "shlex", +] + +[[package]] +name = "cedar-policy" +version = "4.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50368b44367cd7664627bbee9bfe5721d10ab2433daf77645833645e8eb746da" +dependencies = [ + "cedar-policy-core", + "cedar-policy-formatter", + "itertools 0.14.0", + "linked-hash-map", + "miette", + "ref-cast", + "semver", + "serde", + "serde_json", + "serde_with", + "smol_str", + "thiserror", +] + +[[package]] +name = "cedar-policy-core" +version = "4.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c9700d95c08701d5e43d30756ab0ec791649c2a93dee1274fac0fe8a17c7b24f" +dependencies = [ + "chrono", + "educe", + "either", + "itertools 0.14.0", + "lalrpop", + "lalrpop-util", + "linked-hash-map", + "linked_hash_set", + "miette", + "nonempty", + "ref-cast", + "regex", + "rustc-literal-escaper", + "serde", + "serde_json", + "serde_with", + "smol_str", + "stacker", + "thiserror", + "unicode-security", +] + +[[package]] +name = "cedar-policy-formatter" +version = "4.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "18c03e1d143e1c222d2ea48453ab4f4b11e545ac5a268a15bb163769fe568b90" +dependencies = [ + "cedar-policy-core", + "itertools 0.14.0", + "logos", + "miette", + "pretty", + "regex", + "smol_str", +] + +[[package]] +name = "census" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4f4c707c6a209cbe82d10abd08e1ea8995e9ea937d2550646e02798948992be0" + +[[package]] +name = "cfg-if" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" + +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chrono" +version = "0.4.43" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fac4744fb15ae8337dc853fee7fb3f4e48c0fbaa23d0afe49c447b4fab126118" +dependencies = [ + "iana-time-zone", + "js-sys", + "num-traits", + "serde", + "wasm-bindgen", + "windows-link", +] + +[[package]] +name = "chrono-tz" +version = "0.10.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6139a8597ed92cf816dfb33f5dd6cf0bb93a6adc938f11039f371bc5bcd26c3" +dependencies = [ + "chrono", + "phf", +] + +[[package]] +name = "cipher" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773f3b9af64447d2ce9850330c473515014aa235e6a783b02db81ff39e4a3dad" +dependencies = [ + "crypto-common", + "inout", +] + +[[package]] +name = "clap" +version = "4.5.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63be97961acde393029492ce0be7a1af7e323e6bae9511ebfac33751be5e6806" +dependencies = [ + "clap_builder", + "clap_derive", +] + +[[package]] +name = "clap_builder" +version = "4.5.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f13174bda5dfd69d7e947827e5af4b0f2f94a4a3ee92912fba07a66150f21e2" +dependencies = [ + "anstream", + "anstyle", + "clap_lex", + "strsim", +] + +[[package]] +name = "clap_derive" +version = "4.5.55" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a92793da1a46a5f2a02a6f4c46c6496b28c43638adea8306fcb0caa1634f24e5" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "clap_lex" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a822ea5bc7590f9d40f1ba12c0dc3c2760f3482c6984db1573ad11031420831" + +[[package]] +name = "cmake" +version = "0.1.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c0f78a02292a74a88ac736019ab962ece0bc380e3f977bf72e376c5d78ff0678" +dependencies = [ + "cc", +] + +[[package]] +name = "color-eyre" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5920befb47832a6d61ee3a3a846565cfa39b331331e68a3b1d1116630f2f26d" +dependencies = [ + "backtrace", + "color-spantrace", + "eyre", + "indenter", + "once_cell", + "owo-colors", + "tracing-error", +] + +[[package]] +name = "color-spantrace" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8b88ea9df13354b55bc7234ebcce36e6ef896aca2e42a15de9e10edce01b427" +dependencies = [ + "once_cell", + "owo-colors", + "tracing-core", + "tracing-error", +] + +[[package]] +name = "colorchoice" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" + +[[package]] +name = "comfy-table" +version = "7.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "958c5d6ecf1f214b4c2bbbbf6ab9523a864bd136dcf71a7e8904799acfe1ad47" +dependencies = [ + "unicode-segmentation", + "unicode-width 0.2.2", +] + +[[package]] +name = "compression-codecs" +version = "0.4.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb7b51a7d9c967fc26773061ba86150f19c50c0d65c887cb1fbe295fd16619b7" +dependencies = [ + "compression-core", + "flate2", + "memchr", +] + +[[package]] +name = "compression-core" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75984efb6ed102a0d42db99afb6c1948f0380d1d91808d5529916e6c08b49d8d" + +[[package]] +name = "concurrent-queue" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ca0197aee26d1ae37445ee532fefce43251d24cc7c166799f4d46817f1d3973" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "const-oid" +version = "0.9.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8" + +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom 0.2.17", + "once_cell", + "tiny-keccak", +] + +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + +[[package]] +name = "core-foundation" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2a6cd9ae233e7f62ba4e9353e81a88df7fc8a5987b8d445b4d90c879bd156f6" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + +[[package]] +name = "cpufeatures" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59ed5838eebb26a2bb2e58f6d5b5316989ae9d08bab10e0e6d103e656d1b0280" +dependencies = [ + "libc", +] + +[[package]] +name = "crc32c" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a47af21622d091a8f0fb295b88bc886ac74efcc613efc19f5d0b21de5c89e47" +dependencies = [ + "rustc_version", +] + +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "82b8f8f868b36967f9606790d1903570de9ceaf870a7bf9fbbd3016d636a2cb2" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b82ac4a3c2ca9c3460964f020e1402edd5753411d7737aa39c3714ad1b5420e" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f58bbc28f91df819d0aa2a2c00cd19754769c2fad90579b3592b1c9ba7a3115" +dependencies = [ + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-skiplist" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df29de440c58ca2cc6e587ec3d22347551a32435fbde9d2bff64e78a9ffa151b" +dependencies = [ + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" + +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + +[[package]] +name = "crypto-common" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78c8292055d1c1df0cce5d180393dc8cce0abec0a7102adb6c7b1eef6016d60a" +dependencies = [ + "generic-array", + "typenum", +] + +[[package]] +name = "csv" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52cd9d68cf7efc6ddfaaee42e7288d3a99d613d4b50f76ce9827ae0c6e14f938" +dependencies = [ + "csv-core", + "itoa", + "ryu", + "serde_core", +] + +[[package]] +name = "csv-core" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "704a3c26996a80471189265814dbc2c257598b96b8a7feae2d31ace646bb9782" +dependencies = [ + "memchr", +] + +[[package]] +name = "darling" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25ae13da2f202d56bd7f91c25fba009e7717a1e4a1cc98a76d844b65ae912e9d" +dependencies = [ + "darling_core", + "darling_macro", +] + +[[package]] +name = "darling_core" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9865a50f7c335f53564bb694ef660825eb8610e0a53d3e11bf1b0d3df31e03b0" +dependencies = [ + "ident_case", + "proc-macro2", + "quote", + "strsim", + "syn 2.0.115", +] + +[[package]] +name = "darling_macro" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3984ec7bd6cfa798e62b4a642426a5be0e68f9401cfc2a01e3fa9ea2fcdb8d" +dependencies = [ + "darling_core", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown 0.14.5", + "lock_api", + "once_cell", + "parking_lot_core", +] + +[[package]] +name = "datafusion" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43c18ba387f9c05ac1f3be32a73f8f3cc6c1cfc43e5d4b7a8e5b0d3a5eb48dc7" +dependencies = [ + "arrow", + "arrow-schema", + "async-trait", + "bytes", + "chrono", + "datafusion-catalog", + "datafusion-catalog-listing", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-datasource-arrow", + "datafusion-datasource-csv", + "datafusion-datasource-json", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-nested", + "datafusion-functions-table", + "datafusion-functions-window", + "datafusion-optimizer", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-optimizer", + "datafusion-physical-plan", + "datafusion-session", + "datafusion-sql", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "regex", + "sqlparser", + "tempfile", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "datafusion-catalog" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c75a4ce672b27fb8423810efb92a3600027717a1664d06a2c307eeeabcec694" +dependencies = [ + "arrow", + "async-trait", + "dashmap", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "log", + "object_store", + "parking_lot", + "tokio", +] + +[[package]] +name = "datafusion-catalog-listing" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c8b9a3795ffb46bf4957a34c67d89a67558b311ae455c8d4295ff2115eeea50" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "futures", + "itertools 0.14.0", + "log", + "object_store", +] + +[[package]] +name = "datafusion-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "205dc1e20441973f470e6b7ef87626a3b9187970e5106058fef1b713047f770c" +dependencies = [ + "ahash", + "arrow", + "arrow-ipc", + "chrono", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "libc", + "log", + "object_store", + "paste", + "sqlparser", + "tokio", + "web-time", +] + +[[package]] +name = "datafusion-common-runtime" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8cf5880c02ff6f5f11fb5bc19211789fb32fd3c53d79b7d6cb2b12e401312ba0" +dependencies = [ + "futures", + "log", + "tokio", +] + +[[package]] +name = "datafusion-datasource" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc614d6e709450e29b7b032a42c1bdb705f166a6b2edef7bed7c7897eb905499" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "chrono", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-physical-expr-adapter", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "glob", + "itertools 0.14.0", + "log", + "object_store", + "rand 0.9.2", + "tokio", + "url", +] + +[[package]] +name = "datafusion-datasource-arrow" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e497d5fc48dac7ce86f6b4fb09a3a494385774af301ff20ec91aebfae9b05b4" +dependencies = [ + "arrow", + "arrow-ipc", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "itertools 0.14.0", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-datasource-csv" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0dfc250cad940d0327ca2e9109dc98830892d17a3d6b2ca11d68570e872cf379" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "regex", + "tokio", +] + +[[package]] +name = "datafusion-datasource-json" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91e9677ed62833b0e8129dec0d1a8f3c9bb7590bd6dd714a43e4c3b663e4aa0" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-datasource", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-session", + "futures", + "object_store", + "tokio", +] + +[[package]] +name = "datafusion-doc" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3e13e5fe3447baa0584b61ee8644086e007e1ef6e58f4be48bc8a72417854729" + +[[package]] +name = "datafusion-execution" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48a6cc03e34899a54546b229235f7b192634c8e832f78a267f0989b18216c56d" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "dashmap", + "datafusion-common", + "datafusion-expr", + "futures", + "log", + "object_store", + "parking_lot", + "rand 0.9.2", + "tempfile", + "url", +] + +[[package]] +name = "datafusion-expr" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee3315d87eca7a7df58e52a1fb43b4c4171b545fd30ffc3102945c162a9f6ddb" +dependencies = [ + "arrow", + "async-trait", + "chrono", + "datafusion-common", + "datafusion-doc", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr-common", + "indexmap 2.13.0", + "itertools 0.14.0", + "paste", + "serde_json", + "sqlparser", +] + +[[package]] +name = "datafusion-expr-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c6d83feae0753799f933a2c47dfd15980c6947960cb95ed60f5c1f885548b3" +dependencies = [ + "arrow", + "datafusion-common", + "indexmap 2.13.0", + "itertools 0.14.0", + "paste", +] + +[[package]] +name = "datafusion-functions" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49b82962015cc3db4d7662459c9f7fcda0591b5edacb8af1cf3bc3031f274800" +dependencies = [ + "arrow", + "arrow-buffer", + "base64", + "blake2", + "blake3", + "chrono", + "chrono-tz", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-macros", + "hex", + "itertools 0.14.0", + "log", + "md-5", + "num-traits", + "rand 0.9.2", + "regex", + "sha2", + "unicode-segmentation", + "uuid", +] + +[[package]] +name = "datafusion-functions-aggregate" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4e42c227d9e55a6c8041785d4a8a117e4de531033d480aae10984247ac62e27e" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "half", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-aggregate-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cead3cfed825b0b688700f4338d281cd7857e4907775a5b9554c083edd5f3f95" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-functions-nested" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "62ea99612970aebab8cf864d02eb3d296bbab7f4881e1023d282b57fe431b201" +dependencies = [ + "arrow", + "arrow-ord", + "datafusion-common", + "datafusion-doc", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions", + "datafusion-functions-aggregate", + "datafusion-functions-aggregate-common", + "datafusion-macros", + "datafusion-physical-expr-common", + "itertools 0.14.0", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-table" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d83dbf3ab8b9af6f209b068825a7adbd3b88bf276f2a1ec14ba09567b97f5674" +dependencies = [ + "arrow", + "async-trait", + "datafusion-catalog", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", + "paste", +] + +[[package]] +name = "datafusion-functions-window" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "732edabe07496e2fc5a1e57a284d7a36edcea445a2821119770a0dea624b472c" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-doc", + "datafusion-expr", + "datafusion-functions-window-common", + "datafusion-macros", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "log", + "paste", +] + +[[package]] +name = "datafusion-functions-window-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0c6e30e09700799bd52adce8c377ab03dda96e73a623e4803a31ad94fe7ce14" +dependencies = [ + "datafusion-common", + "datafusion-physical-expr-common", +] + +[[package]] +name = "datafusion-macros" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "402f2a8ed70fb99a18f71580a1fe338604222a3d32ddeac6e72c5b34feea2d4d" +dependencies = [ + "datafusion-doc", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "datafusion-optimizer" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "99f32edb8ba12f08138f86c09b80fae3d4a320551262fa06b91d8a8cb3065a5b" +dependencies = [ + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "indexmap 2.13.0", + "itertools 0.14.0", + "log", + "regex", + "regex-syntax", +] + +[[package]] +name = "datafusion-physical-expr" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "987c5e29e96186589301b42e25aa7d11bbe319a73eb02ef8d755edc55b5b89fc" +dependencies = [ + "ahash", + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-functions-aggregate-common", + "datafusion-physical-expr-common", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "parking_lot", + "paste", + "petgraph 0.8.3", + "tokio", +] + +[[package]] +name = "datafusion-physical-expr-adapter" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1de89d0afa08b6686697bd8a6bac4ba2cd44c7003356e1bce6114d5a93f94b5c" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-expr-common" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "602d1970c0fe87f1c3a36665d131fbfe1c4379d35f8fc5ec43a362229ad2954d" +dependencies = [ + "ahash", + "arrow", + "chrono", + "datafusion-common", + "datafusion-expr-common", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "parking_lot", +] + +[[package]] +name = "datafusion-physical-optimizer" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b24d704b6385ebe27c756a12e5ba15684576d3b47aeca79cc9fb09480236dc32" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "datafusion-pruning", + "itertools 0.14.0", +] + +[[package]] +name = "datafusion-physical-plan" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c21d94141ea5043e98793f170798e9c1887095813b8291c5260599341e383a38" +dependencies = [ + "ahash", + "arrow", + "arrow-ord", + "arrow-schema", + "async-trait", + "datafusion-common", + "datafusion-common-runtime", + "datafusion-execution", + "datafusion-expr", + "datafusion-functions", + "datafusion-functions-aggregate-common", + "datafusion-functions-window-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "futures", + "half", + "hashbrown 0.16.1", + "indexmap 2.13.0", + "itertools 0.14.0", + "log", + "parking_lot", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "datafusion-pruning" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a68cce43d18c0dfac95cacd74e70565f7e2fb12b9ed41e2d312f0fa837626b1" +dependencies = [ + "arrow", + "datafusion-common", + "datafusion-datasource", + "datafusion-expr-common", + "datafusion-physical-expr", + "datafusion-physical-expr-common", + "datafusion-physical-plan", + "itertools 0.14.0", + "log", +] + +[[package]] +name = "datafusion-session" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6b4e1c40a0b1896aed4a4504145c2eb7fa9b9da13c2d04b40a4767a09f076199" +dependencies = [ + "async-trait", + "datafusion-common", + "datafusion-execution", + "datafusion-expr", + "datafusion-physical-plan", + "parking_lot", +] + +[[package]] +name = "datafusion-sql" +version = "52.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f1891e5b106d1d73c7fe403bd8a265d19c3977edc17f60808daf26c2fe65ffb" +dependencies = [ + "arrow", + "bigdecimal", + "chrono", + "datafusion-common", + "datafusion-expr", + "indexmap 2.13.0", + "log", + "regex", + "sqlparser", +] + +[[package]] +name = "deepsize" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1cdb987ec36f6bf7bfbea3f928b75590b736fc42af8e54d97592481351b2b96c" +dependencies = [ + "deepsize_derive", +] + +[[package]] +name = "deepsize_derive" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990101d41f3bc8c1a45641024377ee284ecc338e5ecf3ea0f0e236d897c72796" +dependencies = [ + "proc-macro2", + "quote", + "syn 1.0.109", +] + +[[package]] +name = "der" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7c1832837b905bbfb5101e07cc24c8deddf52f93225eee6ead5f4d63d53ddcb" +dependencies = [ + "const-oid", + "pem-rfc7468", + "zeroize", +] + +[[package]] +name = "deranged" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd812cc2bc1d69d4764bd80df88b4317eaef9e773c75226407d9bc0876b211c" +dependencies = [ + "powerfmt", + "serde_core", +] + +[[package]] +name = "difflib" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6184e33543162437515c2e2b48714794e37845ec9851711914eec9d308f6ebe8" + +[[package]] +name = "digest" +version = "0.10.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ed9a281f7bc9b7576e61468ba615a66a5c8cfdff42420a70aa82701a3b1e292" +dependencies = [ + "block-buffer", + "const-oid", + "crypto-common", + "subtle", +] + +[[package]] +name = "dirs" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c3e8aa94d75141228480295a7d0e7feb620b1a5ad9f12bc40be62411e38cce4e" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e01a3366d27ee9890022452ee61b2b63a67e6f13f58900b651ff5665f0bb1fab" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.61.2", +] + +[[package]] +name = "displaydoc" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97369cbbc041bc366949bc74d34658d6cda5621039731c6310521892a3a20ae0" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "dlv-list" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "442039f5147480ba31067cb00ada1adae6892028e40e45fc5de7b7df6dcc1b5f" +dependencies = [ + "const-random", +] + +[[package]] +name = "downcast-rs" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "117240f60069e65410b3ae1bb213295bd828f707b5bec6596a1afc8793ce0cbc" + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "dyn-clone" +version = "1.0.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0881ea181b1df73ff77ffaaf9c7544ecc11e82fba9b5f27b262a3c73a332555" + +[[package]] +name = "educe" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d7bc049e1bd8cdeb31b68bbd586a9464ecf9f3944af3958a7a9d0f8b9799417" +dependencies = [ + "enum-ordinalize", + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "either" +version = "1.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "48c757948c5ede0e46177b7add2e67155f70e33c07fea8284df6576da70b3719" + +[[package]] +name = "ena" +version = "0.14.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eabffdaee24bd1bf95c5ef7cec31260444317e72ea56c4c91750e8b7ee58d5f1" +dependencies = [ + "log", +] + +[[package]] +name = "encoding_rs" +version = "0.8.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75030f3c4f45dafd7586dd6780965a8c7e8e285a5ecb86713e63a79c5b2766f3" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "enum-ordinalize" +version = "4.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4a1091a7bb1f8f2c4b28f1fe2cef4980ca2d410a3d727d67ecc3178c9b0800f0" +dependencies = [ + "enum-ordinalize-derive", +] + +[[package]] +name = "enum-ordinalize-derive" +version = "4.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ca9601fb2d62598ee17836250842873a413586e5d7ed88b356e38ddbb0ec631" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "errno" +version = "0.3.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" +dependencies = [ + "libc", + "windows-sys 0.61.2", +] + +[[package]] +name = "ethnum" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca81e6b4777c89fd810c25a4be2b1bd93ea034fbe58e6a75216a34c6b82c539b" + +[[package]] +name = "event-listener" +version = "5.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e13b66accf52311f30a0db42147dadea9850cb48cd070028831ae5f5d4b856ab" +dependencies = [ + "concurrent-queue", + "parking", + "pin-project-lite", +] + +[[package]] +name = "event-listener-strategy" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be9f3dfaaffdae2972880079a491a1a8bb7cbed0b8dd7a347f668b4150a3b93" +dependencies = [ + "event-listener", + "pin-project-lite", +] + +[[package]] +name = "eyre" +version = "0.6.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7cd915d99f24784cdc19fd37ef22b97e3ff0ae756c7e492e9fbfe897d61e2aec" +dependencies = [ + "indenter", + "once_cell", +] + +[[package]] +name = "fail" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" +dependencies = [ + "log", + "once_cell", + "rand 0.8.5", +] + +[[package]] +name = "fast-float2" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8eb564c5c7423d25c886fb561d1e4ee69f72354d16918afa32c08811f6b6a55" + +[[package]] +name = "fastdivide" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afc2bd4d5a73106dd53d10d73d3401c2f32730ba2c0b93ddb888a8983680471" + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "fixedbitset" +version = "0.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d674e81391d1e1ab681a28d99df07927c6d4aa5b027d7da16ba32d1d21ecd99" + +[[package]] +name = "flatbuffers" +version = "25.12.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "35f6839d7b3b98adde531effaf34f0c2badc6f4735d26fe74709d8e513a96ef3" +dependencies = [ + "bitflags", + "rustc_version", +] + +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "float-cmp" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b09cf3155332e944990140d967ff5eceb70df778b34f77d8075db46e4704e6d8" +dependencies = [ + "num-traits", +] + +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + +[[package]] +name = "foldhash" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9c4f5dac5e15c24eb999c26181a6ca40b39fe946cbe4c263c7209467bc83af2" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "form_urlencoded" +version = "1.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb4cb245038516f5f85277875cdaa4f7d2c9a0fa0468de06ed190163b1581fcf" +dependencies = [ + "percent-encoding", +] + +[[package]] +name = "fs4" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7e180ac76c23b45e767bd7ae9579bc0bb458618c4bc71835926e098e61d15f8" +dependencies = [ + "rustix 0.38.44", + "windows-sys 0.52.0", +] + +[[package]] +name = "fs_extra" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c" + +[[package]] +name = "fsst" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195cc7f87e84bd695586137de99605e7e9579b26ec5e01b82960ddb4d0922f2" +dependencies = [ + "arrow-array", + "rand 0.9.2", +] + +[[package]] +name = "fst" +version = "0.4.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ab85b9b05e3978cc9a9cf8fea7f01b494e1a09ed3037e16ba39edc7a29eb61a" +dependencies = [ + "utf8-ranges", +] + +[[package]] +name = "funty" +version = "2.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6d5a32815ae3f33302d95fdcb2ce17862f8c65363dcfd29360480ba1001fc9c" + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-channel" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "07bbe89c50d7a535e539b8c17bc0b49bdb77747034daa8087407d655f3f7cc1d" +dependencies = [ + "futures-core", + "futures-sink", +] + +[[package]] +name = "futures-core" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e3450815272ef58cec6d564423f6e755e25379b217b0bc688e295ba24df6b1d" + +[[package]] +name = "futures-executor" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf29c38818342a3b26b5b923639e7b1f4a61fc5e76102d4b1981c6dc7a7579d" +dependencies = [ + "futures-core", + "futures-task", + "futures-util", +] + +[[package]] +name = "futures-io" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cecba35d7ad927e23624b22ad55235f2239cfa44fd10428eecbeba6d6a717718" + +[[package]] +name = "futures-macro" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e835b70203e41293343137df5c0664546da5745f82ec9b84d40be8336958447b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "futures-sink" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c39754e157331b013978ec91992bde1ac089843443c49cbc7f46150b0fad0893" + +[[package]] +name = "futures-task" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" + +[[package]] +name = "futures-util" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" +dependencies = [ + "futures-channel", + "futures-core", + "futures-io", + "futures-macro", + "futures-sink", + "futures-task", + "memchr", + "pin-project-lite", + "slab", +] + +[[package]] +name = "generator" +version = "0.8.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f04ae4152da20c76fe800fa48659201d5cf627c5149ca0b707b69d7eef6cf9" +dependencies = [ + "cc", + "cfg-if", + "libc", + "log", + "rustversion", + "windows-link", + "windows-result", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" +dependencies = [ + "cfg-if", + "libc", + "r-efi 6.0.0", + "wasip2", + "wasip3", +] + +[[package]] +name = "gimli" +version = "0.32.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e629b9b98ef3dd8afe6ca2bd0f89306cec16d43d907889945bc5d6687f2f13c7" + +[[package]] +name = "glob" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cc23270f6e1808e30a928bdc84dea0b9b4136a8bc82338574f23baf47bbd280" + +[[package]] +name = "gloo-timers" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbb143cf96099802033e0d4f4963b19fd2e0b728bcf076cd9cf7f6634f092994" +dependencies = [ + "futures-channel", + "futures-core", + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "h2" +version = "0.4.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f44da3a8150a6703ed5d34e164b875fd14c2cdab9af1252a9a1020bde2bdc54" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http 1.4.0", + "indexmap 2.13.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + +[[package]] +name = "half" +version = "2.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b" +dependencies = [ + "cfg-if", + "crunchy", + "num-traits", + "zerocopy", +] + +[[package]] +name = "hashbrown" +version = "0.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + +[[package]] +name = "hashbrown" +version = "0.15.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9229cfe53dfd69f0609a49f65461bd93001ea1ef889cd5529dd176593f5338a1" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.1.5", +] + +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash 0.2.0", +] + +[[package]] +name = "heck" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2304e00983f87ffb38b55b444b5e3b60a884b5d30c0fca7d82fe33449bbe55ea" + +[[package]] +name = "hermit-abi" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fc0fef456e4baa96da950455cd02c081ca953b141298e41db3fc7e36b1da849c" + +[[package]] +name = "hex" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f24254aa9a54b5c858eaee2f5bccdb46aaf0e486a595ed5fd8f86ba55232a70" + +[[package]] +name = "hmac" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c49c37c09c17a53d937dfbb742eb3a961d65a994e6bcdcf37e7399d0cc8ab5e" +dependencies = [ + "digest", +] + +[[package]] +name = "home" +version = "0.5.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cc627f471c528ff0c4a49e1d5e60450c8f6461dd6d10ba9dcd3a61d3dff7728d" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "htmlescape" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9025058dae765dee5070ec375f591e2ba14638c63feff74f13805a72e523163" + +[[package]] +name = "http" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" +dependencies = [ + "bytes", + "fnv", + "itoa", +] + +[[package]] +name = "http" +version = "1.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3ba2a386d7f85a81f119ad7498ebe444d2e22c2af0b86b069416ace48b3311a" +dependencies = [ + "bytes", + "itoa", +] + +[[package]] +name = "http-body" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" +dependencies = [ + "bytes", + "http 0.2.12", + "pin-project-lite", +] + +[[package]] +name = "http-body" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" +dependencies = [ + "bytes", + "http 1.4.0", +] + +[[package]] +name = "http-body-util" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b021d93e26becf5dc7e1b75b1bed1fd93124b374ceb73f43d4d4eafec896a64a" +dependencies = [ + "bytes", + "futures-core", + "http 1.4.0", + "http-body 1.0.1", + "pin-project-lite", +] + +[[package]] +name = "httparse" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6dbf3de79e51f3d586ab4cb9d5c3e2c14aa28ed23d180cf89b4df0454a69cc87" + +[[package]] +name = "httpdate" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df3b46402a9d5adb4c86a0cf463f42e19994e3ee891101b1841f30a545cb49a9" + +[[package]] +name = "humantime" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135b12329e5e3ce057a9f972339ea52bc954fe1e9358ef27f95e89716fbc5424" + +[[package]] +name = "hyper" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2ab2d4f250c3d7b1c9fcdff1cece94ea4e2dfbec68614f7b87cb205f24ca9d11" +dependencies = [ + "atomic-waker", + "bytes", + "futures-channel", + "futures-core", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "httparse", + "httpdate", + "itoa", + "pin-project-lite", + "pin-utils", + "smallvec", + "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3c93eb611681b207e1fe55d5a71ecf91572ec8a6705cdb6857f7d8d5242cf58" +dependencies = [ + "http 1.4.0", + "hyper", + "hyper-util", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", +] + +[[package]] +name = "hyper-util" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" +dependencies = [ + "base64", + "bytes", + "futures-channel", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "hyper", + "ipnet", + "libc", + "percent-encoding", + "pin-project-lite", + "socket2", + "tokio", + "tower-service", + "tracing", +] + +[[package]] +name = "hyperloglogplus" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "621debdf94dcac33e50475fdd76d34d5ea9c0362a834b9db08c3024696c1fbe3" +dependencies = [ + "serde", +] + +[[package]] +name = "iana-time-zone" +version = "0.1.65" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e31bc9ad994ba00e440a8aa5c9ef0ec67d5cb5e5cb0cc7f8b744a35b389cc470" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "log", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + +[[package]] +name = "icu_collections" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6b649701667bbe825c3b7e6388cb521c23d88644678e83c0c4d0a621a34b43" +dependencies = [ + "displaydoc", + "potential_utf", + "yoke", + "zerofrom", + "zerovec", +] + +[[package]] +name = "icu_locale_core" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "edba7861004dd3714265b4db54a3c390e880ab658fec5f7db895fae2046b5bb6" +dependencies = [ + "displaydoc", + "litemap", + "tinystr", + "writeable", + "zerovec", +] + +[[package]] +name = "icu_normalizer" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f6c8828b67bf8908d82127b2054ea1b4427ff0230ee9141c54251934ab1b599" +dependencies = [ + "icu_collections", + "icu_normalizer_data", + "icu_properties", + "icu_provider", + "smallvec", + "zerovec", +] + +[[package]] +name = "icu_normalizer_data" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7aedcccd01fc5fe81e6b489c15b247b8b0690feb23304303a9e560f37efc560a" + +[[package]] +name = "icu_properties" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "020bfc02fe870ec3a66d93e677ccca0562506e5872c650f893269e08615d74ec" +dependencies = [ + "icu_collections", + "icu_locale_core", + "icu_properties_data", + "icu_provider", + "zerotrie", + "zerovec", +] + +[[package]] +name = "icu_properties_data" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "616c294cf8d725c6afcd8f55abc17c56464ef6211f9ed59cccffe534129c77af" + +[[package]] +name = "icu_provider" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85962cf0ce02e1e0a629cc34e7ca3e373ce20dda4c4d7294bbd0bf1fdb59e614" +dependencies = [ + "displaydoc", + "icu_locale_core", + "writeable", + "yoke", + "zerofrom", + "zerotrie", + "zerovec", +] + +[[package]] +name = "id-arena" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d3067d79b975e8844ca9eb072e16b31c3c1c36928edf9c6789548c524d0d954" + +[[package]] +name = "ident_case" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" + +[[package]] +name = "idna" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b0875f23caa03898994f6ddc501886a45c7d3d62d04d2d90788d47be1b1e4de" +dependencies = [ + "idna_adapter", + "smallvec", + "utf8_iter", +] + +[[package]] +name = "idna_adapter" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3acae9609540aa318d1bc588455225fb2085b9ed0c4f6bd0d9d5bcd86f1a0344" +dependencies = [ + "icu_normalizer", + "icu_properties", +] + +[[package]] +name = "indenter" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "964de6e86d545b246d84badc0fef527924ace5134f30641c203ef52ba83f58d5" + +[[package]] +name = "indexmap" +version = "1.9.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" +dependencies = [ + "autocfg", + "hashbrown 0.12.3", + "serde", +] + +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown 0.16.1", + "serde", + "serde_core", +] + +[[package]] +name = "inout" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "879f10e63c20629ecabbb64a8010319738c66a5cd0c29b02d63d272b03751d01" +dependencies = [ + "block-padding", + "generic-array", +] + +[[package]] +name = "ipnet" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" + +[[package]] +name = "iri-string" +version = "0.7.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c91338f0783edbd6195decb37bae672fd3b165faffb89bf7b9e6942f8b1a731a" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "is_terminal_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a6cb138bb79a146c1bd460005623e142ef0181e3d0219cb493e02f7d08a35695" + +[[package]] +name = "itertools" +version = "0.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "413ee7dfc52ee1a4949ceeb7dbc8a33f2d6c088194d9f922fb8318faf1f01186" +dependencies = [ + "either", +] + +[[package]] +name = "itertools" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b192c782037fadd9cfa75548310488aabdbf3d2da73885b31bd0abd03351285" +dependencies = [ + "either", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + +[[package]] +name = "jiff" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a3546dc96b6d42c5f24902af9e2538e82e39ad350b0c766eb3fbf2d8f3d8359" +dependencies = [ + "jiff-static", + "jiff-tzdb-platform", + "log", + "portable-atomic", + "portable-atomic-util", + "serde_core", + "windows-sys 0.61.2", +] + +[[package]] +name = "jiff-static" +version = "0.2.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a8c8b344124222efd714b73bb41f8b5120b27a7cc1c75593a6ff768d9d05aa4" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "jiff-tzdb" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c900ef84826f1338a557697dc8fc601df9ca9af4ac137c7fb61d4c6f2dfd3076" + +[[package]] +name = "jiff-tzdb-platform" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "875a5a69ac2bab1a891711cf5eccbec1ce0341ea805560dcd90b7a2e925132e8" +dependencies = [ + "jiff-tzdb", +] + +[[package]] +name = "jobserver" +version = "0.1.34" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9afb3de4395d6b3e67a780b6de64b51c978ecf11cb9a462c66be7d4ca9039d33" +dependencies = [ + "getrandom 0.3.4", + "libc", +] + +[[package]] +name = "js-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8c942ebf8e95485ca0d52d97da7c5a2c387d0e7f0ba4c35e93bfcaee045955b3" +dependencies = [ + "once_cell", + "wasm-bindgen", +] + +[[package]] +name = "jsonb" +version = "0.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a901f06163d352fbe41c3c2ff5e08b75330a003cc941e988fb501022f5421e6" +dependencies = [ + "byteorder", + "ethnum", + "fast-float2", + "itoa", + "jiff", + "nom 8.0.0", + "num-traits", + "ordered-float", + "rand 0.9.2", + "ryu", + "serde", + "serde_json", +] + +[[package]] +name = "jsonwebtoken" +version = "9.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a87cc7a48537badeae96744432de36f4be2b4a34a05a5ef32e9dd8a1c169dde" +dependencies = [ + "base64", + "js-sys", + "pem", + "ring", + "serde", + "serde_json", + "simple_asn1", +] + +[[package]] +name = "keccak" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cb26cec98cce3a3d96cbb7bced3c4b16e3d13f27ec56dbd62cbc8f39cfb9d653" +dependencies = [ + "cpufeatures", +] + +[[package]] +name = "lalrpop" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba4ebbd48ce411c1d10fb35185f5a51a7bfa3d8b24b4e330d30c9e3a34129501" +dependencies = [ + "ascii-canvas", + "bit-set", + "ena", + "itertools 0.14.0", + "lalrpop-util", + "petgraph 0.7.1", + "pico-args", + "regex", + "regex-syntax", + "sha3", + "string_cache", + "term", + "unicode-xid", + "walkdir", +] + +[[package]] +name = "lalrpop-util" +version = "0.22.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b5baa5e9ff84f1aefd264e6869907646538a52147a755d494517a8007fb48733" +dependencies = [ + "regex-automata", + "rustversion", +] + +[[package]] +name = "lance" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "efe6c3ddd79cdfd2b7e1c23cafae52806906bc40fbd97de9e8cf2f8c7a75fc04" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-ord", + "arrow-row", + "arrow-schema", + "arrow-select", + "async-recursion", + "async-trait", + "async_cell", + "aws-credential-types", + "byteorder", + "bytes", + "chrono", + "crossbeam-skiplist", + "dashmap", + "datafusion", + "datafusion-expr", + "datafusion-functions", + "datafusion-physical-expr", + "datafusion-physical-plan", + "deepsize", + "either", + "futures", + "half", + "humantime", + "itertools 0.13.0", + "lance-arrow", + "lance-core", + "lance-datafusion", + "lance-encoding", + "lance-file", + "lance-index", + "lance-io", + "lance-linalg", + "lance-namespace", + "lance-table", + "log", + "moka", + "object_store", + "permutation", + "pin-project", + "prost", + "prost-types", + "rand 0.9.2", + "roaring", + "semver", + "serde", + "serde_json", + "snafu", + "tantivy", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "lance-arrow" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d9f5d95bdda2a2b790f1fb8028b5b6dcf661abeb3133a8bca0f3d24b054af87" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-ord", + "arrow-schema", + "arrow-select", + "bytes", + "futures", + "getrandom 0.2.17", + "half", + "jsonb", + "num-traits", + "rand 0.9.2", +] + +[[package]] +name = "lance-bitpacking" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f827d6ab9f8f337a9509d5ad66a12f3314db8713868260521c344ef6135eb4e4" +dependencies = [ + "arrayref", + "paste", + "seq-macro", +] + +[[package]] +name = "lance-core" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0f1e25df6a79bf72ee6bcde0851f19b1cd36c5848c1b7db83340882d3c9fdecb" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "async-trait", + "byteorder", + "bytes", + "chrono", + "datafusion-common", + "datafusion-sql", + "deepsize", + "futures", + "itertools 0.13.0", + "lance-arrow", + "libc", + "log", + "mock_instant", + "moka", + "num_cpus", + "object_store", + "pin-project", + "prost", + "rand 0.9.2", + "roaring", + "serde_json", + "snafu", + "tempfile", + "tokio", + "tokio-stream", + "tokio-util", + "tracing", + "url", +] + +[[package]] +name = "lance-datafusion" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93146de8ae720cb90edef81c2f2d0a1b065fc2f23ecff2419546f389b0fa70a4" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ord", + "arrow-schema", + "arrow-select", + "async-trait", + "chrono", + "datafusion", + "datafusion-common", + "datafusion-functions", + "datafusion-physical-expr", + "futures", + "jsonb", + "lance-arrow", + "lance-core", + "lance-datagen", + "log", + "pin-project", + "prost", + "prost-build", + "snafu", + "tokio", + "tracing", +] + +[[package]] +name = "lance-datagen" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccec8ce4d8e0a87a99c431dab2364398029f2ffb649c1a693c60c79e05ed30dd" +dependencies = [ + "arrow", + "arrow-array", + "arrow-cast", + "arrow-schema", + "chrono", + "futures", + "half", + "hex", + "rand 0.9.2", + "rand_distr 0.5.1", + "rand_xoshiro", + "random_word", +] + +[[package]] +name = "lance-encoding" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c1aec0bbbac6bce829bc10f1ba066258126100596c375fb71908ecf11c2c2a5" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", + "bytemuck", + "byteorder", + "bytes", + "fsst", + "futures", + "hex", + "hyperloglogplus", + "itertools 0.13.0", + "lance-arrow", + "lance-bitpacking", + "lance-core", + "log", + "lz4", + "num-traits", + "prost", + "prost-build", + "prost-types", + "rand 0.9.2", + "snafu", + "strum", + "tokio", + "tracing", + "xxhash-rust", + "zstd", +] + +[[package]] +name = "lance-file" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14a8c548804f5b17486dc2d3282356ed1957095a852780283bc401fdd69e9075" +dependencies = [ + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "async-recursion", + "async-trait", + "byteorder", + "bytes", + "datafusion-common", + "deepsize", + "futures", + "lance-arrow", + "lance-core", + "lance-encoding", + "lance-io", + "log", + "num-traits", + "object_store", + "prost", + "prost-build", + "prost-types", + "snafu", + "tokio", + "tracing", +] + +[[package]] +name = "lance-index" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2da212f0090ea59f79ac3686660f596520c167fe1cb5f408900cf71d215f0e03" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-ord", + "arrow-schema", + "arrow-select", + "async-channel", + "async-recursion", + "async-trait", + "bitpacking", + "bitvec", + "bytes", + "chrono", + "crossbeam-queue", + "datafusion", + "datafusion-common", + "datafusion-expr", + "datafusion-physical-expr", + "datafusion-sql", + "deepsize", + "dirs", + "fst", + "futures", + "half", + "itertools 0.13.0", + "jsonb", + "lance-arrow", + "lance-core", + "lance-datafusion", + "lance-datagen", + "lance-encoding", + "lance-file", + "lance-io", + "lance-linalg", + "lance-table", + "libm", + "log", + "ndarray", + "num-traits", + "object_store", + "prost", + "prost-build", + "prost-types", + "rand 0.9.2", + "rand_distr 0.5.1", + "rangemap", + "rayon", + "roaring", + "serde", + "serde_json", + "smallvec", + "snafu", + "tantivy", + "tempfile", + "tokio", + "tracing", + "twox-hash", + "uuid", +] + +[[package]] +name = "lance-io" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d958eb4b56f03bbe0f5f85eb2b4e9657882812297b6f711f201ffc995f259f" +dependencies = [ + "arrow", + "arrow-arith", + "arrow-array", + "arrow-buffer", + "arrow-cast", + "arrow-data", + "arrow-schema", + "arrow-select", + "async-recursion", + "async-trait", + "aws-config", + "aws-credential-types", + "byteorder", + "bytes", + "chrono", + "deepsize", + "futures", + "http 1.4.0", + "lance-arrow", + "lance-core", + "lance-namespace", + "log", + "object_store", + "object_store_opendal", + "opendal", + "path_abs", + "pin-project", + "prost", + "rand 0.9.2", + "serde", + "snafu", + "tempfile", + "tokio", + "tracing", + "url", +] + +[[package]] +name = "lance-linalg" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0285b70da35def7ed95e150fae1d5308089554e1290470403ed3c50cb235bc5e" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-schema", + "cc", + "deepsize", + "half", + "lance-arrow", + "lance-core", + "num-traits", + "rand 0.9.2", +] + +[[package]] +name = "lance-namespace" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f78e2a828b654e062a495462c6e3eb4fcf0e7e907d761b8f217fc09ccd3ceac" +dependencies = [ + "arrow", + "async-trait", + "bytes", + "lance-core", + "lance-namespace-reqwest-client", + "serde", + "snafu", +] + +[[package]] +name = "lance-namespace-impls" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2392314f3da38f00d166295e44244208a65ccfc256e274fa8631849fc3f4d94" +dependencies = [ + "arrow", + "arrow-ipc", + "arrow-schema", + "async-trait", + "bytes", + "chrono", + "futures", + "lance", + "lance-core", + "lance-index", + "lance-io", + "lance-namespace", + "lance-table", + "log", + "object_store", + "rand 0.9.2", + "serde_json", + "snafu", + "tokio", + "url", +] + +[[package]] +name = "lance-namespace-reqwest-client" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee2e48de899e2931afb67fcddd0a08e439bf5d8b6ea2a2ed9cb8f4df669bd5cc" +dependencies = [ + "reqwest", + "serde", + "serde_json", + "serde_repr", + "url", +] + +[[package]] +name = "lance-table" +version = "4.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3df9c4adca3eb2074b3850432a9fb34248a3d90c3d6427d158b13ff9355664ee" +dependencies = [ + "arrow", + "arrow-array", + "arrow-buffer", + "arrow-ipc", + "arrow-schema", + "async-trait", + "byteorder", + "bytes", + "chrono", + "deepsize", + "futures", + "lance-arrow", + "lance-core", + "lance-file", + "lance-io", + "log", + "object_store", + "prost", + "prost-build", + "prost-types", + "rand 0.9.2", + "rangemap", + "roaring", + "semver", + "serde", + "serde_json", + "snafu", + "tokio", + "tracing", + "url", + "uuid", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" +dependencies = [ + "spin", +] + +[[package]] +name = "leb128fmt" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09edd9e8b54e49e587e4f6295a7d29c3ea94d469cb40ab8ca70b288248a81db2" + +[[package]] +name = "levenshtein_automata" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c2cdeb66e45e9f36bfad5bbdb4d2384e70936afbee843c6f6543f0c551ebb25" + +[[package]] +name = "lexical-core" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7d8d125a277f807e55a77304455eb7b1cb52f2b18c143b60e766c120bd64a594" +dependencies = [ + "lexical-parse-float", + "lexical-parse-integer", + "lexical-util", + "lexical-write-float", + "lexical-write-integer", +] + +[[package]] +name = "lexical-parse-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52a9f232fbd6f550bc0137dcb5f99ab674071ac2d690ac69704593cb4abbea56" +dependencies = [ + "lexical-parse-integer", + "lexical-util", +] + +[[package]] +name = "lexical-parse-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a7a039f8fb9c19c996cd7b2fcce303c1b2874fe1aca544edc85c4a5f8489b34" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "lexical-util" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2604dd126bb14f13fb5d1bd6a66155079cb9fa655b37f875b3a742c705dbed17" + +[[package]] +name = "lexical-write-float" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50c438c87c013188d415fbabbb1dceb44249ab81664efbd31b14ae55dabb6361" +dependencies = [ + "lexical-util", + "lexical-write-integer", +] + +[[package]] +name = "lexical-write-integer" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "409851a618475d2d5796377cad353802345cba92c867d9fbcde9cf4eac4e14df" +dependencies = [ + "lexical-util", +] + +[[package]] +name = "libc" +version = "0.2.182" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6800badb6cb2082ffd7b6a67e6125bb39f18782f793520caee8cb8846be06112" + +[[package]] +name = "libm" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" + +[[package]] +name = "libredox" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1744e39d1d6a9948f4f388969627434e31128196de472883b39f148769bfe30a" +dependencies = [ + "libc", +] + +[[package]] +name = "linked-hash-map" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f" +dependencies = [ + "serde", +] + +[[package]] +name = "linked_hash_set" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "984fb35d06508d1e69fc91050cceba9c0b748f983e6739fa2c7a9237154c52c8" +dependencies = [ + "linked-hash-map", +] + +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + +[[package]] +name = "linux-raw-sys" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a66949e030da00e8c7d4434b251670a91556f4144941d37452769c25d58a53" + +[[package]] +name = "litemap" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6373607a59f0be73a39b6fe456b8192fcc3585f602af20751600e974dd455e77" + +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + +[[package]] +name = "log" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" + +[[package]] +name = "logos" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eb2c55a318a87600ea870ff8c2012148b44bf18b74fad48d0f835c38c7d07c5f" +dependencies = [ + "logos-derive", +] + +[[package]] +name = "logos-codegen" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "58b3ffaa284e1350d017a57d04ada118c4583cf260c8fb01e0fe28a2e9cf8970" +dependencies = [ + "fnv", + "proc-macro2", + "quote", + "regex-automata", + "regex-syntax", + "syn 2.0.115", +] + +[[package]] +name = "logos-derive" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52d3a9855747c17eaf4383823f135220716ab49bea5fbea7dd42cc9a92f8aa31" +dependencies = [ + "logos-codegen", +] + +[[package]] +name = "loom" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "419e0dc8046cb947daa77eb95ae174acfbddb7673b4151f56d1eed8e93fbfaca" +dependencies = [ + "cfg-if", + "generator", + "scoped-tls", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "lru" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "234cf4f4a04dc1f57e24b96cc0cd600cf2af460d4161ac5ecdd0af8e1f3b2a38" +dependencies = [ + "hashbrown 0.15.5", +] + +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "lz4" +version = "1.28.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a20b523e860d03443e98350ceaac5e71c6ba89aea7d960769ec3ce37f4de5af4" +dependencies = [ + "lz4-sys", +] + +[[package]] +name = "lz4-sys" +version = "1.11.1+lz4-1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bd8c0d6c6ed0cd30b3652886bb8711dc4bb01d637a68105a3d5158039b418e6" +dependencies = [ + "cc", + "libc", +] + +[[package]] +name = "lz4_flex" +version = "0.11.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "373f5eceeeab7925e0c1098212f2fbc4d416adec9d35051a6ab251e824c1854a" + +[[package]] +name = "lz4_flex" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "98c23545df7ecf1b16c303910a69b079e8e251d60f7dd2cc9b4177f2afaf1746" +dependencies = [ + "twox-hash", +] + +[[package]] +name = "matchers" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1525a2a28c7f4fa0fc98bb91ae755d1e2d1505079e05539e35bc876b5d65ae9" +dependencies = [ + "regex-automata", +] + +[[package]] +name = "matchit" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" + +[[package]] +name = "matrixmultiply" +version = "0.3.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a06de3016e9fae57a36fd14dba131fccf49f74b40b7fbdb472f96e361ec71a08" +dependencies = [ + "autocfg", + "num_cpus", + "once_cell", + "rawpointer", + "thread-tree", +] + +[[package]] +name = "md-5" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d89e7ee0cfbedfc4da3340218492196241d89eefb6dab27de5df917a6d2e78cf" +dependencies = [ + "cfg-if", + "digest", +] + +[[package]] +name = "measure_time" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "51c55d61e72fc3ab704396c5fa16f4c184db37978ae4e94ca8959693a235fc0e" +dependencies = [ + "log", +] + +[[package]] +name = "memchr" +version = "2.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ca58f447f06ed17d5fc4043ce1b10dd205e060fb3ce5b979b8ed8e59ff3f79" + +[[package]] +name = "memmap2" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "714098028fe011992e1c3962653c96b2d578c4b4bce9036e15ff220319b1e0e3" +dependencies = [ + "libc", +] + +[[package]] +name = "miette" +version = "7.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5f98efec8807c63c752b5bd61f862c165c115b0a35685bdcfd9238c7aeb592b7" +dependencies = [ + "cfg-if", + "miette-derive", + "serde", + "unicode-width 0.1.14", +] + +[[package]] +name = "miette-derive" +version = "7.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db5b29714e950dbb20d5e6f74f9dcec4edbcc1067bb7f8ed198c097b8c1a818b" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "mime" +version = "0.3.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" + +[[package]] +name = "mime_guess" +version = "2.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f7c44f8e672c00fe5308fa235f821cb4198414e1c77935c1ab6948d3fd78550e" +dependencies = [ + "mime", + "unicase", +] + +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + +[[package]] +name = "mio" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a69bcab0ad47271a0234d9422b131806bf3968021e5dc9328caf2d4cd58557fc" +dependencies = [ + "libc", + "wasi", + "windows-sys 0.61.2", +] + +[[package]] +name = "mock_instant" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce6dd36094cac388f119d2e9dc82dc730ef91c32a6222170d630e5414b956e6" + +[[package]] +name = "moka" +version = "0.12.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "957228ad12042ee839f93c8f257b62b4c0ab5eaae1d4fa60de53b27c9d7c5046" +dependencies = [ + "async-lock", + "crossbeam-channel", + "crossbeam-epoch", + "crossbeam-utils", + "equivalent", + "event-listener", + "futures-util", + "parking_lot", + "portable-atomic", + "smallvec", + "tagptr", + "uuid", +] + +[[package]] +name = "multimap" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d87ecb2933e8aeadb3e3a02b828fed80a7528047e68b4f424523a0981a3a084" + +[[package]] +name = "murmurhash32" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2195bf6aa996a481483b29d62a7663eed3fe39600c460e323f8ff41e90bdd89b" + +[[package]] +name = "ndarray" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "882ed72dce9365842bf196bdeedf5055305f11fc8c03dee7bb0194a6cad34841" +dependencies = [ + "matrixmultiply", + "num-complex", + "num-integer", + "num-traits", + "portable-atomic", + "portable-atomic-util", + "rawpointer", +] + +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + +[[package]] +name = "nom" +version = "8.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df9761775871bdef83bee530e60050f7e54b1105350d6884eb0fb4f46c2f9405" +dependencies = [ + "memchr", +] + +[[package]] +name = "nonempty" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9737e026353e5cd0736f98eddae28665118eb6f6600902a7f50db585621fecb6" +dependencies = [ + "serde", +] + +[[package]] +name = "normalize-line-endings" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61807f77802ff30975e01f4f071c8ba10c022052f98b3294119f3e615d13e5be" + +[[package]] +name = "nu-ansi-term" +version = "0.50.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "num-bigint" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9" +dependencies = [ + "num-integer", + "num-traits", +] + +[[package]] +name = "num-bigint-dig" +version = "0.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e661dda6640fad38e827a6d4a310ff4763082116fe217f279885c97f511bb0b7" +dependencies = [ + "lazy_static", + "libm", + "num-integer", + "num-iter", + "num-traits", + "rand 0.8.5", + "smallvec", + "zeroize", +] + +[[package]] +name = "num-complex" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-conv" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf97ec579c3c42f953ef76dbf8d55ac91fb219dde70e49aa4a6b7d74e9919050" + +[[package]] +name = "num-integer" +version = "0.1.46" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f" +dependencies = [ + "num-traits", +] + +[[package]] +name = "num-iter" +version = "0.1.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1429034a0490724d0075ebb2bc9e875d6503c3cf69e235a8941aa757d83ef5bf" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "num_cpus" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91df4bbde75afed763b708b7eee1e8e7651e02d97f6d5dd763e89367e957b23b" +dependencies = [ + "hermit-abi", + "libc", +] + +[[package]] +name = "object" +version = "0.37.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff76201f031d8863c38aa7f905eca4f53abbfa15f609db4277d44cd8938f33fe" +dependencies = [ + "memchr", +] + +[[package]] +name = "object_store" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fbfbfff40aeccab00ec8a910b57ca8ecf4319b335c542f2edcd19dd25a1e2a00" +dependencies = [ + "async-trait", + "base64", + "bytes", + "chrono", + "form_urlencoded", + "futures", + "http 1.4.0", + "http-body-util", + "httparse", + "humantime", + "hyper", + "itertools 0.14.0", + "md-5", + "parking_lot", + "percent-encoding", + "quick-xml 0.38.4", + "rand 0.9.2", + "reqwest", + "ring", + "rustls-pemfile", + "serde", + "serde_json", + "serde_urlencoded", + "thiserror", + "tokio", + "tracing", + "url", + "walkdir", + "wasm-bindgen-futures", + "web-time", +] + +[[package]] +name = "object_store_opendal" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "113ab0769e972eee585e57407b98de08bda5354fa28e8ba4d89038d6cb6a8991" +dependencies = [ + "async-trait", + "bytes", + "chrono", + "futures", + "object_store", + "opendal", + "pin-project", + "tokio", +] + +[[package]] +name = "omnigraph" +version = "0.4.0" +dependencies = [ + "arrow-array", + "arrow-cast", + "arrow-ord", + "arrow-schema", + "arrow-select", + "async-trait", + "base64", + "fail", + "futures", + "lance", + "lance-datafusion", + "lance-file", + "lance-index", + "lance-linalg", + "lance-namespace", + "lance-namespace-impls", + "lance-table", + "object_store", + "omnigraph-compiler", + "regex", + "reqwest", + "serde", + "serde_json", + "serial_test", + "tempfile", + "thiserror", + "time", + "tokio", + "tracing", + "ulid", + "url", +] + +[[package]] +name = "omnigraph-cli" +version = "0.4.0" +dependencies = [ + "assert_cmd", + "clap", + "color-eyre", + "omnigraph", + "omnigraph-compiler", + "omnigraph-server", + "predicates", + "reqwest", + "serde", + "serde_json", + "serde_yaml", + "tempfile", + "tokio", +] + +[[package]] +name = "omnigraph-compiler" +version = "0.4.0" +dependencies = [ + "ahash", + "arrow-array", + "arrow-cast", + "arrow-ipc", + "arrow-ord", + "arrow-schema", + "arrow-select", + "pest", + "pest_derive", + "reqwest", + "serde", + "serde_json", + "sha2", + "thiserror", + "tokio", +] + +[[package]] +name = "omnigraph-server" +version = "0.4.0" +dependencies = [ + "axum", + "cedar-policy", + "clap", + "color-eyre", + "omnigraph", + "omnigraph-compiler", + "serde", + "serde_json", + "serde_yaml", + "serial_test", + "tempfile", + "tokio", + "tower", + "tower-http", + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "once_cell" +version = "1.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42f5e15c9953c5e4ccceeb2e7382a716482c34515315f7b03532b8b4e8393d2d" + +[[package]] +name = "once_cell_polyfill" +version = "1.70.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "384b8ab6d37215f3c5301a95a4accb5d64aa607f1fcb26a11b5303878451b4fe" + +[[package]] +name = "oneshot" +version = "0.1.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "269bca4c2591a28585d6bf10d9ed0332b7d76900a1b02bec41bdc3a2cdcda107" + +[[package]] +name = "opendal" +version = "0.55.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d075ab8a203a6ab4bc1bce0a4b9fe486a72bf8b939037f4b78d95386384bc80a" +dependencies = [ + "anyhow", + "backon", + "base64", + "bytes", + "crc32c", + "futures", + "getrandom 0.2.17", + "http 1.4.0", + "http-body 1.0.1", + "jiff", + "log", + "md-5", + "percent-encoding", + "quick-xml 0.38.4", + "reqsign", + "reqwest", + "serde", + "serde_json", + "sha2", + "tokio", + "url", + "uuid", +] + +[[package]] +name = "openssl-probe" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c87def4c32ab89d880effc9e097653c8da5d6ef28e6b539d313baaacfbafcbe" + +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "ordered-float" +version = "5.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f4779c6901a562440c3786d08192c6fbda7c1c2060edd10006b05ee35d10f2d" +dependencies = [ + "num-traits", +] + +[[package]] +name = "ordered-multimap" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49203cdcae0030493bad186b28da2fa25645fa276a51b6fec8010d281e02ef79" +dependencies = [ + "dlv-list", + "hashbrown 0.14.5", +] + +[[package]] +name = "outref" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a80800c0488c3a21695ea981a54918fbb37abf04f4d0720c453632255e2ff0e" + +[[package]] +name = "ownedbytes" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2fbd56f7631767e61784dc43f8580f403f4475bd4aaa4da003e6295e1bab4a7e" +dependencies = [ + "stable_deref_trait", +] + +[[package]] +name = "owo-colors" +version = "4.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c6901729fa79e91a0913333229e9ca5dc725089d1c363b2f4b4760709dc4a52" + +[[package]] +name = "parking" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f38d5652c16fde515bb1ecef450ab0f6a219d619a7274976324d5e377f7dceba" + +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "paste" +version = "1.0.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" + +[[package]] +name = "path_abs" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05ef02f6342ac01d8a93b65f96db53fe68a92a15f41144f97fb00a9e669633c3" +dependencies = [ + "serde", + "serde_derive", + "std_prelude", + "stfu8", +] + +[[package]] +name = "pbkdf2" +version = "0.12.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8ed6a7761f76e3b9f92dfb0a60a6a6477c61024b775147ff0973a02653abaf2" +dependencies = [ + "digest", + "hmac", +] + +[[package]] +name = "pem" +version = "3.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d30c53c26bc5b31a98cd02d20f25a7c8567146caf63ed593a9d87b2775291be" +dependencies = [ + "base64", + "serde_core", +] + +[[package]] +name = "pem-rfc7468" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88b39c9bfcfc231068454382784bb460aae594343fb030d46e9f50a645418412" +dependencies = [ + "base64ct", +] + +[[package]] +name = "percent-encoding" +version = "2.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" + +[[package]] +name = "permutation" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df202b0b0f5b8e389955afd5f27b007b00fb948162953f1db9c70d2c7e3157d7" + +[[package]] +name = "pest" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e0848c601009d37dfa3430c4666e147e49cdcf1b92ecd3e63657d8a5f19da662" +dependencies = [ + "memchr", + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "11f486f1ea21e6c10ed15d5a7c77165d0ee443402f0780849d1768e7d9d6fe77" +dependencies = [ + "pest", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8040c4647b13b210a963c1ed407c1ff4fdfa01c31d6d2a098218702e6664f94f" +dependencies = [ + "pest", + "pest_meta", + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "pest_meta" +version = "2.8.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "89815c69d36021a140146f26659a81d6c2afa33d216d736dd4be5381a7362220" +dependencies = [ + "pest", + "sha2", +] + +[[package]] +name = "petgraph" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3672b37090dbd86368a4145bc067582552b29c27377cad4e0a306c97f9bd7772" +dependencies = [ + "fixedbitset", + "indexmap 2.13.0", +] + +[[package]] +name = "petgraph" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8701b58ea97060d5e5b155d383a69952a60943f0e6dfe30b04c287beb0b27455" +dependencies = [ + "fixedbitset", + "hashbrown 0.15.5", + "indexmap 2.13.0", + "serde", +] + +[[package]] +name = "phf" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "913273894cec178f401a31ec4b656318d95473527be05c0752cc41cdc32be8b7" +dependencies = [ + "phf_shared 0.12.1", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher", +] + +[[package]] +name = "phf_shared" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06005508882fb681fd97892ecff4b7fd0fee13ef1aa569f8695dae7ab9099981" +dependencies = [ + "siphasher", +] + +[[package]] +name = "pico-args" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5be167a7af36ee22fe3115051bc51f6e6c7054c9348e28deb4f49bd6f705a315" + +[[package]] +name = "pin-project" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1749c7ed4bcaf4c3d0a3efc28538844fb29bcdd7d2b67b2be7e20ba861ff517" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9b20ed30f105399776b9c883e68e536ef602a16ae6f596d2c473591d6ad64c6" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "pin-project-lite" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b3cff922bd51709b605d9ead9aa71031d81447142d828eb4a6eba76fe619f9b" + +[[package]] +name = "pin-utils" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" + +[[package]] +name = "pkcs1" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8ffb9f10fa047879315e6625af03c164b16962a5368d724ed16323b68ace47f" +dependencies = [ + "der", + "pkcs8", + "spki", +] + +[[package]] +name = "pkcs5" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e847e2c91a18bfa887dd028ec33f2fe6f25db77db3619024764914affe8b69a6" +dependencies = [ + "aes", + "cbc", + "der", + "pbkdf2", + "scrypt", + "sha2", + "spki", +] + +[[package]] +name = "pkcs8" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f950b2377845cebe5cf8b5165cb3cc1a5e0fa5cfa3e1f7f55707d8fd82e0a7b7" +dependencies = [ + "der", + "pkcs5", + "rand_core 0.6.4", + "spki", +] + +[[package]] +name = "pkg-config" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7edddbd0b52d732b21ad9a5fab5c704c14cd949e5e9a1ec5929a24fded1b904c" + +[[package]] +name = "portable-atomic" +version = "1.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c33a9471896f1c69cecef8d20cbe2f7accd12527ce60845ff44c153bb2a21b49" + +[[package]] +name = "portable-atomic-util" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "091397be61a01d4be58e7841595bd4bfedb15f1cd54977d79b8271e94ed799a3" +dependencies = [ + "portable-atomic", +] + +[[package]] +name = "potential_utf" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b73949432f5e2a09657003c25bca5e19a0e9c84f8058ca374f49e0ebe605af77" +dependencies = [ + "zerovec", +] + +[[package]] +name = "powerfmt" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" + +[[package]] +name = "ppv-lite86" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85eae3c4ed2f50dcfe72643da4befc30deadb458a9b590d720cde2f2b1e97da9" +dependencies = [ + "zerocopy", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + +[[package]] +name = "predicates" +version = "3.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ada8f2932f28a27ee7b70dd6c1c39ea0675c55a36879ab92f3a715eaa1e63cfe" +dependencies = [ + "anstyle", + "difflib", + "float-cmp", + "normalize-line-endings", + "predicates-core", + "regex", +] + +[[package]] +name = "predicates-core" +version = "1.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cad38746f3166b4031b1a0d39ad9f954dd291e7854fcc0eed52ee41a0b50d144" + +[[package]] +name = "predicates-tree" +version = "1.0.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d0de1b847b39c8131db0467e9df1ff60e6d0562ab8e9a16e568ad0fdb372e2f2" +dependencies = [ + "predicates-core", + "termtree", +] + +[[package]] +name = "pretty" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d22152487193190344590e4f30e219cf3fe140d9e7a3fdb683d82aa2c5f4156" +dependencies = [ + "arrayvec 0.5.2", + "typed-arena", + "unicode-width 0.2.2", +] + +[[package]] +name = "prettyplease" +version = "0.2.37" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "479ca8adacdd7ce8f1fb39ce9ecccbfe93a3f1344b3d0d97f20bc0196208f62b" +dependencies = [ + "proc-macro2", + "syn 2.0.115", +] + +[[package]] +name = "proc-macro2" +version = "1.0.106" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fd00f0bb2e90d81d1044c2b32617f68fcb9fa3bb7640c23e9c748e53fb30934" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "prost" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d2ea70524a2f82d518bce41317d0fae74151505651af45faf1ffbd6fd33f0568" +dependencies = [ + "bytes", + "prost-derive", +] + +[[package]] +name = "prost-build" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "343d3bd7056eda839b03204e68deff7d1b13aba7af2b2fd16890697274262ee7" +dependencies = [ + "heck", + "itertools 0.14.0", + "log", + "multimap", + "petgraph 0.8.3", + "prettyplease", + "prost", + "prost-types", + "regex", + "syn 2.0.115", + "tempfile", +] + +[[package]] +name = "prost-derive" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "27c6023962132f4b30eb4c172c91ce92d933da334c59c23cddee82358ddafb0b" +dependencies = [ + "anyhow", + "itertools 0.14.0", + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "prost-types" +version = "0.14.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8991c4cbdb8bc5b11f0b074ffe286c30e523de90fee5ba8132f1399f23cb3dd7" +dependencies = [ + "prost", +] + +[[package]] +name = "psm" +version = "0.1.30" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3852766467df634d74f0b2d7819bf8dc483a0eb2e3b0f50f756f9cfe8b0d18d8" +dependencies = [ + "ar_archive_writer", + "cc", +] + +[[package]] +name = "quick-xml" +version = "0.37.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "331e97a1af0bf59823e6eadffe373d7b27f485be8748f71471c662c1f269b7fb" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quick-xml" +version = "0.38.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b66c2058c55a409d601666cffe35f04333cf1013010882cec174a7467cd4e21c" +dependencies = [ + "memchr", + "serde", +] + +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1906b49b0c3bc04b5fe5d86a77925ae6524a19b816ae38ce1e426255f1d8a31" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.2", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.60.2", +] + +[[package]] +name = "quote" +version = "1.0.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21b2ebcf727b7760c461f091f9f0f539b77b8e87f2fd88131e7f1b433b3cece4" +dependencies = [ + "proc-macro2", +] + +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + +[[package]] +name = "r-efi" +version = "6.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f8dcc9c7d52a811697d2151c701e0d08956f92b0e24136cf4cf27b57a6a0d9bf" + +[[package]] +name = "radium" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc33ff2d4973d518d823d61aa239014831e521c75da58e3df4840d3f47749d09" + +[[package]] +name = "rand" +version = "0.8.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" +dependencies = [ + "libc", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db2770f06117d490610c7488547d543617b21bfa07796d7a12f6f1bd53850d1" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_chacha" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" +dependencies = [ + "ppv-lite86", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", +] + +[[package]] +name = "rand_core" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" +dependencies = [ + "getrandom 0.2.17", +] + +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + +[[package]] +name = "rand_distr" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32cb0b9bc82b0a0876c2dd994a7e7a2683d3e7390ca40e6886785ef0c7e3ee31" +dependencies = [ + "num-traits", + "rand 0.8.5", +] + +[[package]] +name = "rand_distr" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463" +dependencies = [ + "num-traits", + "rand 0.9.2", +] + +[[package]] +name = "rand_xoshiro" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f703f4665700daf5512dcca5f43afa6af89f09db47fb56be587f80636bda2d41" +dependencies = [ + "rand_core 0.9.5", +] + +[[package]] +name = "random_word" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e47a395bdb55442b883c89062d6bcff25dc90fa5f8369af81e0ac6d49d78cf81" +dependencies = [ + "ahash", + "brotli", + "paste", + "rand 0.9.2", + "unicase", +] + +[[package]] +name = "rangemap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "973443cf09a9c8656b574a866ab68dfa19f0867d0340648c7d2f6a71b8a8ea68" + +[[package]] +name = "rawpointer" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "60a357793950651c4ed0f3f52338f53b2f809f32d83a07f72909fa13e4c6c1e3" + +[[package]] +name = "rayon" +version = "1.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "368f01d005bf8fd9b1206fb6fa653e6c4a81ceb1466406b81792d87c5677a58f" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22e18b0f0062d30d4230b2e85ff77fdfe4326feb054b9783a3460d8435c8ab91" +dependencies = [ + "crossbeam-deque", + "crossbeam-utils", +] + +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + +[[package]] +name = "redox_users" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4e608c6638b9c18977b00b475ac1f28d14e84b27d8d42f70e0bf1e3dec127ac" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror", +] + +[[package]] +name = "ref-cast" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f354300ae66f76f1c85c5f84693f0ce81d747e2c3f21a45fef496d89c960bf7d" +dependencies = [ + "ref-cast-impl", +] + +[[package]] +name = "ref-cast-impl" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7186006dcb21920990093f30e3dea63b7d6e977bf1256be20c3563a5db070da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + +[[package]] +name = "regex-automata" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e1dd4122fc1595e8162618945476892eefca7b88c52820e74af6262213cae8f" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] + +[[package]] +name = "regex-lite" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cab834c73d247e67f4fae452806d17d3c7501756d98c8808d7c9c7aa7d18f973" + +[[package]] +name = "regex-syntax" +version = "0.8.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" + +[[package]] +name = "reqsign" +version = "0.16.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43451dbf3590a7590684c25fb8d12ecdcc90ed3ac123433e500447c7d77ed701" +dependencies = [ + "anyhow", + "async-trait", + "base64", + "chrono", + "form_urlencoded", + "getrandom 0.2.17", + "hex", + "hmac", + "home", + "http 1.4.0", + "jsonwebtoken", + "log", + "once_cell", + "percent-encoding", + "quick-xml 0.37.5", + "rand 0.8.5", + "reqwest", + "rsa", + "rust-ini", + "serde", + "serde_json", + "sha1", + "sha2", + "tokio", +] + +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-channel", + "futures-core", + "futures-util", + "h2", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "mime", + "mime_guess", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-native-certs", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + +[[package]] +name = "roaring" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ba9ce64a8f45d7fc86358410bb1a82e8c987504c0d4900e9141d69a9f26c885" +dependencies = [ + "bytemuck", + "byteorder", +] + +[[package]] +name = "rsa" +version = "0.9.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8573f03f5883dcaebdfcf4725caa1ecb9c15b2ef50c43a07b816e06799bb12d" +dependencies = [ + "const-oid", + "digest", + "num-bigint-dig", + "num-integer", + "num-traits", + "pkcs1", + "pkcs8", + "rand_core 0.6.4", + "sha2", + "signature", + "spki", + "subtle", + "zeroize", +] + +[[package]] +name = "rust-ini" +version = "0.21.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "796e8d2b6696392a43bea58116b667fb4c29727dc5abd27d6acf338bb4f688c7" +dependencies = [ + "cfg-if", + "ordered-multimap", +] + +[[package]] +name = "rust-stemmers" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e46a2036019fdb888131db7a4c847a1063a7493f971ed94ea82c67eada63ca54" +dependencies = [ + "serde", + "serde_derive", +] + +[[package]] +name = "rustc-demangle" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b50b8869d9fc858ce7266cce0194bd74df58b9d0e3f6df3a9fc8eb470d95c09d" + +[[package]] +name = "rustc-hash" +version = "2.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357703d41365b4b27c590e3ed91eabb1b663f07c4c084095e60cbed4362dff0d" + +[[package]] +name = "rustc-literal-escaper" +version = "0.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8be87abb9e40db7466e0681dc8ecd9dcfd40360cb10b4c8fe24a7c4c3669b198" + +[[package]] +name = "rustc_version" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfcb3a22ef46e85b45de6ee7e79d063319ebb6594faafcf1c225ea92ab6e9b92" +dependencies = [ + "semver", +] + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + +[[package]] +name = "rustix" +version = "1.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6fe4565b9518b83ef4f91bb47ce29620ca828bd32cb7e408f0062e9930ba190" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.12.1", + "windows-sys 0.61.2", +] + +[[package]] +name = "rustls" +version = "0.23.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c665f33d38cea657d9614f766881e4d510e0eda4239891eea56b4cadcf01801b" +dependencies = [ + "aws-lc-rs", + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-native-certs" +version = "0.8.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "612460d5f7bea540c490b2b6395d8e34a953e52b491accd6c86c8164c5932a63" +dependencies = [ + "openssl-probe", + "rustls-pki-types", + "schannel", + "security-framework", +] + +[[package]] +name = "rustls-pemfile" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dce314e5fee3f39953d46bb63bb8a46d40c2f8fb7cc5a3b6cab2bde9721d6e50" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be040f8b0a225e40375822a563fa9524378b9d63112f53e19ffff34df5d33fdd" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7df23109aa6c1567d1c575b9952556388da57401e4ace1d15f79eedad0d8f53" +dependencies = [ + "aws-lc-rs", + "ring", + "rustls-pki-types", + "untrusted", +] + +[[package]] +name = "rustversion" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b39cdef0fa800fc44525c84ccb54a029961a8215f9619753635a9c0d2538d46d" + +[[package]] +name = "ryu" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9774ba4a74de5f7b1c1451ed6cd5285a32eddb5cccb8cc655a4e50009e06477f" + +[[package]] +name = "salsa20" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97a22f5af31f73a954c10289c93e8a50cc23d971e80ee446f1f6f7137a088213" +dependencies = [ + "cipher", +] + +[[package]] +name = "same-file" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93fc1dc3aaa9bfed95e02e6eadabb4baf7e3078b0bd1b4d7b6b0b68378900502" +dependencies = [ + "winapi-util", +] + +[[package]] +name = "scc" +version = "2.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc" +dependencies = [ + "sdd", +] + +[[package]] +name = "schannel" +version = "0.1.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91c1b7e4904c873ef0710c1f407dde2e6287de2bebc1bbbf7d430bb7cbffd939" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "schemars" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd191f9397d57d581cddd31014772520aa448f65ef991055d7f61582c65165f" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "schemars" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2b42f36aa1cd011945615b92222f6bf73c599a102a300334cd7f8dbeec726cc" +dependencies = [ + "dyn-clone", + "ref-cast", + "serde", + "serde_json", +] + +[[package]] +name = "scoped-tls" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e1cf6437eb19a8f4a6cc0f7dca544973b0b78843adbfeb3683d1a94a0024a294" + +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + +[[package]] +name = "scrypt" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0516a385866c09368f0b5bcd1caff3366aace790fcd46e2bb032697bb172fd1f" +dependencies = [ + "pbkdf2", + "salsa20", + "sha2", +] + +[[package]] +name = "sdd" +version = "3.0.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca" + +[[package]] +name = "security-framework" +version = "3.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" +dependencies = [ + "bitflags", + "core-foundation", + "core-foundation-sys", + "libc", + "security-framework-sys", +] + +[[package]] +name = "security-framework-sys" +version = "2.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2691df843ecc5d231c0b14ece2acc3efb62c0a398c7e1d875f3983ce020e3" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "semver" +version = "1.0.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d767eb0aabc880b29956c35734170f26ed551a859dbd361d140cdbeca61ab1e2" + +[[package]] +name = "seq-macro" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1bc711410fbe7399f390ca1c3b60ad0f53f80e95c5eb935e52268a0e2cd49acc" + +[[package]] +name = "serde" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a8e94ea7f378bd32cbbd37198a4a91436180c5bb472411e48b5ec2e2124ae9e" +dependencies = [ + "serde_core", + "serde_derive", +] + +[[package]] +name = "serde_core" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "41d385c7d4ca58e59fc732af25c3983b67ac852c1a25000afe1175de458b67ad" +dependencies = [ + "serde_derive", +] + +[[package]] +name = "serde_derive" +version = "1.0.228" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d540f220d3187173da220f885ab66608367b6574e925011a9353e4badda91d79" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap 2.13.0", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "serde_path_to_error" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10a9ff822e371bb5403e391ecd83e182e0e77ba7f6fe0160b795797109d1b457" +dependencies = [ + "itoa", + "serde", + "serde_core", +] + +[[package]] +name = "serde_repr" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "175ee3e80ae9982737ca543e96133087cbd9a485eecc3bc4de9c1a37b47ea59c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "serde_urlencoded" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3491c14715ca2294c4d6a88f15e84739788c1d030eed8c110436aafdaa2f3fd" +dependencies = [ + "form_urlencoded", + "itoa", + "ryu", + "serde", +] + +[[package]] +name = "serde_with" +version = "3.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dd5414fad8e6907dbdd5bc441a50ae8d6e26151a03b1de04d89a5576de61d01f" +dependencies = [ + "base64", + "chrono", + "hex", + "indexmap 1.9.3", + "indexmap 2.13.0", + "schemars 0.9.0", + "schemars 1.2.1", + "serde_core", + "serde_json", + "serde_with_macros", + "time", +] + +[[package]] +name = "serde_with_macros" +version = "3.18.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3db8978e608f1fe7357e211969fd9abdcae80bac1ba7a3369bb7eb6b404eb65" +dependencies = [ + "darling", + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "serde_yaml" +version = "0.9.34+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a8b1a1a2ebf674015cc02edccce75287f1a0130d394307b36743c2f5d504b47" +dependencies = [ + "indexmap 2.13.0", + "itoa", + "ryu", + "serde", + "unsafe-libyaml", +] + +[[package]] +name = "serial_test" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f" +dependencies = [ + "futures-executor", + "futures-util", + "log", + "once_cell", + "parking_lot", + "scc", + "serial_test_derive", +] + +[[package]] +name = "serial_test_derive" +version = "3.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "sha1" +version = "0.10.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha2" +version = "0.10.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7507d819769d01a365ab707794a4084392c824f54a7a6a7862f8c3d0892b283" +dependencies = [ + "cfg-if", + "cpufeatures", + "digest", +] + +[[package]] +name = "sha3" +version = "0.10.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "75872d278a8f37ef87fa0ddbda7802605cb18344497949862c0d4dcb291eba60" +dependencies = [ + "digest", + "keccak", +] + +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "signal-hook-registry" +version = "1.4.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4db69cba1110affc0e9f7bcd48bbf87b3f4fc7c61fc9155afd4c469eb3d6c1b" +dependencies = [ + "errno", + "libc", +] + +[[package]] +name = "signature" +version = "2.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" +dependencies = [ + "digest", + "rand_core 0.6.4", +] + +[[package]] +name = "simd-adler32" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e320a6c5ad31d271ad523dcf3ad13e2767ad8b1cb8f047f75a8aeaf8da139da2" + +[[package]] +name = "simdutf8" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" + +[[package]] +name = "simple_asn1" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d585997b0ac10be3c5ee635f1bab02d512760d14b7c468801ac8a01d9ae5f1d" +dependencies = [ + "num-bigint", + "num-traits", + "thiserror", + "time", +] + +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + +[[package]] +name = "sketches-ddsketch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c6f73aeb92d671e0cc4dca167e59b2deb6387c375391bc99ee743f326994a2b" +dependencies = [ + "serde", +] + +[[package]] +name = "slab" +version = "0.4.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c790de23124f9ab44544d7ac05d60440adc586479ce501c1d6d7da3cd8c9cf5" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "smol_str" +version = "0.3.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4aaa7368fcf4852a4c2dd92df0cace6a71f2091ca0a23391ce7f3a31833f1523" +dependencies = [ + "borsh", + "serde_core", +] + +[[package]] +name = "snafu" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1d4bced6a69f90b2056c03dcff2c4737f98d6fb9e0853493996e1d253ca29c6" +dependencies = [ + "snafu-derive", +] + +[[package]] +name = "snafu-derive" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "54254b8531cafa275c5e096f62d48c81435d1015405a91198ddb11e967301d40" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "socket2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86f4aa3ad99f2088c990dfa82d367e19cb29268ed67c574d10d0a4bfe71f07e0" +dependencies = [ + "libc", + "windows-sys 0.60.2", +] + +[[package]] +name = "spin" +version = "0.9.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67" + +[[package]] +name = "spki" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d91ed6c858b01f942cd56b37a94b3e0a1798290327d1236e4d9cf4eaca44d29d" +dependencies = [ + "base64ct", + "der", +] + +[[package]] +name = "sqlparser" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4591acadbcf52f0af60eafbb2c003232b2b4cd8de5f0e9437cb8b1b59046cc0f" +dependencies = [ + "log", + "sqlparser_derive", +] + +[[package]] +name = "sqlparser_derive" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da5fc6819faabb412da764b99d3b713bb55083c11e7e0c00144d386cd6a1939c" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "stable_deref_trait" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" + +[[package]] +name = "stacker" +version = "0.1.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d74a23609d509411d10e2176dc2a4346e3b4aea2e7b1869f19fdedbc71c013" +dependencies = [ + "cc", + "cfg-if", + "libc", + "psm", + "windows-sys 0.59.0", +] + +[[package]] +name = "std_prelude" +version = "0.2.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8207e78455ffdf55661170876f88daf85356e4edd54e0a3dbc79586ca1e50cbe" + +[[package]] +name = "stfu8" +version = "0.2.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e51f1e89f093f99e7432c491c382b88a6860a5adbe6bf02574bf0a08efff1978" + +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", +] + +[[package]] +name = "strsim" +version = "0.11.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" + +[[package]] +name = "strum" +version = "0.26.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8fec0f0aef304996cf250b31b5a10dee7980c85da9d759361292b8bca5a18f06" +dependencies = [ + "strum_macros", +] + +[[package]] +name = "strum_macros" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be" +dependencies = [ + "heck", + "proc-macro2", + "quote", + "rustversion", + "syn 2.0.115", +] + +[[package]] +name = "subtle" +version = "2.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13c2bddecc57b384dee18652358fb23172facb8a2c51ccc10d74c157bdea3292" + +[[package]] +name = "syn" +version = "1.0.109" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "syn" +version = "2.0.115" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e614ed320ac28113fa64972c4262d5dbc89deacdfd00c34a3e4cea073243c12" +dependencies = [ + "proc-macro2", + "quote", + "unicode-ident", +] + +[[package]] +name = "sync_wrapper" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] + +[[package]] +name = "synstructure" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "728a70f3dbaf5bab7f0c4b1ac8d7ae5ea60a4b5549c8a5914361c99147a709d2" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "tagptr" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7b2093cf4c8eb1e67749a6762251bc9cd836b6fc171623bd0a9d324d37af2417" + +[[package]] +name = "tantivy" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64a966cb0e76e311f09cf18507c9af192f15d34886ee43d7ba7c7e3803660c43" +dependencies = [ + "aho-corasick", + "arc-swap", + "base64", + "bitpacking", + "bon", + "byteorder", + "census", + "crc32fast", + "crossbeam-channel", + "downcast-rs", + "fastdivide", + "fnv", + "fs4", + "htmlescape", + "hyperloglogplus", + "itertools 0.14.0", + "levenshtein_automata", + "log", + "lru", + "lz4_flex 0.11.6", + "measure_time", + "memmap2", + "once_cell", + "oneshot", + "rayon", + "regex", + "rust-stemmers", + "rustc-hash", + "serde", + "serde_json", + "sketches-ddsketch", + "smallvec", + "tantivy-bitpacker", + "tantivy-columnar", + "tantivy-common", + "tantivy-fst", + "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", + "tempfile", + "thiserror", + "time", + "uuid", + "winapi", +] + +[[package]] +name = "tantivy-bitpacker" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1adc286a39e089ae9938935cd488d7d34f14502544a36607effd2239ff0e2494" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6300428e0c104c4f7db6f95b466a6f5c1b9aece094ec57cdd365337908dc7344" +dependencies = [ + "downcast-rs", + "fastdivide", + "itertools 0.14.0", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] + +[[package]] +name = "tantivy-common" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91b6ea6090ce03dc72c27d0619e77185d26cc3b20775966c346c6d4f7e99d7f" +dependencies = [ + "async-trait", + "byteorder", + "ownedbytes", + "serde", + "time", +] + +[[package]] +name = "tantivy-fst" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d60769b80ad7953d8a7b2c70cdfe722bbcdcac6bccc8ac934c40c034d866fc18" +dependencies = [ + "byteorder", + "regex-syntax", + "utf8-ranges", +] + +[[package]] +name = "tantivy-query-grammar" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e810cdeeebca57fc3f7bfec5f85fdbea9031b2ac9b990eb5ff49b371d52bbe6a" +dependencies = [ + "nom 7.1.3", + "serde", + "serde_json", +] + +[[package]] +name = "tantivy-sstable" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "709f22c08a4c90e1b36711c1c6cad5ae21b20b093e535b69b18783dd2cb99416" +dependencies = [ + "futures-util", + "itertools 0.14.0", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-fst", + "zstd", +] + +[[package]] +name = "tantivy-stacker" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2bcdebb267671311d1e8891fd9d1301803fdb8ad21ba22e0a30d0cab49ba59c1" +dependencies = [ + "murmurhash32", + "rand_distr 0.4.3", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfa942fcee81e213e09715bbce8734ae2180070b97b33839a795ba1de201547d" +dependencies = [ + "serde", +] + +[[package]] +name = "tap" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" + +[[package]] +name = "tempfile" +version = "3.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32497e9a4c7b38532efcdebeef879707aa9f794296a4f0244f6f69e9bc8574bd" +dependencies = [ + "fastrand", + "getrandom 0.4.2", + "once_cell", + "rustix 1.1.4", + "windows-sys 0.61.2", +] + +[[package]] +name = "term" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d8c27177b12a6399ffc08b98f76f7c9a1f4fe9fc967c784c5a071fa8d93cf7e1" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "termtree" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f50febec83f5ee1df3015341d8bd429f2d1cc62bcba7ea2076759d315084683" + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "thread-tree" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ffbd370cb847953a25954d9f63e14824a36113f8c72eecf6eccef5dc4b45d630" +dependencies = [ + "crossbeam-channel", +] + +[[package]] +name = "thread_local" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f60246a4944f24f6e018aa17cdeffb7818b76356965d03b07d6a9886e8962185" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "time" +version = "0.3.47" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "743bd48c283afc0388f9b8827b976905fb217ad9e647fae3a379a9283c4def2c" +dependencies = [ + "deranged", + "itoa", + "num-conv", + "powerfmt", + "serde_core", + "time-core", + "time-macros", +] + +[[package]] +name = "time-core" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7694e1cfe791f8d31026952abf09c69ca6f6fa4e1a1229e18988f06a04a12dca" + +[[package]] +name = "time-macros" +version = "0.2.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e70e4c5a0e0a8a4823ad65dfe1a6930e4f4d756dcd9dd7939022b5e8c501215" +dependencies = [ + "num-conv", + "time-core", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tinystr" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42d3e9c45c09de15d06dd8acf5f4e0e399e85927b7f00711024eb7ae10fa4869" +dependencies = [ + "displaydoc", + "zerovec", +] + +[[package]] +name = "tinyvec" +version = "1.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa5fdc3bce6191a1dbc8c02d5c8bffcf557bafa17c124c5264a458f1b0613fa" +dependencies = [ + "tinyvec_macros", +] + +[[package]] +name = "tinyvec_macros" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" + +[[package]] +name = "tokio" +version = "1.49.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72a2903cd7736441aac9df9d7688bd0ce48edccaadf181c3b90be801e81d3d86" +dependencies = [ + "bytes", + "libc", + "mio", + "parking_lot", + "pin-project-lite", + "signal-hook-registry", + "socket2", + "tokio-macros", + "windows-sys 0.61.2", +] + +[[package]] +name = "tokio-macros" +version = "2.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af407857209536a95c8e56f8231ef2c2e2aff839b22e07a1ffcbc617e9db9fa5" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + +[[package]] +name = "tokio-stream" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32da49809aab5c3bc678af03902d4ccddea2a87d028d86392a4b1560c6906c70" +dependencies = [ + "futures-core", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tokio-util" +version = "0.7.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ae9cec805b01e8fc3fd2fe289f89149a9b66dd16786abd8b19cfa7b48cb0098" +dependencies = [ + "bytes", + "futures-core", + "futures-sink", + "pin-project-lite", + "tokio", +] + +[[package]] +name = "tower" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebe5ef63511595f1344e2d5cfa636d973292adc0eec1f0ad45fae9f0851ab1d4" +dependencies = [ + "futures-core", + "futures-util", + "pin-project-lite", + "sync_wrapper", + "tokio", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-http" +version = "0.6.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e6559d53cc268e5031cd8429d05415bc4cb4aefc4aa5d6cc35fbf5b924a1f8" +dependencies = [ + "async-compression", + "bitflags", + "bytes", + "futures-core", + "futures-util", + "http 1.4.0", + "http-body 1.0.1", + "http-body-util", + "iri-string", + "pin-project-lite", + "tokio", + "tokio-util", + "tower", + "tower-layer", + "tower-service", + "tracing", +] + +[[package]] +name = "tower-layer" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" + +[[package]] +name = "tower-service" +version = "0.3.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" + +[[package]] +name = "tracing" +version = "0.1.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "63e71662fa4b2a2c3a26f570f037eb95bb1f85397f3cd8076caed2f026a6d100" +dependencies = [ + "log", + "pin-project-lite", + "tracing-attributes", + "tracing-core", +] + +[[package]] +name = "tracing-attributes" +version = "0.1.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7490cfa5ec963746568740651ac6781f701c9c5ea257c58e057f3ba8cf69e8da" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "tracing-core" +version = "0.1.36" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db97caf9d906fbde555dd62fa95ddba9eecfd14cb388e4f491a66d74cd5fb79a" +dependencies = [ + "once_cell", + "valuable", +] + +[[package]] +name = "tracing-error" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b1581020d7a273442f5b45074a6a57d5757ad0a47dac0e9f0bd57b81936f3db" +dependencies = [ + "tracing", + "tracing-subscriber", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2f30143827ddab0d256fd843b7a66d164e9f271cfa0dde49142c5ca0ca291f1e" +dependencies = [ + "matchers", + "nu-ansi-term", + "once_cell", + "regex-automata", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", +] + +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "twox-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ea3136b675547379c4bd395ca6b938e5ad3c3d20fad76e7fe85f9e0d011419c" +dependencies = [ + "rand 0.9.2", +] + +[[package]] +name = "typed-arena" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6af6ae20167a9ece4bcb41af5b80f8a1f1df981f6391189ce00fd257af04126a" + +[[package]] +name = "typenum" +version = "1.19.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "562d481066bde0658276a35467c4af00bdc6ee726305698a55b86e61d7ad82bb" + +[[package]] +name = "ucd-trie" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2896d95c02a80c6d6a5d6e953d479f5ddf2dfdb6a244441010e373ac0fb88971" + +[[package]] +name = "ulid" +version = "1.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "470dbf6591da1b39d43c14523b2b469c86879a53e8b758c8e090a470fe7b1fbe" +dependencies = [ + "rand 0.9.2", + "web-time", +] + +[[package]] +name = "unicase" +version = "2.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dbc4bc3a9f746d862c45cb89d705aa10f187bb96c76001afab07a0d35ce60142" + +[[package]] +name = "unicode-ident" +version = "1.0.23" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "537dd038a89878be9b64dd4bd1b260315c1bb94f4d784956b81e27a088d9a09e" + +[[package]] +name = "unicode-normalization" +version = "0.1.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fd4f6878c9cb28d874b009da9e8d183b5abc80117c40bbd187a1fde336be6e8" +dependencies = [ + "tinyvec", +] + +[[package]] +name = "unicode-script" +version = "0.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "383ad40bb927465ec0ce7720e033cb4ca06912855fc35db31b5755d0de75b1ee" + +[[package]] +name = "unicode-security" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2e4ddba1535dd35ed8b61c52166b7155d7f4e4b8847cec6f48e71dc66d8b5e50" +dependencies = [ + "unicode-normalization", + "unicode-script", +] + +[[package]] +name = "unicode-segmentation" +version = "1.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" + +[[package]] +name = "unicode-width" +version = "0.1.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" + +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + +[[package]] +name = "unicode-xid" +version = "0.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" + +[[package]] +name = "unsafe-libyaml" +version = "0.2.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "673aac59facbab8a9007c7f6108d11f63b603f7cabff99fabf650fea5c32b861" + +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + +[[package]] +name = "url" +version = "2.5.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff67a8a4397373c3ef660812acab3268222035010ab8680ec4215f38ba3d0eed" +dependencies = [ + "form_urlencoded", + "idna", + "percent-encoding", + "serde", +] + +[[package]] +name = "urlencoding" +version = "2.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "daf8dba3b7eb870caf1ddeed7bc9d2a049f3cfdfae7cb521b087cc33ae4c49da" + +[[package]] +name = "utf8-ranges" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7fcfc827f90e53a02eaef5e535ee14266c1d569214c6aa70133a624d8a3164ba" + +[[package]] +name = "utf8_iter" +version = "1.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6c140620e7ffbb22c2dee59cafe6084a59b5ffc27a8859a5f0d494b5d52b6be" + +[[package]] +name = "utf8parse" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" + +[[package]] +name = "uuid" +version = "1.22.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a68d3c8f01c0cfa54a75291d83601161799e4a89a39e0929f4b0354d88757a37" +dependencies = [ + "getrandom 0.4.2", + "js-sys", + "serde_core", + "wasm-bindgen", +] + +[[package]] +name = "valuable" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba73ea9cf16a25df0c8caa16c51acb937d5712a8429db78a3ee29d5dcacd3a65" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + +[[package]] +name = "vsimd" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c3082ca00d5a5ef149bb8b555a72ae84c9c59f7250f013ac822ac2e49b19c64" + +[[package]] +name = "wait-timeout" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ac3b126d3914f9849036f826e054cbabdc8519970b8998ddaf3b5bd3c65f11" +dependencies = [ + "libc", +] + +[[package]] +name = "walkdir" +version = "2.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "29790946404f91d9c5d06f9874efddea1dc06c5efe94541a7d6863108e3a5e4b" +dependencies = [ + "same-file", + "winapi-util", +] + +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "wasip2" +version = "1.0.2+wasi-0.2.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9517f9239f02c069db75e65f174b3da828fe5f5b945c4dd26bd25d89c03ebcf5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasip3" +version = "0.4.0+wasi-0.3.0-rc-2026-01-06" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5428f8bf88ea5ddc08faddef2ac4a67e390b88186c703ce6dbd955e1c145aca5" +dependencies = [ + "wit-bindgen", +] + +[[package]] +name = "wasm-bindgen" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "64024a30ec1e37399cf85a7ffefebdb72205ca1c972291c51512360d90bd8566" +dependencies = [ + "cfg-if", + "once_cell", + "rustversion", + "wasm-bindgen-macro", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.58" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70a6e77fd0ae8029c9ea0063f87c46fde723e7d887703d74ad2616d792e51e6f" +dependencies = [ + "cfg-if", + "futures-util", + "js-sys", + "once_cell", + "wasm-bindgen", + "web-sys", +] + +[[package]] +name = "wasm-bindgen-macro" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "008b239d9c740232e71bd39e8ef6429d27097518b6b30bdf9086833bd5b6d608" +dependencies = [ + "quote", + "wasm-bindgen-macro-support", +] + +[[package]] +name = "wasm-bindgen-macro-support" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5256bae2d58f54820e6490f9839c49780dff84c65aeab9e772f15d5f0e913a55" +dependencies = [ + "bumpalo", + "proc-macro2", + "quote", + "syn 2.0.115", + "wasm-bindgen-shared", +] + +[[package]] +name = "wasm-bindgen-shared" +version = "0.2.108" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f01b580c9ac74c8d8f0c0e4afb04eeef2acf145458e52c03845ee9cd23e3d12" +dependencies = [ + "unicode-ident", +] + +[[package]] +name = "wasm-encoder" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "990065f2fe63003fe337b932cfb5e3b80e0b4d0f5ff650e6985b1048f62c8319" +dependencies = [ + "leb128fmt", + "wasmparser", +] + +[[package]] +name = "wasm-metadata" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bb0e353e6a2fbdc176932bbaab493762eb1255a7900fe0fea1a2f96c296cc909" +dependencies = [ + "anyhow", + "indexmap 2.13.0", + "wasm-encoder", + "wasmparser", +] + +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wasmparser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "47b807c72e1bac69382b3a6fb3dbe8ea4c0ed87ff5629b8685ae6b9a611028fe" +dependencies = [ + "bitflags", + "hashbrown 0.15.5", + "indexmap 2.13.0", + "semver", +] + +[[package]] +name = "web-sys" +version = "0.3.85" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "312e32e551d92129218ea9a2452120f4aabc03529ef03e4d0d82fb2780608598" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22cfaf3c063993ff62e73cb4311efde4db1efb31ab78a3e5c457939ad5cc0bed" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "winapi" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" +dependencies = [ + "winapi-i686-pc-windows-gnu", + "winapi-x86_64-pc-windows-gnu", +] + +[[package]] +name = "winapi-i686-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" + +[[package]] +name = "winapi-util" +version = "0.1.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" +dependencies = [ + "windows-sys 0.61.2", +] + +[[package]] +name = "winapi-x86_64-pc-windows-gnu" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" + +[[package]] +name = "windows-core" +version = "0.62.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8e83a14d34d0623b51dce9581199302a221863196a1dde71a7663a4c2be9deb" +dependencies = [ + "windows-implement", + "windows-interface", + "windows-link", + "windows-result", + "windows-strings", +] + +[[package]] +name = "windows-implement" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "053e2e040ab57b9dc951b72c264860db7eb3b0200ba345b4e4c3b14f67855ddf" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "windows-interface" +version = "0.59.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f316c4a2570ba26bbec722032c4099d8c8bc095efccdc15688708623367e358" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "windows-link" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" + +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.60.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2f500e4d28234f72040990ec9d39e3a6b950f9f22d3dba18416c35882612bcb" +dependencies = [ + "windows-targets 0.53.5", +] + +[[package]] +name = "windows-sys" +version = "0.61.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm 0.52.6", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", +] + +[[package]] +name = "windows-targets" +version = "0.53.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4945f9f551b88e0d65f3db0bc25c33b8acea4d9e41163edf90dcd0b19f9069f3" +dependencies = [ + "windows-link", + "windows_aarch64_gnullvm 0.53.1", + "windows_aarch64_msvc 0.53.1", + "windows_i686_gnu 0.53.1", + "windows_i686_gnullvm 0.53.1", + "windows_i686_msvc 0.53.1", + "windows_x86_64_gnu 0.53.1", + "windows_x86_64_gnullvm 0.53.1", + "windows_x86_64_msvc 0.53.1", +] + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a9d8416fa8b42f5c947f8482c43e7d89e73a173cead56d044f6a56104a6d1b53" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + +[[package]] +name = "windows_aarch64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9d782e804c2f632e395708e99a94275910eb9100b2114651e04744e9b125006" + +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "960e6da069d81e09becb0ca57a65220ddff016ff2d6af6a223cf372a506593a3" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa7359d10048f68ab8b09fa71c3daccfb0e9b559aed648a8f95469c27057180c" + +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + +[[package]] +name = "windows_i686_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e7ac75179f18232fe9c285163565a57ef8d3c89254a30685b57d83a38d326c2" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + +[[package]] +name = "windows_x86_64_gnu" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9c3842cdd74a865a8066ab39c8a7a473c0778a3f29370b5fd6b4b9aa7df4a499" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ffa179e2d07eee8ad8f57493436566c7cc30ac536a3379fdf008f47f6bb7ae1" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "windows_x86_64_msvc" +version = "0.53.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d6bbff5f0aada427a1e5a6da5f1f98158182f26556f345ac9e04d36d0ebed650" + +[[package]] +name = "wit-bindgen" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d7249219f66ced02969388cf2bb044a09756a083d0fab1e566056b04d9fbcaa5" +dependencies = [ + "wit-bindgen-rust-macro", +] + +[[package]] +name = "wit-bindgen-core" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" +dependencies = [ + "anyhow", + "heck", + "wit-parser", +] + +[[package]] +name = "wit-bindgen-rust" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" +dependencies = [ + "anyhow", + "heck", + "indexmap 2.13.0", + "prettyplease", + "syn 2.0.115", + "wasm-metadata", + "wit-bindgen-core", + "wit-component", +] + +[[package]] +name = "wit-bindgen-rust-macro" +version = "0.51.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c0f9bfd77e6a48eccf51359e3ae77140a7f50b1e2ebfe62422d8afdaffab17a" +dependencies = [ + "anyhow", + "prettyplease", + "proc-macro2", + "quote", + "syn 2.0.115", + "wit-bindgen-core", + "wit-bindgen-rust", +] + +[[package]] +name = "wit-component" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9d66ea20e9553b30172b5e831994e35fbde2d165325bec84fc43dbf6f4eb9cb2" +dependencies = [ + "anyhow", + "bitflags", + "indexmap 2.13.0", + "log", + "serde", + "serde_derive", + "serde_json", + "wasm-encoder", + "wasm-metadata", + "wasmparser", + "wit-parser", +] + +[[package]] +name = "wit-parser" +version = "0.244.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ecc8ac4bc1dc3381b7f59c34f00b67e18f910c2c0f50015669dde7def656a736" +dependencies = [ + "anyhow", + "id-arena", + "indexmap 2.13.0", + "log", + "semver", + "serde", + "serde_derive", + "serde_json", + "unicode-xid", + "wasmparser", +] + +[[package]] +name = "writeable" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9edde0db4769d2dc68579893f2306b26c6ecfbe0ef499b013d731b7b9247e0b9" + +[[package]] +name = "wyz" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "05f360fc0b24296329c78fda852a1e9ae82de9cf7b27dae4b7f62f118f77b9ed" +dependencies = [ + "tap", +] + +[[package]] +name = "xmlparser" +version = "0.13.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" + +[[package]] +name = "xxhash-rust" +version = "0.8.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdd20c5420375476fbd4394763288da7eb0cc0b8c11deed431a91562af7335d3" + +[[package]] +name = "yoke" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72d6e5c6afb84d73944e5cedb052c4680d5657337201555f9f2a16b7406d4954" +dependencies = [ + "stable_deref_trait", + "yoke-derive", + "zerofrom", +] + +[[package]] +name = "yoke-derive" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b659052874eb698efe5b9e8cf382204678a0086ebf46982b79d6ca3182927e5d" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", + "synstructure", +] + +[[package]] +name = "zerocopy" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "db6d35d663eadb6c932438e763b262fe1a70987f9ae936e60158176d710cae4a" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.39" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4122cd3169e94605190e77839c9a40d40ed048d305bfdc146e7df40ab0f3e517" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "zerofrom" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "50cc42e0333e05660c3587f3bf9d0478688e15d870fab3346451ce7f8c9fbea5" +dependencies = [ + "zerofrom-derive", +] + +[[package]] +name = "zerofrom-derive" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d71e5d6e06ab090c67b5e44993ec16b72dcbaabc526db883a360057678b48502" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", + "synstructure", +] + +[[package]] +name = "zeroize" +version = "1.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b97154e67e32c85465826e8bcc1c59429aaaf107c1e4a9e53c8d8ccd5eff88d0" + +[[package]] +name = "zerotrie" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a59c17a5562d507e4b54960e8569ebee33bee890c70aa3fe7b97e85a9fd7851" +dependencies = [ + "displaydoc", + "yoke", + "zerofrom", +] + +[[package]] +name = "zerovec" +version = "0.11.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c28719294829477f525be0186d13efa9a3c602f7ec202ca9e353d310fb9a002" +dependencies = [ + "yoke", + "zerofrom", + "zerovec-derive", +] + +[[package]] +name = "zerovec-derive" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eadce39539ca5cb3985590102671f2567e659fca9666581ad3411d59207951f3" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.115", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" + +[[package]] +name = "zstd" +version = "0.13.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e91ee311a569c327171651566e07972200e76fcfe2242a4fa446149a3881c08a" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "7.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8f49c4d5f0abb602a93fb8736af2a4f4dd9512e36f7f570d66e65ff867ed3b9d" +dependencies = [ + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.16+zstd.1.5.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e19ebc2adc8f83e43039e79776e3fda8ca919132d68a1fed6a5faca2683748" +dependencies = [ + "cc", + "pkg-config", +] diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..91861ce --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,79 @@ +[workspace] +resolver = "2" +members = [ + "crates/omnigraph-compiler", + "crates/omnigraph", + "crates/omnigraph-cli", + "crates/omnigraph-server", +] +default-members = [ + "crates/omnigraph", + "crates/omnigraph-cli", + "crates/omnigraph-server", +] + +[workspace.dependencies] +arrow-array = "57" +arrow-ipc = "57" +arrow-schema = "57" +arrow-select = "57" +arrow-cast = { version = "57", features = ["prettyprint"] } +arrow-ord = "57" + +datafusion-physical-plan = "52" +datafusion-physical-expr = "52" +datafusion-execution = "52" +datafusion-common = "52" +datafusion-expr = "52" +datafusion-functions-aggregate = "52" + +lance = { version = "4.0.0", default-features = false, features = ["aws"] } +lance-datafusion = "4.0.0" +lance-file = "4.0.0" +lance-index = "4.0.0" +lance-linalg = "4.0.0" +lance-namespace = "4.0.0" +lance-namespace-impls = "4.0.0" +lance-table = "4.0.0" + +ulid = "1" +futures = "0.3" +async-trait = "0.1" +pest = "2" +pest_derive = "2" +thiserror = "2" +tokio = { version = "1", features = ["rt-multi-thread", "macros", "time", "net", "signal", "sync"] } +clap = { version = "4", features = ["derive"] } +serde = { version = "1", features = ["derive"] } +serde_json = "1" +serde_yaml = "0.9" +tracing = "0.1" +tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] } +tower = "0.5" +tower-http = { version = "0.6", features = ["trace"] } +color-eyre = "0.6" +tempfile = "3" +ahash = "0.8" +base64 = "0.22" +ariadne = "0.4" +regex = "1" +reqwest = { version = "0.12", default-features = false, features = ["json", "rustls-tls"] } +object_store = { version = "0.12.5", default-features = false, features = ["aws"] } +fail = "0.5" +time = { version = "0.3", features = ["formatting"] } +axum = { version = "0.8", features = ["json", "macros"] } +url = "2" +cedar-policy = "4.9" +sha2 = "0.10" + +[profile.dev] +debug = 0 + +[profile.dev.package."*"] +opt-level = 2 + +[profile.release] +opt-level = 2 +lto = "thin" +codegen-units = 16 +strip = true diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..67dd0eb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +FROM debian:bookworm-slim + +RUN apt-get update \ + && apt-get install -y --no-install-recommends ca-certificates curl \ + && rm -rf /var/lib/apt/lists/* + +RUN groupadd --system omnigraph \ + && useradd --system --gid omnigraph --create-home --home-dir /var/lib/omnigraph omnigraph + +COPY target/release/omnigraph-server /usr/local/bin/omnigraph-server +COPY docker/entrypoint.sh /usr/local/bin/omnigraph-entrypoint + +RUN chmod 0755 /usr/local/bin/omnigraph-server /usr/local/bin/omnigraph-entrypoint + +ENV OMNIGRAPH_BIND=0.0.0.0:8080 + +WORKDIR /var/lib/omnigraph +USER omnigraph:omnigraph + +EXPOSE 8080 + +HEALTHCHECK --interval=30s --timeout=5s --start-period=10s --retries=3 \ + CMD curl -fsS http://127.0.0.1:8080/healthz >/dev/null || exit 1 + +ENTRYPOINT ["/usr/local/bin/omnigraph-entrypoint"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..03c5baf --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 NanoGraph Contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..70dcc82 --- /dev/null +++ b/README.md @@ -0,0 +1,154 @@ +# Omnigraph + +Omnigraph is a typed property graph database built on Lance. It combines +schema-first graph modeling, typed queries and mutations, Git-style graph +workflows, and storage that runs equally well on a local directory or an +`s3://` URI. + +## Quick Install + +```bash +curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph-public/main/scripts/install.sh | bash +``` + +This installs `omnigraph` and `omnigraph-server` into `~/.local/bin`. If no +tagged release exists for your platform yet, the installer falls back to a +source build. + +## One-Command Local RustFS Bootstrap + +```bash +curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph-public/main/scripts/local-rustfs-bootstrap.sh | bash +``` + +That bootstrap: + +- starts RustFS on `127.0.0.1:9000` +- creates a bucket and S3-backed repo +- loads the checked-in context fixture +- launches `omnigraph-server` on `127.0.0.1:8080` + +Docker must be installed and running first. + +## Good Fit For + +- Team knowledge graphs and internal context graphs +- Research, decisions, and evidence tracking +- Collaborative knowledge systems with reviewable changes +- Private self-hosted graph backends for local or on-prem AI tooling + +## Why Omnigraph + +- Typed schema, typed queries, and typed mutations +- Git-style graph workflows: branches, commits, merges, and transactional runs +- Local-first and S3-native storage with snapshot-pinned reads +- Graph traversal plus text, fuzzy, BM25, vector, and RRF search in one runtime +- Policy as code for server-side access control + +## Quick Start + +From a checkout of this repo: + +```bash +cargo build --workspace + +cargo run -p omnigraph-cli -- init \ + --schema crates/omnigraph/tests/fixtures/test.pg \ + ./repo.omni + +cargo run -p omnigraph-cli -- load \ + --data crates/omnigraph/tests/fixtures/test.jsonl \ + ./repo.omni + +cargo run -p omnigraph-cli -- read \ + ./repo.omni \ + --query crates/omnigraph/tests/fixtures/test.gq \ + --name friends_of \ + --params '{"name":"Alice"}' +``` + +`init` also scaffolds an `omnigraph.yaml` next to the repo if one does not +already exist. + +## Run A Server + +Serve the same repo over HTTP: + +```bash +cargo run -p omnigraph-server -- ./repo.omni --bind 127.0.0.1:8080 +``` + +Then query it remotely: + +```bash +cargo run -p omnigraph-cli -- read \ + --target http://127.0.0.1:8080 \ + --query crates/omnigraph/tests/fixtures/test.gq \ + --name get_person \ + --params '{"name":"Alice"}' +``` + +Server routes include `/healthz`, `/snapshot`, `/export`, `/read`, `/change`, +`/ingest`, `/branches`, `/runs`, and `/commits`. + +To require auth, set `OMNIGRAPH_SERVER_BEARER_TOKEN` on the server and set the +matching bearer token env var in your CLI target config. + +## Common Commands + +Core repo flow: + +```bash +omnigraph init --schema ./schema.pg ./repo.omni +omnigraph load --data ./data.jsonl --mode overwrite ./repo.omni +omnigraph snapshot ./repo.omni --branch main --json +omnigraph read ./repo.omni --query ./queries.gq --name get_person --params '{"name":"Alice"}' +omnigraph change ./repo.omni --query ./queries.gq --name insert_person --params '{"name":"Mina","age":28}' +omnigraph branch create --uri ./repo.omni --from main feature-x +omnigraph branch merge --uri ./repo.omni feature-x --into main +``` + +More CLI examples, config patterns, and admin commands live in +[docs/cli.md](docs/cli.md). + +## Production Features + +- Branches, commits, merge-base-aware graph merges, and transactional runs +- Snapshot-pinned reads across local and S3-backed repos +- Traversal plus text, fuzzy, BM25, vector, and RRF search +- Axum server for reads, changes, export, branches, commits, and runs +- Cedar-based server-side authorization + +## Docs + +- [Install guide](docs/install.md) +- [CLI guide](docs/cli.md) +- [Deployment guide](docs/deployment.md) + +## Build And Test + +```bash +cargo build --workspace +cargo check --workspace +cargo test --workspace +``` + +Notes: + +- Rust stable toolchain, edition 2024 +- CI runs `cargo test --workspace --locked` +- Full CI and some local test flows require `protobuf-compiler` +- S3 integration tests expect an S3-compatible endpoint such as RustFS + +## Workspace Crates + +- `crates/omnigraph-compiler`: shared schema/query parser, typechecker, catalog, and IR lowering +- `crates/omnigraph`: storage/runtime, branching, merge, change detection, and query execution +- `crates/omnigraph-cli`: CLI for init/load/ingest/read/change/branch/snapshot/export/policy operations +- `crates/omnigraph-server`: Axum HTTP server for remote reads, changes, ingest, export, branches, commits, and runs + +## Contributing + +Please open an issue, spec, or design discussion before sending large code +changes. Design feedback and concrete problem statements are the fastest way to +collaborate on the roadmap. diff --git a/SECURITY.md b/SECURITY.md new file mode 100644 index 0000000..bf94797 --- /dev/null +++ b/SECURITY.md @@ -0,0 +1,14 @@ +# Security Policy + +Please do not report security issues through public GitHub issues. + +If GitHub private vulnerability reporting is enabled for this repository, use +that channel. Otherwise, contact the maintainers directly through a private +channel before publishing details. + +When reporting an issue, include: + +- affected version or commit +- impact +- reproduction steps +- any proposed mitigation diff --git a/crates/omnigraph-cli/Cargo.toml b/crates/omnigraph-cli/Cargo.toml new file mode 100644 index 0000000..380fed2 --- /dev/null +++ b/crates/omnigraph-cli/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "omnigraph-cli" +version = "0.4.0" +edition = "2024" +description = "CLI for the Omnigraph graph database." +license = "MIT" + +[[bin]] +name = "omnigraph" +path = "src/main.rs" + +[dependencies] +omnigraph = { path = "../omnigraph", version = "0.4.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.0" } +omnigraph-server = { path = "../omnigraph-server", version = "0.4.0" } +clap = { workspace = true } +color-eyre = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +serde_yaml = { workspace = true } +tokio = { workspace = true } +reqwest = { workspace = true, features = ["blocking"] } + +[dev-dependencies] +assert_cmd = "2" +predicates = "3" +serde_json = { workspace = true } +tempfile = { workspace = true } diff --git a/crates/omnigraph-cli/src/embed.rs b/crates/omnigraph-cli/src/embed.rs new file mode 100644 index 0000000..2e1c6d9 --- /dev/null +++ b/crates/omnigraph-cli/src/embed.rs @@ -0,0 +1,586 @@ +use std::collections::{BTreeMap, HashSet}; +use std::fs::{self, File}; +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::path::{Path, PathBuf}; + +use clap::Args; +use color_eyre::eyre::{Result, bail, eyre}; +use omnigraph::embedding::EmbeddingClient; +use serde::{Deserialize, Serialize}; +use serde_json::{Map, Value, json}; + +const DEFAULT_EMBED_MODEL: &str = "gemini-embedding-2-preview"; + +#[derive(Debug, Args, Clone)] +pub(crate) struct EmbedArgs { + /// Seed manifest path + #[arg(long, conflicts_with_all = ["input", "output", "spec"])] + pub seed: Option, + /// Raw seed JSONL input path + #[arg(long, requires_all = ["output", "spec"], conflicts_with = "seed")] + pub input: Option, + /// Embedded JSONL output path + #[arg(long)] + pub output: Option, + /// Embedding spec JSON path + #[arg(long, requires_all = ["input", "output"], conflicts_with = "seed")] + pub spec: Option, + /// Remove embedding fields instead of generating embeddings + #[arg(long, conflicts_with = "reembed_all")] + pub clean: bool, + /// Regenerate embeddings for all matching rows + #[arg(long, conflicts_with = "clean")] + pub reembed_all: bool, + /// Restrict processing to these type names + #[arg(long = "type")] + pub types: Vec, + /// Reembed or clean matching rows only. Syntax: Type:field=value or field=value + #[arg(long = "select")] + pub selectors: Vec, + /// Print JSON summary + #[arg(long)] + pub json: bool, +} + +#[derive(Debug, Clone, Serialize)] +pub(crate) struct EmbedOutput { + pub input: String, + pub output: String, + pub rows: usize, + pub selected_rows: usize, + pub embedded_rows: usize, + pub cleaned_rows: usize, + pub mode: &'static str, + pub dimension: usize, + pub model: String, +} + +#[derive(Debug, Clone)] +pub(crate) struct EmbedJob { + input: PathBuf, + output: PathBuf, + spec: EmbedSpec, + mode: EmbedMode, + type_filter: HashSet, + selectors: Vec, +} + +#[derive(Debug, Clone, Copy)] +enum EmbedMode { + FillMissing, + ReembedAll, + Clean, +} + +impl EmbedMode { + fn as_str(self, selectors_present: bool) -> &'static str { + match self { + Self::FillMissing if selectors_present => "reembed_selected", + Self::FillMissing => "fill_missing", + Self::ReembedAll => "reembed_all", + Self::Clean => "clean", + } + } +} + +#[derive(Debug, Clone, Deserialize)] +struct EmbedSpec { + #[serde(default = "default_embed_model")] + model: String, + dimension: usize, + types: BTreeMap, +} + +#[derive(Debug, Clone, Deserialize)] +struct EmbedTypeSpec { + target: String, + fields: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +struct SeedManifest { + #[serde(default)] + sources: Option, + #[serde(default)] + artifacts: Option, + #[serde(default)] + embeddings: Option, + #[serde(default)] + seed: Option, +} + +#[derive(Debug, Clone, Deserialize)] +struct SeedSources { + raw_seed: PathBuf, +} + +#[derive(Debug, Clone, Deserialize)] +struct SeedArtifacts { + embedded_seed: PathBuf, +} + +#[derive(Debug, Clone, Deserialize)] +struct LegacySeed { + data: PathBuf, +} + +#[derive(Debug, Clone)] +struct RowSelector { + type_name: Option, + field: String, + expected: String, +} + +#[derive(Debug)] +enum EmbedRow { + Entity { + type_name: String, + data: Map, + root: Map, + }, + Passthrough(Map), +} + +pub(crate) fn resolve_embed_job(args: &EmbedArgs) -> Result { + let mode = if args.clean { + EmbedMode::Clean + } else if args.reembed_all { + EmbedMode::ReembedAll + } else { + EmbedMode::FillMissing + }; + let selectors = args + .selectors + .iter() + .map(|selector| RowSelector::parse(selector)) + .collect::>>()?; + let type_filter = args.types.iter().cloned().collect::>(); + + let (input, output, spec) = if let Some(seed_path) = &args.seed { + let manifest = load_seed_manifest(seed_path)?; + ( + manifest.raw_seed, + args.output.clone().unwrap_or(manifest.embedded_seed), + manifest.spec, + ) + } else { + let input = args + .input + .clone() + .ok_or_else(|| eyre!("--input is required when --seed is not provided"))?; + let output = args + .output + .clone() + .ok_or_else(|| eyre!("--output is required when --seed is not provided"))?; + let spec_path = args + .spec + .clone() + .ok_or_else(|| eyre!("--spec is required when --seed is not provided"))?; + let spec = load_embed_spec(&spec_path)?; + (input, output, spec) + }; + + if spec.model != DEFAULT_EMBED_MODEL { + bail!( + "only {} is supported for explicit seed embeddings right now", + DEFAULT_EMBED_MODEL + ); + } + + Ok(EmbedJob { + input, + output, + spec, + mode, + type_filter, + selectors, + }) +} + +pub(crate) async fn execute_embed(args: &EmbedArgs) -> Result { + let job = resolve_embed_job(args)?; + run_embed_job(&job).await +} + +pub(crate) async fn run_embed_job(job: &EmbedJob) -> Result { + if !job.input.exists() { + bail!("seed input does not exist: {}", job.input.display()); + } + + if let Some(parent) = job.output.parent() { + fs::create_dir_all(parent)?; + } + + let temp_output = temp_output_path(&job.output); + let mut reader = BufReader::new(File::open(&job.input)?); + let mut writer = BufWriter::new(File::create(&temp_output)?); + let client = match job.mode { + EmbedMode::Clean => None, + _ => Some(EmbeddingClient::from_env()?), + }; + + let mut line = String::new(); + let mut rows = 0usize; + let mut selected_rows = 0usize; + let mut embedded_rows = 0usize; + let mut cleaned_rows = 0usize; + + loop { + line.clear(); + let bytes = reader.read_line(&mut line)?; + if bytes == 0 { + break; + } + let raw = line.trim(); + if raw.is_empty() { + continue; + } + rows += 1; + let mut row = parse_row(raw, rows)?; + let selected = row_matches_selection(&row, &job.type_filter, &job.selectors); + if selected { + selected_rows += 1; + } + + if let Some(type_spec) = row + .type_name() + .and_then(|type_name| job.spec.types.get(type_name)) + { + match job.mode { + EmbedMode::Clean => { + if selected + && row + .data_mut() + .is_some_and(|data| data.remove(&type_spec.target).is_some()) + { + cleaned_rows += 1; + } + } + EmbedMode::ReembedAll => { + if selected { + embed_row( + &mut row, + type_spec, + job.spec.dimension, + client.as_ref().unwrap(), + ) + .await?; + embedded_rows += 1; + } + } + EmbedMode::FillMissing => { + let reembed_selected = !job.selectors.is_empty(); + if selected + && (reembed_selected + || embedding_missing( + row.data().and_then(|data| data.get(&type_spec.target)), + )) + { + embed_row( + &mut row, + type_spec, + job.spec.dimension, + client.as_ref().unwrap(), + ) + .await?; + embedded_rows += 1; + } + } + } + } + + writer.write_all(serde_json::to_string(&row.into_value())?.as_bytes())?; + writer.write_all(b"\n")?; + } + + writer.flush()?; + fs::rename(&temp_output, &job.output)?; + + Ok(EmbedOutput { + input: job.input.display().to_string(), + output: job.output.display().to_string(), + rows, + selected_rows, + embedded_rows, + cleaned_rows, + mode: job.mode.as_str(!job.selectors.is_empty()), + dimension: job.spec.dimension, + model: job.spec.model.clone(), + }) +} + +fn temp_output_path(output: &Path) -> PathBuf { + let mut temp = output.as_os_str().to_os_string(); + temp.push(".tmp"); + PathBuf::from(temp) +} + +fn default_embed_model() -> String { + DEFAULT_EMBED_MODEL.to_string() +} + +fn load_embed_spec(path: &Path) -> Result { + Ok(serde_json::from_str(&fs::read_to_string(path)?)?) +} + +struct ResolvedSeedManifest { + raw_seed: PathBuf, + embedded_seed: PathBuf, + spec: EmbedSpec, +} + +fn load_seed_manifest(path: &Path) -> Result { + let base_dir = path + .parent() + .map(Path::to_path_buf) + .unwrap_or(std::env::current_dir()?); + let manifest: SeedManifest = serde_yaml::from_str(&fs::read_to_string(path)?)?; + let raw_seed = manifest + .sources + .as_ref() + .map(|sources| sources.raw_seed.clone()) + .or_else(|| manifest.seed.as_ref().map(|seed| seed.data.clone())) + .ok_or_else(|| eyre!("seed manifest is missing sources.raw_seed"))?; + let embedded_seed = manifest + .artifacts + .as_ref() + .map(|artifacts| artifacts.embedded_seed.clone()) + .unwrap_or_else(|| PathBuf::from("./build/seed.embedded.jsonl")); + let spec = manifest + .embeddings + .ok_or_else(|| eyre!("seed manifest is missing embeddings"))?; + + Ok(ResolvedSeedManifest { + raw_seed: base_dir.join(raw_seed), + embedded_seed: base_dir.join(embedded_seed), + spec, + }) +} + +impl RowSelector { + fn parse(value: &str) -> Result { + let (lhs, expected) = value + .split_once('=') + .ok_or_else(|| eyre!("selector must be field=value or Type:field=value"))?; + let (type_name, field) = if let Some((type_name, field)) = lhs.split_once(':') { + ( + Some(type_name.trim().to_string()).filter(|value| !value.is_empty()), + field.trim().to_string(), + ) + } else { + (None, lhs.trim().to_string()) + }; + + if field.is_empty() { + bail!("selector field cannot be empty"); + } + + Ok(Self { + type_name, + field, + expected: expected.trim().to_string(), + }) + } + + fn matches(&self, type_name: &str, data: &Map) -> bool { + if self + .type_name + .as_deref() + .is_some_and(|expected| expected != type_name) + { + return false; + } + + data.get(&self.field) + .map(render_value) + .is_some_and(|value| value == self.expected) + } +} + +fn parse_row(raw: &str, line_number: usize) -> Result { + let mut root = serde_json::from_str::>(raw) + .map_err(|err| eyre!("line {} is not valid JSON: {}", line_number, err))?; + let Some(type_name) = root.get("type").and_then(Value::as_str).map(str::to_string) else { + return Ok(EmbedRow::Passthrough(root)); + }; + let data = root + .remove("data") + .and_then(|value| value.as_object().cloned()) + .ok_or_else(|| eyre!("line {} is missing object field 'data'", line_number))?; + + Ok(EmbedRow::Entity { + type_name, + data, + root, + }) +} + +impl EmbedRow { + fn into_value(self) -> Value { + match self { + Self::Entity { + type_name, + data, + mut root, + } => { + root.insert("type".to_string(), Value::String(type_name)); + root.insert("data".to_string(), Value::Object(data)); + Value::Object(root) + } + Self::Passthrough(root) => Value::Object(root), + } + } + + fn type_name(&self) -> Option<&str> { + match self { + Self::Entity { type_name, .. } => Some(type_name.as_str()), + Self::Passthrough(_) => None, + } + } + + fn data(&self) -> Option<&Map> { + match self { + Self::Entity { data, .. } => Some(data), + Self::Passthrough(_) => None, + } + } + + fn data_mut(&mut self) -> Option<&mut Map> { + match self { + Self::Entity { data, .. } => Some(data), + Self::Passthrough(_) => None, + } + } +} + +fn row_matches_selection( + row: &EmbedRow, + type_filter: &HashSet, + selectors: &[RowSelector], +) -> bool { + let Some(type_name) = row.type_name() else { + return false; + }; + let Some(data) = row.data() else { + return false; + }; + + let matches_type = type_filter.is_empty() || type_filter.contains(type_name); + if !matches_type { + return false; + } + if selectors.is_empty() { + return true; + } + selectors + .iter() + .any(|selector| selector.matches(type_name, data)) +} + +fn embedding_missing(value: Option<&Value>) -> bool { + match value { + None | Some(Value::Null) => true, + Some(Value::Array(values)) => values.is_empty(), + _ => false, + } +} + +fn render_value(value: &Value) -> String { + match value { + Value::Null => String::new(), + Value::String(value) => value.trim().to_string(), + Value::Bool(value) => { + if *value { + "true".to_string() + } else { + "false".to_string() + } + } + Value::Number(value) => value.to_string(), + Value::Array(values) => values + .iter() + .map(render_value) + .filter(|value| !value.is_empty()) + .collect::>() + .join(", "), + other => other.to_string(), + } +} + +fn build_embedding_text(type_name: &str, data: &Map, fields: &[String]) -> String { + let mut parts = vec![format!("type: {}", type_name)]; + for field in fields { + if let Some(value) = data.get(field) { + let rendered = render_value(value); + if !rendered.is_empty() { + parts.push(format!("{}: {}", field, rendered)); + } + } + } + parts.join("\n") +} + +async fn embed_row( + row: &mut EmbedRow, + spec: &EmbedTypeSpec, + dimension: usize, + client: &EmbeddingClient, +) -> Result<()> { + let type_name = row + .type_name() + .ok_or_else(|| eyre!("cannot embed non-entity seed rows"))? + .to_string(); + let data = row + .data_mut() + .ok_or_else(|| eyre!("cannot embed non-entity seed rows"))?; + let text = build_embedding_text(&type_name, data, &spec.fields); + if text.trim().is_empty() { + return Ok(()); + } + let embedding = client.embed_document_text(&text, dimension).await?; + data.insert(spec.target.clone(), json!(embedding)); + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::{RowSelector, build_embedding_text, render_value}; + use serde_json::json; + + #[test] + fn selector_parses_type_and_field_forms() { + let typed = RowSelector::parse("Decision:slug=dec-1").unwrap(); + assert_eq!(typed.type_name.as_deref(), Some("Decision")); + assert_eq!(typed.field, "slug"); + assert_eq!(typed.expected, "dec-1"); + + let plain = RowSelector::parse("slug=dec-2").unwrap(); + assert_eq!(plain.type_name, None); + assert_eq!(plain.field, "slug"); + assert_eq!(plain.expected, "dec-2"); + } + + #[test] + fn render_value_handles_lists_and_scalars() { + assert_eq!(render_value(&json!(["a", "b"])), "a, b"); + assert_eq!(render_value(&json!(true)), "true"); + assert_eq!(render_value(&json!(3)), "3"); + } + + #[test] + fn build_embedding_text_prefixes_type_and_fields() { + let data = json!({ + "slug": "dec-1", + "intent": "Ship it" + }); + let object = data.as_object().unwrap(); + let text = build_embedding_text( + "Decision", + object, + &["slug".to_string(), "intent".to_string()], + ); + assert!(text.contains("type: Decision")); + assert!(text.contains("slug: dec-1")); + assert!(text.contains("intent: Ship it")); + } +} diff --git a/crates/omnigraph-cli/src/main.rs b/crates/omnigraph-cli/src/main.rs new file mode 100644 index 0000000..9a74a7f --- /dev/null +++ b/crates/omnigraph-cli/src/main.rs @@ -0,0 +1,2410 @@ +use std::fs; +use std::path::Path; +use std::path::PathBuf; + +use clap::{Arg, ArgAction, Args, CommandFactory, FromArgMatches, Parser, Subcommand, ValueEnum}; +use color_eyre::eyre::{Result, bail}; +use omnigraph::db::{Omnigraph, ReadTarget, RunId, SnapshotId}; +use omnigraph::loader::LoadMode; +use omnigraph_compiler::json_params_to_param_map; +use omnigraph_compiler::query::parser::parse_query; +use omnigraph_compiler::{JsonParamMode, ParamMap, SchemaMigrationPlan, SchemaMigrationStep}; +use omnigraph_server::api::{ + BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, + BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, + CommitOutput, ErrorOutput, ExportRequest, IngestOutput, IngestRequest, ReadOutput, ReadRequest, + RunListOutput, RunOutput, SnapshotOutput, SnapshotTableOutput, commit_output, ingest_output, + read_output, run_output, snapshot_payload, +}; +use omnigraph_server::{ + AliasCommand, OmnigraphConfig, PolicyAction, PolicyDecision, PolicyEngine, PolicyRequest, + PolicyTestConfig, ReadOutputFormat, load_config, +}; +use reqwest::Method; +use reqwest::header::AUTHORIZATION; +use serde::Serialize; +use serde::de::DeserializeOwned; +use serde_json::Value; + +mod embed; +mod read_format; + +use embed::{EmbedArgs, EmbedOutput, execute_embed}; +use read_format::{ReadRenderOptions, render_read}; + +const DEFAULT_BEARER_TOKEN_ENV: &str = "OMNIGRAPH_BEARER_TOKEN"; + +#[derive(Debug, Parser)] +#[command(name = "omnigraph")] +#[command(about = "Omnigraph graph database CLI")] +#[command(version = env!("CARGO_PKG_VERSION"), disable_version_flag = true)] +struct Cli { + #[command(subcommand)] + command: Command, +} + +#[derive(Debug, Subcommand)] +enum Command { + /// Print the CLI version + Version, + /// Generate, clean, or refresh explicit seed embeddings + Embed(EmbedArgs), + /// Initialize a new repo from a schema + Init { + #[arg(long)] + schema: PathBuf, + /// Repo URI (local path or s3://) + uri: String, + }, + /// Load data into a repo + Load { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + data: PathBuf, + #[arg(long)] + branch: Option, + #[arg(long, default_value = "overwrite")] + mode: CliLoadMode, + #[arg(long)] + json: bool, + }, + /// Ingest data into a reviewable named branch + Ingest { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + data: PathBuf, + #[arg(long)] + branch: Option, + #[arg(long)] + from: Option, + #[arg(long, default_value = "merge")] + mode: CliLoadMode, + #[arg(long)] + json: bool, + }, + /// Branch operations + Branch { + #[command(subcommand)] + command: BranchCommand, + }, + /// Schema planning operations + Schema { + #[command(subcommand)] + command: SchemaCommand, + }, + /// Show repo snapshot + Snapshot { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + branch: Option, + #[arg(long)] + json: bool, + }, + /// Export a full graph snapshot as JSONL + Export { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + branch: Option, + #[arg(long)] + jsonl: bool, + #[arg(long = "type")] + type_names: Vec, + #[arg(long = "table")] + table_keys: Vec, + }, + /// Run operations + Run { + #[command(subcommand)] + command: RunCommand, + }, + /// Commit history operations + Commit { + #[command(subcommand)] + command: CommitCommand, + }, + /// Execute a read query against a branch or snapshot + Read { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + alias: Option, + #[arg(long)] + query: Option, + #[arg(long)] + name: Option, + #[command(flatten)] + params: ParamsArgs, + #[arg(long, conflicts_with = "snapshot")] + branch: Option, + #[arg(long, conflicts_with = "branch")] + snapshot: Option, + #[arg(long, conflicts_with = "json")] + format: Option, + #[arg(long, conflicts_with = "format")] + json: bool, + #[arg()] + alias_args: Vec, + }, + /// Execute a graph change query against a branch + Change { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + alias: Option, + #[arg(long)] + query: Option, + #[arg(long)] + name: Option, + #[command(flatten)] + params: ParamsArgs, + #[arg(long)] + branch: Option, + #[arg(long)] + json: bool, + #[arg()] + alias_args: Vec, + }, + /// Policy administration and diagnostics + Policy { + #[command(subcommand)] + command: PolicyCommand, + }, +} + +#[derive(Debug, Subcommand)] +enum BranchCommand { + /// Create a new branch + Create { + /// Repo URI + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + from: Option, + name: String, + #[arg(long)] + json: bool, + }, + /// List branches + List { + /// Repo URI + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + json: bool, + }, + /// Delete a branch + Delete { + /// Repo URI + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + name: String, + #[arg(long)] + json: bool, + }, + /// Merge a source branch into a target branch + Merge { + /// Repo URI + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + source: String, + #[arg(long)] + into: Option, + #[arg(long)] + json: bool, + }, +} + +#[derive(Debug, Subcommand)] +enum SchemaCommand { + /// Plan a schema migration against the accepted persisted schema + Plan { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + schema: PathBuf, + #[arg(long)] + json: bool, + }, +} + +#[derive(Debug, Subcommand)] +enum RunCommand { + /// List transactional runs + List { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + json: bool, + }, + /// Show a transactional run + Show { + /// Repo URI + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + run_id: String, + #[arg(long)] + json: bool, + }, + /// Publish a transactional run + Publish { + /// Repo URI + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + run_id: String, + #[arg(long)] + json: bool, + }, + /// Abort a transactional run + Abort { + /// Repo URI + #[arg(long)] + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + run_id: String, + #[arg(long)] + json: bool, + }, +} + +#[derive(Debug, Subcommand)] +enum CommitCommand { + /// List graph commits + List { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + branch: Option, + #[arg(long)] + json: bool, + }, + /// Show a graph commit + Show { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + commit_id: String, + #[arg(long)] + json: bool, + }, +} + +#[derive(Debug, Subcommand)] +enum PolicyCommand { + /// Validate policy YAML and compiled Cedar policy state + Validate { + #[arg(long)] + config: Option, + }, + /// Run declarative policy tests from policy.tests.yaml + Test { + #[arg(long)] + config: Option, + }, + /// Explain one policy decision locally + Explain { + #[arg(long)] + config: Option, + #[arg(long)] + actor: String, + #[arg(long)] + action: PolicyAction, + #[arg(long)] + branch: Option, + #[arg(long = "target-branch")] + target_branch: Option, + }, +} + +#[derive(Debug, Args, Clone)] +struct ParamsArgs { + #[arg(long, conflicts_with = "params_file")] + params: Option, + #[arg(long, conflicts_with = "params")] + params_file: Option, +} + +#[derive(Clone, Copy, Debug, Eq, PartialEq, Serialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +enum CliLoadMode { + Overwrite, + Append, + Merge, +} + +impl From for LoadMode { + fn from(value: CliLoadMode) -> Self { + match value { + CliLoadMode::Overwrite => LoadMode::Overwrite, + CliLoadMode::Append => LoadMode::Append, + CliLoadMode::Merge => LoadMode::Merge, + } + } +} + +impl CliLoadMode { + fn as_str(self) -> &'static str { + match self { + CliLoadMode::Overwrite => "overwrite", + CliLoadMode::Append => "append", + CliLoadMode::Merge => "merge", + } + } +} + +#[derive(Debug, Serialize)] +struct LoadOutput<'a> { + uri: &'a str, + branch: &'a str, + mode: &'a str, + nodes_loaded: usize, + edges_loaded: usize, +} + +#[derive(Debug, Serialize)] +struct SchemaPlanOutput<'a> { + uri: &'a str, + supported: bool, + step_count: usize, + steps: &'a [SchemaMigrationStep], +} + +fn ensure_local_repo_parent(uri: &str) -> Result<()> { + if !uri.contains("://") { + fs::create_dir_all(uri)?; + } + Ok(()) +} + +fn print_json(value: &T) -> Result<()> { + println!("{}", serde_json::to_string_pretty(value)?); + Ok(()) +} + +fn is_remote_uri(uri: &str) -> bool { + uri.starts_with("http://") || uri.starts_with("https://") +} + +fn remote_url(base: &str, path: &str) -> String { + format!("{}{}", base.trim_end_matches('/'), path) +} + +fn remote_branch_url(base: &str, branch: &str) -> Result { + let mut url = reqwest::Url::parse(&format!("{}/", base.trim_end_matches('/')))?; + url.path_segments_mut() + .map_err(|_| color_eyre::eyre::eyre!("invalid remote base url"))? + .extend(["branches", branch]); + Ok(url.to_string()) +} + +fn normalize_bearer_token(value: Option) -> Option { + value + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) +} + +fn bearer_token_from_env(var_name: &str) -> Option { + normalize_bearer_token(std::env::var(var_name).ok()) +} + +fn parse_env_assignment(line: &str) -> Option<(String, String)> { + let line = line.trim(); + if line.is_empty() || line.starts_with('#') { + return None; + } + + let line = line.strip_prefix("export ").unwrap_or(line).trim(); + let (name, value) = line.split_once('=')?; + let name = name.trim(); + if name.is_empty() { + return None; + } + + let value = value.trim(); + let value = if value.len() >= 2 + && ((value.starts_with('"') && value.ends_with('"')) + || (value.starts_with('\'') && value.ends_with('\''))) + { + &value[1..value.len() - 1] + } else { + value + }; + + Some((name.to_string(), value.to_string())) +} + +fn bearer_token_from_env_file(path: &Path, var_name: &str) -> Result> { + if !path.exists() { + return Ok(None); + } + + for line in fs::read_to_string(path)?.lines() { + let Some((name, value)) = parse_env_assignment(line) else { + continue; + }; + if name == var_name { + return Ok(normalize_bearer_token(Some(value))); + } + } + + Ok(None) +} + +fn load_env_file_into_process(path: &Path) -> Result<()> { + if !path.exists() { + return Ok(()); + } + + for line in fs::read_to_string(path)?.lines() { + let Some((name, value)) = parse_env_assignment(line) else { + continue; + }; + if std::env::var_os(&name).is_none() { + unsafe { + std::env::set_var(name, value); + } + } + } + + Ok(()) +} + +fn load_cli_config(config_path: Option<&PathBuf>) -> Result { + let config = load_config(config_path)?; + if let Some(path) = config.resolve_auth_env_file() { + load_env_file_into_process(&path)?; + } + Ok(config) +} + +fn resolve_policy_engine(config: &OmnigraphConfig) -> Result { + let policy_file = config + .resolve_policy_file() + .ok_or_else(|| color_eyre::eyre::eyre!("policy.file must be set in omnigraph.yaml"))?; + PolicyEngine::load(&policy_file, &policy_repo_id(config)) +} + +fn resolve_policy_tests_path(config: &OmnigraphConfig) -> Result { + config.resolve_policy_tests_file().ok_or_else(|| { + color_eyre::eyre::eyre!( + "policy.tests.yaml requires policy.file to be set in omnigraph.yaml" + ) + }) +} + +fn policy_repo_id(config: &OmnigraphConfig) -> String { + if let Some(name) = &config.project.name { + return name.clone(); + } + config + .resolve_target_uri(None, None, config.server_target_name()) + .or_else(|_| config.resolve_target_uri(None, None, config.cli_target_name())) + .unwrap_or_else(|_| "default".to_string()) +} + +fn resolve_remote_bearer_token( + config: &OmnigraphConfig, + explicit_uri: Option<&str>, + explicit_target: Option<&str>, +) -> Result> { + let scoped_env = + config.target_bearer_token_env(explicit_uri, explicit_target, config.cli_target_name()); + let mut env_names = Vec::new(); + if let Some(name) = scoped_env { + env_names.push(name.to_string()); + } + if env_names + .iter() + .all(|name| name != DEFAULT_BEARER_TOKEN_ENV) + { + env_names.push(DEFAULT_BEARER_TOKEN_ENV.to_string()); + } + + let env_file = config.resolve_auth_env_file(); + for env_name in env_names { + if let Some(token) = bearer_token_from_env(&env_name) { + return Ok(Some(token)); + } + if let Some(path) = env_file.as_ref() { + if let Some(token) = bearer_token_from_env_file(path, &env_name)? { + return Ok(Some(token)); + } + } + } + + Ok(None) +} + +fn build_http_client() -> Result { + Ok(reqwest::Client::new()) +} + +fn apply_bearer_token( + request: reqwest::RequestBuilder, + token: Option<&str>, +) -> reqwest::RequestBuilder { + if let Some(token) = token { + request.header(AUTHORIZATION, format!("Bearer {}", token)) + } else { + request + } +} + +async fn remote_json( + client: &reqwest::Client, + method: Method, + url: String, + body: Option, + bearer_token: Option<&str>, +) -> Result { + let request = apply_bearer_token(client.request(method, url), bearer_token); + let request = if let Some(body) = body { + request.json(&body) + } else { + request + }; + let response = request.send().await?; + let status = response.status(); + let text = response.text().await?; + if !status.is_success() { + if let Ok(error) = serde_json::from_str::(&text) { + bail!(error.error); + } + bail!("server returned {}: {}", status, text); + } + Ok(serde_json::from_str(&text)?) +} + +async fn remote_text( + client: &reqwest::Client, + method: Method, + url: String, + body: Option, + bearer_token: Option<&str>, +) -> Result { + let request = apply_bearer_token(client.request(method, url), bearer_token); + let request = if let Some(body) = body { + request.json(&body) + } else { + request + }; + let response = request.send().await?; + let status = response.status(); + let text = response.text().await?; + if !status.is_success() { + if let Ok(error) = serde_json::from_str::(&text) { + bail!(error.error); + } + bail!("server returned {}: {}", status, text); + } + Ok(text) +} + +fn resolve_uri( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, +) -> Result { + config.resolve_target_uri(cli_uri, cli_target, config.cli_target_name()) +} + +fn resolve_local_uri( + config: &OmnigraphConfig, + cli_uri: Option, + cli_target: Option<&str>, + operation: &str, +) -> Result { + let uri = resolve_uri(config, cli_uri, cli_target)?; + if is_remote_uri(&uri) { + bail!( + "{} is only supported against local repo URIs in this milestone", + operation + ); + } + Ok(uri) +} + +fn resolve_branch( + config: &OmnigraphConfig, + cli_branch: Option, + alias_branch: Option, + default_branch: &str, +) -> String { + cli_branch + .or(alias_branch) + .or_else(|| config.cli.branch.clone()) + .unwrap_or_else(|| default_branch.to_string()) +} + +fn resolve_read_target( + config: &OmnigraphConfig, + cli_branch: Option, + cli_snapshot: Option, + alias_branch: Option, +) -> Result { + if cli_branch.is_some() && cli_snapshot.is_some() { + bail!("read target may specify branch or snapshot, not both"); + } + Ok(read_target_from_cli( + cli_branch + .or(alias_branch) + .or_else(|| config.cli.branch.clone()), + cli_snapshot, + )) +} + +fn resolve_query_source( + config: &OmnigraphConfig, + explicit_query: Option<&PathBuf>, + alias_query: Option<&str>, +) -> Result { + let query_path = explicit_query + .map(PathBuf::from) + .or_else(|| alias_query.map(PathBuf::from)) + .ok_or_else(|| { + color_eyre::eyre::eyre!("exactly one of --query or --alias must be provided") + })?; + Ok(fs::read_to_string(config.resolve_query_path(&query_path)?)?) +} + +fn parse_alias_value(value: &str) -> Value { + serde_json::from_str(value).unwrap_or_else(|_| Value::String(value.to_string())) +} + +fn merged_params_json( + alias_name: Option<&str>, + alias_arg_names: &[String], + alias_arg_values: &[String], + explicit: Option, +) -> Result> { + if alias_arg_values.len() > alias_arg_names.len() { + let alias = alias_name.unwrap_or(""); + bail!( + "alias '{}' expects at most {} args but got {}", + alias, + alias_arg_names.len(), + alias_arg_values.len() + ); + } + + let mut merged = serde_json::Map::new(); + for (arg_name, arg_value) in alias_arg_names.iter().zip(alias_arg_values.iter()) { + merged.insert(arg_name.clone(), parse_alias_value(arg_value)); + } + + match explicit { + Some(Value::Object(object)) => { + for (key, value) in object { + merged.insert(key, value); + } + } + Some(_) => bail!("params JSON must be an object"), + None => {} + } + + if merged.is_empty() { + Ok(None) + } else { + Ok(Some(Value::Object(merged))) + } +} + +fn print_load_human( + uri: &str, + branch: &str, + mode: CliLoadMode, + nodes_loaded: usize, + edges_loaded: usize, +) { + println!( + "loaded {} on branch {} with {}: {} node types, {} edge types", + uri, + branch, + mode.as_str(), + nodes_loaded, + edges_loaded + ); +} + +fn print_ingest_human(output: &IngestOutput) { + println!( + "ingested {} into branch {} from {} with {} ({})", + output.uri, + output.branch, + output.base_branch, + output.mode.as_str(), + if output.branch_created { + "branch created" + } else { + "branch exists" + } + ); + for table in &output.tables { + println!("{} rows_loaded={}", table.table_key, table.rows_loaded); + } + if let Some(actor_id) = &output.actor_id { + println!("actor_id: {}", actor_id); + } +} + +fn print_schema_plan_human(uri: &str, plan: &SchemaMigrationPlan) { + println!("schema plan for {}", uri); + println!("supported: {}", if plan.supported { "yes" } else { "no" }); + if plan.steps.is_empty() { + println!("no schema changes"); + return; + } + for step in &plan.steps { + println!("- {}", render_schema_plan_step(step)); + } +} + +fn render_schema_plan_step(step: &SchemaMigrationStep) -> String { + match step { + SchemaMigrationStep::AddType { type_kind, name } => { + format!("add {} type '{}'", schema_type_kind_label(*type_kind), name) + } + SchemaMigrationStep::RenameType { + type_kind, + from, + to, + } => format!( + "rename {} type '{}' -> '{}'", + schema_type_kind_label(*type_kind), + from, + to + ), + SchemaMigrationStep::AddProperty { + type_kind, + type_name, + property_name, + property_type, + } => format!( + "add property '{}.{}' ({}) on {} '{}'", + type_name, + property_name, + render_prop_type(property_type), + schema_type_kind_label(*type_kind), + type_name + ), + SchemaMigrationStep::RenameProperty { + type_kind, + type_name, + from, + to, + } => format!( + "rename property '{}.{}' -> '{}.{}' on {} '{}'", + type_name, + from, + type_name, + to, + schema_type_kind_label(*type_kind), + type_name + ), + SchemaMigrationStep::AddConstraint { + type_kind, + type_name, + constraint, + } => format!( + "add constraint {} on {} '{}'", + render_constraint(constraint), + schema_type_kind_label(*type_kind), + type_name + ), + SchemaMigrationStep::UpdateTypeMetadata { + type_kind, + name, + annotations, + } => format!( + "update metadata on {} '{}' ({})", + schema_type_kind_label(*type_kind), + name, + render_annotations(annotations) + ), + SchemaMigrationStep::UpdatePropertyMetadata { + type_kind, + type_name, + property_name, + annotations, + } => format!( + "update metadata on property '{}.{}' of {} '{}' ({})", + type_name, + property_name, + schema_type_kind_label(*type_kind), + type_name, + render_annotations(annotations) + ), + SchemaMigrationStep::UnsupportedChange { entity, reason } => { + format!("unsupported change on {}: {}", entity, reason) + } + } +} + +fn schema_type_kind_label(kind: omnigraph_compiler::SchemaTypeKind) -> &'static str { + match kind { + omnigraph_compiler::SchemaTypeKind::Interface => "interface", + omnigraph_compiler::SchemaTypeKind::Node => "node", + omnigraph_compiler::SchemaTypeKind::Edge => "edge", + } +} + +fn render_prop_type(prop_type: &omnigraph_compiler::PropType) -> String { + let base = if let Some(values) = &prop_type.enum_values { + format!("Enum({})", values.join("|")) + } else { + prop_type.scalar.to_string() + }; + let base = if prop_type.list { + format!("[{}]", base) + } else { + base + }; + if prop_type.nullable { + format!("{}?", base) + } else { + base + } +} + +fn render_constraint(constraint: &omnigraph_compiler::schema::ast::Constraint) -> String { + match constraint { + omnigraph_compiler::schema::ast::Constraint::Key(columns) => { + format!("@key({})", columns.join(", ")) + } + omnigraph_compiler::schema::ast::Constraint::Unique(columns) => { + format!("@unique({})", columns.join(", ")) + } + omnigraph_compiler::schema::ast::Constraint::Index(columns) => { + format!("@index({})", columns.join(", ")) + } + omnigraph_compiler::schema::ast::Constraint::Range { property, min, max } => { + format!("@range({}, {:?}, {:?})", property, min, max) + } + omnigraph_compiler::schema::ast::Constraint::Check { property, pattern } => { + format!("@check({}, {:?})", property, pattern) + } + } +} + +fn render_annotations(annotations: &[omnigraph_compiler::schema::ast::Annotation]) -> String { + annotations + .iter() + .map(|annotation| match &annotation.value { + Some(value) => format!("@{}({})", annotation.name, value), + None => format!("@{}", annotation.name), + }) + .collect::>() + .join(", ") +} + +fn print_embed_human(output: &EmbedOutput) { + println!( + "embedded {} rows (selected {}, cleaned {}) from {} -> {} [{} {}d]", + output.embedded_rows, + output.selected_rows, + output.cleaned_rows, + output.input, + output.output, + output.mode, + output.dimension + ); +} + +fn print_snapshot_human(branch: &str, manifest_version: u64, entries: &[SnapshotTableOutput]) { + println!("branch: {}", branch); + println!("manifest_version: {}", manifest_version); + for entry in entries { + println!( + "{} v{} branch={} rows={}", + entry.table_key, + entry.table_version, + entry.table_branch.as_deref().unwrap_or("main"), + entry.row_count + ); + } +} + +fn print_read_output( + output: &ReadOutput, + format: ReadOutputFormat, + config: &OmnigraphConfig, +) -> Result<()> { + println!( + "{}", + render_read( + output, + format, + &ReadRenderOptions { + max_column_width: config.table_max_column_width(), + cell_layout: config.table_cell_layout(), + }, + )? + ); + Ok(()) +} + +fn print_change_human(output: &ChangeOutput) { + println!( + "changed {} via {}: {} nodes, {} edges", + output.branch, output.query_name, output.affected_nodes, output.affected_edges + ); + if let Some(actor_id) = &output.actor_id { + println!("actor_id: {}", actor_id); + } +} + +fn print_run_list_human(runs: &[RunOutput]) { + for run in runs { + println!( + "{} {} target={} branch={}{}", + run.run_id, + run.status, + run.target_branch, + run.run_branch, + run.actor_id + .as_deref() + .map(|actor| format!(" actor={}", actor)) + .unwrap_or_default() + ); + } +} + +fn print_run_human(run: &RunOutput) { + println!("run_id: {}", run.run_id); + println!("status: {}", run.status); + println!("target_branch: {}", run.target_branch); + println!("run_branch: {}", run.run_branch); + println!("base_snapshot_id: {}", run.base_snapshot_id); + println!("base_manifest_version: {}", run.base_manifest_version); + if let Some(actor_id) = &run.actor_id { + println!("actor_id: {}", actor_id); + } + if let Some(operation_hash) = &run.operation_hash { + println!("operation_hash: {}", operation_hash); + } + if let Some(snapshot_id) = &run.published_snapshot_id { + println!("published_snapshot_id: {}", snapshot_id); + } + println!("created_at: {}", run.created_at); + println!("updated_at: {}", run.updated_at); +} + +fn print_commit_list_human(commits: &[CommitOutput]) { + for commit in commits { + let branch = commit.manifest_branch.as_deref().unwrap_or("main"); + println!( + "{} branch={} version={}{}", + commit.graph_commit_id, + branch, + commit.manifest_version, + commit + .actor_id + .as_deref() + .map(|actor| format!(" actor={}", actor)) + .unwrap_or_default() + ); + } +} + +fn print_commit_human(commit: &CommitOutput) { + println!("graph_commit_id: {}", commit.graph_commit_id); + println!( + "manifest_branch: {}", + commit.manifest_branch.as_deref().unwrap_or("main") + ); + println!("manifest_version: {}", commit.manifest_version); + if let Some(parent_commit_id) = &commit.parent_commit_id { + println!("parent_commit_id: {}", parent_commit_id); + } + if let Some(merged_parent_commit_id) = &commit.merged_parent_commit_id { + println!("merged_parent_commit_id: {}", merged_parent_commit_id); + } + if let Some(actor_id) = &commit.actor_id { + println!("actor_id: {}", actor_id); + } + println!("created_at: {}", commit.created_at); +} + +fn print_policy_explain(decision: &PolicyDecision, request: &PolicyRequest) { + println!( + "decision: {}", + if decision.allowed { "allow" } else { "deny" } + ); + println!("actor: {}", request.actor_id); + println!("action: {}", request.action); + if let Some(branch) = &request.branch { + println!("branch: {}", branch); + } + if let Some(target_branch) = &request.target_branch { + println!("target_branch: {}", target_branch); + } + if let Some(rule_id) = &decision.matched_rule_id { + println!("matched_rule: {}", rule_id); + } + println!("message: {}", decision.message); +} + +fn resolve_read_format( + config: &OmnigraphConfig, + cli_format: Option, + json: bool, + alias_format: Option, +) -> ReadOutputFormat { + if json { + ReadOutputFormat::Json + } else { + cli_format + .or(alias_format) + .unwrap_or_else(|| config.cli_output_format()) + } +} + +fn resolve_alias<'a>( + config: &'a OmnigraphConfig, + alias_name: Option<&'a str>, + expected: AliasCommand, +) -> Result> { + let Some(alias_name) = alias_name else { + return Ok(None); + }; + let alias = config.alias(alias_name)?; + if alias.command != expected { + bail!( + "alias '{}' is a {:?} alias, not a {:?} alias", + alias_name, + alias.command, + expected + ); + } + Ok(Some((alias_name, alias))) +} + +fn normalize_alias_args( + uri: Option, + target: Option<&str>, + default_target_present: bool, + alias_name: Option<&str>, + mut alias_args: Vec, +) -> (Option, Vec) { + let Some(candidate) = uri else { + return (None, alias_args); + }; + + if alias_name.is_some() + && (target.is_some() || default_target_present) + && !is_remote_uri(&candidate) + && !candidate.contains(std::path::MAIN_SEPARATOR) + && !Path::new(&candidate).exists() + { + alias_args.insert(0, candidate); + return (None, alias_args); + } + + (Some(candidate), alias_args) +} + +fn scaffold_config_if_missing(uri: &str) -> Result<()> { + let path = inferred_config_path(uri)?; + if path.exists() { + return Ok(()); + } + + fs::write( + path, + format!( + "\ +project: + name: Omnigraph Project + +targets: + local: + uri: {} + # bearer_token_env: OMNIGRAPH_BEARER_TOKEN + +server: + target: local + bind: 127.0.0.1:8080 + +cli: + target: local + branch: main + output_format: table + table_max_column_width: 80 + table_cell_layout: truncate + +query: + roots: + - queries + - . + +aliases: + # owner: + # command: read + # query: context.gq + # name: decision_owner + # args: [slug] + # target: local + # branch: main + # format: kv + # + # attach_trace: + # command: change + # query: mutations.gq + # name: attach_trace + # args: [decision_slug, trace_slug] + # target: local + # branch: main + +# auth: +# env_file: ./.env.omni +# +# policy: +# file: ./policy.yaml +", + yaml_string(uri), + ), + )?; + Ok(()) +} + +fn yaml_string(value: &str) -> String { + format!("'{}'", value.replace('\'', "''")) +} + +fn inferred_config_path(uri: &str) -> Result { + if uri.contains("://") { + return Ok(omnigraph_server::config::default_config_path()); + } + + let path = Path::new(uri); + let base = if path.is_absolute() { + path.parent() + .map(Path::to_path_buf) + .unwrap_or(std::env::current_dir()?) + } else { + std::env::current_dir()?.join(path.parent().unwrap_or_else(|| Path::new("."))) + }; + Ok(base.join(omnigraph_server::config::DEFAULT_CONFIG_FILE)) +} + +fn read_target_from_cli(branch: Option, snapshot: Option) -> ReadTarget { + if let Some(snapshot) = snapshot { + ReadTarget::snapshot(SnapshotId::new(snapshot)) + } else { + ReadTarget::branch(branch.unwrap_or_else(|| "main".to_string())) + } +} + +fn load_params_json(params: &ParamsArgs) -> Result> { + match (¶ms.params, ¶ms.params_file) { + (Some(inline), None) => Ok(Some(serde_json::from_str(inline)?)), + (None, Some(path)) => Ok(Some(serde_json::from_str(&fs::read_to_string(path)?)?)), + (None, None) => Ok(None), + (Some(_), Some(_)) => bail!("only one of --params or --params-file may be provided"), + } +} + +fn select_named_query( + query_source: &str, + requested_name: Option<&str>, +) -> Result<(String, Vec)> { + let parsed = parse_query(query_source)?; + let query = if let Some(name) = requested_name { + parsed + .queries + .into_iter() + .find(|query| query.name == name) + .ok_or_else(|| color_eyre::eyre::eyre!("query '{}' not found", name))? + } else if parsed.queries.len() == 1 { + parsed.queries.into_iter().next().unwrap() + } else { + bail!("query file contains multiple queries; pass --name"); + }; + + Ok((query.name, query.params)) +} + +fn query_params_from_json( + query_params: &[omnigraph_compiler::query::ast::Param], + params_json: Option<&Value>, +) -> Result { + json_params_to_param_map(params_json, query_params, JsonParamMode::Standard) + .map_err(|err| color_eyre::eyre::eyre!(err.to_string())) +} + +async fn execute_read( + uri: &str, + query_source: &str, + query_name: Option<&str>, + target: ReadTarget, + params_json: Option<&Value>, +) -> Result { + let (selected_name, query_params) = select_named_query(query_source, query_name)?; + let params = query_params_from_json(&query_params, params_json)?; + let db = Omnigraph::open(uri).await?; + let result = db + .query(target.clone(), query_source, &selected_name, ¶ms) + .await?; + Ok(read_output(selected_name, &target, result)) +} + +async fn execute_read_remote( + client: &reqwest::Client, + uri: &str, + query_source: &str, + query_name: Option<&str>, + target: ReadTarget, + params_json: Option<&Value>, + bearer_token: Option<&str>, +) -> Result { + let (branch, snapshot) = match &target { + ReadTarget::Branch(branch) => (Some(branch.clone()), None), + ReadTarget::Snapshot(snapshot) => (None, Some(snapshot.as_str().to_string())), + }; + remote_json( + client, + Method::POST, + remote_url(uri, "/read"), + Some(serde_json::to_value(ReadRequest { + query_source: query_source.to_string(), + query_name: query_name.map(ToOwned::to_owned), + params: params_json.cloned(), + branch, + snapshot, + })?), + bearer_token, + ) + .await +} + +async fn execute_change( + uri: &str, + query_source: &str, + query_name: Option<&str>, + branch: &str, + params_json: Option<&Value>, +) -> Result { + let (selected_name, query_params) = select_named_query(query_source, query_name)?; + let params = query_params_from_json(&query_params, params_json)?; + let mut db = Omnigraph::open(uri).await?; + let result = db + .mutate(branch, query_source, &selected_name, ¶ms) + .await?; + Ok(ChangeOutput { + branch: branch.to_string(), + query_name: selected_name, + affected_nodes: result.affected_nodes, + affected_edges: result.affected_edges, + actor_id: None, + }) +} + +async fn execute_change_remote( + client: &reqwest::Client, + uri: &str, + query_source: &str, + query_name: Option<&str>, + branch: &str, + params_json: Option<&Value>, + bearer_token: Option<&str>, +) -> Result { + remote_json( + client, + Method::POST, + remote_url(uri, "/change"), + Some(serde_json::to_value(ChangeRequest { + query_source: query_source.to_string(), + query_name: query_name.map(ToOwned::to_owned), + params: params_json.cloned(), + branch: Some(branch.to_string()), + })?), + bearer_token, + ) + .await +} + +async fn execute_export( + uri: &str, + branch: &str, + type_names: &[String], + table_keys: &[String], +) -> Result { + let db = Omnigraph::open(uri).await?; + Ok(db.export_jsonl(branch, type_names, table_keys).await?) +} + +async fn execute_export_remote( + client: &reqwest::Client, + uri: &str, + branch: &str, + type_names: &[String], + table_keys: &[String], + bearer_token: Option<&str>, +) -> Result { + remote_text( + client, + Method::POST, + remote_url(uri, "/export"), + Some(serde_json::to_value(ExportRequest { + branch: Some(branch.to_string()), + type_names: type_names.to_vec(), + table_keys: table_keys.to_vec(), + })?), + bearer_token, + ) + .await +} + +#[tokio::main] +async fn main() -> Result<()> { + color_eyre::install()?; + let cli = { + let matches = Cli::command() + .arg( + Arg::new("version") + .short('v') + .long("version") + .action(ArgAction::Version) + .help("Print version"), + ) + .get_matches(); + Cli::from_arg_matches(&matches)? + }; + let http_client = build_http_client()?; + match cli.command { + Command::Version => { + println!("omnigraph {}", env!("CARGO_PKG_VERSION")); + } + Command::Embed(args) => { + let output = execute_embed(&args).await?; + if args.json { + print_json(&output)?; + } else { + print_embed_human(&output); + } + } + Command::Init { schema, uri } => { + let schema_source = fs::read_to_string(&schema)?; + ensure_local_repo_parent(&uri)?; + Omnigraph::init(&uri, &schema_source).await?; + scaffold_config_if_missing(&uri)?; + println!("initialized {}", uri); + } + Command::Load { + uri, + target, + config, + data, + branch, + mode, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let uri = resolve_local_uri(&config, uri, target.as_deref(), "load")?; + let branch = resolve_branch(&config, branch, None, "main"); + let mut db = Omnigraph::open(&uri).await?; + let result = db + .load_file(&branch, &data.to_string_lossy(), mode.into()) + .await?; + let payload = LoadOutput { + uri: &uri, + branch: &branch, + mode: mode.as_str(), + nodes_loaded: result.nodes_loaded.len(), + edges_loaded: result.edges_loaded.len(), + }; + if json { + print_json(&payload)?; + } else { + print_load_human( + &uri, + &branch, + mode, + payload.nodes_loaded, + payload.edges_loaded, + ); + } + } + Command::Ingest { + uri, + target, + config, + data, + branch, + from, + mode, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let branch = resolve_branch(&config, branch, None, "main"); + let from = resolve_branch(&config, from, None, "main"); + let payload = if is_remote_uri(&uri) { + let data = fs::read_to_string(&data)?; + remote_json::( + &http_client, + Method::POST, + remote_url(&uri, "/ingest"), + Some(serde_json::to_value(IngestRequest { + branch: Some(branch.clone()), + from: Some(from.clone()), + mode: Some(mode.into()), + data, + })?), + bearer_token.as_deref(), + ) + .await? + } else { + let mut db = Omnigraph::open(&uri).await?; + let result = db + .ingest_file(&branch, Some(&from), &data.to_string_lossy(), mode.into()) + .await?; + ingest_output(&uri, &result, None) + }; + if json { + print_json(&payload)?; + } else { + print_ingest_human(&payload); + } + } + Command::Branch { command } => match command { + BranchCommand::Create { + uri, + target, + config, + from, + name, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let from = resolve_branch(&config, from, None, "main"); + let payload = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::POST, + remote_url(&uri, "/branches"), + Some(serde_json::to_value(BranchCreateRequest { + from: Some(from.clone()), + name: name.clone(), + })?), + bearer_token.as_deref(), + ) + .await? + } else { + let mut db = Omnigraph::open(&uri).await?; + db.branch_create_from(ReadTarget::branch(&from), &name) + .await?; + BranchCreateOutput { + uri: uri.clone(), + from: from.clone(), + name: name.clone(), + actor_id: None, + } + }; + if json { + print_json(&payload)?; + } else { + println!("created branch {} from {}", payload.name, payload.from); + } + } + BranchCommand::List { + uri, + target, + config, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let payload = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::GET, + remote_url(&uri, "/branches"), + None, + bearer_token.as_deref(), + ) + .await? + } else { + let db = Omnigraph::open(&uri).await?; + let mut branches = db.branch_list().await?; + branches.sort(); + BranchListOutput { branches } + }; + if json { + print_json(&payload)?; + } else { + for branch in payload.branches { + println!("{}", branch); + } + } + } + BranchCommand::Delete { + uri, + target, + config, + name, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let payload = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::DELETE, + remote_branch_url(&uri, &name)?, + None, + bearer_token.as_deref(), + ) + .await? + } else { + let mut db = Omnigraph::open(&uri).await?; + db.branch_delete(&name).await?; + BranchDeleteOutput { + uri: uri.clone(), + name: name.clone(), + actor_id: None, + } + }; + if json { + print_json(&payload)?; + } else { + println!("deleted branch {}", payload.name); + } + } + BranchCommand::Merge { + uri, + target, + config, + source, + into, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let into = resolve_branch(&config, into, None, "main"); + let payload = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::POST, + remote_url(&uri, "/branches/merge"), + Some(serde_json::to_value(BranchMergeRequest { + source: source.clone(), + target: Some(into.clone()), + })?), + bearer_token.as_deref(), + ) + .await? + } else { + let mut db = Omnigraph::open(&uri).await?; + let outcome = db.branch_merge(&source, &into).await?; + BranchMergeOutput { + source: source.clone(), + target: into.clone(), + outcome: outcome.into(), + actor_id: None, + } + }; + if json { + print_json(&payload)?; + } else { + println!( + "merged {} into {}: {}", + payload.source, + payload.target, + payload.outcome.as_str() + ); + } + } + }, + Command::Commit { command } => match command { + CommitCommand::List { + uri, + target, + config, + branch, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let commits = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::GET, + if let Some(branch) = branch.as_deref() { + format!("{}?branch={}", remote_url(&uri, "/commits"), branch) + } else { + remote_url(&uri, "/commits") + }, + None, + bearer_token.as_deref(), + ) + .await? + .commits + } else { + let db = Omnigraph::open(&uri).await?; + db.list_commits(branch.as_deref()) + .await? + .iter() + .map(commit_output) + .collect::>() + }; + if json { + print_json(&CommitListOutput { commits })?; + } else { + print_commit_list_human(&commits); + } + } + CommitCommand::Show { + uri, + target, + config, + commit_id, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let commit = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::GET, + remote_url(&uri, &format!("/commits/{}", commit_id)), + None, + bearer_token.as_deref(), + ) + .await? + } else { + let db = Omnigraph::open(&uri).await?; + commit_output(&db.get_commit(&commit_id).await?) + }; + if json { + print_json(&commit)?; + } else { + print_commit_human(&commit); + } + } + }, + Command::Schema { command } => match command { + SchemaCommand::Plan { + uri, + target, + config, + schema, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let uri = resolve_local_uri(&config, uri, target.as_deref(), "schema plan")?; + let schema_source = fs::read_to_string(&schema)?; + let db = Omnigraph::open(&uri).await?; + let plan = db.plan_schema(&schema_source).await?; + let output = SchemaPlanOutput { + uri: &uri, + supported: plan.supported, + step_count: plan.steps.len(), + steps: &plan.steps, + }; + if json { + print_json(&output)?; + } else { + print_schema_plan_human(&uri, &plan); + } + } + }, + Command::Snapshot { + uri, + target, + config, + branch, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let branch = resolve_branch(&config, branch, None, "main"); + let payload = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::GET, + format!("{}?branch={}", remote_url(&uri, "/snapshot"), branch), + None, + bearer_token.as_deref(), + ) + .await? + } else { + let db = Omnigraph::open(&uri).await?; + let snapshot = db.snapshot_of(ReadTarget::branch(branch.as_str())).await?; + snapshot_payload(&branch, &snapshot) + }; + + if json { + print_json(&payload)?; + } else { + print_snapshot_human(&payload.branch, payload.manifest_version, &payload.tables); + } + } + Command::Export { + uri, + target, + config, + branch, + jsonl: _, + type_names, + table_keys, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let branch = resolve_branch(&config, branch, None, "main"); + let output = if is_remote_uri(&uri) { + execute_export_remote( + &http_client, + &uri, + &branch, + &type_names, + &table_keys, + bearer_token.as_deref(), + ) + .await? + } else { + execute_export(&uri, &branch, &type_names, &table_keys).await? + }; + print!("{output}"); + } + Command::Run { command } => match command { + RunCommand::List { + uri, + target, + config, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let runs = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::GET, + remote_url(&uri, "/runs"), + None, + bearer_token.as_deref(), + ) + .await? + .runs + } else { + let db = Omnigraph::open(&uri).await?; + db.list_runs() + .await? + .iter() + .map(run_output) + .collect::>() + }; + if json { + print_json(&RunListOutput { runs })?; + } else { + print_run_list_human(&runs); + } + } + RunCommand::Show { + uri, + target, + config, + run_id, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let run = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::GET, + remote_url(&uri, &format!("/runs/{}", run_id)), + None, + bearer_token.as_deref(), + ) + .await? + } else { + let db = Omnigraph::open(&uri).await?; + run_output(&db.get_run(&RunId::new(run_id)).await?) + }; + if json { + print_json(&run)?; + } else { + print_run_human(&run); + } + } + RunCommand::Publish { + uri, + target, + config, + run_id, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let run = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::POST, + remote_url(&uri, &format!("/runs/{}/publish", run_id)), + Some(serde_json::json!({})), + bearer_token.as_deref(), + ) + .await? + } else { + let mut db = Omnigraph::open(&uri).await?; + db.publish_run(&RunId::new(run_id.clone())).await?; + run_output(&db.get_run(&RunId::new(run_id)).await?) + }; + if json { + print_json(&run)?; + } else { + print_run_human(&run); + } + } + RunCommand::Abort { + uri, + target, + config, + run_id, + json, + } => { + let config = load_cli_config(config.as_ref())?; + let bearer_token = + resolve_remote_bearer_token(&config, uri.as_deref(), target.as_deref())?; + let uri = resolve_uri(&config, uri, target.as_deref())?; + let run = if is_remote_uri(&uri) { + remote_json::( + &http_client, + Method::POST, + remote_url(&uri, &format!("/runs/{}/abort", run_id)), + Some(serde_json::json!({})), + bearer_token.as_deref(), + ) + .await? + } else { + let mut db = Omnigraph::open(&uri).await?; + run_output(&db.abort_run(&RunId::new(run_id)).await?) + }; + if json { + print_json(&run)?; + } else { + print_run_human(&run); + } + } + }, + Command::Read { + uri, + target, + config, + alias, + query, + name, + params, + branch, + snapshot, + format, + json, + alias_args, + } => { + if alias.is_some() == query.is_some() { + bail!("exactly one of --alias or --query must be provided"); + } + + let config = load_cli_config(config.as_ref())?; + let alias = resolve_alias(&config, alias.as_deref(), AliasCommand::Read)?; + let alias_name = alias.as_ref().map(|(name, _)| *name); + let alias_config = alias.as_ref().map(|(_, alias)| *alias); + let (uri, alias_args) = normalize_alias_args( + uri, + target.as_deref(), + config.cli_target_name().is_some(), + alias_name, + alias_args, + ); + let target_name = target + .as_deref() + .or_else(|| alias_config.and_then(|alias| alias.target.as_deref())); + let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target_name)?; + let uri = resolve_uri(&config, uri, target_name)?; + let query_source = resolve_query_source( + &config, + query.as_ref(), + alias_config.map(|a| a.query.as_str()), + )?; + let params_json = merged_params_json( + alias_name, + alias_config + .map(|alias| alias.args.as_slice()) + .unwrap_or(&[]), + &alias_args, + load_params_json(¶ms)?, + )?; + let target = resolve_read_target( + &config, + branch, + snapshot, + alias_config.and_then(|alias| alias.branch.clone()), + )?; + let query_name = name.or_else(|| alias_config.and_then(|alias| alias.name.clone())); + let output = if is_remote_uri(&uri) { + execute_read_remote( + &http_client, + &uri, + &query_source, + query_name.as_deref(), + target, + params_json.as_ref(), + bearer_token.as_deref(), + ) + .await? + } else { + execute_read( + &uri, + &query_source, + query_name.as_deref(), + target, + params_json.as_ref(), + ) + .await? + }; + let format = resolve_read_format( + &config, + format, + json, + alias_config.and_then(|alias| alias.format), + ); + print_read_output(&output, format, &config)?; + } + Command::Change { + uri, + target, + config, + alias, + query, + name, + params, + branch, + json, + alias_args, + } => { + if alias.is_some() == query.is_some() { + bail!("exactly one of --alias or --query must be provided"); + } + + let config = load_cli_config(config.as_ref())?; + let alias = resolve_alias(&config, alias.as_deref(), AliasCommand::Change)?; + let alias_name = alias.as_ref().map(|(name, _)| *name); + let alias_config = alias.as_ref().map(|(_, alias)| *alias); + let (uri, alias_args) = normalize_alias_args( + uri, + target.as_deref(), + config.cli_target_name().is_some(), + alias_name, + alias_args, + ); + let target_name = target + .as_deref() + .or_else(|| alias_config.and_then(|alias| alias.target.as_deref())); + let bearer_token = resolve_remote_bearer_token(&config, uri.as_deref(), target_name)?; + let uri = resolve_uri(&config, uri, target_name)?; + let query_source = resolve_query_source( + &config, + query.as_ref(), + alias_config.map(|a| a.query.as_str()), + )?; + let params_json = merged_params_json( + alias_name, + alias_config + .map(|alias| alias.args.as_slice()) + .unwrap_or(&[]), + &alias_args, + load_params_json(¶ms)?, + )?; + let branch = resolve_branch( + &config, + branch, + alias_config.and_then(|alias| alias.branch.clone()), + "main", + ); + let query_name = name.or_else(|| alias_config.and_then(|alias| alias.name.clone())); + let output = if is_remote_uri(&uri) { + execute_change_remote( + &http_client, + &uri, + &query_source, + query_name.as_deref(), + &branch, + params_json.as_ref(), + bearer_token.as_deref(), + ) + .await? + } else { + execute_change( + &uri, + &query_source, + query_name.as_deref(), + &branch, + params_json.as_ref(), + ) + .await? + }; + if json { + print_json(&output)?; + } else { + print_change_human(&output); + } + } + Command::Policy { command } => match command { + PolicyCommand::Validate { config } => { + let config = load_cli_config(config.as_ref())?; + let engine = resolve_policy_engine(&config)?; + let policy_file = config + .resolve_policy_file() + .expect("policy file should exist after resolve_policy_engine"); + println!( + "policy valid: {} [{} actors]", + policy_file.display(), + engine.known_actor_count() + ); + } + PolicyCommand::Test { config } => { + let config = load_cli_config(config.as_ref())?; + let engine = resolve_policy_engine(&config)?; + let tests_path = resolve_policy_tests_path(&config)?; + let tests = PolicyTestConfig::load(&tests_path)?; + engine.run_tests(&tests)?; + println!("policy tests passed: {} cases", tests.cases.len()); + } + PolicyCommand::Explain { + config, + actor, + action, + branch, + target_branch, + } => { + let config = load_cli_config(config.as_ref())?; + let engine = resolve_policy_engine(&config)?; + let request = PolicyRequest { + actor_id: actor, + action, + branch, + target_branch, + }; + let decision = engine.authorize(&request)?; + print_policy_explain(&decision, &request); + } + }, + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use std::fs; + + use super::{ + DEFAULT_BEARER_TOKEN_ENV, apply_bearer_token, bearer_token_from_env_file, load_cli_config, + load_env_file_into_process, normalize_bearer_token, parse_env_assignment, + resolve_remote_bearer_token, + }; + use omnigraph_server::load_config; + use reqwest::header::AUTHORIZATION; + use tempfile::tempdir; + + #[test] + fn apply_bearer_token_adds_header_when_configured() { + let client = reqwest::Client::new(); + let request = apply_bearer_token(client.get("http://example.com"), Some("demo-token")) + .build() + .unwrap(); + assert_eq!( + request + .headers() + .get(AUTHORIZATION) + .and_then(|value| value.to_str().ok()), + Some("Bearer demo-token") + ); + } + + #[test] + fn apply_bearer_token_leaves_request_unchanged_when_not_configured() { + let client = reqwest::Client::new(); + let request = apply_bearer_token(client.get("http://example.com"), None) + .build() + .unwrap(); + assert!(request.headers().get(AUTHORIZATION).is_none()); + } + + #[test] + fn normalize_bearer_token_trims_and_filters_blank_values() { + assert_eq!(normalize_bearer_token(None), None); + assert_eq!(normalize_bearer_token(Some(" ".to_string())), None); + assert_eq!( + normalize_bearer_token(Some(" demo-token ".to_string())).as_deref(), + Some("demo-token") + ); + } + + #[test] + fn parse_env_assignment_supports_plain_and_exported_values() { + assert_eq!( + parse_env_assignment("DEMO_TOKEN=demo-token"), + Some(("DEMO_TOKEN".to_string(), "demo-token".to_string())) + ); + assert_eq!( + parse_env_assignment("export DEMO_TOKEN=\"quoted-token\""), + Some(("DEMO_TOKEN".to_string(), "quoted-token".to_string())) + ); + assert_eq!(parse_env_assignment("# comment"), None); + assert_eq!(parse_env_assignment(" "), None); + } + + #[test] + fn bearer_token_from_env_file_reads_named_value() { + let temp = tempdir().unwrap(); + let env_file = temp.path().join(".env.omni"); + fs::write( + &env_file, + "FIRST=ignore\nexport DEMO_TOKEN=\" demo-token \"\n", + ) + .unwrap(); + + assert_eq!( + bearer_token_from_env_file(&env_file, "DEMO_TOKEN") + .unwrap() + .as_deref(), + Some("demo-token") + ); + assert_eq!( + bearer_token_from_env_file(&env_file, "MISSING").unwrap(), + None + ); + } + + #[test] + fn load_env_file_into_process_sets_missing_values_without_overriding_existing_ones() { + let temp = tempdir().unwrap(); + let env_file = temp.path().join(".env.omni"); + fs::write( + &env_file, + "AUTOLOAD_ONLY=from-file\nAUTOLOAD_PRESET=from-file\n", + ) + .unwrap(); + + let missing_key = "AUTOLOAD_ONLY"; + let preset_key = "AUTOLOAD_PRESET"; + let previous_missing = std::env::var_os(missing_key); + let previous_preset = std::env::var_os(preset_key); + + unsafe { + std::env::remove_var(missing_key); + std::env::set_var(preset_key, "from-env"); + } + + load_env_file_into_process(&env_file).unwrap(); + + assert_eq!(std::env::var(missing_key).unwrap(), "from-file"); + assert_eq!(std::env::var(preset_key).unwrap(), "from-env"); + + unsafe { + if let Some(value) = previous_missing { + std::env::set_var(missing_key, value); + } else { + std::env::remove_var(missing_key); + } + + if let Some(value) = previous_preset { + std::env::set_var(preset_key, value); + } else { + std::env::remove_var(preset_key); + } + } + } + + #[test] + fn resolve_remote_bearer_token_uses_scoped_env_file_with_global_fallback() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + r#" +targets: + demo: + uri: https://example.com + bearer_token_env: DEMO_TOKEN +auth: + env_file: .env.omni +cli: + target: demo +"#, + ) + .unwrap(); + fs::write( + temp.path().join(".env.omni"), + "DEMO_TOKEN=scoped-token\nOMNIGRAPH_BEARER_TOKEN=global-token\n", + ) + .unwrap(); + + let previous = std::env::var_os(DEFAULT_BEARER_TOKEN_ENV); + unsafe { + std::env::remove_var(DEFAULT_BEARER_TOKEN_ENV); + } + + let config_path = temp.path().join("omnigraph.yaml"); + let config = load_config(Some(&config_path)).unwrap(); + + assert_eq!( + resolve_remote_bearer_token(&config, None, Some("demo")) + .unwrap() + .as_deref(), + Some("scoped-token") + ); + assert_eq!( + resolve_remote_bearer_token(&config, Some("https://override.example.com"), None) + .unwrap() + .as_deref(), + Some("global-token") + ); + + unsafe { + if let Some(value) = previous { + std::env::set_var(DEFAULT_BEARER_TOKEN_ENV, value); + } else { + std::env::remove_var(DEFAULT_BEARER_TOKEN_ENV); + } + } + } + + #[test] + fn load_cli_config_autoloads_env_file_into_process() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + r#" +auth: + env_file: .env.omni +targets: + demo: + uri: s3://bucket/prefix +"#, + ) + .unwrap(); + fs::write( + temp.path().join(".env.omni"), + "AUTOLOAD_FROM_CONFIG=loaded\n", + ) + .unwrap(); + + let key = "AUTOLOAD_FROM_CONFIG"; + let previous = std::env::var_os(key); + unsafe { + std::env::remove_var(key); + } + + let config_path = temp.path().join("omnigraph.yaml"); + let config = load_cli_config(Some(&config_path)).unwrap(); + + assert_eq!( + config.resolve_target_uri(None, Some("demo"), None).unwrap(), + "s3://bucket/prefix" + ); + assert_eq!(std::env::var(key).unwrap(), "loaded"); + + unsafe { + if let Some(value) = previous { + std::env::set_var(key, value); + } else { + std::env::remove_var(key); + } + } + } +} diff --git a/crates/omnigraph-cli/src/read_format.rs b/crates/omnigraph-cli/src/read_format.rs new file mode 100644 index 0000000..b205b19 --- /dev/null +++ b/crates/omnigraph-cli/src/read_format.rs @@ -0,0 +1,356 @@ +use color_eyre::eyre::Result; +use omnigraph_server::ReadOutputFormat; +use omnigraph_server::api::ReadOutput; +use omnigraph_server::config::TableCellLayout; +use serde_json::{Map, Value}; + +pub struct ReadRenderOptions { + pub max_column_width: usize, + pub cell_layout: TableCellLayout, +} + +pub fn render_read( + output: &ReadOutput, + format: ReadOutputFormat, + options: &ReadRenderOptions, +) -> Result { + match format { + ReadOutputFormat::Json => Ok(serde_json::to_string_pretty(output)?), + ReadOutputFormat::Jsonl => render_jsonl(output), + ReadOutputFormat::Csv => render_csv(output), + ReadOutputFormat::Kv => Ok(render_kv(output)), + ReadOutputFormat::Table => Ok(render_table(output, options)), + } +} + +fn render_jsonl(output: &ReadOutput) -> Result { + let mut lines = Vec::new(); + lines.push(serde_json::to_string(&serde_json::json!({ + "kind": "metadata", + "query_name": output.query_name, + "target": output.target, + "row_count": output.row_count, + }))?); + for row in rows(output) { + lines.push(serde_json::to_string(&row)?); + } + Ok(lines.join("\n")) +} + +fn render_csv(output: &ReadOutput) -> Result { + let rows = rows(output); + let columns = columns(output, &rows); + let mut lines = Vec::new(); + lines.push( + columns + .iter() + .map(|column| csv_escape(column)) + .collect::>() + .join(","), + ); + for row in rows { + lines.push( + columns + .iter() + .map(|column| csv_escape(&stringify_value(row.get(column).unwrap_or(&Value::Null)))) + .collect::>() + .join(","), + ); + } + Ok(lines.join("\n")) +} + +fn render_kv(output: &ReadOutput) -> String { + let mut lines = vec![header_line(output)]; + let rows = rows(output); + if rows.is_empty() { + lines.push("(no rows)".to_string()); + return lines.join("\n"); + } + + for (idx, row) in rows.iter().enumerate() { + if idx > 0 { + lines.push(String::new()); + } + lines.push(format!("row {}", idx + 1)); + for column in columns(output, &rows) { + lines.push(format!( + "{}: {}", + column, + stringify_value(row.get(&column).unwrap_or(&Value::Null)) + )); + } + } + lines.join("\n") +} + +fn render_table(output: &ReadOutput, options: &ReadRenderOptions) -> String { + let mut lines = vec![header_line(output)]; + let rows = rows(output); + let columns = columns(output, &rows); + + if columns.is_empty() { + lines.push("(no rows)".to_string()); + return lines.join("\n"); + } + + let widths = columns + .iter() + .map(|column| { + let mut width = column.chars().count(); + for row in &rows { + let rendered = + normalize_cell(&stringify_value(row.get(column).unwrap_or(&Value::Null))); + let longest = rendered + .lines() + .map(|line| line.chars().count()) + .max() + .unwrap_or(0); + width = width.max(longest.min(options.max_column_width)); + } + width.min(options.max_column_width.max(8)) + }) + .collect::>(); + + lines.push(render_table_line(&columns, &widths)); + lines.push( + widths + .iter() + .map(|width| "-".repeat(*width)) + .collect::>() + .join("-+-"), + ); + + for row in rows { + let cell_lines = columns + .iter() + .zip(widths.iter()) + .map(|(column, width)| { + split_cell( + &normalize_cell(&stringify_value(row.get(column).unwrap_or(&Value::Null))), + *width, + options.cell_layout, + ) + }) + .collect::>(); + let line_count = cell_lines.iter().map(Vec::len).max().unwrap_or(1); + for line_idx in 0..line_count { + let rendered = cell_lines + .iter() + .zip(widths.iter()) + .map(|(segments, width)| { + let segment = segments.get(line_idx).cloned().unwrap_or_default(); + pad_to_width(&segment, *width) + }) + .collect::>(); + lines.push(rendered.join(" | ")); + } + } + + lines.join("\n") +} + +fn render_table_line(columns: &[String], widths: &[usize]) -> String { + columns + .iter() + .zip(widths.iter()) + .map(|(column, width)| pad_to_width(column, *width)) + .collect::>() + .join(" | ") +} + +fn header_line(output: &ReadOutput) -> String { + format!( + "{} rows from {} via {}", + output.row_count, + output + .target + .snapshot + .as_deref() + .map(|id| format!("snapshot {}", id)) + .or_else(|| { + output + .target + .branch + .as_deref() + .map(|branch| format!("branch {}", branch)) + }) + .unwrap_or_else(|| "target".to_string()), + output.query_name + ) +} + +fn rows(output: &ReadOutput) -> Vec> { + output + .rows + .as_array() + .into_iter() + .flatten() + .map(|row| match row { + Value::Object(map) => map.clone(), + other => { + let mut map = Map::new(); + map.insert("value".to_string(), other.clone()); + map + } + }) + .collect() +} + +fn columns(output: &ReadOutput, rows: &[Map]) -> Vec { + if !output.columns.is_empty() { + return output.columns.clone(); + } + + let mut columns = rows + .iter() + .flat_map(|row| row.keys().cloned()) + .collect::>(); + columns.sort(); + columns.dedup(); + columns +} + +fn stringify_value(value: &Value) -> String { + match value { + Value::Null => "null".to_string(), + Value::String(text) => text.clone(), + Value::Bool(boolean) => boolean.to_string(), + Value::Number(number) => number.to_string(), + other => serde_json::to_string(other).unwrap_or_else(|_| "".to_string()), + } +} + +fn normalize_cell(value: &str) -> String { + value.replace('\n', "\\n") +} + +fn split_cell(value: &str, width: usize, layout: TableCellLayout) -> Vec { + if value.is_empty() { + return vec![String::new()]; + } + if value.chars().count() <= width { + return vec![value.to_string()]; + } + match layout { + TableCellLayout::Truncate => vec![truncate(value, width)], + TableCellLayout::Wrap => wrap(value, width), + } +} + +fn truncate(value: &str, width: usize) -> String { + if width <= 1 { + return value.chars().take(width).collect(); + } + let keep = width.saturating_sub(1); + let mut out = value.chars().take(keep).collect::(); + out.push('…'); + out +} + +fn wrap(value: &str, width: usize) -> Vec { + let chars = value.chars().collect::>(); + chars + .chunks(width.max(1)) + .map(|chunk| chunk.iter().collect::()) + .collect() +} + +fn pad_to_width(value: &str, width: usize) -> String { + let value_width = value.chars().count(); + if value_width >= width { + value.to_string() + } else { + format!("{}{}", value, " ".repeat(width - value_width)) + } +} + +fn csv_escape(value: &str) -> String { + if value.contains(',') || value.contains('"') || value.contains('\n') || value.contains('\r') { + format!("\"{}\"", value.replace('"', "\"\"")) + } else { + value.to_string() + } +} + +#[cfg(test)] +mod tests { + use omnigraph_server::api::{ReadOutput, ReadTargetOutput}; + + use super::*; + + fn sample_output() -> ReadOutput { + ReadOutput { + query_name: "get_person".to_string(), + target: ReadTargetOutput { + branch: Some("main".to_string()), + snapshot: None, + }, + row_count: 1, + columns: vec!["name".to_string(), "age".to_string()], + rows: serde_json::json!([{ "name": "Alice", "age": 30 }]), + } + } + + #[test] + fn csv_format_outputs_header_and_rows() { + let rendered = render_read( + &sample_output(), + ReadOutputFormat::Csv, + &ReadRenderOptions { + max_column_width: 80, + cell_layout: TableCellLayout::Truncate, + }, + ) + .unwrap(); + + assert!(rendered.lines().next().unwrap().contains("name,age")); + assert!(rendered.contains("Alice,30")); + } + + #[test] + fn jsonl_format_emits_metadata_first() { + let rendered = render_read( + &sample_output(), + ReadOutputFormat::Jsonl, + &ReadRenderOptions { + max_column_width: 80, + cell_layout: TableCellLayout::Truncate, + }, + ) + .unwrap(); + + let first = rendered.lines().next().unwrap(); + assert!(first.contains("\"kind\":\"metadata\"")); + assert!( + rendered + .lines() + .nth(1) + .unwrap() + .contains("\"name\":\"Alice\"") + ); + } + + #[test] + fn render_falls_back_to_discovered_columns_for_legacy_payloads() { + let mut output = sample_output(); + output.columns.clear(); + + let rendered = render_read( + &output, + ReadOutputFormat::Csv, + &ReadRenderOptions { + max_column_width: 80, + cell_layout: TableCellLayout::Truncate, + }, + ) + .unwrap(); + + assert!(rendered.lines().next().unwrap().contains("age,name")); + } + + #[test] + fn csv_quotes_carriage_returns() { + assert_eq!(csv_escape("hello\rworld"), "\"hello\rworld\""); + } +} diff --git a/crates/omnigraph-cli/tests/cli.rs b/crates/omnigraph-cli/tests/cli.rs new file mode 100644 index 0000000..62aa16a --- /dev/null +++ b/crates/omnigraph-cli/tests/cli.rs @@ -0,0 +1,1408 @@ +use std::fs; + +use omnigraph::db::{Omnigraph, ReadTarget}; +use serde_json::Value; +use tempfile::tempdir; + +mod support; + +use support::*; + +const POLICY_YAML: &str = r#" +version: 1 +groups: + team: [act-andrew, act-bruno] + admins: [act-andrew] +protected_branches: [main] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read] + branch_scope: any + - id: team-write + allow: + actors: { group: team } + actions: [change] + branch_scope: unprotected + - id: admins-promote + allow: + actors: { group: admins } + actions: [branch_merge, run_publish] + target_branch_scope: protected +"#; + +const POLICY_TESTS_YAML: &str = r#" +version: 1 +cases: + - id: allow-feature-write + actor: act-andrew + action: change + branch: feature + expect: allow + - id: deny-main-write + actor: act-bruno + action: change + branch: main + expect: deny +"#; + +fn manifest_dataset_version(repo: &std::path::Path) -> u64 { + tokio::runtime::Runtime::new().unwrap().block_on(async { + Omnigraph::open(repo.to_string_lossy().as_ref()) + .await + .unwrap() + .snapshot_of(ReadTarget::branch("main")) + .await + .unwrap() + .version() + }) +} + +fn write_policy_config_fixture(root: &std::path::Path) -> (std::path::PathBuf, std::path::PathBuf) { + let config = root.join("omnigraph.yaml"); + let policy = root.join("policy.yaml"); + fs::write( + &config, + r#" +project: + name: policy-test-repo +policy: + file: ./policy.yaml +"#, + ) + .unwrap(); + fs::write(&policy, POLICY_YAML).unwrap(); + fs::write(root.join("policy.tests.yaml"), POLICY_TESTS_YAML).unwrap(); + (config, policy) +} + +#[test] +fn version_command_prints_current_cli_version() { + let output = output_success(cli().arg("version")); + let stdout = stdout_string(&output); + + assert_eq!( + stdout.trim(), + format!("omnigraph {}", env!("CARGO_PKG_VERSION")) + ); +} + +#[test] +fn short_version_flag_prints_current_cli_version() { + let output = output_success(cli().arg("-v")); + let stdout = stdout_string(&output); + + assert_eq!( + stdout.trim(), + format!("omnigraph {}", env!("CARGO_PKG_VERSION")) + ); +} + +#[test] +fn long_version_flag_prints_current_cli_version() { + let output = output_success(cli().arg("--version")); + let stdout = stdout_string(&output); + + assert_eq!( + stdout.trim(), + format!("omnigraph {}", env!("CARGO_PKG_VERSION")) + ); +} + +#[test] +fn embed_seed_fills_missing_and_preserves_existing_vectors_by_default() { + let temp = tempdir().unwrap(); + let seed = write_seed_fixture(temp.path()); + + let output = output_success( + cli() + .env("OMNIGRAPH_EMBEDDINGS_MOCK", "1") + .arg("embed") + .arg("--seed") + .arg(&seed) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["mode"], "fill_missing"); + assert_eq!(payload["embedded_rows"], 1); + assert_eq!(payload["selected_rows"], 2); + + let embedded = read_embedded_rows(temp.path().join("build/seed.embedded.jsonl")); + assert_eq!( + embedded[0]["data"]["embedding"].as_array().unwrap().len(), + 4 + ); + assert_eq!( + embedded[1]["data"]["embedding"], + serde_json::json!([0.1, 0.2]) + ); +} + +#[test] +fn embed_clean_removes_selected_embeddings() { + let temp = tempdir().unwrap(); + let seed = write_seed_fixture(temp.path()); + + let output = output_success( + cli() + .arg("embed") + .arg("--seed") + .arg(&seed) + .arg("--clean") + .arg("--select") + .arg("Decision:slug=dec-beta") + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["mode"], "clean"); + assert_eq!(payload["cleaned_rows"], 1); + + let embedded = read_embedded_rows(temp.path().join("build/seed.embedded.jsonl")); + assert!(embedded[0]["data"].get("embedding").is_none()); + assert!(embedded[1]["data"].get("embedding").is_none()); +} + +#[test] +fn embed_select_reembeds_only_matching_rows() { + let temp = tempdir().unwrap(); + let seed = write_seed_fixture(temp.path()); + + let output = output_success( + cli() + .env("OMNIGRAPH_EMBEDDINGS_MOCK", "1") + .arg("embed") + .arg("--seed") + .arg(&seed) + .arg("--select") + .arg("Decision:slug=dec-beta") + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["mode"], "reembed_selected"); + assert_eq!(payload["embedded_rows"], 1); + assert_eq!(payload["selected_rows"], 1); + + let embedded = read_embedded_rows(temp.path().join("build/seed.embedded.jsonl")); + assert!(embedded[0]["data"].get("embedding").is_none()); + assert_ne!( + embedded[1]["data"]["embedding"], + serde_json::json!([0.1, 0.2]) + ); + assert_eq!( + embedded[1]["data"]["embedding"].as_array().unwrap().len(), + 4 + ); +} + +#[test] +fn embed_seed_preserves_non_entity_rows() { + let temp = tempdir().unwrap(); + let seed = write_seed_fixture_with_edge(temp.path()); + + let output = output_success( + cli() + .env("OMNIGRAPH_EMBEDDINGS_MOCK", "1") + .arg("embed") + .arg("--seed") + .arg(&seed) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["rows"], 3); + assert_eq!(payload["embedded_rows"], 1); + + let embedded = read_embedded_rows(temp.path().join("build/seed.embedded.jsonl")); + assert_eq!(embedded.len(), 3); + assert_eq!(embedded[2]["edge"], "Triggered"); + assert_eq!(embedded[2]["from"], "sig-alpha"); + assert_eq!(embedded[2]["to"], "dec-alpha"); +} + +#[test] +fn init_creates_repo_successfully_on_missing_local_directory() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let schema = fixture("test.pg"); + + let output = output_success(cli().arg("init").arg("--schema").arg(&schema).arg(&repo)); + let stdout = stdout_string(&output); + + assert!(stdout.contains("initialized")); + assert!(repo.join("_schema.pg").exists()); + assert!(repo.join("__manifest").exists()); + assert!(temp.path().join("omnigraph.yaml").exists()); +} + +#[test] +fn schema_plan_json_reports_supported_additive_change() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let schema_path = temp.path().join("next.pg"); + init_repo(&repo); + + let next_schema = fs::read_to_string(fixture("test.pg")).unwrap().replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + fs::write(&schema_path, next_schema).unwrap(); + + let output = output_success( + cli() + .arg("schema") + .arg("plan") + .arg("--schema") + .arg(&schema_path) + .arg("--json") + .arg(&repo), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + + assert_eq!(payload["supported"], true); + assert_eq!(payload["step_count"], 1); + assert_eq!(payload["steps"][0]["kind"], "add_property"); + assert_eq!(payload["steps"][0]["type_kind"], "node"); + assert_eq!(payload["steps"][0]["type_name"], "Person"); + assert_eq!(payload["steps"][0]["property_name"], "nickname"); +} + +#[test] +fn schema_plan_json_reports_unsupported_type_change() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let schema_path = temp.path().join("breaking.pg"); + init_repo(&repo); + + let breaking_schema = fs::read_to_string(fixture("test.pg")) + .unwrap() + .replace("age: I32?", "age: I64?"); + fs::write(&schema_path, breaking_schema).unwrap(); + + let output = output_success( + cli() + .arg("schema") + .arg("plan") + .arg("--schema") + .arg(&schema_path) + .arg("--json") + .arg(&repo), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + + assert_eq!(payload["supported"], false); + assert!(payload["steps"].as_array().unwrap().iter().any(|step| { + step["kind"] == "unsupported_change" + && step["entity"] + .as_str() + .unwrap_or_default() + .contains("Person.age") + })); +} + +#[test] +fn load_json_outputs_summary_for_main_branch() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + let data = fixture("test.jsonl"); + + let output = output_success( + cli() + .arg("load") + .arg("--data") + .arg(&data) + .arg("--json") + .arg(&repo), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + + assert_eq!(payload["branch"], "main"); + assert_eq!(payload["mode"], "overwrite"); + assert_eq!(payload["nodes_loaded"], 2); + assert_eq!(payload["edges_loaded"], 2); +} + +#[test] +fn load_into_feature_branch_with_merge_mode_succeeds() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("feature"), + ); + + let feature_data = temp.path().join("feature.jsonl"); + write_jsonl( + &feature_data, + r#"{"type":"Person","data":{"name":"Alice","age":31}}"#, + ); + + let output = output_success( + cli() + .arg("load") + .arg("--data") + .arg(&feature_data) + .arg("--branch") + .arg("feature") + .arg("--mode") + .arg("merge") + .arg(&repo), + ); + let stdout = stdout_string(&output); + + assert!(stdout.contains("branch feature")); + assert!(stdout.contains("with merge")); + assert!(stdout.contains("1 node types")); +} + +#[test] +fn read_json_outputs_rows_for_named_query() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + let queries = fixture("test.gq"); + + let output = output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(&queries) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + + assert_eq!(payload["query_name"], "get_person"); + assert_eq!(payload["target"]["branch"], "main"); + assert_eq!(payload["row_count"], 1); + assert_eq!(payload["rows"][0]["p.name"], "Alice"); +} + +#[test] +fn export_jsonl_outputs_source_rows_for_selected_branch_and_type() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("feature"), + ); + + let feature_data = temp.path().join("feature-export.jsonl"); + write_jsonl( + &feature_data, + r#"{"type":"Person","data":{"name":"Eve","age":29}}"#, + ); + output_success( + cli() + .arg("load") + .arg("--data") + .arg(&feature_data) + .arg("--branch") + .arg("feature") + .arg("--mode") + .arg("append") + .arg(&repo), + ); + + let output = output_success( + cli() + .arg("export") + .arg(&repo) + .arg("--branch") + .arg("feature") + .arg("--type") + .arg("Person") + .arg("--jsonl"), + ); + let rows = stdout_string(&output) + .lines() + .map(|line| serde_json::from_str::(line).unwrap()) + .collect::>(); + + assert_eq!(rows.len(), 5); + assert!(rows.iter().all(|row| row["type"] == "Person")); + assert!(rows.iter().all(|row| row.get("edge").is_none())); + assert!( + rows.iter() + .any(|row| row["data"]["name"].as_str() == Some("Eve")) + ); +} + +#[test] +fn policy_validate_accepts_valid_policy_file() { + let temp = tempdir().unwrap(); + let (config, _) = write_policy_config_fixture(temp.path()); + + let output = output_success( + cli() + .arg("policy") + .arg("validate") + .arg("--config") + .arg(&config), + ); + let stdout = stdout_string(&output); + + assert!(stdout.contains("policy valid:")); + assert!(stdout.contains("policy.yaml")); + assert!(stdout.contains("[2 actors]")); +} + +#[test] +fn policy_validate_fails_for_invalid_policy_file() { + let temp = tempdir().unwrap(); + let config = temp.path().join("omnigraph.yaml"); + let policy = temp.path().join("policy.yaml"); + fs::write( + &config, + r#" +project: + name: policy-test-repo +policy: + file: ./policy.yaml +"#, + ) + .unwrap(); + fs::write( + &policy, + r#" +version: 1 +groups: + team: [act-andrew] +rules: + - id: duplicate + allow: + actors: { group: team } + actions: [read] + branch_scope: any + - id: duplicate + allow: + actors: { group: team } + actions: [export] + branch_scope: any +"#, + ) + .unwrap(); + + let output = output_failure( + cli() + .arg("policy") + .arg("validate") + .arg("--config") + .arg(&config), + ); + let stderr = String::from_utf8(output.stderr).unwrap(); + assert!(stderr.contains("duplicate policy rule id")); +} + +#[test] +fn policy_test_runs_declarative_cases() { + let temp = tempdir().unwrap(); + let (config, _) = write_policy_config_fixture(temp.path()); + + let output = output_success(cli().arg("policy").arg("test").arg("--config").arg(&config)); + let stdout = stdout_string(&output); + + assert!(stdout.contains("policy tests passed: 2 cases")); +} + +#[test] +fn policy_explain_reports_decision_and_matched_rule() { + let temp = tempdir().unwrap(); + let (config, _) = write_policy_config_fixture(temp.path()); + + let allow = output_success( + cli() + .arg("policy") + .arg("explain") + .arg("--config") + .arg(&config) + .arg("--actor") + .arg("act-andrew") + .arg("--action") + .arg("change") + .arg("--branch") + .arg("feature"), + ); + let allow_stdout = stdout_string(&allow); + assert!(allow_stdout.contains("decision: allow")); + assert!(allow_stdout.contains("matched_rule: team-write")); + + let deny = output_success( + cli() + .arg("policy") + .arg("explain") + .arg("--config") + .arg(&config) + .arg("--actor") + .arg("act-bruno") + .arg("--action") + .arg("change") + .arg("--branch") + .arg("main"), + ); + let deny_stdout = stdout_string(&deny); + assert!(deny_stdout.contains("decision: deny")); + assert!(deny_stdout.contains("message: policy denied action 'change' on branch 'main'")); +} + +#[test] +fn read_can_resolve_uri_from_config() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let config = temp.path().join("omnigraph.yaml"); + init_repo(&repo); + load_fixture(&repo); + write_config(&config, &local_yaml_config(&repo)); + + let output = output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["row_count"], 1); +} + +#[test] +fn read_alias_from_yaml_config_runs_with_kv_output() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let config = temp.path().join("omnigraph.yaml"); + let query = temp.path().join("aliases.gq"); + init_repo(&repo); + load_fixture(&repo); + write_query_file( + &query, + &std::fs::read_to_string(fixture("test.gq")).unwrap(), + ); + write_config( + &config, + &format!( + "{}aliases:\n owner:\n command: read\n query: aliases.gq\n name: get_person\n args: [name]\n format: kv\n", + local_yaml_config(&repo) + ), + ); + + let output = output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--alias") + .arg("owner") + .arg("Alice"), + ); + let stdout = stdout_string(&output); + + assert!(stdout.contains("row 1")); + assert!(stdout.contains("p.name: Alice")); +} + +#[test] +fn change_alias_from_yaml_config_persists_changes() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let config = temp.path().join("omnigraph.yaml"); + let query = temp.path().join("mutations.gq"); + init_repo(&repo); + load_fixture(&repo); + write_query_file( + &query, + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#, + ); + write_config( + &config, + &format!( + "{}aliases:\n add_person:\n command: change\n query: mutations.gq\n name: insert_person\n args: [name, age]\n", + local_yaml_config(&repo) + ), + ); + + let output = output_success( + cli() + .arg("change") + .arg("--config") + .arg(&config) + .arg("--alias") + .arg("add_person") + .arg("Eve") + .arg("29") + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["affected_nodes"], 1); + + let verify = output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Eve"}"#) + .arg("--json"), + ); + let verify_payload: Value = serde_json::from_slice(&verify.stdout).unwrap(); + assert_eq!(verify_payload["row_count"], 1); +} + +#[test] +fn read_csv_format_outputs_header_and_row_values() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let output = output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--format") + .arg("csv"), + ); + let stdout = stdout_string(&output); + + assert!(stdout.lines().next().unwrap().contains("p.name")); + assert!(stdout.contains("Alice")); +} + +#[test] +fn read_jsonl_format_outputs_metadata_header_first() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let output = output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--format") + .arg("jsonl"), + ); + let stdout = stdout_string(&output); + let mut lines = stdout.lines(); + assert!(lines.next().unwrap().contains("\"kind\":\"metadata\"")); + assert!(lines.next().unwrap().contains("\"p.name\":\"Alice\"")); +} + +#[test] +fn change_json_outputs_affected_counts_and_persists() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + let mutation_file = temp.path().join("mutations.gq"); + write_query_file( + &mutation_file, + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#, + ); + + let output = output_success( + cli() + .arg("change") + .arg(&repo) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"Eve","age":29}"#) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["branch"], "main"); + assert_eq!(payload["query_name"], "insert_person"); + assert_eq!(payload["affected_nodes"], 1); + assert_eq!(payload["affected_edges"], 0); + + let verify = output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Eve"}"#) + .arg("--json"), + ); + let verify_payload: Value = serde_json::from_slice(&verify.stdout).unwrap(); + assert_eq!(verify_payload["row_count"], 1); + assert_eq!(verify_payload["rows"][0]["p.name"], "Eve"); +} + +#[test] +fn change_can_resolve_uri_and_branch_from_config() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let config = temp.path().join("omnigraph.yaml"); + init_repo(&repo); + load_fixture(&repo); + write_config(&config, &local_yaml_config(&repo)); + let mutation_file = temp.path().join("config-mutations.gq"); + write_query_file( + &mutation_file, + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#, + ); + + let output = output_success( + cli() + .arg("change") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"Mia","age":30}"#) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["branch"], "main"); + assert_eq!(payload["affected_nodes"], 1); +} + +#[test] +fn read_requires_name_for_multi_query_files() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let output = output_failure( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(fixture("test.gq")), + ); + let stderr = String::from_utf8(output.stderr).unwrap(); + assert!(stderr.contains("multiple queries")); +} + +#[test] +fn branch_create_json_outputs_source_and_name() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + + let output = output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("feature") + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + + assert_eq!(payload["from"], "main"); + assert_eq!(payload["name"], "feature"); + assert_eq!(payload["uri"], repo.to_string_lossy().as_ref()); +} + +#[test] +fn branch_list_outputs_sorted_branches() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("zeta"), + ); + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("alpha"), + ); + + let output = output_success(cli().arg("branch").arg("list").arg("--uri").arg(&repo)); + let stdout = stdout_string(&output); + let lines = stdout + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .collect::>(); + + assert_eq!(lines, vec!["alpha", "main", "zeta"]); +} + +#[test] +fn branch_delete_json_outputs_name_and_removes_branch() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("feature"), + ); + + let output = output_success( + cli() + .arg("branch") + .arg("delete") + .arg("--uri") + .arg(&repo) + .arg("feature") + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["name"], "feature"); + assert_eq!(payload["uri"], repo.to_string_lossy().as_ref()); + + let listed = output_success(cli().arg("branch").arg("list").arg("--uri").arg(&repo)); + let stdout = stdout_string(&listed); + let lines = stdout + .lines() + .map(str::trim) + .filter(|line| !line.is_empty()) + .collect::>(); + assert_eq!(lines, vec!["main"]); +} + +#[test] +fn branch_delete_rejects_main() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + + let output = output_failure( + cli() + .arg("branch") + .arg("delete") + .arg("--uri") + .arg(&repo) + .arg("main"), + ); + let stderr = String::from_utf8(output.stderr).unwrap(); + assert!(stderr.contains("cannot delete branch 'main'")); +} + +#[test] +fn branch_merge_defaults_target_to_main() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("feature"), + ); + + let feature_data = temp.path().join("feature.jsonl"); + write_jsonl( + &feature_data, + r#"{"type":"Person","data":{"name":"Eve","age":29}}"#, + ); + output_success( + cli() + .arg("load") + .arg("--data") + .arg(&feature_data) + .arg("--branch") + .arg("feature") + .arg("--mode") + .arg("append") + .arg(&repo), + ); + + let merge_output = output_success( + cli() + .arg("branch") + .arg("merge") + .arg("--uri") + .arg(&repo) + .arg("feature") + .arg("--json"), + ); + let merge_payload: Value = serde_json::from_slice(&merge_output.stdout).unwrap(); + assert_eq!(merge_payload["source"], "feature"); + assert_eq!(merge_payload["target"], "main"); + assert_eq!(merge_payload["outcome"], "fast_forward"); + + let snapshot_output = output_success( + cli() + .arg("snapshot") + .arg(&repo) + .arg("--branch") + .arg("main") + .arg("--json"), + ); + let snapshot: Value = serde_json::from_slice(&snapshot_output.stdout).unwrap(); + let person_row_count = snapshot["tables"] + .as_array() + .unwrap() + .iter() + .find(|table| table["table_key"] == "node:Person") + .unwrap()["row_count"] + .as_u64() + .unwrap(); + assert_eq!(person_row_count, 5); +} + +#[test] +fn branch_merge_supports_explicit_target() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("feature"), + ); + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(&repo) + .arg("--from") + .arg("main") + .arg("experiment"), + ); + + let feature_data = temp.path().join("feature-explicit.jsonl"); + write_jsonl( + &feature_data, + r#"{"type":"Person","data":{"name":"Frank","age":41}}"#, + ); + output_success( + cli() + .arg("load") + .arg("--data") + .arg(&feature_data) + .arg("--branch") + .arg("feature") + .arg("--mode") + .arg("append") + .arg(&repo), + ); + + let merge_output = output_success( + cli() + .arg("branch") + .arg("merge") + .arg("--uri") + .arg(&repo) + .arg("feature") + .arg("--into") + .arg("experiment") + .arg("--json"), + ); + let merge_payload: Value = serde_json::from_slice(&merge_output.stdout).unwrap(); + assert_eq!(merge_payload["target"], "experiment"); + assert_eq!(merge_payload["outcome"], "fast_forward"); +} + +#[test] +fn snapshot_json_returns_manifest_version_and_tables() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let output = output_success(cli().arg("snapshot").arg(&repo).arg("--json")); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + + assert_eq!(payload["branch"], "main"); + assert_eq!( + payload["manifest_version"].as_u64().unwrap(), + manifest_dataset_version(&repo) + ); + assert!(payload["tables"].as_array().unwrap().len() >= 4); +} + +fn write_seed_fixture(root: &std::path::Path) -> std::path::PathBuf { + fs::create_dir_all(root.join("data")).unwrap(); + fs::create_dir_all(root.join("build")).unwrap(); + let raw_seed = root.join("data/seed.jsonl"); + let seed = root.join("seed.yaml"); + + fs::write( + &raw_seed, + concat!( + "{\"type\":\"Decision\",\"data\":{\"slug\":\"dec-alpha\",\"intent\":\"Alpha ship\"}}\n", + "{\"type\":\"Decision\",\"data\":{\"slug\":\"dec-beta\",\"intent\":\"Beta ship\",\"embedding\":[0.1,0.2]}}\n" + ), + ) + .unwrap(); + + fs::write( + &seed, + concat!( + "graph:\n", + " slug: mr-context-graph\n", + "sources:\n", + " raw_seed: ./data/seed.jsonl\n", + "artifacts:\n", + " embedded_seed: ./build/seed.embedded.jsonl\n", + "embeddings:\n", + " model: gemini-embedding-2-preview\n", + " dimension: 4\n", + " types:\n", + " Decision:\n", + " target: embedding\n", + " fields: [slug, intent]\n" + ), + ) + .unwrap(); + + seed +} + +fn write_seed_fixture_with_edge(root: &std::path::Path) -> std::path::PathBuf { + let seed = write_seed_fixture(root); + let raw_seed = root.join("data/seed.jsonl"); + fs::write( + &raw_seed, + concat!( + "{\"type\":\"Decision\",\"data\":{\"slug\":\"dec-alpha\",\"intent\":\"Alpha ship\"}}\n", + "{\"type\":\"Decision\",\"data\":{\"slug\":\"dec-beta\",\"intent\":\"Beta ship\",\"embedding\":[0.1,0.2]}}\n", + "{\"edge\":\"Triggered\",\"from\":\"sig-alpha\",\"to\":\"dec-alpha\"}\n" + ), + ) + .unwrap(); + seed +} + +fn read_embedded_rows(path: std::path::PathBuf) -> Vec { + fs::read_to_string(path) + .unwrap() + .lines() + .filter(|line| !line.trim().is_empty()) + .map(|line| serde_json::from_str(line).unwrap()) + .collect() +} + +#[test] +fn snapshot_can_resolve_uri_from_config() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let config = temp.path().join("omnigraph.yaml"); + init_repo(&repo); + load_fixture(&repo); + write_config(&config, &local_yaml_config(&repo)); + + let output = output_success( + cli() + .arg("snapshot") + .arg("--config") + .arg(&config) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["branch"], "main"); +} + +#[test] +fn snapshot_human_output_includes_branch_and_table_summaries() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let output = output_success(cli().arg("snapshot").arg(&repo)); + let stdout = stdout_string(&output); + + assert!(stdout.contains("branch: main")); + assert!(stdout.contains("manifest_version:")); + assert!(stdout.contains("node:Person v")); + assert!(stdout.contains("edge:Knows v")); +} + +#[test] +fn cli_fails_for_missing_repo() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + + let output = output_failure(cli().arg("snapshot").arg(&repo)); + let stderr = String::from_utf8(output.stderr).unwrap(); + assert!( + stderr.contains("_schema.pg") + || stderr.contains("No such file") + || stderr.contains("not found") + ); +} + +#[test] +fn cli_fails_for_missing_schema_or_data_file() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let missing_schema = temp.path().join("missing.pg"); + let missing_data = temp.path().join("missing.jsonl"); + + let init_output = output_failure( + cli() + .arg("init") + .arg("--schema") + .arg(&missing_schema) + .arg(&repo), + ); + assert!( + String::from_utf8(init_output.stderr) + .unwrap() + .contains("No such file") + ); + + init_repo(&repo); + let load_output = output_failure( + cli() + .arg("load") + .arg("--data") + .arg(&missing_data) + .arg(&repo), + ); + assert!( + String::from_utf8(load_output.stderr) + .unwrap() + .contains("No such file") + ); +} + +#[test] +fn cli_fails_for_invalid_merge_requests() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let missing_branch = output_failure( + cli() + .arg("branch") + .arg("merge") + .arg("--uri") + .arg(&repo) + .arg("missing"), + ); + let missing_branch_stderr = String::from_utf8(missing_branch.stderr).unwrap(); + assert!( + missing_branch_stderr.contains("missing") + || missing_branch_stderr.contains("head commit") + || missing_branch_stderr.contains("not found") + ); + + let same_branch = output_failure( + cli() + .arg("branch") + .arg("merge") + .arg("--uri") + .arg(&repo) + .arg("main") + .arg("--into") + .arg("main"), + ); + assert!( + String::from_utf8(same_branch.stderr) + .unwrap() + .contains("distinct source and target") + ); +} + +#[test] +fn run_list_and_show_report_published_runs() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let list_output = output_success(cli().arg("run").arg("list").arg(&repo).arg("--json")); + let list_payload: Value = serde_json::from_slice(&list_output.stdout).unwrap(); + let runs = list_payload["runs"].as_array().unwrap(); + assert_eq!(runs.len(), 1); + assert_eq!(runs[0]["status"], "published"); + let run_id = runs[0]["run_id"].as_str().unwrap(); + + let show_output = output_success( + cli() + .arg("run") + .arg("show") + .arg("--uri") + .arg(&repo) + .arg(run_id) + .arg("--json"), + ); + let show_payload: Value = serde_json::from_slice(&show_output.stdout).unwrap(); + assert_eq!(show_payload["run_id"], run_id); + assert_eq!(show_payload["status"], "published"); + assert_eq!(show_payload["target_branch"], "main"); +} + +#[test] +fn run_list_can_resolve_uri_from_config() { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + let config = temp.path().join("omnigraph.yaml"); + init_repo(&repo); + load_fixture(&repo); + write_config(&config, &local_yaml_config(&repo)); + + let output = output_success( + cli() + .arg("run") + .arg("list") + .arg("--config") + .arg(&config) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&output.stdout).unwrap(); + assert_eq!(payload["runs"].as_array().unwrap().len(), 1); +} + +#[test] +fn run_publish_promotes_manual_running_run() { + let runtime = tokio::runtime::Runtime::new().unwrap(); + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let run_id = runtime.block_on(begin_manual_run(&repo, "main")); + + let publish_output = output_success( + cli() + .arg("run") + .arg("publish") + .arg("--uri") + .arg(&repo) + .arg(&run_id) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&publish_output.stdout).unwrap(); + assert_eq!(payload["run_id"], run_id); + assert_eq!(payload["status"], "published"); + assert!(payload["published_snapshot_id"].is_string()); + + let runtime = tokio::runtime::Runtime::new().unwrap(); + runtime.block_on(async { + let db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + let result = db + .query( + ReadTarget::branch("main"), + include_str!("../../omnigraph/tests/fixtures/test.gq"), + "get_person", + &omnigraph_compiler::ir::ParamMap::from([( + "name".to_string(), + omnigraph_compiler::query::ast::Literal::String("Eve".to_string()), + )]), + ) + .await + .unwrap(); + assert_eq!(result.num_rows(), 1); + }); +} + +#[test] +fn run_abort_marks_manual_running_run_aborted() { + let runtime = tokio::runtime::Runtime::new().unwrap(); + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + + let run_id = runtime.block_on(begin_manual_run(&repo, "main")); + + let abort_output = output_success( + cli() + .arg("run") + .arg("abort") + .arg("--uri") + .arg(&repo) + .arg(&run_id) + .arg("--json"), + ); + let payload: Value = serde_json::from_slice(&abort_output.stdout).unwrap(); + assert_eq!(payload["run_id"], run_id); + assert_eq!(payload["status"], "aborted"); +} diff --git a/crates/omnigraph-cli/tests/support/mod.rs b/crates/omnigraph-cli/tests/support/mod.rs new file mode 100644 index 0000000..8e38ee4 --- /dev/null +++ b/crates/omnigraph-cli/tests/support/mod.rs @@ -0,0 +1,292 @@ +#![allow(dead_code)] + +use std::fs; +use std::net::TcpListener; +use std::path::{Path, PathBuf}; +use std::process::{Child, Command as StdCommand, Output, Stdio}; +use std::thread::sleep; +use std::time::Duration; + +use assert_cmd::Command; +use omnigraph::db::Omnigraph; +use omnigraph::loader::LoadMode; +use reqwest::blocking::Client; +use serde_json::Value; +use tempfile::{TempDir, tempdir}; + +pub fn cli() -> Command { + Command::cargo_bin("omnigraph").unwrap() +} + +pub fn cli_process() -> StdCommand { + StdCommand::new(assert_cmd::cargo::cargo_bin("omnigraph")) +} + +fn server_process() -> StdCommand { + if let Some(path) = std::env::var_os("CARGO_BIN_EXE_omnigraph-server") { + StdCommand::new(path) + } else if let Some(path) = built_server_binary() { + StdCommand::new(path) + } else { + let cargo = std::env::var_os("CARGO").unwrap_or_else(|| "cargo".into()); + let mut cmd = StdCommand::new(cargo); + cmd.arg("run") + .arg("--quiet") + .arg("-p") + .arg("omnigraph-server") + .arg("--"); + cmd + } +} + +fn built_server_binary() -> Option { + let workspace_root = PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../.."); + let candidate = workspace_root + .join("target") + .join("debug") + .join(format!("omnigraph-server{}", std::env::consts::EXE_SUFFIX)); + candidate.exists().then_some(candidate) +} + +pub fn fixture(name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../omnigraph/tests/fixtures") + .join(name) +} + +pub fn repo_path(root: &Path) -> PathBuf { + root.join("demo.omni") +} + +pub fn output_success(cmd: &mut Command) -> Output { + let output = cmd.output().unwrap(); + assert!( + output.status.success(), + "command failed\nstdout:\n{}\nstderr:\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + output +} + +pub fn output_failure(cmd: &mut Command) -> Output { + let output = cmd.output().unwrap(); + assert!( + !output.status.success(), + "command unexpectedly succeeded\nstdout:\n{}\nstderr:\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + output +} + +pub fn stdout_string(output: &Output) -> String { + String::from_utf8(output.stdout.clone()).unwrap() +} + +pub fn parse_stdout_json(output: &Output) -> Value { + serde_json::from_slice(&output.stdout).unwrap() +} + +pub fn init_repo(repo: &Path) { + let schema = fixture("test.pg"); + output_success(cli().arg("init").arg("--schema").arg(&schema).arg(repo)); +} + +pub fn load_fixture(repo: &Path) { + let data = fixture("test.jsonl"); + output_success(cli().arg("load").arg("--data").arg(&data).arg(repo)); +} + +pub fn write_jsonl(path: &Path, rows: &str) { + fs::write(path, rows).unwrap(); +} + +pub fn write_query_file(path: &Path, source: &str) { + fs::write(path, source).unwrap(); +} + +pub fn write_config(path: &Path, source: &str) { + fs::write(path, source).unwrap(); +} + +fn yaml_string(value: &str) -> String { + format!("'{}'", value.replace('\'', "''")) +} + +pub fn local_yaml_config(repo: &Path) -> String { + format!( + "\ +targets: + local: + uri: {} +cli: + target: local + branch: main +query: + roots: + - . +policy: {{}} +", + yaml_string(&repo.to_string_lossy()) + ) +} + +pub fn remote_yaml_config(url: &str) -> String { + format!( + "\ +targets: + dev: + uri: {} +cli: + target: dev + branch: main +query: + roots: + - . +policy: {{}} +", + yaml_string(url) + ) +} + +pub struct TestServer { + child: Child, + pub base_url: String, +} + +impl Drop for TestServer { + fn drop(&mut self) { + let _ = self.child.kill(); + let _ = self.child.wait(); + } +} + +fn free_port() -> u16 { + let listener = TcpListener::bind("127.0.0.1:0").unwrap(); + let port = listener.local_addr().unwrap().port(); + drop(listener); + port +} + +fn spawn_server_process(mut command: StdCommand) -> TestServer { + let port = free_port(); + let bind = format!("127.0.0.1:{}", port); + let mut child = command + .arg("--bind") + .arg(&bind) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .unwrap(); + let base_url = format!("http://{}", bind); + let client = Client::new(); + for _ in 0..300 { + if client + .get(format!("{}/healthz", base_url)) + .send() + .map(|response| response.status().is_success()) + .unwrap_or(false) + { + return TestServer { child, base_url }; + } + if let Some(status) = child.try_wait().unwrap() { + panic!("server exited before becoming healthy: {status}"); + } + sleep(Duration::from_millis(100)); + } + panic!("server did not become healthy"); +} + +pub fn spawn_server(repo: &Path) -> TestServer { + let mut command = server_process(); + command.arg(repo); + spawn_server_process(command) +} + +pub fn spawn_server_with_config(config: &Path) -> TestServer { + let mut command = server_process(); + command.arg("--config").arg(config); + spawn_server_process(command) +} + +pub fn spawn_server_with_config_env(config: &Path, envs: &[(&str, &str)]) -> TestServer { + let mut command = server_process(); + command.arg("--config").arg(config); + for (name, value) in envs { + command.env(name, value); + } + spawn_server_process(command) +} + +pub async fn begin_manual_run(repo: &Path, target_branch: &str) -> String { + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + let run = db + .begin_run(target_branch, Some("cli-test-run")) + .await + .unwrap(); + db.load( + &run.run_branch, + r#"{"type":"Person","data":{"name":"Eve","age":29}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + run.run_id.as_str().to_string() +} + +pub struct SystemRepo { + _temp: TempDir, + repo: PathBuf, +} + +impl SystemRepo { + pub fn initialized() -> Self { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + Self { _temp: temp, repo } + } + + pub fn loaded() -> Self { + let temp = tempdir().unwrap(); + let repo = repo_path(temp.path()); + init_repo(&repo); + load_fixture(&repo); + Self { _temp: temp, repo } + } + + pub fn path(&self) -> &Path { + &self.repo + } + + pub fn write_query(&self, name: &str, source: &str) -> PathBuf { + let path = self.repo.parent().unwrap().join(name); + write_query_file(&path, source); + path + } + + pub fn write_jsonl(&self, name: &str, rows: &str) -> PathBuf { + let path = self.repo.parent().unwrap().join(name); + write_jsonl(&path, rows); + path + } + + pub fn write_config(&self, name: &str, source: &str) -> PathBuf { + let path = self.repo.parent().unwrap().join(name); + write_config(&path, source); + path + } + + pub fn spawn_server(&self) -> TestServer { + spawn_server(&self.repo) + } + + pub fn spawn_server_with_config(&self, config: &Path) -> TestServer { + spawn_server_with_config(config) + } + + pub fn spawn_server_with_config_env(&self, config: &Path, envs: &[(&str, &str)]) -> TestServer { + spawn_server_with_config_env(config, envs) + } +} diff --git a/crates/omnigraph-cli/tests/system_local.rs b/crates/omnigraph-cli/tests/system_local.rs new file mode 100644 index 0000000..8be599a --- /dev/null +++ b/crates/omnigraph-cli/tests/system_local.rs @@ -0,0 +1,1162 @@ +mod support; + +use std::env; +use std::fs; +use std::process::Stdio; +use std::thread::sleep; +use std::time::Duration; + +use omnigraph::db::Omnigraph; +use omnigraph::loader::LoadMode; +use reqwest::blocking::Client; +use serde_json::Value; + +use support::*; + +const POLICY_E2E_YAML: &str = r#" +version: 1 +groups: + team: [act-bruno] + admins: [act-ragnor] +protected_branches: [main] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read] + branch_scope: any + - id: team-write-unprotected + allow: + actors: { group: team } + actions: [change] + branch_scope: unprotected + - id: admins-promote + allow: + actors: { group: admins } + actions: [branch_merge, run_publish] + target_branch_scope: protected +"#; + +const POLICY_E2E_TESTS_YAML: &str = r#" +version: 1 +cases: + - id: deny-main-change + actor: act-bruno + action: change + branch: main + expect: deny + - id: allow-feature-change + actor: act-bruno + action: change + branch: feature + expect: allow +"#; + +fn yaml_string(value: &str) -> String { + format!("'{}'", value.replace('\'', "''")) +} + +fn local_policy_config(repo: &SystemRepo) -> String { + format!( + "\ +project: + name: policy-e2e-local +targets: + local: + uri: {} +cli: + target: local + branch: main +query: + roots: + - . +policy: + file: ./policy.yaml +", + yaml_string(&repo.path().to_string_lossy()) + ) +} + +fn insert_person_query(repo: &SystemRepo, name: &str) -> std::path::PathBuf { + repo.write_query( + name, + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#, + ) +} + +fn add_friend_query(repo: &SystemRepo, name: &str) -> std::path::PathBuf { + repo.write_query( + name, + r#" +query add_friend($from: String, $to: String) { + insert Knows { from: $from, to: $to } +} +"#, + ) +} + +fn snapshot_table_row_count(repo: &SystemRepo, table_key: &str) -> u64 { + snapshot_table_row_count_at(repo.path(), table_key) +} + +fn snapshot_table_row_count_at(repo: &std::path::Path, table_key: &str) -> u64 { + let payload = parse_stdout_json(&output_success( + cli().arg("snapshot").arg(repo).arg("--json"), + )); + payload["tables"] + .as_array() + .unwrap() + .iter() + .find(|table| table["table_key"] == table_key) + .unwrap()["row_count"] + .as_u64() + .unwrap() +} + +fn wait_for_running_run(repo: &SystemRepo) -> String { + let runtime = tokio::runtime::Runtime::new().unwrap(); + for _ in 0..200 { + let running = runtime.block_on(async { + let db = Omnigraph::open(repo.path().to_str().unwrap()) + .await + .unwrap(); + db.list_runs() + .await + .unwrap() + .into_iter() + .find(|run| run.target_branch == "main" && run.status.as_str() == "running") + .map(|run| run.run_id.to_string()) + }); + if let Some(run_id) = running { + return run_id; + } + sleep(Duration::from_millis(50)); + } + + panic!("timed out waiting for running run"); +} + +fn bulk_people_jsonl(count: usize) -> String { + let mut rows = String::new(); + for index in 0..count { + rows.push_str(&format!( + r#"{{"type":"Person","data":{{"name":"Bulk{:05}","age":{}}}}}"#, + index, + 20 + (index % 50) + )); + rows.push('\n'); + } + rows +} + +fn gemini_base_url() -> String { + env::var("OMNIGRAPH_GEMINI_BASE_URL") + .ok() + .filter(|value| !value.trim().is_empty()) + .unwrap_or_else(|| "https://generativelanguage.googleapis.com/v1beta".to_string()) +} + +fn embed_text_with_gemini(text: &str, dim: usize) -> Vec { + let api_key = env::var("GEMINI_API_KEY").expect("GEMINI_API_KEY must be set"); + let client = Client::new(); + let response = client + .post(format!( + "{}/models/gemini-embedding-2-preview:embedContent", + gemini_base_url().trim_end_matches('/') + )) + .header("x-goog-api-key", api_key) + .json(&serde_json::json!({ + "model": "models/gemini-embedding-2-preview", + "content": { + "parts": [ + { + "text": text + } + ] + }, + "taskType": "RETRIEVAL_QUERY", + "outputDimensionality": dim, + })) + .send() + .unwrap() + .error_for_status() + .unwrap() + .json::() + .unwrap(); + + response["embedding"]["values"] + .as_array() + .unwrap() + .iter() + .map(|value| value.as_f64().unwrap() as f32) + .collect() +} + +fn format_vector(values: &[f32]) -> String { + values + .iter() + .map(|value| format!("{:.8}", value)) + .collect::>() + .join(", ") +} + +fn s3_test_repo_uri(suite: &str) -> Option { + let bucket = env::var("OMNIGRAPH_S3_TEST_BUCKET").ok()?; + let prefix = env::var("OMNIGRAPH_S3_TEST_PREFIX") + .ok() + .filter(|value| !value.trim().is_empty()) + .unwrap_or_else(|| "omnigraph-itests".to_string()); + let unique = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .ok()? + .as_nanos(); + Some(format!("s3://{}/{}/{}/{}", bucket, prefix, suite, unique)) +} + +#[test] +fn local_cli_end_to_end_init_load_read_change_read_flow() { + let repo = SystemRepo::initialized(); + let mutation_file = insert_person_query(&repo, "system-local-init-change.gq"); + + output_success( + cli() + .arg("load") + .arg("--data") + .arg(fixture("test.jsonl")) + .arg(repo.path()), + ); + + let read_before = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + assert_eq!(read_before["row_count"], 1); + assert_eq!(read_before["rows"][0]["p.name"], "Alice"); + + let change_payload = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg(repo.path()) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"Eve","age":29}"#) + .arg("--json"), + )); + assert_eq!(change_payload["branch"], "main"); + assert_eq!(change_payload["affected_nodes"], 1); + + let read_after = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Eve"}"#) + .arg("--json"), + )); + assert_eq!(read_after["row_count"], 1); + assert_eq!(read_after["rows"][0]["p.name"], "Eve"); +} + +#[test] +fn local_cli_end_to_end_branch_change_merge_flow() { + let repo = SystemRepo::loaded(); + let mutation_file = insert_person_query(&repo, "system-local-change.gq"); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(repo.path()) + .arg("--from") + .arg("main") + .arg("feature"), + ); + + let change_payload = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg(repo.path()) + .arg("--query") + .arg(&mutation_file) + .arg("--branch") + .arg("feature") + .arg("--params") + .arg(r#"{"name":"Zoe","age":33}"#) + .arg("--json"), + )); + assert_eq!(change_payload["branch"], "feature"); + assert_eq!(change_payload["affected_nodes"], 1); + + let feature_read = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--branch") + .arg("feature") + .arg("--params") + .arg(r#"{"name":"Zoe"}"#) + .arg("--json"), + )); + assert_eq!(feature_read["row_count"], 1); + assert_eq!(feature_read["rows"][0]["p.name"], "Zoe"); + + let merge_payload = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("merge") + .arg("--uri") + .arg(repo.path()) + .arg("feature") + .arg("--json"), + )); + assert_eq!(merge_payload["target"], "main"); + + let main_read = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Zoe"}"#) + .arg("--json"), + )); + assert_eq!(main_read["row_count"], 1); + assert_eq!(main_read["rows"][0]["p.name"], "Zoe"); + + let runs_payload = parse_stdout_json(&output_success( + cli().arg("run").arg("list").arg(repo.path()).arg("--json"), + )); + let runs = runs_payload["runs"].as_array().unwrap(); + assert!(runs.len() >= 2); + assert!( + runs.iter() + .any(|run| run["target_branch"] == "feature" && run["status"] == "published") + ); +} + +#[test] +fn local_cli_ingest_creates_review_branch_and_keeps_it_readable() { + let repo = SystemRepo::loaded(); + let ingest_data = repo.write_jsonl( + "system-local-ingest.jsonl", + r#"{"type":"Person","data":{"name":"Zoe","age":33}} +{"type":"Person","data":{"name":"Bob","age":26}}"#, + ); + + let ingest_payload = parse_stdout_json(&output_success( + cli() + .arg("ingest") + .arg("--data") + .arg(&ingest_data) + .arg("--branch") + .arg("feature-ingest") + .arg(repo.path()) + .arg("--json"), + )); + assert_eq!(ingest_payload["branch"], "feature-ingest"); + assert_eq!(ingest_payload["base_branch"], "main"); + assert_eq!(ingest_payload["branch_created"], true); + assert_eq!(ingest_payload["mode"], "merge"); + assert_eq!(ingest_payload["tables"][0]["table_key"], "node:Person"); + assert_eq!(ingest_payload["tables"][0]["rows_loaded"], 2); + + let feature_snapshot = parse_stdout_json(&output_success( + cli() + .arg("snapshot") + .arg(repo.path()) + .arg("--branch") + .arg("feature-ingest") + .arg("--json"), + )); + assert_eq!(feature_snapshot["branch"], "feature-ingest"); + + let zoe = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--branch") + .arg("feature-ingest") + .arg("--params") + .arg(r#"{"name":"Zoe"}"#) + .arg("--json"), + )); + assert_eq!(zoe["row_count"], 1); + assert_eq!(zoe["rows"][0]["p.name"], "Zoe"); + + let bob = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--branch") + .arg("feature-ingest") + .arg("--params") + .arg(r#"{"name":"Bob"}"#) + .arg("--json"), + )); + assert_eq!(bob["row_count"], 1); + assert_eq!(bob["rows"][0]["p.age"], 26); +} + +#[test] +fn local_cli_export_round_trips_full_branch_graph() { + let repo = SystemRepo::loaded(); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--uri") + .arg(repo.path()) + .arg("--from") + .arg("main") + .arg("feature"), + ); + + let feature_data = repo.write_jsonl( + "system-local-export-feature.jsonl", + r#"{"type":"Person","data":{"name":"Eve","age":29}} +{"edge":"Knows","from":"Alice","to":"Eve"}"#, + ); + output_success( + cli() + .arg("load") + .arg("--data") + .arg(&feature_data) + .arg("--branch") + .arg("feature") + .arg("--mode") + .arg("append") + .arg(repo.path()), + ); + + let exported = stdout_string(&output_success( + cli() + .arg("export") + .arg(repo.path()) + .arg("--branch") + .arg("feature") + .arg("--jsonl"), + )); + let export_path = repo.write_jsonl("system-local-exported.jsonl", &exported); + let imported_repo = repo.path().parent().unwrap().join("imported-export.omni"); + + output_success( + cli() + .arg("init") + .arg("--schema") + .arg(fixture("test.pg")) + .arg(&imported_repo), + ); + output_success( + cli() + .arg("load") + .arg("--data") + .arg(&export_path) + .arg(&imported_repo), + ); + + assert_eq!( + snapshot_table_row_count_at(&imported_repo, "node:Person"), + 5 + ); + assert_eq!( + snapshot_table_row_count_at(&imported_repo, "node:Company"), + 2 + ); + assert_eq!(snapshot_table_row_count_at(&imported_repo, "edge:Knows"), 4); + assert_eq!( + snapshot_table_row_count_at(&imported_repo, "edge:WorksAt"), + 2 + ); + + let eve = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(&imported_repo) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Eve"}"#) + .arg("--json"), + )); + assert_eq!(eve["row_count"], 1); + assert_eq!(eve["rows"][0]["p.name"], "Eve"); + + let friends = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(&imported_repo) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("friends_of") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + assert_eq!(friends["row_count"], 3); +} + +#[test] +fn local_cli_s3_end_to_end_init_load_read_flow() { + let Some(repo_uri) = s3_test_repo_uri("cli-local") else { + eprintln!("skipping s3 cli test: OMNIGRAPH_S3_TEST_BUCKET is not set"); + return; + }; + + let temp = tempfile::tempdir().unwrap(); + let query_root = temp.path(); + let config = query_root.join("omnigraph.yaml"); + let query = query_root.join("test.gq"); + fs::copy(fixture("test.gq"), &query).unwrap(); + write_config( + &config, + &format!( + "\ +targets: + rustfs: + uri: '{}' +cli: + target: rustfs + branch: main +query: + roots: + - . +policy: {{}} +", + repo_uri + ), + ); + + output_success( + cli() + .arg("init") + .arg("--schema") + .arg(fixture("test.pg")) + .arg(&repo_uri), + ); + output_success( + cli() + .arg("load") + .arg("--data") + .arg(fixture("test.jsonl")) + .arg(&repo_uri), + ); + + let read = parse_stdout_json(&output_success( + cli() + .current_dir(query_root) + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg("test.gq") + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + assert_eq!(read["row_count"], 1); + assert_eq!(read["rows"][0]["p.name"], "Alice"); + + let snapshot = parse_stdout_json(&output_success( + cli() + .current_dir(query_root) + .arg("snapshot") + .arg("--config") + .arg(&config) + .arg("--json"), + )); + assert!(snapshot["tables"].is_array()); +} + +#[test] +fn local_cli_failed_load_keeps_target_state_unchanged() { + let repo = SystemRepo::loaded(); + let bad_data = repo.write_jsonl( + "system-bad-load.jsonl", + r#"{"edge":"Knows","from":"Alice","to":"Missing"}"#, + ); + let person_rows_before = snapshot_table_row_count(&repo, "node:Person"); + let knows_rows_before = snapshot_table_row_count(&repo, "edge:Knows"); + + let output = output_failure( + cli() + .arg("load") + .arg("--data") + .arg(&bad_data) + .arg("--mode") + .arg("append") + .arg(repo.path()), + ); + let stderr = String::from_utf8(output.stderr).unwrap(); + assert!(stderr.contains("not found") || stderr.contains("Missing")); + + assert_eq!( + snapshot_table_row_count(&repo, "node:Person"), + person_rows_before + ); + assert_eq!( + snapshot_table_row_count(&repo, "edge:Knows"), + knows_rows_before + ); + + let runs_payload = parse_stdout_json(&output_success( + cli().arg("run").arg("list").arg(repo.path()).arg("--json"), + )); + assert!( + runs_payload["runs"] + .as_array() + .unwrap() + .iter() + .any(|run| run["target_branch"] == "main" && run["status"] == "failed") + ); +} + +#[test] +fn local_cli_failed_change_keeps_target_state_unchanged() { + let repo = SystemRepo::loaded(); + let mutation_file = add_friend_query(&repo, "system-invalid-change.gq"); + + let output = output_failure( + cli() + .arg("change") + .arg(repo.path()) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"from":"Alice","to":"Missing"}"#), + ); + let stderr = String::from_utf8(output.stderr).unwrap(); + assert!(stderr.contains("not found") || stderr.contains("Missing")); + + let friends_payload = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("friends_of") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + assert_eq!(friends_payload["row_count"], 2); + + let runs_payload = parse_stdout_json(&output_success( + cli().arg("run").arg("list").arg(repo.path()).arg("--json"), + )); + assert!( + runs_payload["runs"] + .as_array() + .unwrap() + .iter() + .any(|run| run["target_branch"] == "main" && run["status"] == "failed") + ); +} + +#[test] +fn local_cli_resolves_relative_query_against_config_base_dir() { + let repo = SystemRepo::loaded(); + let root = repo.path().parent().unwrap(); + let config_dir = root.join("config"); + let query_dir = config_dir.join("queries"); + let ambient_dir = root.join("ambient"); + fs::create_dir_all(&query_dir).unwrap(); + fs::create_dir_all(&ambient_dir).unwrap(); + + let config = config_dir.join("omnigraph.yaml"); + write_config( + &config, + &format!( + "\ +targets: + local: + uri: '{}' +cli: + target: local + branch: main +query: + roots: + - queries +policy: {{}} +", + repo.path().display() + ), + ); + write_query_file( + &query_dir.join("local.gq"), + r#" +query get_person($name: String) { + match { + $p: Person { name: $name } + } + return { $p.age, $p.name } +} +"#, + ); + write_query_file( + &ambient_dir.join("local.gq"), + r#" +query get_person($name: String) { + match { + $p: Person { name: $name } + } + return { $p.name } +} +"#, + ); + + let payload = parse_stdout_json(&output_success( + cli() + .current_dir(&ambient_dir) + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg("local.gq") + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + let columns = payload["columns"] + .as_array() + .unwrap() + .iter() + .map(|value| value.as_str().unwrap()) + .collect::>(); + assert_eq!(columns, vec!["p.age", "p.name"]); + assert_eq!(payload["rows"][0]["p.age"], 30); + assert_eq!(payload["rows"][0]["p.name"], "Alice"); +} + +#[test] +fn local_cli_datetime_and_list_types_round_trip_through_load_read_and_change() { + let temp = tempfile::tempdir().unwrap(); + let repo = repo_path(temp.path()); + let schema = temp.path().join("datatypes.pg"); + let data = temp.path().join("datatypes.jsonl"); + let queries = temp.path().join("datatypes.gq"); + + write_query_file( + &schema, + r#" +node Task { + slug: String @key + title: String + due_at: DateTime + tags: [String] + scores: [I32]? + active_days: [Date]? +} +"#, + ); + write_jsonl( + &data, + r#"{"type":"Task","data":{"slug":"alpha","title":"Launch prep","due_at":"2026-04-01T08:30:00Z","tags":["launch","priority"],"scores":[1,2],"active_days":["2026-03-30","2026-03-31"]}} +{"type":"Task","data":{"slug":"beta","title":"Archive","due_at":"2026-05-01T12:00:00Z","tags":["backlog"],"scores":[5],"active_days":["2026-04-01"]}}"#, + ); + write_query_file( + &queries, + r#" +query due_with_tag($deadline: DateTime, $tag: String) { + match { + $t: Task + $t.due_at <= $deadline + $t.tags contains $tag + } + return { $t.slug, $t.due_at, $t.tags, $t.scores, $t.active_days } +} + +query insert_task( + $slug: String, + $title: String, + $due_at: DateTime, + $tags: [String], + $scores: [I32], + $active_days: [Date] +) { + insert Task { + slug: $slug, + title: $title, + due_at: $due_at, + tags: $tags, + scores: $scores, + active_days: $active_days + } +} + +query update_task( + $slug: String, + $due_at: DateTime, + $tags: [String], + $scores: [I32], + $active_days: [Date] +) { + update Task set { + due_at: $due_at, + tags: $tags, + scores: $scores, + active_days: $active_days + } where slug = $slug +} + +query get_task($slug: String) { + match { $t: Task { slug: $slug } } + return { $t.slug, $t.due_at, $t.tags, $t.scores, $t.active_days } +} +"#, + ); + + output_success(cli().arg("init").arg("--schema").arg(&schema).arg(&repo)); + output_success(cli().arg("load").arg("--data").arg(&data).arg(&repo)); + + let filtered = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(&queries) + .arg("--name") + .arg("due_with_tag") + .arg("--params") + .arg(r#"{"deadline":"2026-04-02T00:00:00Z","tag":"launch"}"#) + .arg("--json"), + )); + assert_eq!(filtered["row_count"], 1); + assert_eq!(filtered["rows"][0]["t.slug"], "alpha"); + assert_eq!(filtered["rows"][0]["t.due_at"], "2026-04-01T08:30:00.000Z"); + assert_eq!( + filtered["rows"][0]["t.tags"], + serde_json::json!(["launch", "priority"]) + ); + assert_eq!(filtered["rows"][0]["t.scores"], serde_json::json!([1, 2])); + assert_eq!( + filtered["rows"][0]["t.active_days"], + serde_json::json!(["2026-03-30", "2026-03-31"]) + ); + + let insert_payload = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg(&repo) + .arg("--query") + .arg(&queries) + .arg("--name") + .arg("insert_task") + .arg("--params") + .arg( + r#"{"slug":"gamma","title":"Embed prep","due_at":"2026-04-03T09:15:00Z","tags":["embed","launch"],"scores":[3,8],"active_days":["2026-04-02","2026-04-03"]}"#, + ) + .arg("--json"), + )); + assert_eq!(insert_payload["affected_nodes"], 1); + + let update_payload = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg(&repo) + .arg("--query") + .arg(&queries) + .arg("--name") + .arg("update_task") + .arg("--params") + .arg(r#"{"slug":"gamma","due_at":"2026-04-04T10:45:00Z","tags":["embed","released"],"scores":[13,21],"active_days":["2026-04-04","2026-04-05"]}"#) + .arg("--json"), + )); + assert_eq!(update_payload["affected_nodes"], 1); + + let gamma = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(&queries) + .arg("--name") + .arg("get_task") + .arg("--params") + .arg(r#"{"slug":"gamma"}"#) + .arg("--json"), + )); + assert_eq!(gamma["row_count"], 1); + assert_eq!(gamma["rows"][0]["t.slug"], "gamma"); + assert_eq!(gamma["rows"][0]["t.due_at"], "2026-04-04T10:45:00.000Z"); + assert_eq!( + gamma["rows"][0]["t.tags"], + serde_json::json!(["embed", "released"]) + ); + assert_eq!(gamma["rows"][0]["t.scores"], serde_json::json!([13, 21])); + assert_eq!( + gamma["rows"][0]["t.active_days"], + serde_json::json!(["2026-04-04", "2026-04-05"]) + ); +} + +#[test] +#[ignore = "requires GEMINI_API_KEY and network access"] +fn local_cli_real_gemini_string_nearest_query_returns_expected_match() { + let temp = tempfile::tempdir().unwrap(); + let repo = repo_path(temp.path()); + let schema = temp.path().join("gemini.pg"); + let data = temp.path().join("gemini.jsonl"); + let queries = temp.path().join("gemini.gq"); + + write_query_file( + &schema, + r#" +node Doc { + slug: String @key + title: String + embedding: Vector(4) @index +} +"#, + ); + + let alpha = embed_text_with_gemini("alpha", 4); + let beta = embed_text_with_gemini("beta", 4); + let gamma = embed_text_with_gemini("gamma", 4); + write_jsonl( + &data, + &format!( + r#"{{"type":"Doc","data":{{"slug":"alpha-doc","title":"alpha","embedding":[{}]}}}} +{{"type":"Doc","data":{{"slug":"beta-doc","title":"beta","embedding":[{}]}}}} +{{"type":"Doc","data":{{"slug":"gamma-doc","title":"gamma","embedding":[{}]}}}}"#, + format_vector(&alpha), + format_vector(&beta), + format_vector(&gamma), + ), + ); + write_query_file( + &queries, + r#" +query vector_search($q: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { nearest($d.embedding, $q) } + limit 3 +} +"#, + ); + + output_success(cli().arg("init").arg("--schema").arg(&schema).arg(&repo)); + output_success(cli().arg("load").arg("--data").arg(&data).arg(&repo)); + + let result = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(&repo) + .arg("--query") + .arg(&queries) + .arg("--name") + .arg("vector_search") + .arg("--params") + .arg(r#"{"q":"alpha"}"#) + .arg("--json"), + )); + + assert_eq!(result["row_count"], 3); + assert_eq!(result["rows"][0]["d.slug"], "alpha-doc"); +} + +#[test] +fn local_cli_transactional_load_drift_fails_without_partial_publish() { + let repo = SystemRepo::loaded(); + let large_data = repo.write_jsonl("system-large-load.jsonl", &bulk_people_jsonl(250_000)); + let person_rows_before = snapshot_table_row_count(&repo, "node:Person"); + + let mut load = cli_process(); + load.arg("load") + .arg("--data") + .arg(&large_data) + .arg("--mode") + .arg("merge") + .arg(repo.path()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + let child = load.spawn().unwrap(); + + let run_id = wait_for_running_run(&repo); + + tokio::runtime::Runtime::new().unwrap().block_on(async { + let mut db = Omnigraph::open(repo.path().to_str().unwrap()) + .await + .unwrap(); + let interloper = db + .begin_run("main", Some("system-test-interloper")) + .await + .unwrap(); + db.load( + interloper.run_branch.as_str(), + r#"{"type":"Person","data":{"name":"Interloper","age":41}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + db.publish_run(&interloper.run_id).await.unwrap(); + }); + + let output = child.wait_with_output().unwrap(); + assert!( + !output.status.success(), + "load unexpectedly succeeded\nstdout:\n{}\nstderr:\n{}", + String::from_utf8_lossy(&output.stdout), + String::from_utf8_lossy(&output.stderr) + ); + let stderr = String::from_utf8(output.stderr).unwrap(); + assert!( + stderr.contains("advanced during transactional load") + || stderr.contains("version drift") + || stderr.contains("retry"), + "unexpected load failure: {stderr}" + ); + + let run_payload = parse_stdout_json(&output_success( + cli() + .arg("run") + .arg("show") + .arg("--uri") + .arg(repo.path()) + .arg(&run_id) + .arg("--json"), + )); + assert_eq!(run_payload["status"], "failed"); + + assert_eq!( + snapshot_table_row_count(&repo, "node:Person"), + person_rows_before + 1 + ); + + let interloper = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Interloper"}"#) + .arg("--json"), + )); + assert_eq!(interloper["row_count"], 1); + + let bulk_row = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Bulk00000"}"#) + .arg("--json"), + )); + assert_eq!(bulk_row["row_count"], 0); +} + +#[test] +fn local_cli_policy_tooling_is_end_to_end_while_local_writes_stay_unenforced() { + let repo = SystemRepo::loaded(); + let config = repo.write_config("omnigraph-policy.yaml", &local_policy_config(&repo)); + repo.write_config("policy.yaml", POLICY_E2E_YAML); + repo.write_config("policy.tests.yaml", POLICY_E2E_TESTS_YAML); + let mutation_file = insert_person_query(&repo, "system-local-policy-change.gq"); + + let validate = output_success( + cli() + .arg("policy") + .arg("validate") + .arg("--config") + .arg(&config), + ); + assert!(stdout_string(&validate).contains("policy valid:")); + + let tests = output_success(cli().arg("policy").arg("test").arg("--config").arg(&config)); + assert!(stdout_string(&tests).contains("policy tests passed: 2 cases")); + + let explain = output_success( + cli() + .arg("policy") + .arg("explain") + .arg("--config") + .arg(&config) + .arg("--actor") + .arg("act-bruno") + .arg("--action") + .arg("change") + .arg("--branch") + .arg("main"), + ); + let explain_stdout = stdout_string(&explain); + assert!(explain_stdout.contains("decision: deny")); + assert!(explain_stdout.contains("branch: main")); + + let local_change = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"PolicyLocal","age":44}"#) + .arg("--json"), + )); + assert_eq!(local_change["branch"], "main"); + assert_eq!(local_change["affected_nodes"], 1); + + let verify = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"PolicyLocal"}"#) + .arg("--json"), + )); + assert_eq!(verify["row_count"], 1); + assert_eq!(verify["rows"][0]["p.name"], "PolicyLocal"); +} diff --git a/crates/omnigraph-cli/tests/system_remote.rs b/crates/omnigraph-cli/tests/system_remote.rs new file mode 100644 index 0000000..dc7af37 --- /dev/null +++ b/crates/omnigraph-cli/tests/system_remote.rs @@ -0,0 +1,810 @@ +mod support; + +use std::fs; + +use reqwest::blocking::Client; +use serde_json::json; + +use support::*; + +const REMOTE_POLICY_E2E_YAML: &str = r#" +version: 1 +groups: + team: [act-bruno] + admins: [act-ragnor] +protected_branches: [main] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read] + branch_scope: any + - id: team-branch-create + allow: + actors: { group: team } + actions: [branch_create] + target_branch_scope: unprotected + - id: team-write-unprotected + allow: + actors: { group: team } + actions: [change] + branch_scope: unprotected + - id: admins-promote + allow: + actors: { group: admins } + actions: [branch_merge, run_publish] + target_branch_scope: protected +"#; + +fn yaml_string(value: &str) -> String { + format!("'{}'", value.replace('\'', "''")) +} + +fn remote_policy_server_config(repo: &SystemRepo) -> String { + format!( + "\ +project: + name: remote-policy-e2e +targets: + local: + uri: {} +server: + target: local +policy: + file: ./policy.yaml +", + yaml_string(&repo.path().to_string_lossy()) + ) +} + +fn remote_policy_client_config(url: &str) -> String { + format!( + "\ +targets: + dev: + uri: {} + bearer_token_env: POLICY_TEST_TOKEN +cli: + target: dev + branch: main +query: + roots: + - . +auth: + env_file: ./.env.omni +", + yaml_string(url) + ) +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_server_and_cli_end_to_end_flow() { + let repo = SystemRepo::loaded(); + let server = repo.spawn_server(); + let config = repo.write_config("omnigraph.yaml", &remote_yaml_config(&server.base_url)); + let mutation_file = repo.write_query( + "system-remote-change.gq", + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#, + ); + let client = Client::new(); + + let health = client + .get(format!("{}/healthz", server.base_url)) + .send() + .unwrap() + .error_for_status() + .unwrap() + .json::() + .unwrap(); + assert_eq!(health["status"], "ok"); + + let local_snapshot = parse_stdout_json(&output_success( + cli().arg("snapshot").arg(repo.path()).arg("--json"), + )); + let snapshot = parse_stdout_json(&output_success( + cli() + .arg("snapshot") + .arg("--config") + .arg(&config) + .arg("--json"), + )); + assert_eq!(snapshot["branch"], "main"); + assert_eq!(snapshot["tables"], local_snapshot["tables"]); + + let local_read = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + let read_payload = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + assert_eq!(read_payload, local_read); + assert_eq!(read_payload["row_count"], 1); + assert_eq!(read_payload["rows"][0]["p.name"], "Alice"); + + let change_payload = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"Mina","age":28}"#) + .arg("--json"), + )); + assert_eq!(change_payload["affected_nodes"], 1); + + let query_source = fs::read_to_string(fixture("test.gq")).unwrap(); + let http_read = client + .post(format!("{}/read", server.base_url)) + .json(&json!({ + "branch": "main", + "query_source": query_source, + "query_name": "get_person", + "params": { "name": "Mina" } + })) + .send() + .unwrap() + .error_for_status() + .unwrap() + .json::() + .unwrap(); + assert_eq!(http_read["row_count"], 1); + assert_eq!(http_read["rows"][0]["p.name"], "Mina"); + + let local_verify = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(repo.path()) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Mina"}"#) + .arg("--json"), + )); + assert_eq!(local_verify["row_count"], 1); + assert_eq!(local_verify["rows"][0]["p.name"], "Mina"); + + let manual_run = tokio::runtime::Runtime::new() + .unwrap() + .block_on(begin_manual_run(repo.path(), "main")); + let publish_payload = parse_stdout_json(&output_success( + cli() + .arg("run") + .arg("publish") + .arg("--config") + .arg(&config) + .arg(&manual_run) + .arg("--json"), + )); + assert_eq!(publish_payload["run_id"], manual_run); + assert_eq!(publish_payload["status"], "published"); + + let runs_payload = parse_stdout_json(&output_success( + cli() + .arg("run") + .arg("list") + .arg("--config") + .arg(&config) + .arg("--json"), + )); + assert!(runs_payload["runs"].as_array().unwrap().len() >= 2); +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_read_preserves_projection_order_in_json_and_csv() { + let repo = SystemRepo::loaded(); + let server = repo.spawn_server(); + let config = repo.write_config("omnigraph.yaml", &remote_yaml_config(&server.base_url)); + let ordered_query = repo.write_query( + "ordered-remote.gq", + r#" +query ordered_person($name: String) { + match { + $p: Person { name: $name } + } + return { $p.age, $p.name } +} +"#, + ); + + let json_payload = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&ordered_query) + .arg("--name") + .arg("ordered_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--json"), + )); + let columns = json_payload["columns"] + .as_array() + .unwrap() + .iter() + .map(|value| value.as_str().unwrap()) + .collect::>(); + assert_eq!(columns, vec!["p.age", "p.name"]); + + let csv = stdout_string(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&ordered_query) + .arg("--name") + .arg("ordered_person") + .arg("--params") + .arg(r#"{"name":"Alice"}"#) + .arg("--format") + .arg("csv"), + )); + let mut lines = csv.lines(); + assert_eq!(lines.next().unwrap(), "p.age,p.name"); + assert_eq!(lines.next().unwrap(), "30,Alice"); +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_branch_create_list_merge_flow() { + let repo = SystemRepo::loaded(); + let server = repo.spawn_server(); + let config = repo.write_config("omnigraph.yaml", &remote_yaml_config(&server.base_url)); + let mutation_file = repo.write_query( + "system-remote-branch-change.gq", + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#, + ); + + let initial = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("list") + .arg("--config") + .arg(&config) + .arg("--json"), + )); + assert_eq!(initial["branches"], json!(["main"])); + + let created = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("create") + .arg("--config") + .arg(&config) + .arg("--from") + .arg("main") + .arg("feature") + .arg("--json"), + )); + assert_eq!(created["from"], "main"); + assert_eq!(created["name"], "feature"); + + let listed = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("list") + .arg("--config") + .arg(&config) + .arg("--json"), + )); + assert_eq!(listed["branches"], json!(["feature", "main"])); + + let changed = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&mutation_file) + .arg("--branch") + .arg("feature") + .arg("--params") + .arg(r#"{"name":"Zoe","age":33}"#) + .arg("--json"), + )); + assert_eq!(changed["branch"], "feature"); + assert_eq!(changed["affected_nodes"], 1); + + let merged = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("merge") + .arg("--config") + .arg(&config) + .arg("feature") + .arg("--into") + .arg("main") + .arg("--json"), + )); + assert_eq!(merged["source"], "feature"); + assert_eq!(merged["target"], "main"); + assert_eq!(merged["outcome"], "fast_forward"); + + let verify = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Zoe"}"#) + .arg("--json"), + )); + assert_eq!(verify["row_count"], 1); + assert_eq!(verify["rows"][0]["p.name"], "Zoe"); +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_branch_delete_removes_branch() { + let repo = SystemRepo::loaded(); + let server = repo.spawn_server(); + let config = repo.write_config("omnigraph.yaml", &remote_yaml_config(&server.base_url)); + + parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("create") + .arg("--config") + .arg(&config) + .arg("--from") + .arg("main") + .arg("feature") + .arg("--json"), + )); + + let deleted = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("delete") + .arg("--config") + .arg(&config) + .arg("feature") + .arg("--json"), + )); + assert_eq!(deleted["name"], "feature"); + + let listed = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("list") + .arg("--config") + .arg(&config) + .arg("--json"), + )); + assert_eq!(listed["branches"], json!(["main"])); +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_export_round_trips_full_branch_graph() { + let repo = SystemRepo::loaded(); + let server = repo.spawn_server(); + let config = repo.write_config("omnigraph.yaml", &remote_yaml_config(&server.base_url)); + let mutation_file = repo.write_query( + "system-remote-export-change.gq", + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} + +query add_friend($from: String, $to: String) { + insert Knows { from: $from, to: $to } +} +"#, + ); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--config") + .arg(&config) + .arg("--from") + .arg("main") + .arg("feature"), + ); + + output_success( + cli() + .arg("change") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&mutation_file) + .arg("--name") + .arg("insert_person") + .arg("--branch") + .arg("feature") + .arg("--params") + .arg(r#"{"name":"Eve","age":29}"#) + .arg("--json"), + ); + output_success( + cli() + .arg("change") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(&mutation_file) + .arg("--name") + .arg("add_friend") + .arg("--branch") + .arg("feature") + .arg("--params") + .arg(r#"{"from":"Alice","to":"Eve"}"#) + .arg("--json"), + ); + + let exported = stdout_string(&output_success( + cli() + .arg("export") + .arg("--config") + .arg(&config) + .arg("--branch") + .arg("feature") + .arg("--jsonl"), + )); + let export_path = repo.write_jsonl("system-remote-exported.jsonl", &exported); + let imported_repo = repo + .path() + .parent() + .unwrap() + .join("imported-remote-export.omni"); + + output_success( + cli() + .arg("init") + .arg("--schema") + .arg(fixture("test.pg")) + .arg(&imported_repo), + ); + output_success( + cli() + .arg("load") + .arg("--data") + .arg(&export_path) + .arg(&imported_repo), + ); + + let snapshot = parse_stdout_json(&output_success( + cli().arg("snapshot").arg(&imported_repo).arg("--json"), + )); + assert_eq!( + snapshot["tables"] + .as_array() + .unwrap() + .iter() + .find(|table| table["table_key"] == "node:Person") + .unwrap()["row_count"], + 5 + ); + assert_eq!( + snapshot["tables"] + .as_array() + .unwrap() + .iter() + .find(|table| table["table_key"] == "edge:Knows") + .unwrap()["row_count"], + 4 + ); + + let eve = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg(&imported_repo) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"Eve"}"#) + .arg("--json"), + )); + assert_eq!(eve["row_count"], 1); + assert_eq!(eve["rows"][0]["p.name"], "Eve"); +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_ingest_creates_review_branch_and_keeps_it_readable() { + let repo = SystemRepo::loaded(); + let server = repo.spawn_server(); + let config = repo.write_config("omnigraph.yaml", &remote_yaml_config(&server.base_url)); + let ingest_data = repo.write_jsonl( + "system-remote-ingest.jsonl", + r#"{"type":"Person","data":{"name":"Zoe","age":33}} +{"type":"Person","data":{"name":"Bob","age":26}}"#, + ); + + let ingest_payload = parse_stdout_json(&output_success( + cli() + .arg("ingest") + .arg("--config") + .arg(&config) + .arg("--data") + .arg(&ingest_data) + .arg("--branch") + .arg("feature-ingest") + .arg("--json"), + )); + assert_eq!(ingest_payload["branch"], "feature-ingest"); + assert_eq!(ingest_payload["base_branch"], "main"); + assert_eq!(ingest_payload["branch_created"], true); + assert_eq!(ingest_payload["mode"], "merge"); + assert_eq!(ingest_payload["tables"][0]["table_key"], "node:Person"); + assert_eq!(ingest_payload["tables"][0]["rows_loaded"], 2); + + let feature_snapshot = parse_stdout_json(&output_success( + cli() + .arg("snapshot") + .arg("--config") + .arg(&config) + .arg("--branch") + .arg("feature-ingest") + .arg("--json"), + )); + assert_eq!(feature_snapshot["branch"], "feature-ingest"); + + let zoe = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--branch") + .arg("feature-ingest") + .arg("--params") + .arg(r#"{"name":"Zoe"}"#) + .arg("--json"), + )); + assert_eq!(zoe["row_count"], 1); + assert_eq!(zoe["rows"][0]["p.name"], "Zoe"); +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_ingest_reuses_existing_branch_and_merges_updates() { + let repo = SystemRepo::loaded(); + let server = repo.spawn_server(); + let config = repo.write_config("omnigraph.yaml", &remote_yaml_config(&server.base_url)); + + output_success( + cli() + .arg("branch") + .arg("create") + .arg("--config") + .arg(&config) + .arg("--from") + .arg("main") + .arg("feature-ingest"), + ); + + let ingest_data = repo.write_jsonl( + "system-remote-ingest-merge.jsonl", + r#"{"type":"Person","data":{"name":"Bob","age":26}} +{"type":"Person","data":{"name":"Zoe","age":33}}"#, + ); + + let ingest_payload = parse_stdout_json(&output_success( + cli() + .arg("ingest") + .arg("--config") + .arg(&config) + .arg("--data") + .arg(&ingest_data) + .arg("--branch") + .arg("feature-ingest") + .arg("--from") + .arg("missing-base") + .arg("--json"), + )); + assert_eq!(ingest_payload["branch"], "feature-ingest"); + assert_eq!(ingest_payload["base_branch"], "missing-base"); + assert_eq!(ingest_payload["branch_created"], false); + assert_eq!(ingest_payload["mode"], "merge"); + assert_eq!(ingest_payload["tables"][0]["table_key"], "node:Person"); + assert_eq!(ingest_payload["tables"][0]["rows_loaded"], 2); + + let bob = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--branch") + .arg("feature-ingest") + .arg("--params") + .arg(r#"{"name":"Bob"}"#) + .arg("--json"), + )); + assert_eq!(bob["row_count"], 1); + assert_eq!(bob["rows"][0]["p.age"], 26); + + let zoe = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&config) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--branch") + .arg("feature-ingest") + .arg("--params") + .arg(r#"{"name":"Zoe"}"#) + .arg("--json"), + )); + assert_eq!(zoe["row_count"], 1); + assert_eq!(zoe["rows"][0]["p.name"], "Zoe"); +} + +#[test] +#[ignore = "requires loopback socket permissions in sandboxed runners"] +fn remote_policy_enforces_branch_first_cli_workflow() { + let repo = SystemRepo::loaded(); + let server_config = + repo.write_config("server-policy.yaml", &remote_policy_server_config(&repo)); + repo.write_config("policy.yaml", REMOTE_POLICY_E2E_YAML); + let server = repo.spawn_server_with_config_env( + &server_config, + &[( + "OMNIGRAPH_SERVER_BEARER_TOKENS_JSON", + r#"{"act-bruno":"team-token","act-ragnor":"admin-token"}"#, + )], + ); + let client_config = repo.write_config( + "omnigraph-policy.yaml", + &remote_policy_client_config(&server.base_url), + ); + repo.write_config(".env.omni", "POLICY_TEST_TOKEN=team-token\n"); + let mutation_file = repo.write_query( + "system-remote-policy-change.gq", + r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#, + ); + + let snapshot = parse_stdout_json(&output_success( + cli() + .arg("snapshot") + .arg("--config") + .arg(&client_config) + .arg("--json"), + )); + assert_eq!(snapshot["branch"], "main"); + + let denied_main_change = output_failure( + cli() + .arg("change") + .arg("--config") + .arg(&client_config) + .arg("--query") + .arg(&mutation_file) + .arg("--params") + .arg(r#"{"name":"PolicyRemote","age":41}"#) + .arg("--json"), + ); + let denied_main_stderr = String::from_utf8(denied_main_change.stderr).unwrap(); + assert!(denied_main_stderr.contains("policy denied action 'change' on branch 'main'")); + + let created = parse_stdout_json(&output_success( + cli() + .arg("branch") + .arg("create") + .arg("--config") + .arg(&client_config) + .arg("--from") + .arg("main") + .arg("feature") + .arg("--json"), + )); + assert_eq!(created["name"], "feature"); + + let changed = parse_stdout_json(&output_success( + cli() + .arg("change") + .arg("--config") + .arg(&client_config) + .arg("--query") + .arg(&mutation_file) + .arg("--branch") + .arg("feature") + .arg("--params") + .arg(r#"{"name":"PolicyRemote","age":41}"#) + .arg("--json"), + )); + assert_eq!(changed["branch"], "feature"); + assert_eq!(changed["affected_nodes"], 1); + + let denied_merge = output_failure( + cli() + .arg("branch") + .arg("merge") + .arg("--config") + .arg(&client_config) + .arg("feature") + .arg("--into") + .arg("main") + .arg("--json"), + ); + let denied_merge_stderr = String::from_utf8(denied_merge.stderr).unwrap(); + assert!(denied_merge_stderr.contains("policy denied action 'branch_merge'")); + + let merged = parse_stdout_json(&output_success( + cli() + .env("POLICY_TEST_TOKEN", "admin-token") + .arg("branch") + .arg("merge") + .arg("--config") + .arg(&client_config) + .arg("feature") + .arg("--into") + .arg("main") + .arg("--json"), + )); + assert_eq!(merged["target"], "main"); + + let verify = parse_stdout_json(&output_success( + cli() + .arg("read") + .arg("--config") + .arg(&client_config) + .arg("--query") + .arg(fixture("test.gq")) + .arg("--name") + .arg("get_person") + .arg("--params") + .arg(r#"{"name":"PolicyRemote"}"#) + .arg("--json"), + )); + assert_eq!(verify["row_count"], 1); + assert_eq!(verify["rows"][0]["p.name"], "PolicyRemote"); +} diff --git a/crates/omnigraph-compiler/Cargo.toml b/crates/omnigraph-compiler/Cargo.toml new file mode 100644 index 0000000..c94e324 --- /dev/null +++ b/crates/omnigraph-compiler/Cargo.toml @@ -0,0 +1,26 @@ +[package] +name = "omnigraph-compiler" +version = "0.4.0" +edition = "2024" +description = "Schema/query compiler for Omnigraph. Zero Lance dependency." +license = "MIT" + +[dependencies] +arrow-array = { workspace = true } +arrow-ipc = { workspace = true } +arrow-schema = { workspace = true } +arrow-select = { workspace = true } +arrow-cast = { workspace = true } +arrow-ord = { workspace = true } +pest = { workspace = true } +pest_derive = { workspace = true } +thiserror = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +reqwest = { workspace = true } +ahash = { workspace = true } +tokio = { workspace = true } +sha2 = { workspace = true } + +[dev-dependencies] +tokio = { workspace = true } diff --git a/crates/omnigraph-compiler/src/catalog/mod.rs b/crates/omnigraph-compiler/src/catalog/mod.rs new file mode 100644 index 0000000..18ba3d9 --- /dev/null +++ b/crates/omnigraph-compiler/src/catalog/mod.rs @@ -0,0 +1,594 @@ +pub mod schema_ir; +pub mod schema_plan; + +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Schema, SchemaRef}; + +use crate::error::{NanoError, Result}; +use crate::schema::ast::{Cardinality, Constraint, ConstraintBound, SchemaDecl, SchemaFile}; +use crate::types::{PropType, ScalarType}; + +#[derive(Debug, Clone)] +pub struct Catalog { + pub node_types: HashMap, + pub edge_types: HashMap, + /// Maps normalized lowercase edge name -> EdgeType key (e.g. "knows" -> "Knows") + pub edge_name_index: HashMap, + /// Interface declarations (for Phase 2 polymorphic queries) + pub interfaces: HashMap, +} + +#[derive(Debug, Clone)] +pub struct InterfaceType { + pub name: String, + pub properties: HashMap, +} + +#[derive(Debug, Clone)] +pub struct NodeType { + pub name: String, + /// Interface names this type implements + pub implements: Vec, + pub properties: HashMap, + /// Key property names (from `@key` or `@key(name)`). Usually 0 or 1 element. + pub key: Option>, + /// Uniqueness constraints (each entry is a list of column names) + pub unique_constraints: Vec>, + /// Index declarations (each entry is a list of column names) + pub indices: Vec>, + /// Value range constraints + pub range_constraints: Vec, + /// Regex check constraints + pub check_constraints: Vec, + /// Maps @embed target property -> source text property + pub embed_sources: HashMap, + pub blob_properties: HashSet, + pub arrow_schema: SchemaRef, +} + +impl NodeType { + /// Backward-compatible accessor: returns the first (and typically only) key property name. + pub fn key_property(&self) -> Option<&str> { + self.key + .as_ref() + .and_then(|v| v.first()) + .map(|s| s.as_str()) + } +} + +#[derive(Debug, Clone)] +pub struct RangeConstraint { + pub property: String, + pub min: Option, + pub max: Option, +} + +#[derive(Debug, Clone)] +pub enum LiteralValue { + Integer(i64), + Float(f64), +} + +#[derive(Debug, Clone)] +pub struct CheckConstraint { + pub property: String, + pub pattern: String, +} + +#[derive(Debug, Clone)] +pub struct EdgeType { + pub name: String, + pub from_type: String, + pub to_type: String, + pub cardinality: Cardinality, + pub properties: HashMap, + /// Uniqueness constraints on edge columns (e.g. `@unique(src, dst)`) + pub unique_constraints: Vec>, + /// Index declarations on edge properties + pub indices: Vec>, + pub blob_properties: HashSet, + pub arrow_schema: SchemaRef, +} + +impl Catalog { + pub fn lookup_edge_by_name(&self, name: &str) -> Option<&EdgeType> { + if let Some(et) = self.edge_types.get(name) { + return Some(et); + } + if let Some(key) = self.edge_name_index.get(&normalize_edge_name(name)) { + return self.edge_types.get(key); + } + None + } +} + +fn normalize_edge_name(name: &str) -> String { + name.to_lowercase() +} + +fn bound_to_literal(b: &ConstraintBound) -> LiteralValue { + match b { + ConstraintBound::Integer(n) => LiteralValue::Integer(*n), + ConstraintBound::Float(f) => LiteralValue::Float(*f), + } +} + +pub fn build_catalog(schema: &SchemaFile) -> Result { + let mut node_types = HashMap::new(); + let mut edge_types = HashMap::new(); + let mut edge_name_index = HashMap::new(); + let mut interfaces = HashMap::new(); + + // Pass 0: collect interfaces + for decl in &schema.declarations { + if let SchemaDecl::Interface(iface) = decl { + let mut properties = HashMap::new(); + for prop in &iface.properties { + properties.insert(prop.name.clone(), prop.prop_type.clone()); + } + interfaces.insert( + iface.name.clone(), + InterfaceType { + name: iface.name.clone(), + properties, + }, + ); + } + } + + // Pass 1: collect node types + for decl in &schema.declarations { + if let SchemaDecl::Node(node) = decl { + if node_types.contains_key(&node.name) { + return Err(NanoError::Catalog(format!( + "duplicate node type: {}", + node.name + ))); + } + + let mut properties = HashMap::new(); + let mut embed_sources = HashMap::new(); + let mut blob_properties = HashSet::new(); + for prop in &node.properties { + properties.insert(prop.name.clone(), prop.prop_type.clone()); + if matches!(prop.prop_type.scalar, ScalarType::Blob) { + blob_properties.insert(prop.name.clone()); + } + // Extract @embed from property annotations (stays as annotation) + if let Some(source_prop) = prop + .annotations + .iter() + .find(|ann| ann.name == "embed") + .and_then(|ann| ann.value.clone()) + { + embed_sources.insert(prop.name.clone(), source_prop); + } + } + + // Extract constraints from the typed Constraint enum + let mut key: Option> = None; + let mut unique_constraints = Vec::new(); + let mut indices = Vec::new(); + let mut range_constraints = Vec::new(); + let mut check_constraints = Vec::new(); + + for constraint in &node.constraints { + match constraint { + Constraint::Key(cols) => { + key = Some(cols.clone()); + // @key implies index on key columns + indices.push(cols.clone()); + } + Constraint::Unique(cols) => { + unique_constraints.push(cols.clone()); + } + Constraint::Index(cols) => { + indices.push(cols.clone()); + } + Constraint::Range { property, min, max } => { + range_constraints.push(RangeConstraint { + property: property.clone(), + min: min.as_ref().map(bound_to_literal), + max: max.as_ref().map(bound_to_literal), + }); + } + Constraint::Check { property, pattern } => { + check_constraints.push(CheckConstraint { + property: property.clone(), + pattern: pattern.clone(), + }); + } + } + } + + // Build Arrow schema: id: Utf8 + all properties + let mut fields = vec![Field::new("id", DataType::Utf8, false)]; + for prop in &node.properties { + fields.push(Field::new( + &prop.name, + prop.prop_type.to_arrow(), + prop.prop_type.nullable, + )); + } + let arrow_schema = Arc::new(Schema::new(fields)); + + node_types.insert( + node.name.clone(), + NodeType { + name: node.name.clone(), + implements: node.implements.clone(), + properties, + key, + unique_constraints, + indices, + range_constraints, + check_constraints, + embed_sources, + blob_properties, + arrow_schema, + }, + ); + } + } + + // Pass 2: collect edge types, validate endpoints + for decl in &schema.declarations { + if let SchemaDecl::Edge(edge) = decl { + if edge_types.contains_key(&edge.name) { + return Err(NanoError::Catalog(format!( + "duplicate edge type: {}", + edge.name + ))); + } + if !node_types.contains_key(&edge.from_type) { + return Err(NanoError::Catalog(format!( + "edge {} references unknown source type: {}", + edge.name, edge.from_type + ))); + } + if !node_types.contains_key(&edge.to_type) { + return Err(NanoError::Catalog(format!( + "edge {} references unknown target type: {}", + edge.name, edge.to_type + ))); + } + + let mut properties = HashMap::new(); + let mut blob_properties = HashSet::new(); + let mut fields = vec![ + Field::new("id", DataType::Utf8, false), + Field::new("src", DataType::Utf8, false), + Field::new("dst", DataType::Utf8, false), + ]; + for prop in &edge.properties { + properties.insert(prop.name.clone(), prop.prop_type.clone()); + if matches!(prop.prop_type.scalar, ScalarType::Blob) { + blob_properties.insert(prop.name.clone()); + } + fields.push(Field::new( + &prop.name, + prop.prop_type.to_arrow(), + prop.prop_type.nullable, + )); + } + + // Extract edge constraints + let mut unique_constraints = Vec::new(); + let mut edge_indices = Vec::new(); + for constraint in &edge.constraints { + match constraint { + Constraint::Unique(cols) => unique_constraints.push(cols.clone()), + Constraint::Index(cols) => edge_indices.push(cols.clone()), + _ => {} // Key/Range/Check validated at parse time to not appear on edges + } + } + + let normalized_name = normalize_edge_name(&edge.name); + if let Some(existing) = edge_name_index.get(&normalized_name) + && existing != &edge.name + { + return Err(NanoError::Catalog(format!( + "edge name collision after case folding: '{}' conflicts with '{}'", + edge.name, existing + ))); + } + edge_name_index.insert(normalized_name, edge.name.clone()); + + edge_types.insert( + edge.name.clone(), + EdgeType { + name: edge.name.clone(), + from_type: edge.from_type.clone(), + to_type: edge.to_type.clone(), + cardinality: edge.cardinality.clone(), + properties, + unique_constraints, + indices: edge_indices, + blob_properties, + arrow_schema: Arc::new(Schema::new(fields)), + }, + ); + } + } + + Ok(Catalog { + node_types, + edge_types, + edge_name_index, + interfaces, + }) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::schema::ast::{EdgeDecl, NodeDecl}; + use crate::schema::parser::parse_schema; + use crate::types::PropType; + + fn test_schema() -> &'static str { + r#" +node Person { + name: String + age: I32? +} +node Company { + name: String +} +edge Knows: Person -> Person { + since: Date? +} +edge WorksAt: Person -> Company { + title: String? +} +"# + } + + #[test] + fn test_build_catalog() { + let schema = parse_schema(test_schema()).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + assert_eq!(catalog.node_types.len(), 2); + assert_eq!(catalog.edge_types.len(), 2); + assert!(catalog.node_types.contains_key("Person")); + assert!(catalog.node_types.contains_key("Company")); + } + + #[test] + fn test_edge_lookup() { + let schema = parse_schema(test_schema()).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let edge = catalog.lookup_edge_by_name("knows").unwrap(); + assert_eq!(edge.from_type, "Person"); + assert_eq!(edge.to_type, "Person"); + let upper = catalog.lookup_edge_by_name("KNOWS").unwrap(); + assert_eq!(upper.name, "Knows"); + } + + #[test] + fn test_node_arrow_schema() { + let schema = parse_schema(test_schema()).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let person = &catalog.node_types["Person"]; + assert_eq!(person.arrow_schema.fields().len(), 3); // id, name, age + } + + #[test] + fn test_duplicate_node_error() { + let input = r#" +node Person { name: String } +node Person { age: I32 } +"#; + let schema = parse_schema(input).unwrap(); + assert!(build_catalog(&schema).is_err()); + } + + #[test] + fn test_bad_edge_endpoint() { + let input = r#" +node Person { name: String } +edge Knows: Person -> Alien +"#; + let schema = parse_schema(input).unwrap(); + assert!(build_catalog(&schema).is_err()); + } + + #[test] + fn test_id_fields_are_utf8() { + let schema = parse_schema(test_schema()).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let person = &catalog.node_types["Person"]; + assert_eq!( + person + .arrow_schema + .field_with_name("id") + .unwrap() + .data_type(), + &DataType::Utf8 + ); + let knows = &catalog.edge_types["Knows"]; + assert_eq!( + knows + .arrow_schema + .field_with_name("id") + .unwrap() + .data_type(), + &DataType::Utf8 + ); + assert_eq!( + knows + .arrow_schema + .field_with_name("src") + .unwrap() + .data_type(), + &DataType::Utf8 + ); + assert_eq!( + knows + .arrow_schema + .field_with_name("dst") + .unwrap() + .data_type(), + &DataType::Utf8 + ); + } + + #[test] + fn test_key_property_tracking() { + let input = r#" +node Signal { + slug: String @key + title: String +} +node Person { + name: String +} +edge Emits: Person -> Signal +"#; + let schema = parse_schema(input).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + assert_eq!(catalog.node_types["Signal"].key_property(), Some("slug")); + assert_eq!(catalog.node_types["Person"].key_property(), None); + } + + #[test] + fn test_edge_lookup_handles_non_ascii_leading_character() { + let schema = SchemaFile { + declarations: vec![ + SchemaDecl::Node(NodeDecl { + name: "Person".to_string(), + annotations: vec![], + implements: vec![], + properties: vec![crate::schema::ast::PropDecl { + name: "name".to_string(), + prop_type: PropType::scalar(ScalarType::String, false), + annotations: vec![], + }], + constraints: vec![], + }), + SchemaDecl::Edge(EdgeDecl { + name: "Édges".to_string(), + from_type: "Person".to_string(), + to_type: "Person".to_string(), + cardinality: Default::default(), + annotations: vec![], + properties: vec![], + constraints: vec![], + }), + ], + }; + let catalog = build_catalog(&schema).unwrap(); + assert!(catalog.lookup_edge_by_name("édges").is_some()); + } + + #[test] + fn test_edge_lookup_rejects_case_fold_collisions() { + let input = r#" +node Person { name: String } +edge Knows: Person -> Person +edge KNOWS: Person -> Person +"#; + let schema = parse_schema(input).unwrap(); + let err = build_catalog(&schema).unwrap_err(); + assert!(err.to_string().contains("case folding")); + } + + #[test] + fn test_catalog_composite_unique() { + let input = r#" +node Person { + first: String + last: String + @unique(first, last) +} +"#; + let schema = parse_schema(input).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let person = &catalog.node_types["Person"]; + assert!( + person + .unique_constraints + .contains(&vec!["first".to_string(), "last".to_string()]) + ); + } + + #[test] + fn test_catalog_composite_index() { + let input = r#" +node Event { + category: String + date: Date + @index(category, date) +} +"#; + let schema = parse_schema(input).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let event = &catalog.node_types["Event"]; + assert!( + event + .indices + .contains(&vec!["category".to_string(), "date".to_string()]) + ); + } + + #[test] + fn test_catalog_edge_cardinality() { + let input = r#" +node Person { name: String } +node Company { name: String } +edge WorksAt: Person -> Company @card(0..1) +"#; + let schema = parse_schema(input).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let edge = &catalog.edge_types["WorksAt"]; + assert_eq!(edge.cardinality.min, 0); + assert_eq!(edge.cardinality.max, Some(1)); + } + + #[test] + fn test_catalog_interfaces_stored() { + let input = r#" +interface Named { + name: String +} +node Person implements Named { + age: I32? +} +"#; + let schema = parse_schema(input).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + assert!(catalog.interfaces.contains_key("Named")); + assert!(catalog.interfaces["Named"].properties.contains_key("name")); + } + + #[test] + fn test_catalog_node_implements() { + let input = r#" +interface Named { + name: String +} +node Person implements Named { + age: I32? +} +"#; + let schema = parse_schema(input).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + assert_eq!(catalog.node_types["Person"].implements, vec!["Named"]); + } + + #[test] + fn test_key_implies_index() { + let input = r#" +node Signal { + slug: String @key + title: String +} +"#; + let schema = parse_schema(input).unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let signal = &catalog.node_types["Signal"]; + assert!(signal.indices.contains(&vec!["slug".to_string()])); + } +} diff --git a/crates/omnigraph-compiler/src/catalog/schema_ir.rs b/crates/omnigraph-compiler/src/catalog/schema_ir.rs new file mode 100644 index 0000000..d90539e --- /dev/null +++ b/crates/omnigraph-compiler/src/catalog/schema_ir.rs @@ -0,0 +1,393 @@ +use std::collections::HashMap; + +use serde::{Deserialize, Serialize}; +use sha2::{Digest, Sha256}; + +use crate::catalog::{Catalog, build_catalog}; +use crate::error::{NanoError, Result}; +use crate::schema::ast::{Annotation, Cardinality, Constraint, PropDecl, SchemaDecl, SchemaFile}; +use crate::types::PropType; + +const SCHEMA_IR_VERSION: u32 = 1; + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SchemaIR { + pub ir_version: u32, + pub interfaces: Vec, + pub nodes: Vec, + pub edges: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct InterfaceIR { + pub name: String, + pub type_id: u32, + pub properties: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct NodeIR { + pub name: String, + pub type_id: u32, + pub annotations: Vec, + pub implements: Vec, + pub properties: Vec, + pub constraints: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct EdgeIR { + pub name: String, + pub type_id: u32, + pub from_type: String, + pub to_type: String, + pub cardinality: Cardinality, + pub annotations: Vec, + pub properties: Vec, + pub constraints: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct PropertyIR { + pub name: String, + pub prop_id: u32, + pub prop_type: PropType, + pub annotations: Vec, +} + +pub fn build_schema_ir(schema: &SchemaFile) -> Result { + let mut seen_type_ids = HashMap::::new(); + let mut interfaces = Vec::new(); + let mut nodes = Vec::new(); + let mut edges = Vec::new(); + + for decl in &schema.declarations { + match decl { + SchemaDecl::Interface(interface) => { + let type_id = stable_type_id("interface", &interface.name); + check_type_id_collision(&mut seen_type_ids, type_id, &interface.name)?; + interfaces.push(InterfaceIR { + name: interface.name.clone(), + type_id, + properties: canonical_properties( + "interface", + &interface.name, + &interface.properties, + )?, + }); + } + SchemaDecl::Node(node) => { + let type_id = stable_type_id("node", &node.name); + check_type_id_collision(&mut seen_type_ids, type_id, &node.name)?; + nodes.push(NodeIR { + name: node.name.clone(), + type_id, + annotations: canonical_annotations(&node.annotations), + implements: canonical_strings(&node.implements), + properties: canonical_properties("node", &node.name, &node.properties)?, + constraints: canonical_constraints(&node.constraints), + }); + } + SchemaDecl::Edge(edge) => { + let type_id = stable_type_id("edge", &edge.name); + check_type_id_collision(&mut seen_type_ids, type_id, &edge.name)?; + edges.push(EdgeIR { + name: edge.name.clone(), + type_id, + from_type: edge.from_type.clone(), + to_type: edge.to_type.clone(), + cardinality: edge.cardinality.clone(), + annotations: canonical_annotations(&edge.annotations), + properties: canonical_properties("edge", &edge.name, &edge.properties)?, + constraints: canonical_constraints(&edge.constraints), + }); + } + } + } + + interfaces.sort_by(|a, b| a.name.cmp(&b.name)); + nodes.sort_by(|a, b| a.name.cmp(&b.name)); + edges.sort_by(|a, b| a.name.cmp(&b.name)); + + Ok(SchemaIR { + ir_version: SCHEMA_IR_VERSION, + interfaces, + nodes, + edges, + }) +} + +pub fn build_catalog_from_ir(ir: &SchemaIR) -> Result { + if ir.ir_version != SCHEMA_IR_VERSION { + return Err(NanoError::Catalog(format!( + "unsupported schema ir_version {} (expected {})", + ir.ir_version, SCHEMA_IR_VERSION + ))); + } + + let schema = SchemaFile { + declarations: ir + .interfaces + .iter() + .map(|interface| { + SchemaDecl::Interface(crate::schema::ast::InterfaceDecl { + name: interface.name.clone(), + properties: interface + .properties + .iter() + .map(property_decl_from_ir) + .collect(), + }) + }) + .chain(ir.nodes.iter().map(|node| { + SchemaDecl::Node(crate::schema::ast::NodeDecl { + name: node.name.clone(), + annotations: node.annotations.clone(), + implements: node.implements.clone(), + properties: node.properties.iter().map(property_decl_from_ir).collect(), + constraints: node.constraints.clone(), + }) + })) + .chain(ir.edges.iter().map(|edge| { + SchemaDecl::Edge(crate::schema::ast::EdgeDecl { + name: edge.name.clone(), + from_type: edge.from_type.clone(), + to_type: edge.to_type.clone(), + cardinality: edge.cardinality.clone(), + annotations: edge.annotations.clone(), + properties: edge.properties.iter().map(property_decl_from_ir).collect(), + constraints: edge.constraints.clone(), + }) + })) + .collect(), + }; + + build_catalog(&schema) +} + +pub fn schema_ir_json(ir: &SchemaIR) -> Result { + serde_json::to_string(ir) + .map_err(|err| NanoError::Catalog(format!("serialize schema ir error: {}", err))) +} + +pub fn schema_ir_pretty_json(ir: &SchemaIR) -> Result { + serde_json::to_string_pretty(ir) + .map_err(|err| NanoError::Catalog(format!("serialize schema ir error: {}", err))) +} + +pub fn schema_ir_hash(ir: &SchemaIR) -> Result { + let json = schema_ir_json(ir)?; + let mut hasher = Sha256::new(); + hasher.update(json.as_bytes()); + Ok(format!("sha256:{:x}", hasher.finalize())) +} + +fn property_decl_from_ir(property: &PropertyIR) -> PropDecl { + PropDecl { + name: property.name.clone(), + prop_type: property.prop_type.clone(), + annotations: property.annotations.clone(), + } +} + +fn canonical_strings(values: &[String]) -> Vec { + let mut values = values.to_vec(); + values.sort(); + values.dedup(); + values +} + +fn canonical_annotations(annotations: &[Annotation]) -> Vec { + let mut annotations = annotations.to_vec(); + annotations.sort_by(|left, right| { + left.name + .cmp(&right.name) + .then_with(|| left.value.cmp(&right.value)) + }); + annotations +} + +fn canonical_prop_type(prop_type: &PropType) -> PropType { + let mut normalized = prop_type.clone(); + if let Some(values) = &mut normalized.enum_values { + values.sort(); + values.dedup(); + } + normalized +} + +fn canonical_properties( + kind: &str, + owner_name: &str, + properties: &[PropDecl], +) -> Result> { + let mut seen_prop_ids = HashMap::::new(); + let owner_key = format!("{}:{}", kind, owner_name); + let mut canonical = properties + .iter() + .map(|property| { + let prop_id = stable_prop_id(&owner_key, &property.name); + if let Some(previous) = seen_prop_ids.insert(prop_id, property.name.clone()) { + return Err(NanoError::Catalog(format!( + "property id collision on {}: '{}' and '{}' both hash to {}", + owner_name, previous, property.name, prop_id + ))); + } + Ok(PropertyIR { + name: property.name.clone(), + prop_id, + prop_type: canonical_prop_type(&property.prop_type), + annotations: canonical_annotations(&property.annotations), + }) + }) + .collect::>>()?; + canonical.sort_by(|a, b| a.name.cmp(&b.name)); + Ok(canonical) +} + +fn canonical_constraints(constraints: &[Constraint]) -> Vec { + let mut constraints = constraints + .iter() + .cloned() + .map(normalize_constraint) + .collect::>(); + constraints.sort_by_key(constraint_sort_key); + constraints +} + +fn normalize_constraint(constraint: Constraint) -> Constraint { + match constraint { + Constraint::Key(mut columns) => { + columns.sort(); + Constraint::Key(columns) + } + Constraint::Unique(mut columns) => { + columns.sort(); + Constraint::Unique(columns) + } + Constraint::Index(mut columns) => { + columns.sort(); + Constraint::Index(columns) + } + other => other, + } +} + +fn constraint_sort_key(constraint: &Constraint) -> String { + match constraint { + Constraint::Key(columns) => format!("key:{}", columns.join(",")), + Constraint::Unique(columns) => format!("unique:{}", columns.join(",")), + Constraint::Index(columns) => format!("index:{}", columns.join(",")), + Constraint::Range { property, min, max } => { + format!("range:{}:{:?}:{:?}", property, min, max) + } + Constraint::Check { property, pattern } => format!("check:{}:{}", property, pattern), + } +} + +fn stable_type_id(kind: &str, name: &str) -> u32 { + fnv1a_u32(&format!("{}:{}", kind, name)) +} + +fn stable_prop_id(owner: &str, name: &str) -> u32 { + fnv1a_u32(&format!("{}:{}", owner, name)) +} + +fn fnv1a_u32(value: &str) -> u32 { + let mut hash: u32 = 2_166_136_261; + for byte in value.bytes() { + hash ^= u32::from(byte); + hash = hash.wrapping_mul(16_777_619); + } + if hash == 0 { 1 } else { hash } +} + +fn check_type_id_collision( + seen_type_ids: &mut HashMap, + type_id: u32, + name: &str, +) -> Result<()> { + if let Some(previous) = seen_type_ids.insert(type_id, name.to_string()) { + return Err(NanoError::Catalog(format!( + "type id collision: '{}' and '{}' both hash to {}", + previous, name, type_id + ))); + } + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::catalog::build_catalog; + use crate::schema::parser::parse_schema; + + #[test] + fn schema_ir_hash_is_stable_across_source_ordering_noise() { + let schema_a = parse_schema( + r#" +node Person { + age: I32? + name: String @key +} + +edge Knows: Person -> Person { + since: Date? +} +"#, + ) + .unwrap(); + let schema_b = parse_schema( + r#" +edge Knows: Person -> Person { + since: Date? +} + +node Person { + name: String @key + age: I32? +} +"#, + ) + .unwrap(); + + let ir_a = build_schema_ir(&schema_a).unwrap(); + let ir_b = build_schema_ir(&schema_b).unwrap(); + assert_eq!(ir_a, ir_b); + assert_eq!( + schema_ir_hash(&ir_a).unwrap(), + schema_ir_hash(&ir_b).unwrap() + ); + } + + #[test] + fn build_catalog_from_ir_round_trips_core_catalog_fields() { + let schema = parse_schema( + r#" +node Person @description("person") { + name: String @key + age: I32? @description("age") +} + +edge Knows: Person -> Person @instruction("friendship") { + since: Date? +} +"#, + ) + .unwrap(); + let direct = build_catalog(&schema).unwrap(); + let ir = build_schema_ir(&schema).unwrap(); + let rebuilt = build_catalog_from_ir(&ir).unwrap(); + + assert_eq!(direct.node_types.len(), rebuilt.node_types.len()); + assert_eq!(direct.edge_types.len(), rebuilt.edge_types.len()); + assert_eq!( + direct.node_types["Person"].key_property(), + rebuilt.node_types["Person"].key_property() + ); + assert_eq!( + direct.edge_types["Knows"].cardinality, + rebuilt.edge_types["Knows"].cardinality + ); + } +} diff --git a/crates/omnigraph-compiler/src/catalog/schema_plan.rs b/crates/omnigraph-compiler/src/catalog/schema_plan.rs new file mode 100644 index 0000000..50334ae --- /dev/null +++ b/crates/omnigraph-compiler/src/catalog/schema_plan.rs @@ -0,0 +1,895 @@ +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; + +use serde::{Deserialize, Serialize}; + +use crate::error::Result; +use crate::schema::ast::{Annotation, Constraint}; +use crate::types::PropType; + +use super::schema_ir::{EdgeIR, InterfaceIR, NodeIR, PropertyIR, SchemaIR}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum SchemaTypeKind { + Interface, + Node, + Edge, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SchemaMigrationPlan { + pub supported: bool, + pub steps: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum SchemaMigrationStep { + AddType { + type_kind: SchemaTypeKind, + name: String, + }, + RenameType { + type_kind: SchemaTypeKind, + from: String, + to: String, + }, + AddProperty { + type_kind: SchemaTypeKind, + type_name: String, + property_name: String, + property_type: PropType, + }, + RenameProperty { + type_kind: SchemaTypeKind, + type_name: String, + from: String, + to: String, + }, + AddConstraint { + type_kind: SchemaTypeKind, + type_name: String, + constraint: Constraint, + }, + UpdateTypeMetadata { + type_kind: SchemaTypeKind, + name: String, + annotations: Vec, + }, + UpdatePropertyMetadata { + type_kind: SchemaTypeKind, + type_name: String, + property_name: String, + annotations: Vec, + }, + UnsupportedChange { + entity: String, + reason: String, + }, +} + +pub fn plan_schema_migration( + accepted: &SchemaIR, + desired: &SchemaIR, +) -> Result { + let mut steps = Vec::new(); + let interface_renames = plan_interfaces(&accepted.interfaces, &desired.interfaces, &mut steps); + let node_renames = plan_nodes( + &accepted.nodes, + &desired.nodes, + &interface_renames, + &mut steps, + ); + plan_edges(&accepted.edges, &desired.edges, &node_renames, &mut steps); + + Ok(SchemaMigrationPlan { + supported: !steps + .iter() + .any(|step| matches!(step, SchemaMigrationStep::UnsupportedChange { .. })), + steps, + }) +} + +fn plan_interfaces( + accepted: &[InterfaceIR], + desired: &[InterfaceIR], + steps: &mut Vec, +) -> HashMap { + let accepted_by_name = accepted + .iter() + .map(|interface| (interface.name.as_str(), interface)) + .collect::>(); + let mut consumed = HashSet::new(); + + for interface in desired { + if let Some(existing) = accepted_by_name.get(interface.name.as_str()) { + consumed.insert(existing.name.clone()); + let _property_renames = plan_properties( + SchemaTypeKind::Interface, + &interface.name, + &existing.properties, + &interface.properties, + steps, + ); + continue; + } + + steps.push(SchemaMigrationStep::AddType { + type_kind: SchemaTypeKind::Interface, + name: interface.name.clone(), + }); + } + + for leftover in accepted + .iter() + .filter(|interface| !consumed.contains(&interface.name)) + { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("interface:{}", leftover.name), + reason: format!( + "removing interface '{}' is not supported in schema migration v1", + leftover.name + ), + }); + } + + HashMap::new() +} + +fn plan_nodes( + accepted: &[NodeIR], + desired: &[NodeIR], + interface_renames: &HashMap, + steps: &mut Vec, +) -> HashMap { + let accepted_by_name = accepted + .iter() + .map(|node| (node.name.as_str(), node)) + .collect::>(); + let mut consumed = HashSet::new(); + let mut renames = HashMap::new(); + + for node in desired { + let rename_from = rename_from_value(&node.annotations); + let matched = accepted_by_name + .get(node.name.as_str()) + .copied() + .or_else(|| { + rename_from.and_then(|from| { + accepted_by_name + .get(from) + .copied() + .filter(|candidate| candidate.name != node.name) + }) + }); + + let Some(existing) = matched else { + if let Some(from) = rename_from { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("node:{}", node.name), + reason: format!( + "node '{}' declares @rename_from(\"{}\") but no accepted node with that name exists", + node.name, from + ), + }); + } else { + steps.push(SchemaMigrationStep::AddType { + type_kind: SchemaTypeKind::Node, + name: node.name.clone(), + }); + } + continue; + }; + + consumed.insert(existing.name.clone()); + if existing.name != node.name { + renames.insert(existing.name.clone(), node.name.clone()); + steps.push(SchemaMigrationStep::RenameType { + type_kind: SchemaTypeKind::Node, + from: existing.name.clone(), + to: node.name.clone(), + }); + } + + if normalize_strings(&existing.implements, interface_renames) + != normalize_strings(&node.implements, &HashMap::new()) + { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("node:{}", node.name), + reason: format!( + "changing implemented interfaces on node '{}' is not supported in schema migration v1", + node.name + ), + }); + } + + plan_type_metadata( + SchemaTypeKind::Node, + &node.name, + &existing.annotations, + &node.annotations, + steps, + ); + let property_renames = plan_properties( + SchemaTypeKind::Node, + &node.name, + &existing.properties, + &node.properties, + steps, + ); + plan_constraints( + SchemaTypeKind::Node, + &node.name, + &existing.constraints, + &node.constraints, + &property_renames, + steps, + ); + } + + for leftover in accepted + .iter() + .filter(|node| !consumed.contains(&node.name)) + { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("node:{}", leftover.name), + reason: format!( + "removing node type '{}' is not supported in schema migration v1", + leftover.name + ), + }); + } + + renames +} + +fn plan_edges( + accepted: &[EdgeIR], + desired: &[EdgeIR], + node_renames: &HashMap, + steps: &mut Vec, +) { + let accepted_by_name = accepted + .iter() + .map(|edge| (edge.name.as_str(), edge)) + .collect::>(); + let mut consumed = HashSet::new(); + + for edge in desired { + let rename_from = rename_from_value(&edge.annotations); + let matched = accepted_by_name + .get(edge.name.as_str()) + .copied() + .or_else(|| { + rename_from.and_then(|from| { + accepted_by_name + .get(from) + .copied() + .filter(|candidate| candidate.name != edge.name) + }) + }); + + let Some(existing) = matched else { + if let Some(from) = rename_from { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("edge:{}", edge.name), + reason: format!( + "edge '{}' declares @rename_from(\"{}\") but no accepted edge with that name exists", + edge.name, from + ), + }); + } else { + steps.push(SchemaMigrationStep::AddType { + type_kind: SchemaTypeKind::Edge, + name: edge.name.clone(), + }); + } + continue; + }; + + consumed.insert(existing.name.clone()); + if existing.name != edge.name { + steps.push(SchemaMigrationStep::RenameType { + type_kind: SchemaTypeKind::Edge, + from: existing.name.clone(), + to: edge.name.clone(), + }); + } + + let normalized_from = normalize_type_ref(&existing.from_type, node_renames); + let normalized_to = normalize_type_ref(&existing.to_type, node_renames); + if normalized_from != edge.from_type || normalized_to != edge.to_type { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("edge:{}", edge.name), + reason: format!( + "changing edge endpoints on '{}' is not supported in schema migration v1", + edge.name + ), + }); + } + if existing.cardinality != edge.cardinality { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("edge:{}", edge.name), + reason: format!( + "changing cardinality on edge '{}' is not supported in schema migration v1", + edge.name + ), + }); + } + + plan_type_metadata( + SchemaTypeKind::Edge, + &edge.name, + &existing.annotations, + &edge.annotations, + steps, + ); + let property_renames = plan_properties( + SchemaTypeKind::Edge, + &edge.name, + &existing.properties, + &edge.properties, + steps, + ); + plan_constraints( + SchemaTypeKind::Edge, + &edge.name, + &existing.constraints, + &edge.constraints, + &property_renames, + steps, + ); + } + + for leftover in accepted + .iter() + .filter(|edge| !consumed.contains(&edge.name)) + { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("edge:{}", leftover.name), + reason: format!( + "removing edge type '{}' is not supported in schema migration v1", + leftover.name + ), + }); + } +} + +fn plan_properties( + type_kind: SchemaTypeKind, + type_name: &str, + accepted: &[PropertyIR], + desired: &[PropertyIR], + steps: &mut Vec, +) -> HashMap { + let accepted_by_name = accepted + .iter() + .map(|property| (property.name.as_str(), property)) + .collect::>(); + let mut consumed = HashSet::new(); + let mut renames = HashMap::new(); + + for property in desired { + let rename_from = rename_from_value(&property.annotations); + let matched = accepted_by_name + .get(property.name.as_str()) + .copied() + .or_else(|| { + rename_from.and_then(|from| { + accepted_by_name + .get(from) + .copied() + .filter(|candidate| candidate.name != property.name) + }) + }); + + let Some(existing) = matched else { + if let Some(from) = rename_from { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!( + "{}:{}.{}", + schema_type_kind_key(type_kind), + type_name, + property.name + ), + reason: format!( + "property '{}.{}' declares @rename_from(\"{}\") but no accepted property with that name exists", + type_name, property.name, from + ), + }); + } else if property.prop_type.nullable { + steps.push(SchemaMigrationStep::AddProperty { + type_kind, + type_name: type_name.to_string(), + property_name: property.name.clone(), + property_type: property.prop_type.clone(), + }); + } else { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!( + "{}:{}.{}", + schema_type_kind_key(type_kind), + type_name, + property.name + ), + reason: format!( + "adding required property '{}.{}' requires a backfill and is not supported in schema migration v1", + type_name, property.name + ), + }); + } + continue; + }; + + consumed.insert(existing.name.clone()); + if existing.name != property.name { + renames.insert(existing.name.clone(), property.name.clone()); + steps.push(SchemaMigrationStep::RenameProperty { + type_kind, + type_name: type_name.to_string(), + from: existing.name.clone(), + to: property.name.clone(), + }); + } + + if existing.prop_type != property.prop_type { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!( + "{}:{}.{}", + schema_type_kind_key(type_kind), + type_name, + property.name + ), + reason: format!( + "changing property type for '{}.{}' is not supported in schema migration v1", + type_name, property.name + ), + }); + } + + plan_property_metadata( + type_kind, + type_name, + &property.name, + &existing.annotations, + &property.annotations, + steps, + ); + } + + for leftover in accepted + .iter() + .filter(|property| !consumed.contains(&property.name)) + { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!( + "{}:{}.{}", + schema_type_kind_key(type_kind), + type_name, + leftover.name + ), + reason: format!( + "removing property '{}.{}' is not supported in schema migration v1", + type_name, leftover.name + ), + }); + } + + renames +} + +fn plan_constraints( + type_kind: SchemaTypeKind, + type_name: &str, + accepted: &[Constraint], + desired: &[Constraint], + property_renames: &HashMap, + steps: &mut Vec, +) { + let accepted = accepted + .iter() + .cloned() + .map(|constraint| rename_constraint_properties(constraint, property_renames)) + .collect::>(); + let desired_map = desired + .iter() + .cloned() + .map(|constraint| (constraint_key(&constraint), constraint)) + .collect::>(); + let accepted_map = accepted + .into_iter() + .map(|constraint| (constraint_key(&constraint), constraint)) + .collect::>(); + + let removed = accepted_map + .keys() + .filter(|key| !desired_map.contains_key(*key)) + .cloned() + .collect::>(); + if !removed.is_empty() { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("{}:{}", schema_type_kind_key(type_kind), type_name), + reason: format!( + "removing constraints from '{}' is not supported in schema migration v1", + type_name + ), + }); + } + + for (key, constraint) in desired_map { + if accepted_map.contains_key(&key) { + continue; + } + match constraint { + Constraint::Index(_) => steps.push(SchemaMigrationStep::AddConstraint { + type_kind, + type_name: type_name.to_string(), + constraint, + }), + _ => steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("{}:{}", schema_type_kind_key(type_kind), type_name), + reason: format!( + "adding constraint '{}' to '{}' is not supported in schema migration v1", + key, type_name + ), + }), + } + } +} + +fn plan_type_metadata( + type_kind: SchemaTypeKind, + name: &str, + accepted: &[Annotation], + desired: &[Annotation], + steps: &mut Vec, +) { + match annotation_change_kind(accepted, desired) { + AnnotationChangeKind::None => {} + AnnotationChangeKind::MetadataOnly(metadata) => { + steps.push(SchemaMigrationStep::UpdateTypeMetadata { + type_kind, + name: name.to_string(), + annotations: metadata, + }); + } + AnnotationChangeKind::Unsupported(reason) => { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!("{}:{}", schema_type_kind_key(type_kind), name), + reason, + }); + } + } +} + +fn plan_property_metadata( + type_kind: SchemaTypeKind, + type_name: &str, + property_name: &str, + accepted: &[Annotation], + desired: &[Annotation], + steps: &mut Vec, +) { + match annotation_change_kind(accepted, desired) { + AnnotationChangeKind::None => {} + AnnotationChangeKind::MetadataOnly(metadata) => { + steps.push(SchemaMigrationStep::UpdatePropertyMetadata { + type_kind, + type_name: type_name.to_string(), + property_name: property_name.to_string(), + annotations: metadata, + }); + } + AnnotationChangeKind::Unsupported(reason) => { + steps.push(SchemaMigrationStep::UnsupportedChange { + entity: format!( + "{}:{}.{}", + schema_type_kind_key(type_kind), + type_name, + property_name + ), + reason, + }); + } + } +} + +enum AnnotationChangeKind { + None, + MetadataOnly(Vec), + Unsupported(String), +} + +fn annotation_change_kind(accepted: &[Annotation], desired: &[Annotation]) -> AnnotationChangeKind { + let accepted_non_metadata = strip_metadata_annotations(accepted); + let desired_non_metadata = strip_metadata_annotations(desired); + if accepted_non_metadata != desired_non_metadata { + return AnnotationChangeKind::Unsupported( + "changing annotations beyond @description/@instruction is not supported in schema migration v1" + .to_string(), + ); + } + + let accepted_metadata = metadata_annotations(accepted); + let desired_metadata = metadata_annotations(desired); + if accepted_metadata == desired_metadata { + AnnotationChangeKind::None + } else { + AnnotationChangeKind::MetadataOnly(desired_metadata) + } +} + +fn strip_metadata_annotations(annotations: &[Annotation]) -> Vec { + annotations + .iter() + .filter(|annotation| { + !matches!( + annotation.name.as_str(), + "description" | "instruction" | "rename_from" | "key" | "unique" | "index" + ) + }) + .cloned() + .collect() +} + +fn metadata_annotations(annotations: &[Annotation]) -> Vec { + annotations + .iter() + .filter(|annotation| matches!(annotation.name.as_str(), "description" | "instruction")) + .cloned() + .collect() +} + +fn normalize_strings(values: &[String], renames: &HashMap) -> BTreeSet { + values + .iter() + .map(|value| normalize_type_ref(value, renames)) + .collect() +} + +fn normalize_type_ref(value: &str, renames: &HashMap) -> String { + renames + .get(value) + .cloned() + .unwrap_or_else(|| value.to_string()) +} + +fn rename_constraint_properties( + constraint: Constraint, + property_renames: &HashMap, +) -> Constraint { + match constraint { + Constraint::Key(columns) => { + Constraint::Key(rename_constraint_columns(columns, property_renames)) + } + Constraint::Unique(columns) => { + Constraint::Unique(rename_constraint_columns(columns, property_renames)) + } + Constraint::Index(columns) => { + Constraint::Index(rename_constraint_columns(columns, property_renames)) + } + Constraint::Range { property, min, max } => Constraint::Range { + property: normalize_property_ref(&property, property_renames), + min, + max, + }, + Constraint::Check { property, pattern } => Constraint::Check { + property: normalize_property_ref(&property, property_renames), + pattern, + }, + } +} + +fn rename_constraint_columns( + columns: Vec, + property_renames: &HashMap, +) -> Vec { + let mut columns = columns + .into_iter() + .map(|column| normalize_property_ref(&column, property_renames)) + .collect::>(); + columns.sort(); + columns +} + +fn normalize_property_ref(value: &str, renames: &HashMap) -> String { + renames + .get(value) + .cloned() + .unwrap_or_else(|| value.to_string()) +} + +fn constraint_key(constraint: &Constraint) -> String { + match constraint { + Constraint::Key(columns) => format!("key:{}", columns.join(",")), + Constraint::Unique(columns) => format!("unique:{}", columns.join(",")), + Constraint::Index(columns) => format!("index:{}", columns.join(",")), + Constraint::Range { property, min, max } => { + format!("range:{}:{:?}:{:?}", property, min, max) + } + Constraint::Check { property, pattern } => format!("check:{}:{}", property, pattern), + } +} + +fn rename_from_value(annotations: &[Annotation]) -> Option<&str> { + annotations + .iter() + .find(|annotation| annotation.name == "rename_from") + .and_then(|annotation| annotation.value.as_deref()) +} + +fn schema_type_kind_key(kind: SchemaTypeKind) -> &'static str { + match kind { + SchemaTypeKind::Interface => "interface", + SchemaTypeKind::Node => "node", + SchemaTypeKind::Edge => "edge", + } +} + +#[cfg(test)] +mod tests { + use crate::catalog::schema_ir::build_schema_ir; + use crate::schema::parser::parse_schema; + + use super::SchemaMigrationStep::{ + AddConstraint, AddProperty, RenameProperty, RenameType, UnsupportedChange, + UpdateTypeMetadata, + }; + use super::*; + + #[test] + fn plan_supports_additive_nullable_property_and_index() { + let accepted = build_schema_ir( + &parse_schema( + r#" +node Person { + name: String @key + age: I32? +} +"#, + ) + .unwrap(), + ) + .unwrap(); + let desired = build_schema_ir( + &parse_schema( + r#" +node Person { + name: String @key + age: I32? @index + nickname: String? +} +"#, + ) + .unwrap(), + ) + .unwrap(); + + let plan = plan_schema_migration(&accepted, &desired).unwrap(); + assert!(plan.supported); + assert!(plan.steps.contains(&AddProperty { + type_kind: SchemaTypeKind::Node, + type_name: "Person".to_string(), + property_name: "nickname".to_string(), + property_type: PropType::scalar(crate::types::ScalarType::String, true), + })); + assert!(plan.steps.contains(&AddConstraint { + type_kind: SchemaTypeKind::Node, + type_name: "Person".to_string(), + constraint: Constraint::Index(vec!["age".to_string()]), + })); + } + + #[test] + fn plan_supports_explicit_type_and_property_rename() { + let accepted = build_schema_ir( + &parse_schema( + r#" +node User { + name: String @key +} +"#, + ) + .unwrap(), + ) + .unwrap(); + let desired = build_schema_ir( + &parse_schema( + r#" +node Account @rename_from("User") { + full_name: String @key @rename_from("name") +} +"#, + ) + .unwrap(), + ) + .unwrap(); + + let plan = plan_schema_migration(&accepted, &desired).unwrap(); + assert!(plan.supported); + assert!(plan.steps.contains(&RenameType { + type_kind: SchemaTypeKind::Node, + from: "User".to_string(), + to: "Account".to_string(), + })); + assert!(plan.steps.contains(&RenameProperty { + type_kind: SchemaTypeKind::Node, + type_name: "Account".to_string(), + from: "name".to_string(), + to: "full_name".to_string(), + })); + } + + #[test] + fn plan_rejects_required_property_addition() { + let accepted = build_schema_ir( + &parse_schema( + r#" +node Person { + name: String @key +} +"#, + ) + .unwrap(), + ) + .unwrap(); + let desired = build_schema_ir( + &parse_schema( + r#" +node Person { + name: String @key + age: I32 +} +"#, + ) + .unwrap(), + ) + .unwrap(); + + let plan = plan_schema_migration(&accepted, &desired).unwrap(); + assert!(!plan.supported); + assert!(plan.steps.iter().any(|step| matches!( + step, + UnsupportedChange { entity, reason } + if entity.contains("Person.age") + && reason.contains("adding required property") + ))); + } + + #[test] + fn plan_supports_metadata_only_annotation_changes() { + let accepted = build_schema_ir( + &parse_schema( + r#" +node Person @description("old") { + name: String @key +} +"#, + ) + .unwrap(), + ) + .unwrap(); + let desired = build_schema_ir( + &parse_schema( + r#" +node Person @description("new") { + name: String @key +} +"#, + ) + .unwrap(), + ) + .unwrap(); + + let plan = plan_schema_migration(&accepted, &desired).unwrap(); + assert!(plan.supported); + assert!(plan.steps.contains(&UpdateTypeMetadata { + type_kind: SchemaTypeKind::Node, + name: "Person".to_string(), + annotations: vec![Annotation { + name: "description".to_string(), + value: Some("new".to_string()), + }], + })); + } +} diff --git a/crates/omnigraph-compiler/src/embedding.rs b/crates/omnigraph-compiler/src/embedding.rs new file mode 100644 index 0000000..6c9e6f3 --- /dev/null +++ b/crates/omnigraph-compiler/src/embedding.rs @@ -0,0 +1,379 @@ +#![allow(dead_code)] + +use std::time::Duration; + +use reqwest::Client; +use serde::Deserialize; +use tokio::time::sleep; + +use crate::error::{NanoError, Result}; + +const DEFAULT_EMBED_MODEL: &str = "text-embedding-3-small"; +const DEFAULT_OPENAI_BASE_URL: &str = "https://api.openai.com/v1"; +const DEFAULT_TIMEOUT_MS: u64 = 30_000; +const DEFAULT_RETRY_ATTEMPTS: usize = 4; +const DEFAULT_RETRY_BACKOFF_MS: u64 = 200; + +#[derive(Clone)] +enum EmbeddingTransport { + Mock, + OpenAi { + api_key: String, + base_url: String, + http: Client, + }, +} + +#[derive(Clone)] +pub(crate) struct EmbeddingClient { + model: String, + retry_attempts: usize, + retry_backoff_ms: u64, + transport: EmbeddingTransport, +} + +struct EmbedCallError { + message: String, + retryable: bool, +} + +#[derive(Debug, Deserialize)] +struct OpenAiEmbeddingResponse { + data: Vec, +} + +#[derive(Debug, Deserialize)] +struct OpenAiEmbeddingDatum { + index: usize, + embedding: Vec, +} + +#[derive(Debug, Deserialize)] +struct OpenAiErrorEnvelope { + error: OpenAiErrorBody, +} + +#[derive(Debug, Deserialize)] +struct OpenAiErrorBody { + message: String, +} + +impl EmbeddingClient { + pub(crate) fn from_env() -> Result { + let model = std::env::var("NANOGRAPH_EMBED_MODEL") + .ok() + .map(|v| v.trim().to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_EMBED_MODEL.to_string()); + let retry_attempts = + parse_env_usize("NANOGRAPH_EMBED_RETRY_ATTEMPTS", DEFAULT_RETRY_ATTEMPTS); + let retry_backoff_ms = + parse_env_u64("NANOGRAPH_EMBED_RETRY_BACKOFF_MS", DEFAULT_RETRY_BACKOFF_MS); + + if env_flag("NANOGRAPH_EMBEDDINGS_MOCK") { + return Ok(Self { + model, + retry_attempts, + retry_backoff_ms, + transport: EmbeddingTransport::Mock, + }); + } + + let api_key = std::env::var("OPENAI_API_KEY") + .ok() + .map(|v| v.trim().to_string()) + .filter(|v| !v.is_empty()) + .ok_or_else(|| { + NanoError::Execution( + "OPENAI_API_KEY is required when an embedding call is needed".to_string(), + ) + })?; + let base_url = std::env::var("OPENAI_BASE_URL") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_OPENAI_BASE_URL.to_string()); + let timeout_ms = parse_env_u64("NANOGRAPH_EMBED_TIMEOUT_MS", DEFAULT_TIMEOUT_MS); + let http = Client::builder() + .timeout(Duration::from_millis(timeout_ms)) + .build() + .map_err(|e| { + NanoError::Execution(format!("failed to initialize HTTP client: {}", e)) + })?; + + Ok(Self { + model, + retry_attempts, + retry_backoff_ms, + transport: EmbeddingTransport::OpenAi { + api_key, + base_url, + http, + }, + }) + } + + #[cfg(test)] + pub(crate) fn mock_for_tests() -> Self { + Self { + model: DEFAULT_EMBED_MODEL.to_string(), + retry_attempts: DEFAULT_RETRY_ATTEMPTS, + retry_backoff_ms: DEFAULT_RETRY_BACKOFF_MS, + transport: EmbeddingTransport::Mock, + } + } + + pub(crate) fn model(&self) -> &str { + &self.model + } + + pub(crate) async fn embed_text(&self, input: &str, expected_dim: usize) -> Result> { + let mut vectors = self.embed_texts(&[input.to_string()], expected_dim).await?; + vectors.pop().ok_or_else(|| { + NanoError::Execution("embedding provider returned no vector".to_string()) + }) + } + + pub(crate) async fn embed_texts( + &self, + inputs: &[String], + expected_dim: usize, + ) -> Result>> { + if expected_dim == 0 { + return Err(NanoError::Execution( + "embedding dimension must be greater than zero".to_string(), + )); + } + if inputs.is_empty() { + return Ok(Vec::new()); + } + + match &self.transport { + EmbeddingTransport::Mock => Ok(inputs + .iter() + .map(|input| mock_embedding(input, expected_dim)) + .collect()), + EmbeddingTransport::OpenAi { .. } => { + self.embed_texts_openai_with_retry(inputs, expected_dim) + .await + } + } + } + + async fn embed_texts_openai_with_retry( + &self, + inputs: &[String], + expected_dim: usize, + ) -> Result>> { + let max_attempt = self.retry_attempts.max(1); + let mut attempt = 0usize; + loop { + attempt += 1; + match self.embed_texts_openai_once(inputs, expected_dim).await { + Ok(vectors) => return Ok(vectors), + Err(err) => { + if !err.retryable || attempt >= max_attempt { + return Err(NanoError::Execution(err.message)); + } + let shift = (attempt - 1).min(10) as u32; + let delay = self.retry_backoff_ms.saturating_mul(1u64 << shift); + sleep(Duration::from_millis(delay)).await; + } + } + } + } + + async fn embed_texts_openai_once( + &self, + inputs: &[String], + expected_dim: usize, + ) -> std::result::Result>, EmbedCallError> { + let (api_key, base_url, http) = match &self.transport { + EmbeddingTransport::OpenAi { + api_key, + base_url, + http, + } => (api_key, base_url, http), + EmbeddingTransport::Mock => unreachable!("mock transport should not call OpenAI"), + }; + + let request = serde_json::json!({ + "model": self.model, + "input": inputs, + "dimensions": expected_dim, + }); + let url = format!("{}/embeddings", base_url); + let response = http + .post(&url) + .bearer_auth(api_key) + .json(&request) + .send() + .await; + + let response = match response { + Ok(resp) => resp, + Err(err) => { + let retryable = err.is_timeout() || err.is_connect() || err.is_request(); + return Err(EmbedCallError { + message: format!("embedding request failed: {}", err), + retryable, + }); + } + }; + + let status = response.status(); + let body = match response.text().await { + Ok(body) => body, + Err(err) => { + return Err(EmbedCallError { + message: format!( + "embedding response read failed (status {}): {}", + status, err + ), + retryable: status.is_server_error() || status.as_u16() == 429, + }); + } + }; + + if !status.is_success() { + let message = parse_openai_error_message(&body).unwrap_or_else(|| body.clone()); + return Err(EmbedCallError { + message: format!( + "embedding request failed with status {}: {}", + status, message + ), + retryable: status.is_server_error() || status.as_u16() == 429, + }); + } + + let mut parsed: OpenAiEmbeddingResponse = + serde_json::from_str(&body).map_err(|err| EmbedCallError { + message: format!("embedding response decode failed: {}", err), + retryable: false, + })?; + + if parsed.data.len() != inputs.len() { + return Err(EmbedCallError { + message: format!( + "embedding response size mismatch: expected {}, got {}", + inputs.len(), + parsed.data.len() + ), + retryable: false, + }); + } + + parsed.data.sort_by_key(|item| item.index); + let mut vectors = Vec::with_capacity(parsed.data.len()); + for (idx, item) in parsed.data.into_iter().enumerate() { + if item.index != idx { + return Err(EmbedCallError { + message: format!( + "embedding response index mismatch at position {}: got {}", + idx, item.index + ), + retryable: false, + }); + } + if item.embedding.len() != expected_dim { + return Err(EmbedCallError { + message: format!( + "embedding dimension mismatch: expected {}, got {}", + expected_dim, + item.embedding.len() + ), + retryable: false, + }); + } + vectors.push(item.embedding); + } + Ok(vectors) + } +} + +fn parse_openai_error_message(body: &str) -> Option { + serde_json::from_str::(body) + .ok() + .map(|e| e.error.message) + .filter(|msg| !msg.trim().is_empty()) +} + +fn parse_env_usize(name: &str, default: usize) -> usize { + std::env::var(name) + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|v| *v > 0) + .unwrap_or(default) +} + +fn parse_env_u64(name: &str, default: u64) -> u64 { + std::env::var(name) + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|v| *v > 0) + .unwrap_or(default) +} + +fn env_flag(name: &str) -> bool { + std::env::var(name) + .ok() + .map(|v| { + let s = v.trim().to_ascii_lowercase(); + s == "1" || s == "true" || s == "yes" || s == "on" + }) + .unwrap_or(false) +} + +fn mock_embedding(input: &str, dim: usize) -> Vec { + let mut seed = fnv1a64(input.as_bytes()); + let mut out = Vec::with_capacity(dim); + for _ in 0..dim { + seed = xorshift64(seed); + let ratio = (seed as f64 / u64::MAX as f64) as f32; + out.push((ratio * 2.0) - 1.0); + } + + let norm = out + .iter() + .map(|v| (*v as f64) * (*v as f64)) + .sum::() + .sqrt() as f32; + if norm > f32::EPSILON { + for value in &mut out { + *value /= norm; + } + } + out +} + +fn fnv1a64(bytes: &[u8]) -> u64 { + let mut hash = 14695981039346656037u64; + for byte in bytes { + hash ^= *byte as u64; + hash = hash.wrapping_mul(1099511628211u64); + } + hash +} + +fn xorshift64(mut x: u64) -> u64 { + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + x +} + +#[cfg(test)] +mod tests { + use super::*; + + #[tokio::test] + async fn mock_embeddings_are_deterministic() { + let client = EmbeddingClient::mock_for_tests(); + let a = client.embed_text("alpha", 8).await.unwrap(); + let b = client.embed_text("alpha", 8).await.unwrap(); + let c = client.embed_text("beta", 8).await.unwrap(); + assert_eq!(a, b); + assert_ne!(a, c); + assert_eq!(a.len(), 8); + } +} diff --git a/crates/omnigraph-compiler/src/error.rs b/crates/omnigraph-compiler/src/error.rs new file mode 100644 index 0000000..ea48759 --- /dev/null +++ b/crates/omnigraph-compiler/src/error.rs @@ -0,0 +1,146 @@ +use thiserror::Error; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub struct SourceSpan { + pub start: usize, + pub end: usize, +} + +impl SourceSpan { + pub fn new(start: usize, end: usize) -> Self { + Self { start, end } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct ParseDiagnostic { + pub message: String, + pub span: Option, +} + +impl ParseDiagnostic { + pub fn new(message: String, span: Option) -> Self { + Self { message, span } + } +} + +impl std::fmt::Display for ParseDiagnostic { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}", self.message) + } +} + +impl std::error::Error for ParseDiagnostic {} + +pub fn render_span(span: SourceSpan) -> SourceSpan { + SourceSpan { + start: span.start, + end: span.end.max(span.start.saturating_add(1)), + } +} + +pub fn decode_string_literal(raw: &str) -> Result { + let inner = raw + .strip_prefix('"') + .and_then(|inner| inner.strip_suffix('"')) + .unwrap_or(raw); + + let mut decoded = String::with_capacity(inner.len()); + let mut chars = inner.chars(); + while let Some(ch) = chars.next() { + if ch != '\\' { + decoded.push(ch); + continue; + } + + let escaped = chars + .next() + .ok_or_else(|| NanoError::Parse("unterminated escape sequence".to_string()))?; + match escaped { + '"' => decoded.push('"'), + '\\' => decoded.push('\\'), + 'n' => decoded.push('\n'), + 'r' => decoded.push('\r'), + 't' => decoded.push('\t'), + other => { + return Err(NanoError::Parse(format!( + "unsupported escape sequence: \\{}", + other + ))); + } + } + } + + Ok(decoded) +} + +#[derive(Debug, Error)] +pub enum NanoError { + #[error("parse error: {0}")] + Parse(String), + + #[error("catalog error: {0}")] + Catalog(String), + + #[error("type error: {0}")] + Type(String), + + #[error("storage error: {0}")] + Storage(String), + + #[error( + "@unique constraint violation on {type_name}.{property}: duplicate value '{value}' at rows {first_row} and {second_row}" + )] + UniqueConstraint { + type_name: String, + property: String, + value: String, + first_row: usize, + second_row: usize, + }, + + #[error("plan error: {0}")] + Plan(String), + + #[error("execution error: {0}")] + Execution(String), + + #[error(transparent)] + Arrow(#[from] arrow_schema::ArrowError), + + #[error("io error: {0}")] + Io(#[from] std::io::Error), + + #[error("lance error: {0}")] + Lance(String), + + #[error("manifest error: {0}")] + Manifest(String), +} + +pub type Result = std::result::Result; + +#[cfg(test)] +mod tests { + use super::{SourceSpan, decode_string_literal, render_span}; + + #[test] + fn source_span_preserves_zero_width() { + let span = SourceSpan::new(7, 7); + assert_eq!(span.start, 7); + assert_eq!(span.end, 7); + } + + #[test] + fn render_span_widens_zero_width_for_diagnostics() { + let rendered = render_span(SourceSpan::new(7, 7)); + assert_eq!(rendered.start, 7); + assert_eq!(rendered.end, 8); + } + + #[test] + fn decode_string_literal_supports_common_escapes() { + let decoded = decode_string_literal("\"a\\n\\r\\t\\\\\\\"b\"").unwrap(); + assert_eq!(decoded, "a\n\r\t\\\"b"); + } +} diff --git a/crates/omnigraph-compiler/src/ir/lower.rs b/crates/omnigraph-compiler/src/ir/lower.rs new file mode 100644 index 0000000..c7a4fb8 --- /dev/null +++ b/crates/omnigraph-compiler/src/ir/lower.rs @@ -0,0 +1,657 @@ +use std::collections::HashSet; + +use crate::catalog::Catalog; +use crate::error::Result; +use crate::query::ast::*; +use crate::query::typecheck::TypeContext; +use crate::types::Direction; + +use super::*; + +pub fn lower_query( + catalog: &Catalog, + query: &QueryDecl, + type_ctx: &TypeContext, +) -> Result { + if query.mutation.is_some() { + return Err(crate::error::NanoError::Plan( + "cannot lower mutation query with read-query lowerer".to_string(), + )); + } + let param_names: HashSet = query.params.iter().map(|p| p.name.clone()).collect(); + + let mut pipeline = Vec::new(); + let mut bound_vars = HashSet::new(); + + lower_clauses( + catalog, + &query.match_clause, + type_ctx, + &mut pipeline, + &mut bound_vars, + ¶m_names, + )?; + + let return_exprs: Vec = query + .return_clause + .iter() + .map(|p| IRProjection { + expr: lower_expr(&p.expr, ¶m_names), + alias: p.alias.clone(), + }) + .collect(); + + let order_by: Vec = query + .order_clause + .iter() + .map(|o| IROrdering { + expr: lower_expr(&o.expr, ¶m_names), + descending: o.descending, + }) + .collect(); + + Ok(QueryIR { + name: query.name.clone(), + params: query.params.clone(), + pipeline, + return_exprs, + order_by, + limit: query.limit, + }) +} + +pub fn lower_mutation_query(query: &QueryDecl) -> Result { + let mutation = query.mutation.as_ref().ok_or_else(|| { + crate::error::NanoError::Plan("query does not contain a mutation body".to_string()) + })?; + let param_names: HashSet = query.params.iter().map(|p| p.name.clone()).collect(); + + let op = match mutation { + Mutation::Insert(insert) => MutationOpIR::Insert { + type_name: insert.type_name.clone(), + assignments: insert + .assignments + .iter() + .map(|a| IRAssignment { + property: a.property.clone(), + value: lower_match_value(&a.value, ¶m_names), + }) + .collect(), + }, + Mutation::Update(update) => MutationOpIR::Update { + type_name: update.type_name.clone(), + assignments: update + .assignments + .iter() + .map(|a| IRAssignment { + property: a.property.clone(), + value: lower_match_value(&a.value, ¶m_names), + }) + .collect(), + predicate: IRMutationPredicate { + property: update.predicate.property.clone(), + op: update.predicate.op, + value: lower_match_value(&update.predicate.value, ¶m_names), + }, + }, + Mutation::Delete(delete) => MutationOpIR::Delete { + type_name: delete.type_name.clone(), + predicate: IRMutationPredicate { + property: delete.predicate.property.clone(), + op: delete.predicate.op, + value: lower_match_value(&delete.predicate.value, ¶m_names), + }, + }, + }; + + Ok(MutationIR { + name: query.name.clone(), + params: query.params.clone(), + op, + }) +} + +fn lower_clauses( + catalog: &Catalog, + clauses: &[Clause], + type_ctx: &TypeContext, + pipeline: &mut Vec, + bound_vars: &mut HashSet, + param_names: &HashSet, +) -> Result<()> { + // Separate clause types for ordering: bindings first, then traversals, then filters + let mut bindings = Vec::new(); + let mut traversals = Vec::new(); + let mut filters = Vec::new(); + let mut negations = Vec::new(); + + for clause in clauses { + match clause { + Clause::Binding(b) => bindings.push(b), + Clause::Traversal(t) => traversals.push(t), + Clause::Filter(f) => filters.push(f), + Clause::Negation(inner) => negations.push(inner), + } + } + + // Lower bindings into NodeScan ops + for binding in &bindings { + let node_type = catalog + .node_types + .get(&binding.type_name) + .expect("binding type was validated during typecheck"); + // Collect inline filters from prop matches + let mut scan_filters = Vec::new(); + for pm in &binding.prop_matches { + let prop = node_type + .properties + .get(&pm.prop_name) + .expect("binding property was validated during typecheck"); + let op = if prop.list { + CompOp::Contains + } else { + CompOp::Eq + }; + match &pm.value { + MatchValue::Literal(lit) => { + scan_filters.push(IRFilter { + left: IRExpr::PropAccess { + variable: binding.variable.clone(), + property: pm.prop_name.clone(), + }, + op, + right: IRExpr::Literal(lit.clone()), + }); + } + MatchValue::Now => { + scan_filters.push(IRFilter { + left: IRExpr::PropAccess { + variable: binding.variable.clone(), + property: pm.prop_name.clone(), + }, + op, + right: IRExpr::Param(NOW_PARAM_NAME.to_string()), + }); + } + MatchValue::Variable(v) => { + let right = if param_names.contains(v) { + IRExpr::Param(v.clone()) + } else { + IRExpr::Variable(v.clone()) + }; + scan_filters.push(IRFilter { + left: IRExpr::PropAccess { + variable: binding.variable.clone(), + property: pm.prop_name.clone(), + }, + op, + right, + }); + } + } + } + + pipeline.push(IROp::NodeScan { + variable: binding.variable.clone(), + type_name: binding.type_name.clone(), + filters: scan_filters, + }); + bound_vars.insert(binding.variable.clone()); + } + + // Lower traversals into Expand ops + // Handle "cycle closing" — if both src and dst are already bound, use a filter + for traversal in &traversals { + let edge = catalog + .lookup_edge_by_name(&traversal.edge_name) + .ok_or_else(|| { + crate::error::NanoError::Plan(format!( + "lowering traversal referenced missing edge '{}' after typecheck", + traversal.edge_name + )) + })?; + + // Determine direction from type context + let direction = type_ctx + .traversals + .iter() + .find(|rt| { + rt.src == traversal.src && rt.dst == traversal.dst && rt.edge_type == edge.name + }) + .map(|rt| rt.direction) + .unwrap_or(Direction::Out); + + let dst_type = match direction { + Direction::Out => edge.to_type.clone(), + Direction::In => edge.from_type.clone(), + }; + + if bound_vars.contains(&traversal.src) && bound_vars.contains(&traversal.dst) { + // Cycle closing: emit expand to a temp var, then filter temp.id = dst.id + let temp_var = format!("__temp_{}", traversal.dst); + pipeline.push(IROp::Expand { + src_var: traversal.src.clone(), + dst_var: temp_var.clone(), + edge_type: edge.name.clone(), + direction, + dst_type, + min_hops: traversal.min_hops, + max_hops: traversal.max_hops, + }); + pipeline.push(IROp::Filter(IRFilter { + left: IRExpr::PropAccess { + variable: temp_var, + property: "id".to_string(), + }, + op: CompOp::Eq, + right: IRExpr::PropAccess { + variable: traversal.dst.clone(), + property: "id".to_string(), + }, + })); + } else if !bound_vars.contains(&traversal.src) && bound_vars.contains(&traversal.dst) { + // Reverse expand: dst is bound, src is not. + // Swap direction and expand from dst to discover src. + let reverse_dir = match direction { + Direction::Out => Direction::In, + Direction::In => Direction::Out, + }; + let src_type = match direction { + Direction::Out => edge.from_type.clone(), + Direction::In => edge.to_type.clone(), + }; + pipeline.push(IROp::Expand { + src_var: traversal.dst.clone(), + dst_var: traversal.src.clone(), + edge_type: edge.name.clone(), + direction: reverse_dir, + dst_type: src_type, + min_hops: traversal.min_hops, + max_hops: traversal.max_hops, + }); + if traversal.src != "_" { + bound_vars.insert(traversal.src.clone()); + } + } else { + pipeline.push(IROp::Expand { + src_var: traversal.src.clone(), + dst_var: traversal.dst.clone(), + edge_type: edge.name.clone(), + direction, + dst_type, + min_hops: traversal.min_hops, + max_hops: traversal.max_hops, + }); + if traversal.dst != "_" { + bound_vars.insert(traversal.dst.clone()); + } + } + } + + // Lower explicit filters + for filter in &filters { + pipeline.push(IROp::Filter(IRFilter { + left: lower_expr(&filter.left, param_names), + op: filter.op, + right: lower_expr(&filter.right, param_names), + })); + } + + // Lower negations into AntiJoin ops + for neg_clauses in &negations { + // Find outer-bound variable referenced in the negation + let outer_var = find_outer_var(neg_clauses, bound_vars); + + let mut inner_pipeline = Vec::new(); + let mut inner_bound = bound_vars.clone(); + lower_clauses( + catalog, + neg_clauses, + type_ctx, + &mut inner_pipeline, + &mut inner_bound, + param_names, + )?; + + pipeline.push(IROp::AntiJoin { + outer_var: outer_var.unwrap_or_default(), + inner: inner_pipeline, + }); + } + + Ok(()) +} + +fn find_outer_var(clauses: &[Clause], outer_bound: &HashSet) -> Option { + for clause in clauses { + match clause { + Clause::Traversal(t) => { + if outer_bound.contains(&t.src) { + return Some(t.src.clone()); + } + if outer_bound.contains(&t.dst) { + return Some(t.dst.clone()); + } + } + Clause::Filter(f) => { + if let Some(v) = expr_var(&f.left) + && outer_bound.contains(&v) + { + return Some(v); + } + if let Some(v) = expr_var(&f.right) + && outer_bound.contains(&v) + { + return Some(v); + } + } + Clause::Binding(b) => { + if outer_bound.contains(&b.variable) { + return Some(b.variable.clone()); + } + } + _ => {} + } + } + None +} + +fn expr_var(expr: &Expr) -> Option { + match expr { + Expr::Now => None, + Expr::PropAccess { variable, .. } => Some(variable.clone()), + Expr::Variable(v) => Some(v.clone()), + Expr::Nearest { variable, .. } => Some(variable.clone()), + Expr::Search { field, query } => expr_var(field).or_else(|| expr_var(query)), + Expr::Fuzzy { + field, + query, + max_edits, + } => expr_var(field) + .or_else(|| expr_var(query)) + .or_else(|| max_edits.as_deref().and_then(expr_var)), + Expr::MatchText { field, query } => expr_var(field).or_else(|| expr_var(query)), + Expr::Bm25 { field, query } => expr_var(field).or_else(|| expr_var(query)), + Expr::Rrf { + primary, + secondary, + k, + } => expr_var(primary) + .or_else(|| expr_var(secondary)) + .or_else(|| k.as_deref().and_then(expr_var)), + Expr::Aggregate { arg, .. } => expr_var(arg), + _ => None, + } +} + +fn lower_expr(expr: &Expr, param_names: &HashSet) -> IRExpr { + match expr { + Expr::Now => IRExpr::Param(NOW_PARAM_NAME.to_string()), + Expr::PropAccess { variable, property } => IRExpr::PropAccess { + variable: variable.clone(), + property: property.clone(), + }, + Expr::Nearest { + variable, + property, + query, + } => IRExpr::Nearest { + variable: variable.clone(), + property: property.clone(), + query: Box::new(lower_expr(query, param_names)), + }, + Expr::Search { field, query } => IRExpr::Search { + field: Box::new(lower_expr(field, param_names)), + query: Box::new(lower_expr(query, param_names)), + }, + Expr::Fuzzy { + field, + query, + max_edits, + } => IRExpr::Fuzzy { + field: Box::new(lower_expr(field, param_names)), + query: Box::new(lower_expr(query, param_names)), + max_edits: max_edits + .as_ref() + .map(|expr| Box::new(lower_expr(expr, param_names))), + }, + Expr::MatchText { field, query } => IRExpr::MatchText { + field: Box::new(lower_expr(field, param_names)), + query: Box::new(lower_expr(query, param_names)), + }, + Expr::Bm25 { field, query } => IRExpr::Bm25 { + field: Box::new(lower_expr(field, param_names)), + query: Box::new(lower_expr(query, param_names)), + }, + Expr::Rrf { + primary, + secondary, + k, + } => IRExpr::Rrf { + primary: Box::new(lower_expr(primary, param_names)), + secondary: Box::new(lower_expr(secondary, param_names)), + k: k.as_ref() + .map(|expr| Box::new(lower_expr(expr, param_names))), + }, + Expr::Variable(v) => { + if param_names.contains(v) { + IRExpr::Param(v.clone()) + } else { + IRExpr::Variable(v.clone()) + } + } + Expr::Literal(l) => IRExpr::Literal(l.clone()), + Expr::Aggregate { func, arg } => IRExpr::Aggregate { + func: *func, + arg: Box::new(lower_expr(arg, param_names)), + }, + Expr::AliasRef(name) => IRExpr::AliasRef(name.clone()), + } +} + +fn lower_match_value(value: &MatchValue, param_names: &HashSet) -> IRExpr { + match value { + MatchValue::Now => IRExpr::Param(NOW_PARAM_NAME.to_string()), + MatchValue::Literal(l) => IRExpr::Literal(l.clone()), + MatchValue::Variable(v) => { + if param_names.contains(v) { + IRExpr::Param(v.clone()) + } else { + IRExpr::Variable(v.clone()) + } + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::catalog::build_catalog; + use crate::query::parser::parse_query; + use crate::query::typecheck::{CheckedQuery, typecheck_query, typecheck_query_decl}; + use crate::schema::parser::parse_schema; + + fn setup() -> Catalog { + let schema = parse_schema( + r#" +node Person { name: String age: I32? } +node Company { name: String } +edge Knows: Person -> Person { since: Date? } +edge WorksAt: Person -> Company +"#, + ) + .unwrap(); + build_catalog(&schema).unwrap() + } + + #[test] + fn test_lower_basic() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($name: String) { + match { + $p: Person { name: $name } + $p knows $f + } + return { $f.name, $f.age } +} +"#, + ) + .unwrap(); + let tc = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + let ir = lower_query(&catalog, &qf.queries[0], &tc).unwrap(); + + assert_eq!(ir.pipeline.len(), 2); // NodeScan + Expand + assert_eq!(ir.return_exprs.len(), 2); + } + + #[test] + fn test_lower_negation() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + not { $p worksAt $_ } + } + return { $p.name } +} +"#, + ) + .unwrap(); + let tc = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + let ir = lower_query(&catalog, &qf.queries[0], &tc).unwrap(); + + assert_eq!(ir.pipeline.len(), 2); // NodeScan + AntiJoin + assert!(matches!(&ir.pipeline[1], IROp::AntiJoin { .. })); + } + + #[test] + fn test_lower_mutation_update() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($name: String, $age: I32) { + update Person set { age: $age } where name = $name +} +"#, + ) + .unwrap(); + let checked = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap(); + assert!(matches!(checked, CheckedQuery::Mutation(_))); + + let ir = lower_mutation_query(&qf.queries[0]).unwrap(); + match ir.op { + MutationOpIR::Update { + type_name, + assignments, + predicate, + } => { + assert_eq!(type_name, "Person"); + assert_eq!(assignments.len(), 1); + assert_eq!(assignments[0].property, "age"); + assert_eq!(predicate.property, "name"); + } + _ => panic!("expected update mutation op"), + } + } + + #[test] + fn test_lower_bounded_traversal() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p knows{1,3} $f + } + return { $f.name } +} +"#, + ) + .unwrap(); + let tc = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + let ir = lower_query(&catalog, &qf.queries[0], &tc).unwrap(); + let expand = ir + .pipeline + .iter() + .find_map(|op| match op { + IROp::Expand { + min_hops, max_hops, .. + } => Some((*min_hops, *max_hops)), + _ => None, + }) + .expect("expected expand op"); + assert_eq!(expand.0, 1); + assert_eq!(expand.1, Some(3)); + } + + #[test] + fn test_lower_now_uses_reserved_runtime_param() { + let catalog = setup(); + let qf = parse_query( + r#" +query stamp() { + match { $p: Person } + return { now() as ts } +} +"#, + ) + .unwrap(); + let tc = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + let ir = lower_query(&catalog, &qf.queries[0], &tc).unwrap(); + + assert!(matches!( + ir.return_exprs[0].expr, + IRExpr::Param(ref name) if name == NOW_PARAM_NAME + )); + } + + #[test] + fn test_lower_mutation_now_uses_reserved_runtime_param() { + let catalog = build_catalog( + &parse_schema( + r#" +node Event { + slug: String @key + updated_at: DateTime? +} +"#, + ) + .unwrap(), + ) + .unwrap(); + let qf = parse_query( + r#" +query stamp() { + update Event set { updated_at: now() } where updated_at = now() +} +"#, + ) + .unwrap(); + let checked = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap(); + assert!(matches!(checked, CheckedQuery::Mutation(_))); + + let ir = lower_mutation_query(&qf.queries[0]).unwrap(); + match ir.op { + MutationOpIR::Update { + assignments, + predicate, + .. + } => { + assert!(matches!( + assignments[0].value, + IRExpr::Param(ref name) if name == NOW_PARAM_NAME + )); + assert!(matches!( + predicate.value, + IRExpr::Param(ref name) if name == NOW_PARAM_NAME + )); + } + _ => panic!("expected update mutation op"), + } + } +} diff --git a/crates/omnigraph-compiler/src/ir/mod.rs b/crates/omnigraph-compiler/src/ir/mod.rs new file mode 100644 index 0000000..7768b1b --- /dev/null +++ b/crates/omnigraph-compiler/src/ir/mod.rs @@ -0,0 +1,143 @@ +pub(crate) mod lower; + +use std::collections::HashMap; + +use crate::query::ast::{AggFunc, CompOp, Literal, Param}; +use crate::types::Direction; + +#[derive(Debug, Clone)] +pub struct QueryIR { + pub name: String, + pub params: Vec, + pub pipeline: Vec, + pub return_exprs: Vec, + pub order_by: Vec, + pub limit: Option, +} + +#[derive(Debug, Clone)] +pub struct MutationIR { + pub name: String, + pub params: Vec, + pub op: MutationOpIR, +} + +#[derive(Debug, Clone)] +pub enum MutationOpIR { + Insert { + type_name: String, + assignments: Vec, + }, + Update { + type_name: String, + assignments: Vec, + predicate: IRMutationPredicate, + }, + Delete { + type_name: String, + predicate: IRMutationPredicate, + }, +} + +#[derive(Debug, Clone)] +pub struct IRAssignment { + pub property: String, + pub value: IRExpr, +} + +#[derive(Debug, Clone)] +pub struct IRMutationPredicate { + pub property: String, + pub op: CompOp, + pub value: IRExpr, +} + +/// Resolved runtime parameters: param name → literal value. +pub type ParamMap = HashMap; + +#[derive(Debug, Clone)] +pub enum IROp { + NodeScan { + variable: String, + type_name: String, + filters: Vec, + }, + Expand { + src_var: String, + dst_var: String, + edge_type: String, + direction: Direction, + dst_type: String, + min_hops: u32, + max_hops: Option, + }, + Filter(IRFilter), + AntiJoin { + /// The outer variable whose id is used for the join key + outer_var: String, + /// The inner pipeline that produces rows to anti-join against + inner: Vec, + }, +} + +#[derive(Debug, Clone)] +pub struct IRFilter { + pub left: IRExpr, + pub op: CompOp, + pub right: IRExpr, +} + +#[derive(Debug, Clone)] +pub enum IRExpr { + PropAccess { + variable: String, + property: String, + }, + Nearest { + variable: String, + property: String, + query: Box, + }, + Search { + field: Box, + query: Box, + }, + Fuzzy { + field: Box, + query: Box, + max_edits: Option>, + }, + MatchText { + field: Box, + query: Box, + }, + Bm25 { + field: Box, + query: Box, + }, + Rrf { + primary: Box, + secondary: Box, + k: Option>, + }, + Variable(String), + Param(String), + Literal(Literal), + Aggregate { + func: AggFunc, + arg: Box, + }, + AliasRef(String), +} + +#[derive(Debug, Clone)] +pub struct IRProjection { + pub expr: IRExpr, + pub alias: Option, +} + +#[derive(Debug, Clone)] +pub struct IROrdering { + pub expr: IRExpr, + pub descending: bool, +} diff --git a/crates/omnigraph-compiler/src/json_output.rs b/crates/omnigraph-compiler/src/json_output.rs new file mode 100644 index 0000000..9ebc1c6 --- /dev/null +++ b/crates/omnigraph-compiler/src/json_output.rs @@ -0,0 +1,352 @@ +use arrow_array::{ + Array, ArrayRef, BooleanArray, Date32Array, Date64Array, FixedSizeListArray, Float32Array, + Float64Array, Int32Array, Int64Array, ListArray, RecordBatch, StringArray, StructArray, + UInt32Array, UInt64Array, +}; +use arrow_schema::DataType; + +pub const JS_MAX_SAFE_INTEGER_I64: i64 = 9_007_199_254_740_991; +pub const JS_MAX_SAFE_INTEGER_U64: u64 = 9_007_199_254_740_991; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum JsonIntegerMode { + JavaScript, + Native, +} + +pub fn is_js_safe_integer_i64(value: i64) -> bool { + (-JS_MAX_SAFE_INTEGER_I64..=JS_MAX_SAFE_INTEGER_I64).contains(&value) +} + +/// Convert Arrow RecordBatches into a Vec of JSON objects (one per row). +pub fn record_batches_to_json_rows(results: &[RecordBatch]) -> Vec { + record_batches_to_json_rows_with_mode(results, JsonIntegerMode::JavaScript) +} + +/// Convert Arrow RecordBatches into JSON rows without JS-safe integer coercion. +pub fn record_batches_to_rust_json_rows(results: &[RecordBatch]) -> Vec { + record_batches_to_json_rows_with_mode(results, JsonIntegerMode::Native) +} + +fn record_batches_to_json_rows_with_mode( + results: &[RecordBatch], + integer_mode: JsonIntegerMode, +) -> Vec { + let total_rows = results.iter().map(RecordBatch::num_rows).sum(); + let mut out = Vec::with_capacity(total_rows); + for batch in results { + let schema = batch.schema(); + for row in 0..batch.num_rows() { + let mut map = serde_json::Map::new(); + for (col_idx, field) in schema.fields().iter().enumerate() { + let col_arr = batch.column(col_idx); + map.insert( + field.name().clone(), + array_value_to_json_with_mode(col_arr, row, integer_mode), + ); + } + out.push(serde_json::Value::Object(map)); + } + } + out +} + +/// Convert a single cell from an Arrow array to a serde_json::Value. +pub fn array_value_to_json(array: &ArrayRef, row: usize) -> serde_json::Value { + array_value_to_json_with_mode(array, row, JsonIntegerMode::JavaScript) +} + +fn array_value_to_json_with_mode( + array: &ArrayRef, + row: usize, + integer_mode: JsonIntegerMode, +) -> serde_json::Value { + if array.is_null(row) { + return serde_json::Value::Null; + } + + match array.data_type() { + DataType::Utf8 => array + .as_any() + .downcast_ref::() + .map(|a| serde_json::Value::String(a.value(row).to_string())) + .unwrap_or(serde_json::Value::Null), + DataType::Boolean => array + .as_any() + .downcast_ref::() + .map(|a| serde_json::Value::Bool(a.value(row))) + .unwrap_or(serde_json::Value::Null), + DataType::Int32 => array + .as_any() + .downcast_ref::() + .map(|a| serde_json::Value::Number((a.value(row) as i64).into())) + .unwrap_or(serde_json::Value::Null), + DataType::Int64 => array + .as_any() + .downcast_ref::() + .map(|a| { + let value = a.value(row); + match integer_mode { + JsonIntegerMode::JavaScript if !is_js_safe_integer_i64(value) => { + serde_json::Value::String(value.to_string()) + } + JsonIntegerMode::JavaScript | JsonIntegerMode::Native => { + serde_json::Value::Number(value.into()) + } + } + }) + .unwrap_or(serde_json::Value::Null), + DataType::UInt32 => array + .as_any() + .downcast_ref::() + .map(|a| serde_json::Value::Number((a.value(row) as u64).into())) + .unwrap_or(serde_json::Value::Null), + DataType::UInt64 => array + .as_any() + .downcast_ref::() + .map(|a| { + let value = a.value(row); + match integer_mode { + JsonIntegerMode::JavaScript if value > JS_MAX_SAFE_INTEGER_U64 => { + serde_json::Value::String(value.to_string()) + } + JsonIntegerMode::JavaScript | JsonIntegerMode::Native => { + serde_json::Value::Number(value.into()) + } + } + }) + .unwrap_or(serde_json::Value::Null), + DataType::Float32 => array + .as_any() + .downcast_ref::() + .map(|a| json_float_value(a.value(row) as f64)) + .unwrap_or(serde_json::Value::Null), + DataType::Float64 => array + .as_any() + .downcast_ref::() + .map(|a| json_float_value(a.value(row))) + .unwrap_or(serde_json::Value::Null), + DataType::Date32 => array + .as_any() + .downcast_ref::() + .map(|a| { + let days = a.value(row); + arrow_array::temporal_conversions::date32_to_datetime(days) + .map(|dt| serde_json::Value::String(dt.format("%Y-%m-%d").to_string())) + .unwrap_or_else(|| serde_json::Value::Number((days as i64).into())) + }) + .unwrap_or(serde_json::Value::Null), + DataType::Date64 => array + .as_any() + .downcast_ref::() + .map(|a| { + let ms = a.value(row); + arrow_array::temporal_conversions::date64_to_datetime(ms) + .map(|dt| { + serde_json::Value::String(dt.format("%Y-%m-%dT%H:%M:%S%.3fZ").to_string()) + }) + .unwrap_or_else(|| serde_json::Value::Number(ms.into())) + }) + .unwrap_or(serde_json::Value::Null), + DataType::List(_) => array + .as_any() + .downcast_ref::() + .map(|a| { + let values = a.value(row); + serde_json::Value::Array( + (0..values.len()) + .map(|idx| array_value_to_json_with_mode(&values, idx, integer_mode)) + .collect(), + ) + }) + .unwrap_or(serde_json::Value::Null), + DataType::FixedSizeList(_, _) => array + .as_any() + .downcast_ref::() + .map(|a| fixed_size_list_value_to_json(a, row, integer_mode)) + .unwrap_or(serde_json::Value::Null), + DataType::Struct(_) => array + .as_any() + .downcast_ref::() + .map(|struct_arr| { + let mut obj = serde_json::Map::new(); + for (i, field) in struct_arr.fields().iter().enumerate() { + let col = struct_arr.column(i); + obj.insert( + field.name().clone(), + array_value_to_json_with_mode(col, row, integer_mode), + ); + } + serde_json::Value::Object(obj) + }) + .unwrap_or(serde_json::Value::Null), + _ => { + let display = + arrow_cast::display::array_value_to_string(array, row).unwrap_or_default(); + serde_json::Value::String(display) + } + } +} + +fn json_float_value(value: f64) -> serde_json::Value { + if value.is_nan() { + return serde_json::Value::String("NaN".to_string()); + } + if value == f64::INFINITY { + return serde_json::Value::String("Infinity".to_string()); + } + if value == f64::NEG_INFINITY { + return serde_json::Value::String("-Infinity".to_string()); + } + + serde_json::Number::from_f64(value) + .map(serde_json::Value::Number) + .unwrap_or(serde_json::Value::Null) +} + +fn fixed_size_list_value_to_json( + array: &FixedSizeListArray, + row: usize, + integer_mode: JsonIntegerMode, +) -> serde_json::Value { + let value_len = array.value_length() as usize; + let values = array.values(); + if let Some(float_values) = values.as_any().downcast_ref::() { + let start = row.saturating_mul(value_len); + return float32_json_array(float_values, start, value_len); + } + + let values = array.value(row); + serde_json::Value::Array( + (0..values.len()) + .map(|idx| array_value_to_json_with_mode(&values, idx, integer_mode)) + .collect(), + ) +} + +fn float32_json_array(values: &Float32Array, start: usize, len: usize) -> serde_json::Value { + let mut out = Vec::with_capacity(len); + let end = start.saturating_add(len).min(values.len()); + for idx in start..end { + if values.is_null(idx) { + out.push(serde_json::Value::Null); + continue; + } + let value = values.value(idx) as f64; + out.push( + serde_json::Number::from_f64(value) + .map(serde_json::Value::Number) + .unwrap_or(serde_json::Value::Null), + ); + } + serde_json::Value::Array(out) +} + +#[cfg(test)] +mod tests { + use super::{array_value_to_json, record_batches_to_rust_json_rows}; + use std::sync::Arc; + + use arrow_array::builder::{FixedSizeListBuilder, Float32Builder}; + use arrow_array::{ArrayRef, Float64Array, Int64Array, RecordBatch, UInt64Array}; + use arrow_schema::{DataType, Field, Schema}; + + #[test] + fn int64_outside_js_safe_range_is_stringified() { + let values: ArrayRef = Arc::new(Int64Array::from(vec![Some(9_007_199_254_740_992)])); + assert_eq!( + array_value_to_json(&values, 0), + serde_json::Value::String("9007199254740992".to_string()) + ); + } + + #[test] + fn uint64_outside_js_safe_range_is_stringified() { + let values: ArrayRef = Arc::new(UInt64Array::from(vec![Some(9_007_199_254_740_992)])); + assert_eq!( + array_value_to_json(&values, 0), + serde_json::Value::String("9007199254740992".to_string()) + ); + } + + #[test] + fn uint64_within_js_safe_range_stays_numeric() { + let values: ArrayRef = Arc::new(UInt64Array::from(vec![Some(9_007_199_254_740_991)])); + assert_eq!( + array_value_to_json(&values, 0), + serde_json::json!(9_007_199_254_740_991u64) + ); + } + + #[test] + fn rust_json_rows_preserve_full_width_integers() { + let schema = Arc::new(Schema::new(vec![ + Field::new("signed", DataType::Int64, false), + Field::new("unsigned", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema, + vec![ + Arc::new(Int64Array::from(vec![i64::MIN])), + Arc::new(UInt64Array::from(vec![u64::MAX])), + ], + ) + .expect("batch"); + + assert_eq!( + record_batches_to_rust_json_rows(&[batch]), + vec![serde_json::json!({ + "signed": i64::MIN, + "unsigned": u64::MAX, + })] + ); + } + + #[test] + fn fixed_size_float32_vectors_serialize_without_recursive_dispatch() { + let mut builder = FixedSizeListBuilder::new(Float32Builder::new(), 3); + builder.values().append_value(0.25); + builder.values().append_value(0.5); + builder.values().append_value(0.75); + builder.append(true); + + for _ in 0..3 { + builder.values().append_null(); + } + builder.append(false); + + builder.values().append_value(1.0); + builder.values().append_value(2.0); + builder.values().append_value(3.0); + builder.append(true); + + let values: ArrayRef = Arc::new(builder.finish()); + assert_eq!( + array_value_to_json(&values, 0), + serde_json::json!([0.25, 0.5, 0.75]) + ); + assert_eq!(array_value_to_json(&values, 1), serde_json::Value::Null); + assert_eq!( + array_value_to_json(&values, 2), + serde_json::json!([1.0, 2.0, 3.0]) + ); + } + + #[test] + fn non_finite_floats_are_stringified() { + let values: ArrayRef = Arc::new(Float64Array::from(vec![ + Some(f64::NAN), + Some(f64::INFINITY), + Some(f64::NEG_INFINITY), + ])); + assert_eq!(array_value_to_json(&values, 0), serde_json::json!("NaN")); + assert_eq!( + array_value_to_json(&values, 1), + serde_json::json!("Infinity") + ); + assert_eq!( + array_value_to_json(&values, 2), + serde_json::json!("-Infinity") + ); + } +} diff --git a/crates/omnigraph-compiler/src/lib.rs b/crates/omnigraph-compiler/src/lib.rs new file mode 100644 index 0000000..3c63367 --- /dev/null +++ b/crates/omnigraph-compiler/src/lib.rs @@ -0,0 +1,28 @@ +pub mod catalog; +pub mod embedding; +pub mod error; +pub mod ir; +pub mod json_output; +pub mod query; +pub mod query_input; +pub mod result; +pub mod schema; +pub mod types; + +pub use catalog::build_catalog; +pub use catalog::schema_ir::{ + SchemaIR, build_catalog_from_ir, build_schema_ir, schema_ir_hash, schema_ir_json, + schema_ir_pretty_json, +}; +pub use catalog::schema_plan::{ + SchemaMigrationPlan, SchemaMigrationStep, SchemaTypeKind, plan_schema_migration, +}; +pub use ir::ParamMap; +pub use ir::lower::{lower_mutation_query, lower_query}; +pub use query::ast::Literal; +pub use query_input::{ + JsonParamMode, RunInputError, RunInputResult, ToParam, find_named_query, + json_params_to_param_map, +}; +pub use result::{MutationExecResult, MutationResult, QueryResult, RunResult}; +pub use types::{Direction, PropType, ScalarType}; diff --git a/crates/omnigraph-compiler/src/query/ast.rs b/crates/omnigraph-compiler/src/query/ast.rs new file mode 100644 index 0000000..4f62688 --- /dev/null +++ b/crates/omnigraph-compiler/src/query/ast.rs @@ -0,0 +1,221 @@ +pub const NOW_PARAM_NAME: &str = "__nanograph_now"; + +#[derive(Debug, Clone)] +pub struct QueryFile { + pub queries: Vec, +} + +#[derive(Debug, Clone)] +pub struct QueryDecl { + pub name: String, + pub description: Option, + pub instruction: Option, + pub params: Vec, + pub match_clause: Vec, + pub return_clause: Vec, + pub order_clause: Vec, + pub limit: Option, + pub mutation: Option, +} + +#[derive(Debug, Clone)] +pub struct Param { + pub name: String, + pub type_name: String, + pub nullable: bool, +} + +#[derive(Debug, Clone)] +pub enum Clause { + Binding(Binding), + Traversal(Traversal), + Filter(Filter), + Negation(Vec), +} + +#[derive(Debug, Clone)] +pub struct Binding { + pub variable: String, + pub type_name: String, + pub prop_matches: Vec, +} + +#[derive(Debug, Clone)] +pub struct PropMatch { + pub prop_name: String, + pub value: MatchValue, +} + +#[derive(Debug, Clone)] +pub enum MatchValue { + Literal(Literal), + Variable(String), + Now, +} + +#[derive(Debug, Clone)] +pub struct Traversal { + pub src: String, + pub edge_name: String, + pub dst: String, + pub min_hops: u32, + pub max_hops: Option, +} + +#[derive(Debug, Clone)] +pub struct Filter { + pub left: Expr, + pub op: CompOp, + pub right: Expr, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum CompOp { + Eq, + Ne, + Gt, + Lt, + Ge, + Le, + Contains, +} + +impl std::fmt::Display for CompOp { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Eq => write!(f, "="), + Self::Ne => write!(f, "!="), + Self::Gt => write!(f, ">"), + Self::Lt => write!(f, "<"), + Self::Ge => write!(f, ">="), + Self::Le => write!(f, "<="), + Self::Contains => write!(f, "contains"), + } + } +} + +#[derive(Debug, Clone)] +pub enum Expr { + Now, + PropAccess { + variable: String, + property: String, + }, + Nearest { + variable: String, + property: String, + query: Box, + }, + Search { + field: Box, + query: Box, + }, + Fuzzy { + field: Box, + query: Box, + max_edits: Option>, + }, + MatchText { + field: Box, + query: Box, + }, + Bm25 { + field: Box, + query: Box, + }, + Rrf { + primary: Box, + secondary: Box, + k: Option>, + }, + Variable(String), + Literal(Literal), + Aggregate { + func: AggFunc, + arg: Box, + }, + AliasRef(String), +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum AggFunc { + Count, + Sum, + Avg, + Min, + Max, +} + +impl std::fmt::Display for AggFunc { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match self { + Self::Count => write!(f, "count"), + Self::Sum => write!(f, "sum"), + Self::Avg => write!(f, "avg"), + Self::Min => write!(f, "min"), + Self::Max => write!(f, "max"), + } + } +} + +#[derive(Debug, Clone)] +pub enum Literal { + String(String), + Integer(i64), + Float(f64), + Bool(bool), + Date(String), + DateTime(String), + List(Vec), +} + +#[derive(Debug, Clone)] +pub struct Projection { + pub expr: Expr, + pub alias: Option, +} + +#[derive(Debug, Clone)] +pub struct Ordering { + pub expr: Expr, + pub descending: bool, +} + +#[derive(Debug, Clone)] +pub enum Mutation { + Insert(InsertMutation), + Update(UpdateMutation), + Delete(DeleteMutation), +} + +#[derive(Debug, Clone)] +pub struct InsertMutation { + pub type_name: String, + pub assignments: Vec, +} + +#[derive(Debug, Clone)] +pub struct UpdateMutation { + pub type_name: String, + pub assignments: Vec, + pub predicate: MutationPredicate, +} + +#[derive(Debug, Clone)] +pub struct DeleteMutation { + pub type_name: String, + pub predicate: MutationPredicate, +} + +#[derive(Debug, Clone)] +pub struct MutationAssignment { + pub property: String, + pub value: MatchValue, +} + +#[derive(Debug, Clone)] +pub struct MutationPredicate { + pub property: String, + pub op: CompOp, + pub value: MatchValue, +} diff --git a/crates/omnigraph-compiler/src/query/mod.rs b/crates/omnigraph-compiler/src/query/mod.rs new file mode 100644 index 0000000..7592221 --- /dev/null +++ b/crates/omnigraph-compiler/src/query/mod.rs @@ -0,0 +1,3 @@ +pub mod ast; +pub mod parser; +pub mod typecheck; diff --git a/crates/omnigraph-compiler/src/query/parser.rs b/crates/omnigraph-compiler/src/query/parser.rs new file mode 100644 index 0000000..52f0668 --- /dev/null +++ b/crates/omnigraph-compiler/src/query/parser.rs @@ -0,0 +1,1689 @@ +use pest::Parser; +use pest::error::InputLocation; +use pest_derive::Parser; + +use crate::error::{ + NanoError, ParseDiagnostic, Result, SourceSpan, decode_string_literal, render_span, +}; + +use super::ast::*; + +#[derive(Parser)] +#[grammar = "query/query.pest"] +struct QueryParser; + +pub fn parse_query(input: &str) -> Result { + parse_query_diagnostic(input).map_err(|e| NanoError::Parse(e.to_string())) +} + +pub fn parse_query_diagnostic(input: &str) -> std::result::Result { + let pairs = QueryParser::parse(Rule::query_file, input).map_err(pest_error_to_diagnostic)?; + + let mut queries = Vec::new(); + for pair in pairs { + if let Rule::query_file = pair.as_rule() { + for inner in pair.into_inner() { + if let Rule::query_decl = inner.as_rule() { + queries.push(parse_query_decl(inner).map_err(nano_error_to_diagnostic)?); + } + } + } + } + Ok(QueryFile { queries }) +} + +fn pest_error_to_diagnostic(err: pest::error::Error) -> ParseDiagnostic { + let span = match err.location { + InputLocation::Pos(pos) => Some(render_span(SourceSpan::new(pos, pos))), + InputLocation::Span((start, end)) => Some(render_span(SourceSpan::new(start, end))), + }; + ParseDiagnostic::new(err.to_string(), span) +} + +fn nano_error_to_diagnostic(err: NanoError) -> ParseDiagnostic { + ParseDiagnostic::new(err.to_string(), None) +} + +fn parse_query_decl(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let name = inner.next().unwrap().as_str().to_string(); + + let mut description = None; + let mut instruction = None; + let mut params = Vec::new(); + let mut match_clause = Vec::new(); + let mut return_clause = Vec::new(); + let mut order_clause = Vec::new(); + let mut limit = None; + let mut mutation = None; + + for item in inner { + match item.as_rule() { + Rule::param_list => { + for p in item.into_inner() { + if let Rule::param = p.as_rule() { + params.push(parse_param(p)?); + } + } + } + Rule::query_annotation => { + let (annotation_name, value) = parse_query_annotation(item)?; + match annotation_name { + "description" => { + if description.replace(value).is_some() { + return Err(NanoError::Parse(format!( + "query `{}` cannot include duplicate @description annotations", + name + ))); + } + } + "instruction" => { + if instruction.replace(value).is_some() { + return Err(NanoError::Parse(format!( + "query `{}` cannot include duplicate @instruction annotations", + name + ))); + } + } + other => { + return Err(NanoError::Parse(format!( + "unsupported query annotation: @{}", + other + ))); + } + } + } + Rule::query_body => { + let body = item + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("query body cannot be empty".to_string()))?; + match body.as_rule() { + Rule::read_query_body => { + for section in body.into_inner() { + match section.as_rule() { + Rule::match_clause => { + for c in section.into_inner() { + if let Rule::clause = c.as_rule() { + match_clause.push(parse_clause(c)?); + } + } + } + Rule::return_clause => { + for proj in section.into_inner() { + if let Rule::projection = proj.as_rule() { + return_clause.push(parse_projection(proj)?); + } + } + } + Rule::order_clause => { + for ord in section.into_inner() { + if let Rule::ordering = ord.as_rule() { + order_clause.push(parse_ordering(ord)?); + } + } + } + Rule::limit_clause => { + let int_pair = section.into_inner().next().unwrap(); + limit = + Some(int_pair.as_str().parse::().map_err(|e| { + NanoError::Parse(format!("invalid limit: {}", e)) + })?); + } + _ => {} + } + } + } + Rule::mutation_stmt => { + let stmt = body.into_inner().next().ok_or_else(|| { + NanoError::Parse("mutation statement cannot be empty".to_string()) + })?; + mutation = Some(parse_mutation_stmt(stmt)?); + } + _ => {} + } + } + _ => {} + } + } + + Ok(QueryDecl { + name, + description, + instruction, + params, + match_clause, + return_clause, + order_clause, + limit, + mutation, + }) +} + +fn parse_query_annotation(pair: pest::iterators::Pair) -> Result<(&'static str, String)> { + let inner = pair + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("query annotation cannot be empty".to_string()))?; + match inner.as_rule() { + Rule::description_annotation => { + let value = inner + .into_inner() + .next() + .ok_or_else(|| { + NanoError::Parse("@description requires a string literal".to_string()) + }) + .map(|value| parse_string_lit(value.as_str()))??; + Ok(("description", value)) + } + Rule::instruction_annotation => { + let value = inner + .into_inner() + .next() + .ok_or_else(|| { + NanoError::Parse("@instruction requires a string literal".to_string()) + }) + .map(|value| parse_string_lit(value.as_str()))??; + Ok(("instruction", value)) + } + other => Err(NanoError::Parse(format!( + "unexpected query annotation rule: {:?}", + other + ))), + } +} + +fn parse_param(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let var = inner.next().unwrap().as_str(); + let name = var.strip_prefix('$').unwrap_or(var).to_string(); + let type_ref = inner.next().unwrap(); + let nullable = type_ref.as_str().trim_end().ends_with('?'); + let mut type_inner = type_ref.into_inner(); + let core = type_inner + .next() + .ok_or_else(|| NanoError::Parse("parameter type is missing".to_string()))?; + let base = match core.as_rule() { + Rule::base_type => core.as_str().to_string(), + Rule::list_type => { + let inner = core + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("list type missing item type".to_string()))?; + format!("[{}]", inner.as_str().trim()) + } + Rule::vector_type => { + let vector = core + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("Vector type missing dimension".to_string()))?; + format!("Vector({})", vector.as_str().trim()) + } + other => { + return Err(NanoError::Parse(format!( + "unexpected param type rule: {:?}", + other + ))); + } + }; + + Ok(Param { + name, + type_name: base, + nullable, + }) +} + +fn parse_clause(pair: pest::iterators::Pair) -> Result { + let inner = pair.into_inner().next().unwrap(); + match inner.as_rule() { + Rule::binding => Ok(Clause::Binding(parse_binding(inner)?)), + Rule::traversal => Ok(Clause::Traversal(parse_traversal(inner)?)), + Rule::filter => Ok(Clause::Filter(parse_filter(inner)?)), + Rule::text_search_clause => Ok(parse_text_search_clause(inner)?), + Rule::negation => { + let mut clauses = Vec::new(); + for c in inner.into_inner() { + if let Rule::clause = c.as_rule() { + clauses.push(parse_clause(c)?); + } + } + Ok(Clause::Negation(clauses)) + } + _ => Err(NanoError::Parse(format!( + "unexpected clause rule: {:?}", + inner.as_rule() + ))), + } +} + +fn parse_text_search_clause(pair: pest::iterators::Pair) -> Result { + let inner = pair + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("text search clause cannot be empty".to_string()))?; + let expr = match inner.as_rule() { + Rule::search_call => parse_search_call(inner)?, + Rule::fuzzy_call => parse_fuzzy_call(inner)?, + Rule::match_text_call => parse_match_text_call(inner)?, + other => { + return Err(NanoError::Parse(format!( + "unexpected text search clause rule: {:?}", + other + ))); + } + }; + + Ok(Clause::Filter(Filter { + left: expr, + op: CompOp::Eq, + right: Expr::Literal(Literal::Bool(true)), + })) +} + +fn parse_binding(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let var = inner.next().unwrap().as_str(); + let variable = var.strip_prefix('$').unwrap_or(var).to_string(); + let type_name = inner.next().unwrap().as_str().to_string(); + + let mut prop_matches = Vec::new(); + for item in inner { + if let Rule::prop_match_list = item.as_rule() { + for pm in item.into_inner() { + if let Rule::prop_match = pm.as_rule() { + prop_matches.push(parse_prop_match(pm)?); + } + } + } + } + + Ok(Binding { + variable, + type_name, + prop_matches, + }) +} + +fn parse_prop_match(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let prop_name = inner.next().unwrap().as_str().to_string(); + let value_pair = inner.next().unwrap(); + let value = parse_match_value(value_pair)?; + + Ok(PropMatch { prop_name, value }) +} + +fn parse_mutation_stmt(pair: pest::iterators::Pair) -> Result { + match pair.as_rule() { + Rule::insert_stmt => parse_insert_mutation(pair).map(Mutation::Insert), + Rule::update_stmt => parse_update_mutation(pair).map(Mutation::Update), + Rule::delete_stmt => parse_delete_mutation(pair).map(Mutation::Delete), + other => Err(NanoError::Parse(format!( + "unexpected mutation statement rule: {:?}", + other + ))), + } +} + +fn parse_insert_mutation(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let type_name = inner.next().unwrap().as_str().to_string(); + let mut assignments = Vec::new(); + for item in inner { + if let Rule::mutation_assignment = item.as_rule() { + assignments.push(parse_mutation_assignment(item)?); + } + } + Ok(InsertMutation { + type_name, + assignments, + }) +} + +fn parse_update_mutation(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let type_name = inner.next().unwrap().as_str().to_string(); + + let mut assignments = Vec::new(); + let mut predicate = None; + + for item in inner { + match item.as_rule() { + Rule::mutation_assignment => assignments.push(parse_mutation_assignment(item)?), + Rule::mutation_predicate => predicate = Some(parse_mutation_predicate(item)?), + _ => {} + } + } + + let predicate = predicate.ok_or_else(|| { + NanoError::Parse("update mutation requires a where predicate".to_string()) + })?; + + Ok(UpdateMutation { + type_name, + assignments, + predicate, + }) +} + +fn parse_delete_mutation(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let type_name = inner.next().unwrap().as_str().to_string(); + let predicate = inner + .next() + .ok_or_else(|| NanoError::Parse("delete mutation requires a where predicate".to_string())) + .and_then(parse_mutation_predicate)?; + Ok(DeleteMutation { + type_name, + predicate, + }) +} + +fn parse_mutation_assignment(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let property = inner.next().unwrap().as_str().to_string(); + let value = parse_match_value(inner.next().unwrap())?; + Ok(MutationAssignment { property, value }) +} + +fn parse_mutation_predicate(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let property = inner.next().unwrap().as_str().to_string(); + let op = parse_comp_op(inner.next().unwrap())?; + let value = parse_match_value(inner.next().unwrap())?; + Ok(MutationPredicate { + property, + op, + value, + }) +} + +fn parse_match_value(pair: pest::iterators::Pair) -> Result { + let value_inner = pair.into_inner().next().unwrap(); + match value_inner.as_rule() { + Rule::variable => { + let v = value_inner.as_str(); + Ok(MatchValue::Variable( + v.strip_prefix('$').unwrap_or(v).to_string(), + )) + } + Rule::now_call => Ok(MatchValue::Now), + Rule::literal => Ok(MatchValue::Literal(parse_literal(value_inner)?)), + _ => Err(NanoError::Parse(format!( + "unexpected match value: {:?}", + value_inner.as_rule() + ))), + } +} + +fn parse_traversal(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let src_var = inner.next().unwrap().as_str(); + let src = src_var.strip_prefix('$').unwrap_or(src_var).to_string(); + let edge_name = inner.next().unwrap().as_str().to_string(); + let mut min_hops = 1u32; + let mut max_hops = Some(1u32); + + let next = inner.next().unwrap(); + let dst_pair = if let Rule::traversal_bounds = next.as_rule() { + let (min, max) = parse_traversal_bounds(next)?; + min_hops = min; + max_hops = max; + inner + .next() + .ok_or_else(|| NanoError::Parse("traversal missing destination variable".to_string()))? + } else { + next + }; + + let dst_var = dst_pair.as_str(); + let dst = dst_var.strip_prefix('$').unwrap_or(dst_var).to_string(); + + Ok(Traversal { + src, + edge_name, + dst, + min_hops, + max_hops, + }) +} + +fn parse_traversal_bounds(pair: pest::iterators::Pair) -> Result<(u32, Option)> { + let mut inner = pair.into_inner(); + let min = inner + .next() + .ok_or_else(|| NanoError::Parse("traversal bound missing min hop".to_string()))? + .as_str() + .parse::() + .map_err(|e| NanoError::Parse(format!("invalid traversal min bound: {}", e)))?; + let max = inner + .next() + .map(|p| { + p.as_str() + .parse::() + .map_err(|e| NanoError::Parse(format!("invalid traversal max bound: {}", e))) + }) + .transpose()?; + Ok((min, max)) +} + +fn parse_filter(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let left = parse_expr(inner.next().unwrap())?; + let op = parse_filter_op(inner.next().unwrap())?; + let right = parse_expr(inner.next().unwrap())?; + + Ok(Filter { left, op, right }) +} + +fn parse_expr(pair: pest::iterators::Pair) -> Result { + let inner = pair.into_inner().next().unwrap(); + match inner.as_rule() { + Rule::now_call => Ok(Expr::Now), + Rule::prop_access => { + let mut parts = inner.into_inner(); + let var = parts.next().unwrap().as_str(); + let variable = var.strip_prefix('$').unwrap_or(var).to_string(); + let property = parts.next().unwrap().as_str().to_string(); + Ok(Expr::PropAccess { variable, property }) + } + Rule::variable => { + let v = inner.as_str(); + Ok(Expr::Variable(v.strip_prefix('$').unwrap_or(v).to_string())) + } + Rule::literal => Ok(Expr::Literal(parse_literal(inner)?)), + Rule::agg_call => { + let mut parts = inner.into_inner(); + let func = match parts.next().unwrap().as_str() { + "count" => AggFunc::Count, + "sum" => AggFunc::Sum, + "avg" => AggFunc::Avg, + "min" => AggFunc::Min, + "max" => AggFunc::Max, + other => return Err(NanoError::Parse(format!("unknown aggregate: {}", other))), + }; + let arg = parse_expr(parts.next().unwrap())?; + Ok(Expr::Aggregate { + func, + arg: Box::new(arg), + }) + } + Rule::search_call => parse_search_call(inner), + Rule::fuzzy_call => parse_fuzzy_call(inner), + Rule::match_text_call => parse_match_text_call(inner), + Rule::nearest_ordering => parse_nearest_ordering(inner), + Rule::bm25_call => parse_bm25_call(inner), + Rule::rrf_call => parse_rrf_call(inner), + Rule::ident => Ok(Expr::AliasRef(inner.as_str().to_string())), + _ => Err(NanoError::Parse(format!( + "unexpected expr rule: {:?}", + inner.as_rule() + ))), + } +} + +fn parse_search_call(pair: pest::iterators::Pair) -> Result { + let mut args = pair.into_inner(); + let field = args + .next() + .ok_or_else(|| NanoError::Parse("search() missing field argument".to_string()))?; + let query = args + .next() + .ok_or_else(|| NanoError::Parse("search() missing query argument".to_string()))?; + if args.next().is_some() { + return Err(NanoError::Parse( + "search() accepts exactly 2 arguments".to_string(), + )); + } + Ok(Expr::Search { + field: Box::new(parse_expr(field)?), + query: Box::new(parse_expr(query)?), + }) +} + +fn parse_fuzzy_call(pair: pest::iterators::Pair) -> Result { + let mut args = pair.into_inner(); + let field = args + .next() + .ok_or_else(|| NanoError::Parse("fuzzy() missing field argument".to_string()))?; + let query = args + .next() + .ok_or_else(|| NanoError::Parse("fuzzy() missing query argument".to_string()))?; + let max_edits = args.next().map(parse_expr).transpose()?.map(Box::new); + if args.next().is_some() { + return Err(NanoError::Parse( + "fuzzy() accepts at most 3 arguments".to_string(), + )); + } + Ok(Expr::Fuzzy { + field: Box::new(parse_expr(field)?), + query: Box::new(parse_expr(query)?), + max_edits, + }) +} + +fn parse_match_text_call(pair: pest::iterators::Pair) -> Result { + let mut args = pair.into_inner(); + let field = args + .next() + .ok_or_else(|| NanoError::Parse("match_text() missing field argument".to_string()))?; + let query = args + .next() + .ok_or_else(|| NanoError::Parse("match_text() missing query argument".to_string()))?; + if args.next().is_some() { + return Err(NanoError::Parse( + "match_text() accepts exactly 2 arguments".to_string(), + )); + } + Ok(Expr::MatchText { + field: Box::new(parse_expr(field)?), + query: Box::new(parse_expr(query)?), + }) +} + +fn parse_bm25_call(pair: pest::iterators::Pair) -> Result { + let mut args = pair.into_inner(); + let field = args + .next() + .ok_or_else(|| NanoError::Parse("bm25() missing field argument".to_string()))?; + let query = args + .next() + .ok_or_else(|| NanoError::Parse("bm25() missing query argument".to_string()))?; + if args.next().is_some() { + return Err(NanoError::Parse( + "bm25() accepts exactly 2 arguments".to_string(), + )); + } + Ok(Expr::Bm25 { + field: Box::new(parse_expr(field)?), + query: Box::new(parse_expr(query)?), + }) +} + +fn parse_rank_expr(pair: pest::iterators::Pair) -> Result { + let inner = if pair.as_rule() == Rule::rank_expr { + pair.into_inner() + .next() + .ok_or_else(|| NanoError::Parse("rank expression cannot be empty".to_string()))? + } else { + pair + }; + match inner.as_rule() { + Rule::nearest_ordering => parse_nearest_ordering(inner), + Rule::bm25_call => parse_bm25_call(inner), + other => Err(NanoError::Parse(format!( + "rrf() rank expression must be nearest(...) or bm25(...), got {:?}", + other + ))), + } +} + +fn parse_rrf_call(pair: pest::iterators::Pair) -> Result { + let mut args = pair.into_inner(); + let primary = args + .next() + .ok_or_else(|| NanoError::Parse("rrf() missing primary rank expression".to_string()))?; + let secondary = args + .next() + .ok_or_else(|| NanoError::Parse("rrf() missing secondary rank expression".to_string()))?; + let k = args.next().map(parse_expr).transpose()?.map(Box::new); + if args.next().is_some() { + return Err(NanoError::Parse( + "rrf() accepts at most 3 arguments".to_string(), + )); + } + Ok(Expr::Rrf { + primary: Box::new(parse_rank_expr(primary)?), + secondary: Box::new(parse_rank_expr(secondary)?), + k, + }) +} + +fn parse_comp_op(pair: pest::iterators::Pair) -> Result { + match pair.as_str() { + "=" => Ok(CompOp::Eq), + "!=" => Ok(CompOp::Ne), + ">" => Ok(CompOp::Gt), + "<" => Ok(CompOp::Lt), + ">=" => Ok(CompOp::Ge), + "<=" => Ok(CompOp::Le), + other => Err(NanoError::Parse(format!("unknown operator: {}", other))), + } +} + +fn parse_filter_op(pair: pest::iterators::Pair) -> Result { + match pair.as_str() { + "contains" => Ok(CompOp::Contains), + _ => parse_comp_op(pair), + } +} + +fn parse_literal(pair: pest::iterators::Pair) -> Result { + let inner = pair.into_inner().next().unwrap(); + match inner.as_rule() { + Rule::string_lit => Ok(Literal::String(parse_string_lit(inner.as_str())?)), + Rule::integer => { + let n: i64 = inner + .as_str() + .parse() + .map_err(|e| NanoError::Parse(format!("invalid integer: {}", e)))?; + Ok(Literal::Integer(n)) + } + Rule::float_lit => { + let f: f64 = inner + .as_str() + .parse() + .map_err(|e| NanoError::Parse(format!("invalid float: {}", e)))?; + Ok(Literal::Float(f)) + } + Rule::bool_lit => { + let b = match inner.as_str() { + "true" => true, + "false" => false, + other => { + return Err(NanoError::Parse(format!( + "invalid boolean literal: {}", + other + ))); + } + }; + Ok(Literal::Bool(b)) + } + Rule::date_lit => { + let date_str = inner + .into_inner() + .next() + .map(|s| parse_string_lit(s.as_str())) + .ok_or_else(|| NanoError::Parse("date literal requires a string".to_string()))?; + Ok(Literal::Date(date_str?)) + } + Rule::datetime_lit => { + let dt_str = inner + .into_inner() + .next() + .map(|s| parse_string_lit(s.as_str())) + .ok_or_else(|| { + NanoError::Parse("datetime literal requires a string".to_string()) + })?; + Ok(Literal::DateTime(dt_str?)) + } + Rule::list_lit => { + let mut items = Vec::new(); + for item in inner.into_inner() { + if item.as_rule() == Rule::literal { + items.push(parse_literal(item)?); + } + } + Ok(Literal::List(items)) + } + _ => Err(NanoError::Parse(format!( + "unexpected literal: {:?}", + inner.as_rule() + ))), + } +} + +fn parse_string_lit(raw: &str) -> Result { + decode_string_literal(raw) +} + +fn parse_projection(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let expr = parse_expr(inner.next().unwrap())?; + let alias = inner.next().map(|p| p.as_str().to_string()); + + Ok(Projection { expr, alias }) +} + +fn parse_ordering(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let first = inner + .next() + .ok_or_else(|| NanoError::Parse("ordering cannot be empty".to_string()))?; + let (expr, descending) = match first.as_rule() { + Rule::nearest_ordering => (parse_nearest_ordering(first)?, false), + Rule::expr => { + let expr = parse_expr(first)?; + let direction = inner.next().map(|p| p.as_str().to_string()); + if matches!(expr, Expr::Nearest { .. }) && direction.is_some() { + return Err(NanoError::Parse( + "nearest() ordering does not accept asc/desc modifiers".to_string(), + )); + } + let descending = matches!(direction.as_deref(), Some("desc")); + (expr, descending) + } + other => { + return Err(NanoError::Parse(format!( + "unexpected ordering rule: {:?}", + other + ))); + } + }; + + Ok(Ordering { expr, descending }) +} + +fn parse_nearest_ordering(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let prop = inner + .next() + .ok_or_else(|| NanoError::Parse("nearest() missing property".to_string()))?; + let mut prop_parts = prop.into_inner(); + let var = prop_parts + .next() + .ok_or_else(|| NanoError::Parse("nearest() missing variable".to_string()))? + .as_str(); + let variable = var.strip_prefix('$').unwrap_or(var).to_string(); + let property = prop_parts + .next() + .ok_or_else(|| NanoError::Parse("nearest() missing property name".to_string()))? + .as_str() + .to_string(); + + let query = inner + .next() + .ok_or_else(|| NanoError::Parse("nearest() missing query expression".to_string()))?; + Ok(Expr::Nearest { + variable, + property, + query: Box::new(parse_expr(query)?), + }) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_basic_query() { + let input = r#" +query get_person($name: String) { + match { + $p: Person { name: $name } + } + return { $p.name, $p.age } +} +"#; + let qf = parse_query(input).unwrap(); + assert_eq!(qf.queries.len(), 1); + let q = &qf.queries[0]; + assert_eq!(q.name, "get_person"); + assert_eq!(q.params.len(), 1); + assert_eq!(q.params[0].name, "name"); + assert_eq!(q.match_clause.len(), 1); + assert_eq!(q.return_clause.len(), 2); + } + + #[test] + fn test_parse_query_metadata_annotations() { + let input = r#" +query semantic_search($q: String) + @description("Find semantically similar documents.") + @instruction("Use for conceptual search; prefer keyword_search for exact terms.") +{ + match { + $d: Doc + } + return { $d.slug } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!( + q.description.as_deref(), + Some("Find semantically similar documents.") + ); + assert_eq!( + q.instruction.as_deref(), + Some("Use for conceptual search; prefer keyword_search for exact terms.") + ); + } + + #[test] + fn test_duplicate_query_description_is_rejected() { + let input = r#" +query q() + @description("one") + @description("two") +{ + match { + $p: Person + } + return { $p.name } +} +"#; + let err = parse_query(input).unwrap_err(); + assert!(err.to_string().contains("duplicate @description")); + } + + #[test] + fn test_parse_no_params() { + let input = r#" +query adults() { + match { + $p: Person + $p.age > 30 + } + return { $p.name, $p.age } + order { $p.age desc } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.name, "adults"); + assert!(q.params.is_empty()); + assert_eq!(q.match_clause.len(), 2); + assert_eq!(q.order_clause.len(), 1); + assert!(q.order_clause[0].descending); + } + + #[test] + fn test_parse_traversal() { + let input = r#" +query friends_of($name: String) { + match { + $p: Person { name: $name } + $p knows $f + } + return { $f.name, $f.age } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 2); + match &q.match_clause[1] { + Clause::Traversal(t) => { + assert_eq!(t.src, "p"); + assert_eq!(t.edge_name, "knows"); + assert_eq!(t.dst, "f"); + assert_eq!(t.min_hops, 1); + assert_eq!(t.max_hops, Some(1)); + } + _ => panic!("expected Traversal"), + } + } + + #[test] + fn test_parse_negation() { + let input = r#" +query unemployed() { + match { + $p: Person + not { $p worksAt $_ } + } + return { $p.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 2); + match &q.match_clause[1] { + Clause::Negation(clauses) => { + assert_eq!(clauses.len(), 1); + match &clauses[0] { + Clause::Traversal(t) => { + assert_eq!(t.src, "p"); + assert_eq!(t.edge_name, "worksAt"); + assert_eq!(t.dst, "_"); + assert_eq!(t.min_hops, 1); + assert_eq!(t.max_hops, Some(1)); + } + _ => panic!("expected Traversal inside negation"), + } + } + _ => panic!("expected Negation"), + } + } + + #[test] + fn test_parse_aggregation() { + let input = r#" +query friend_counts() { + match { + $p: Person + $p knows $f + } + return { + $p.name + count($f) as friends + } + order { friends desc } + limit 20 +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.return_clause.len(), 2); + match &q.return_clause[1].expr { + Expr::Aggregate { func, .. } => { + assert_eq!(*func, AggFunc::Count); + } + _ => panic!("expected Aggregate"), + } + assert_eq!(q.return_clause[1].alias.as_deref(), Some("friends")); + assert_eq!(q.limit, Some(20)); + } + + #[test] + fn test_parse_two_hop() { + let input = r#" +query friends_of_friends($name: String) { + match { + $p: Person { name: $name } + $p knows $mid + $mid knows $fof + } + return { $fof.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 3); + } + + #[test] + fn test_parse_reverse_traversal() { + let input = r#" +query employees_of($company: String) { + match { + $c: Company { name: $company } + $p worksAt $c + } + return { $p.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 2); + match &q.match_clause[1] { + Clause::Traversal(t) => { + assert_eq!(t.src, "p"); + assert_eq!(t.edge_name, "worksAt"); + assert_eq!(t.dst, "c"); + assert_eq!(t.min_hops, 1); + assert_eq!(t.max_hops, Some(1)); + } + _ => panic!("expected Traversal"), + } + } + + #[test] + fn test_parse_bounded_traversal() { + let input = r#" +query q() { + match { + $a: Person + $a knows{1,3} $b + } + return { $b.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Traversal(t) => { + assert_eq!(t.min_hops, 1); + assert_eq!(t.max_hops, Some(3)); + } + _ => panic!("expected Traversal"), + } + } + + #[test] + fn test_parse_unbounded_traversal() { + let input = r#" +query q() { + match { + $a: Person + $a knows{1,} $b + } + return { $b.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Traversal(t) => { + assert_eq!(t.min_hops, 1); + assert_eq!(t.max_hops, None); + } + _ => panic!("expected Traversal"), + } + } + + #[test] + fn test_parse_multi_query_file() { + let input = r#" +query q1() { + match { $p: Person } + return { $p.name } +} +query q2() { + match { $c: Company } + return { $c.name } +} +"#; + let qf = parse_query(input).unwrap(); + assert_eq!(qf.queries.len(), 2); + } + + #[test] + fn test_parse_complex_negation() { + let input = r#" +query knows_alice_not_bob() { + match { + $a: Person { name: "Alice" } + $b: Person { name: "Bob" } + $p: Person + $p knows $a + not { $p knows $b } + } + return { $p.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 5); + } + + #[test] + fn test_parse_filter_string() { + let input = r#" +query test() { + match { + $p: Person + $p.name != "Bob" + } + return { $p.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Filter(f) => { + assert_eq!(f.op, CompOp::Ne); + } + _ => panic!("expected Filter"), + } + } + + #[test] + fn test_parse_filter_string_decodes_escapes() { + let input = r#" +query test() { + match { + $p: Person + $p.name = "Bob\n\"Builder\"\t\\" + } + return { $p.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Filter(f) => match &f.right { + Expr::Literal(Literal::String(value)) => { + assert_eq!(value, "Bob\n\"Builder\"\t\\"); + } + other => panic!("expected string literal, got {:?}", other), + }, + _ => panic!("expected Filter"), + } + } + + #[test] + fn test_parse_string_literal_rejects_unknown_escape() { + let input = r#" +query test() { + match { + $p: Person + $p.name = "Bob\q" + } + return { $p.name } +} +"#; + let err = parse_query(input).unwrap_err(); + assert!(err.to_string().contains("unsupported escape sequence")); + } + + #[test] + fn test_parse_bool_literals() { + let input = r#" +query flags() { + match { + $p: Person + $p.active = true + $p.active != false + } + return { $p.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Filter(f) => match &f.right { + Expr::Literal(Literal::Bool(value)) => assert!(*value), + other => panic!("expected bool literal, got {:?}", other), + }, + _ => panic!("expected Filter"), + } + match &q.match_clause[2] { + Clause::Filter(f) => match &f.right { + Expr::Literal(Literal::Bool(value)) => assert!(!*value), + other => panic!("expected bool literal, got {:?}", other), + }, + _ => panic!("expected Filter"), + } + } + + #[test] + fn test_parse_contains_filter() { + let input = r#" +query tagged($tag: String) { + match { + $p: Person + $p.tags contains $tag + } + return { $p.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Filter(f) => { + assert_eq!(f.op, CompOp::Contains); + assert!(matches!( + &f.left, + Expr::PropAccess { variable, property } if variable == "p" && property == "tags" + )); + assert!(matches!(&f.right, Expr::Variable(v) if v == "tag")); + } + _ => panic!("expected Filter"), + } + } + + #[test] + fn test_parse_contains_is_rejected_in_mutation_predicate() { + let input = r#" +query drop_person($tag: String) { + delete Person where tags contains $tag +} +"#; + assert!(parse_query(input).is_err()); + } + + #[test] + fn test_parse_triangle() { + let input = r#" +query triangles($name: String) { + match { + $a: Person { name: $name } + $a knows $b + $b knows $c + $c knows $a + } + return { $b.name, $c.name } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 4); + } + + #[test] + fn test_parse_avg_aggregation() { + let input = r#" +query avg_age_by_company() { + match { + $p: Person + $p worksAt $c + } + return { + $c.name + avg($p.age) as avg_age + count($p) as headcount + } + order { headcount desc } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.return_clause.len(), 3); + } + + #[test] + fn test_parse_insert_mutation() { + let input = r#" +query add_person($name: String, $age: I32) { + insert Person { + name: $name + age: $age + } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match q.mutation.as_ref().expect("expected mutation") { + Mutation::Insert(ins) => { + assert_eq!(ins.type_name, "Person"); + assert_eq!(ins.assignments.len(), 2); + } + _ => panic!("expected Insert mutation"), + } + } + + #[test] + fn test_parse_update_mutation() { + let input = r#" +query set_age($name: String, $age: I32) { + update Person set { + age: $age + } where name = $name +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match q.mutation.as_ref().expect("expected mutation") { + Mutation::Update(upd) => { + assert_eq!(upd.type_name, "Person"); + assert_eq!(upd.assignments.len(), 1); + assert_eq!(upd.predicate.property, "name"); + assert_eq!(upd.predicate.op, CompOp::Eq); + } + _ => panic!("expected Update mutation"), + } + } + + #[test] + fn test_parse_delete_mutation() { + let input = r#" +query drop_person($name: String) { + delete Person where name = $name +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match q.mutation.as_ref().expect("expected mutation") { + Mutation::Delete(del) => { + assert_eq!(del.type_name, "Person"); + assert_eq!(del.predicate.property, "name"); + assert_eq!(del.predicate.op, CompOp::Eq); + } + _ => panic!("expected Delete mutation"), + } + } + + #[test] + fn test_parse_date_and_datetime_literals() { + let input = r#" +query dated() { + match { + $e: Event + $e.on = date("2026-02-14") + $e.at >= datetime("2026-02-14T10:00:00Z") + } + return { $e.id } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Filter(f) => match &f.right { + Expr::Literal(Literal::Date(v)) => assert_eq!(v, "2026-02-14"), + other => panic!("expected date literal, got {:?}", other), + }, + _ => panic!("expected Filter"), + } + match &q.match_clause[2] { + Clause::Filter(f) => match &f.right { + Expr::Literal(Literal::DateTime(v)) => assert_eq!(v, "2026-02-14T10:00:00Z"), + other => panic!("expected datetime literal, got {:?}", other), + }, + _ => panic!("expected Filter"), + } + } + + #[test] + fn test_parse_now_expression_and_mutation_value() { + let input = r#" +query clock() { + match { + $e: Event + $e.at <= now() + } + return { now() as ts } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[1] { + Clause::Filter(f) => assert!(matches!(f.right, Expr::Now)), + _ => panic!("expected Filter"), + } + assert!(matches!(q.return_clause[0].expr, Expr::Now)); + + let mutation = parse_query( + r#" +query stamp() { + update Event set { updated_at: now() } where created_at <= now() +} +"#, + ) + .unwrap(); + match mutation.queries[0].mutation.as_ref().unwrap() { + Mutation::Update(update) => { + assert!(matches!(update.assignments[0].value, MatchValue::Now)); + assert!(matches!(update.predicate.value, MatchValue::Now)); + } + _ => panic!("expected update mutation"), + } + } + + #[test] + fn test_parse_list_literal() { + let input = r#" +query listy() { + match { $p: Person { tags: ["rust", "db"] } } + return { $p.tags } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + match &q.match_clause[0] { + Clause::Binding(b) => match &b.prop_matches[0].value { + MatchValue::Literal(Literal::List(items)) => { + assert_eq!(items.len(), 2); + } + other => panic!("expected list literal, got {:?}", other), + }, + _ => panic!("expected Binding"), + } + } + + #[test] + fn test_parse_nearest_ordering_and_vector_param_type() { + let input = r#" +query similar($q: Vector(3)) { + match { $d: Doc } + return { $d.id } + order { nearest($d.embedding, $q) } + limit 5 +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.params[0].type_name, "Vector(3)"); + assert_eq!(q.order_clause.len(), 1); + assert!(!q.order_clause[0].descending); + match &q.order_clause[0].expr { + Expr::Nearest { + variable, + property, + query, + } => { + assert_eq!(variable, "d"); + assert_eq!(property, "embedding"); + assert!(matches!(query.as_ref(), Expr::Variable(v) if v == "q")); + } + other => panic!("expected nearest ordering, got {:?}", other), + } + } + + #[test] + fn test_parse_nearest_with_spaced_vector_param_type() { + let input = r#" +query similar($q: Vector( 3 ) ?) { + match { $d: Doc } + return { $d.id } + order { nearest($d.embedding, $q) } + limit 5 +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.params[0].type_name, "Vector(3)"); + assert!(q.params[0].nullable); + } + + #[test] + fn test_parse_list_and_datetime_param_types() { + let input = r#" +query tasks($tags: [String], $days: [Date]?, $due_at: DateTime) { + match { $t: Task } + return { $t.slug } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.params[0].type_name, "[String]"); + assert!(!q.params[0].nullable); + assert_eq!(q.params[1].type_name, "[Date]"); + assert!(q.params[1].nullable); + assert_eq!(q.params[2].type_name, "DateTime"); + } + + #[test] + fn test_parse_nearest_rejects_direction_modifier() { + let input = r#" +query similar($q: Vector(3)) { + match { $d: Doc } + return { $d.id } + order { nearest($d.embedding, $q) desc } + limit 5 +} +"#; + assert!(parse_query(input).is_err()); + } + + #[test] + fn test_parse_nearest_expression_in_return_projection() { + let input = r#" +query similar($q: Vector(3)) { + match { $d: Doc } + return { $d.id, nearest($d.embedding, $q) as score } + order { nearest($d.embedding, $q) } + limit 5 +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.return_clause.len(), 2); + match &q.return_clause[1].expr { + Expr::Nearest { + variable, + property, + query, + } => { + assert_eq!(variable, "d"); + assert_eq!(property, "embedding"); + assert!(matches!(query.as_ref(), Expr::Variable(v) if v == "q")); + } + other => panic!( + "expected nearest expression in return projection, got {:?}", + other + ), + } + assert_eq!(q.return_clause[1].alias.as_deref(), Some("score")); + } + + #[test] + fn test_parse_search_clause_sugar() { + let input = r#" +query q($q: String) { + match { + $s: Signal + search($s.summary, $q) + } + return { $s.slug } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 2); + match &q.match_clause[1] { + Clause::Filter(Filter { left, op, right }) => { + assert_eq!(*op, CompOp::Eq); + assert!(matches!(right, Expr::Literal(Literal::Bool(true)))); + match left { + Expr::Search { field, query } => { + assert!(matches!( + field.as_ref(), + Expr::PropAccess { variable, property } if variable == "s" && property == "summary" + )); + assert!(matches!(query.as_ref(), Expr::Variable(v) if v == "q")); + } + other => panic!("expected search expression, got {:?}", other), + } + } + other => panic!("expected filter clause, got {:?}", other), + } + } + + #[test] + fn test_parse_fuzzy_clause_with_max_edits() { + let input = r#" +query q($q: String) { + match { + $s: Signal + fuzzy($s.summary, $q, 2) + } + return { $s.slug } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 2); + match &q.match_clause[1] { + Clause::Filter(Filter { left, op, right }) => { + assert_eq!(*op, CompOp::Eq); + assert!(matches!(right, Expr::Literal(Literal::Bool(true)))); + match left { + Expr::Fuzzy { + field, + query, + max_edits, + } => { + assert!(matches!( + field.as_ref(), + Expr::PropAccess { variable, property } if variable == "s" && property == "summary" + )); + assert!(matches!(query.as_ref(), Expr::Variable(v) if v == "q")); + assert!(matches!( + max_edits.as_deref(), + Some(Expr::Literal(Literal::Integer(2))) + )); + } + other => panic!("expected fuzzy expression, got {:?}", other), + } + } + other => panic!("expected filter clause, got {:?}", other), + } + } + + #[test] + fn test_parse_match_text_clause_sugar() { + let input = r#" +query q($q: String) { + match { + $s: Signal + match_text($s.summary, $q) + } + return { $s.slug } +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.match_clause.len(), 2); + match &q.match_clause[1] { + Clause::Filter(Filter { left, op, right }) => { + assert_eq!(*op, CompOp::Eq); + assert!(matches!(right, Expr::Literal(Literal::Bool(true)))); + match left { + Expr::MatchText { field, query } => { + assert!(matches!( + field.as_ref(), + Expr::PropAccess { variable, property } if variable == "s" && property == "summary" + )); + assert!(matches!(query.as_ref(), Expr::Variable(v) if v == "q")); + } + other => panic!("expected match_text expression, got {:?}", other), + } + } + other => panic!("expected filter clause, got {:?}", other), + } + } + + #[test] + fn test_parse_bm25_expression_in_order() { + let input = r#" +query q($q: String) { + match { $s: Signal } + return { $s.slug, bm25($s.summary, $q) as score } + order { bm25($s.summary, $q) desc } + limit 5 +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.return_clause.len(), 2); + match &q.return_clause[1].expr { + Expr::Bm25 { field, query } => { + assert!(matches!( + field.as_ref(), + Expr::PropAccess { variable, property } if variable == "s" && property == "summary" + )); + assert!(matches!(query.as_ref(), Expr::Variable(v) if v == "q")); + } + other => panic!("expected bm25 expression, got {:?}", other), + } + assert_eq!(q.order_clause.len(), 1); + assert!(q.order_clause[0].descending); + } + + #[test] + fn test_parse_rrf_ordering_with_nearest_and_bm25() { + let input = r#" +query q($vq: Vector(3), $tq: String) { + match { $s: Signal } + return { $s.slug } + order { rrf(nearest($s.embedding, $vq), bm25($s.summary, $tq), 60) desc } + limit 5 +} +"#; + let qf = parse_query(input).unwrap(); + let q = &qf.queries[0]; + assert_eq!(q.order_clause.len(), 1); + assert!(q.order_clause[0].descending); + match &q.order_clause[0].expr { + Expr::Rrf { + primary, + secondary, + k, + } => { + assert!(matches!(primary.as_ref(), Expr::Nearest { .. })); + assert!(matches!(secondary.as_ref(), Expr::Bm25 { .. })); + assert!(matches!( + k.as_deref(), + Some(Expr::Literal(Literal::Integer(60))) + )); + } + other => panic!("expected rrf expression, got {:?}", other), + } + } + + #[test] + fn test_parse_error_diagnostic_has_span() { + let input = r#" +query q() { + match { + $p: Person + } + return { $p.name +} +"#; + let err = parse_query_diagnostic(input).unwrap_err(); + assert!(err.span.is_some()); + } +} diff --git a/crates/omnigraph-compiler/src/query/query.pest b/crates/omnigraph-compiler/src/query/query.pest new file mode 100644 index 0000000..4aba619 --- /dev/null +++ b/crates/omnigraph-compiler/src/query/query.pest @@ -0,0 +1,114 @@ +// NanoGraph Query Grammar (.gq files) + +WHITESPACE = _{ " " | "\t" | "\r" | "\n" } +COMMENT = _{ LINE_COMMENT | BLOCK_COMMENT } +LINE_COMMENT = _{ "//" ~ (!"\n" ~ ANY)* } +BLOCK_COMMENT = _{ "/*" ~ (!"*/" ~ ANY)* ~ "*/" } + +query_file = { SOI ~ query_decl* ~ EOI } + +query_decl = { + "query" ~ ident ~ "(" ~ param_list? ~ ")" ~ query_annotation* ~ "{" + ~ query_body + ~ "}" +} +query_annotation = { description_annotation | instruction_annotation } +description_annotation = { "@description" ~ "(" ~ string_lit ~ ")" } +instruction_annotation = { "@instruction" ~ "(" ~ string_lit ~ ")" } + +query_body = { read_query_body | mutation_stmt } +read_query_body = { + match_clause + ~ return_clause + ~ order_clause? + ~ limit_clause? +} + +mutation_stmt = { insert_stmt | update_stmt | delete_stmt } +insert_stmt = { "insert" ~ type_name ~ "{" ~ mutation_assignment+ ~ "}" } +update_stmt = { "update" ~ type_name ~ "set" ~ "{" ~ mutation_assignment+ ~ "}" ~ "where" ~ mutation_predicate } +delete_stmt = { "delete" ~ type_name ~ "where" ~ mutation_predicate } +mutation_assignment = { ident ~ ":" ~ match_value ~ ","? } +mutation_predicate = { ident ~ comp_op ~ match_value } + +param_list = { param ~ ("," ~ param)* } +param = { variable ~ ":" ~ type_ref } + +type_ref = { (list_type | base_type | vector_type) ~ "?"? } +list_type = { "[" ~ base_type ~ "]" } +vector_type = { "Vector" ~ "(" ~ integer ~ ")" } +base_type = { "String" | "Blob" | "Bool" | "I32" | "I64" | "U32" | "U64" | "F32" | "F64" | "DateTime" | "Date" } + +match_clause = { "match" ~ "{" ~ clause+ ~ "}" } + +clause = { negation | binding | traversal | filter | text_search_clause } +text_search_clause = { search_call | fuzzy_call | match_text_call } + +// Binding: $p: Person { name: "Alice" } +binding = { variable ~ ":" ~ type_name ~ ("{" ~ prop_match_list ~ "}")? } + +prop_match_list = { prop_match ~ ("," ~ prop_match)* ~ ","? } +prop_match = { ident ~ ":" ~ match_value } +match_value = { literal | variable | now_call } + +// Traversal: $p knows $f +traversal = { variable ~ edge_ident ~ traversal_bounds? ~ variable } +traversal_bounds = { "{" ~ integer ~ "," ~ integer? ~ "}" } + +// Filter: $f.age > 25 +filter = { expr ~ filter_op ~ expr } + +// Negation: not { ... } +negation = { "not" ~ "{" ~ clause+ ~ "}" } + +// Return clause — projections separated by commas or newlines +return_clause = { "return" ~ "{" ~ projection+ ~ "}" } +projection = { expr ~ ("as" ~ ident)? ~ ","? } + +// Order clause +order_clause = { "order" ~ "{" ~ ordering ~ ("," ~ ordering)* ~ "}" } +ordering = { nearest_ordering | (expr ~ order_dir?) } +nearest_ordering = { "nearest" ~ "(" ~ prop_access ~ "," ~ expr ~ ")" } +order_dir = { "asc" | "desc" } + +// Limit clause +limit_clause = { "limit" ~ integer } + +// Expressions +expr = { now_call | nearest_ordering | search_call | fuzzy_call | match_text_call | bm25_call | rrf_call | agg_call | prop_access | variable | literal | ident } +now_call = { "now" ~ "(" ~ ")" } +search_call = { "search" ~ "(" ~ expr ~ "," ~ expr ~ ")" } +fuzzy_call = { "fuzzy" ~ "(" ~ expr ~ "," ~ expr ~ ("," ~ expr)? ~ ")" } +match_text_call = { "match_text" ~ "(" ~ expr ~ "," ~ expr ~ ")" } +bm25_call = { "bm25" ~ "(" ~ expr ~ "," ~ expr ~ ")" } +rank_expr = { nearest_ordering | bm25_call } +rrf_call = { "rrf" ~ "(" ~ rank_expr ~ "," ~ rank_expr ~ ("," ~ expr)? ~ ")" } + +prop_access = { variable ~ "." ~ ident } + +agg_call = { agg_func ~ "(" ~ expr ~ ")" } +agg_func = { "count" | "sum" | "avg" | "min" | "max" } + +comp_op = { ">=" | "<=" | "!=" | ">" | "<" | "=" } +filter_op = { "contains" | comp_op } + +// Terminals +variable = @{ "$" ~ (ident_chars | "_") } +ident_chars = @{ (ASCII_ALPHA_LOWER | "_") ~ (ASCII_ALPHANUMERIC | "_")* } + +// Edge identifier — lowercase start, same as ident but used in traversal context +// Must not match keywords +edge_ident = @{ !("not" ~ !ASCII_ALPHANUMERIC) ~ (ASCII_ALPHA_LOWER | "_") ~ (ASCII_ALPHANUMERIC | "_")* } + +type_name = @{ ASCII_ALPHA_UPPER ~ (ASCII_ALPHANUMERIC | "_")* } +ident = @{ (ASCII_ALPHA_LOWER | "_") ~ (ASCII_ALPHANUMERIC | "_")* } + +literal = { list_lit | datetime_lit | date_lit | string_lit | float_lit | integer | bool_lit } +date_lit = { "date" ~ "(" ~ string_lit ~ ")" } +datetime_lit = { "datetime" ~ "(" ~ string_lit ~ ")" } +list_lit = { "[" ~ (literal ~ ("," ~ literal)*)? ~ "]" } +string_lit = @{ "\"" ~ string_char* ~ "\"" } +string_char = @{ !("\"" | "\\") ~ ANY | "\\" ~ ANY } +float_lit = @{ ASCII_DIGIT+ ~ "." ~ ASCII_DIGIT+ } +integer = @{ ASCII_DIGIT+ } +bool_lit = { "true" | "false" } diff --git a/crates/omnigraph-compiler/src/query/typecheck.rs b/crates/omnigraph-compiler/src/query/typecheck.rs new file mode 100644 index 0000000..3f5bc00 --- /dev/null +++ b/crates/omnigraph-compiler/src/query/typecheck.rs @@ -0,0 +1,2776 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use arrow_schema::{DataType, Field, Schema, SchemaRef}; + +use crate::catalog::Catalog; +use crate::error::{NanoError, Result}; +use crate::types::{Direction, PropType, ScalarType}; + +use super::ast::*; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BindingKind { + Node, + Edge, +} + +#[derive(Debug, Clone)] +pub struct BoundVariable { + pub var_name: String, + pub type_name: String, + pub kind: BindingKind, +} + +#[derive(Debug, Clone)] +pub struct TypeContext { + pub bindings: HashMap, + pub aliases: HashMap, + pub traversals: Vec, +} + +#[derive(Debug, Clone)] +pub struct ResolvedTraversal { + pub src: String, + pub dst: String, + pub edge_type: String, + pub direction: Direction, + pub min_hops: u32, + pub max_hops: Option, +} + +#[derive(Debug, Clone, PartialEq)] +pub enum ResolvedType { + Scalar(PropType), + Node(String), + Aggregate, +} + +impl ResolvedType { + fn display_name(&self) -> String { + match self { + Self::Scalar(prop) => prop.display_name(), + Self::Node(type_name) => format!("node `{}`", type_name), + Self::Aggregate => "aggregate".to_string(), + } + } +} + +#[derive(Debug, Clone)] +pub struct MutationTypeContext { + pub target_type: String, +} + +#[derive(Debug, Clone)] +pub enum CheckedQuery { + Read(TypeContext), + Mutation(MutationTypeContext), +} + +pub fn typecheck_query_decl(catalog: &Catalog, query: &QueryDecl) -> Result { + if let Some(mutation) = &query.mutation { + let target_type = typecheck_mutation(catalog, mutation, &query.params)?; + Ok(CheckedQuery::Mutation(MutationTypeContext { target_type })) + } else { + Ok(CheckedQuery::Read(typecheck_read_query(catalog, query)?)) + } +} + +pub fn typecheck_query(catalog: &Catalog, query: &QueryDecl) -> Result { + if query.mutation.is_some() { + return Err(NanoError::Type( + "mutation query cannot be typechecked with read-query API".to_string(), + )); + } + typecheck_read_query(catalog, query) +} + +pub fn infer_query_result_schema( + catalog: &Catalog, + query: &QueryDecl, + ctx: &TypeContext, +) -> Result { + let params = parse_declared_param_types(&query.params)?; + let mut fields = Vec::with_capacity(query.return_clause.len()); + + for projection in &query.return_clause { + let field = infer_projection_field( + catalog, + &projection.expr, + projection.alias.as_deref(), + ctx, + ¶ms, + )?; + fields.push(field); + } + + Ok(Arc::new(Schema::new(fields))) +} + +fn parse_declared_param_types(params: &[Param]) -> Result> { + let mut out = HashMap::with_capacity(params.len()); + for p in params { + if p.name == NOW_PARAM_NAME { + return Err(NanoError::Type(format!( + "parameter name `${}` is reserved for runtime timestamp injection", + NOW_PARAM_NAME + ))); + } + let prop_type = + PropType::from_param_type_name(&p.type_name, p.nullable).ok_or_else(|| { + NanoError::Type(format!( + "unknown parameter type `{}` for `${}`", + p.type_name, p.name + )) + })?; + out.insert(p.name.clone(), prop_type); + } + Ok(out) +} + +fn typecheck_read_query(catalog: &Catalog, query: &QueryDecl) -> Result { + let mut ctx = TypeContext { + bindings: HashMap::new(), + aliases: HashMap::new(), + traversals: Vec::new(), + }; + let mut alias_exprs: HashMap = HashMap::new(); + + let params = parse_declared_param_types(&query.params)?; + + // Typecheck match clauses + typecheck_clauses(catalog, &query.match_clause, &mut ctx, ¶ms, false)?; + + // Typecheck return projections + for proj in &query.return_clause { + let resolved = resolve_expr_type(catalog, &proj.expr, &ctx, ¶ms)?; + if let Some(alias) = &proj.alias { + ctx.aliases.insert(alias.clone(), resolved); + alias_exprs.insert(alias.clone(), &proj.expr); + } + } + + // Typecheck order expressions + for ord in &query.order_clause { + resolve_expr_type(catalog, &ord.expr, &ctx, ¶ms)?; + } + + let has_standalone_nearest = query + .order_clause + .iter() + .any(|ord| expr_contains_standalone_nearest_with_aliases(&ord.expr, &alias_exprs)); + let has_rrf = query + .order_clause + .iter() + .any(|ord| expr_contains_rrf_with_aliases(&ord.expr, &alias_exprs)); + if has_rrf && query.limit.is_none() { + return Err(NanoError::Type( + "T21: rrf ordering requires a limit clause".to_string(), + )); + } + if has_standalone_nearest && query.limit.is_none() { + return Err(NanoError::Type( + "T17: nearest ordering requires a limit clause".to_string(), + )); + } + if has_standalone_nearest + && query + .order_clause + .iter() + .any(|ord| matches!(ord.expr, Expr::AliasRef(_))) + { + return Err(NanoError::Type( + "T18: alias-based ordering is not supported together with nearest in phase 1" + .to_string(), + )); + } + + Ok(ctx) +} + +fn typecheck_mutation(catalog: &Catalog, mutation: &Mutation, params: &[Param]) -> Result { + let param_types = parse_declared_param_types(params)?; + + match mutation { + Mutation::Insert(insert) => { + if insert.assignments.is_empty() { + return Err(NanoError::Type( + "T10: insert mutation requires at least one assignment".to_string(), + )); + } + + ensure_no_duplicate_assignment_names(&insert.assignments)?; + + if let Some(node_type) = catalog.node_types.get(&insert.type_name) { + for assignment in &insert.assignments { + let prop_type = + node_type + .properties + .get(&assignment.property) + .ok_or_else(|| { + NanoError::Type(format!( + "T11: type `{}` has no property `{}`", + insert.type_name, assignment.property + )) + })?; + check_match_value_type( + &assignment.value, + ¶m_types, + prop_type, + &assignment.property, + )?; + } + + let assigned_props: HashSet<&str> = insert + .assignments + .iter() + .map(|assignment| assignment.property.as_str()) + .collect(); + for (prop_name, prop_type) in &node_type.properties { + if prop_type.nullable { + continue; + } + if assigned_props.contains(prop_name.as_str()) { + continue; + } + + if let Some(source_prop) = node_type.embed_sources.get(prop_name) { + if assigned_props.contains(source_prop.as_str()) { + continue; + } + return Err(NanoError::Type(format!( + "T12: insert for `{}` must provide non-nullable property `{}` or @embed source `{}`", + insert.type_name, prop_name, source_prop + ))); + } + + return Err(NanoError::Type(format!( + "T12: insert for `{}` must provide non-nullable property `{}`", + insert.type_name, prop_name + ))); + } + return Ok(insert.type_name.clone()); + } + + if let Some(edge_type) = catalog.edge_types.get(&insert.type_name) { + let mut has_from = false; + let mut has_to = false; + + for assignment in &insert.assignments { + match assignment.property.as_str() { + "from" => { + has_from = true; + check_match_value_type( + &assignment.value, + ¶m_types, + &PropType::scalar(ScalarType::String, false), + "from", + )?; + } + "to" => { + has_to = true; + check_match_value_type( + &assignment.value, + ¶m_types, + &PropType::scalar(ScalarType::String, false), + "to", + )?; + } + _ => { + let prop_type = edge_type + .properties + .get(&assignment.property) + .ok_or_else(|| { + NanoError::Type(format!( + "T11: type `{}` has no property `{}`", + insert.type_name, assignment.property + )) + })?; + check_match_value_type( + &assignment.value, + ¶m_types, + prop_type, + &assignment.property, + )?; + } + } + } + + if !has_from { + return Err(NanoError::Type(format!( + "T12: insert for `{}` must provide required endpoint `from`", + insert.type_name + ))); + } + if !has_to { + return Err(NanoError::Type(format!( + "T12: insert for `{}` must provide required endpoint `to`", + insert.type_name + ))); + } + + for (prop_name, prop_type) in &edge_type.properties { + if prop_type.nullable { + continue; + } + if !insert.assignments.iter().any(|a| &a.property == prop_name) { + return Err(NanoError::Type(format!( + "T12: insert for `{}` must provide non-nullable property `{}`", + insert.type_name, prop_name + ))); + } + } + return Ok(insert.type_name.clone()); + } + + Err(NanoError::Type(format!( + "T10: unknown node/edge type `{}`", + insert.type_name + ))) + } + Mutation::Update(update) => { + let node_type = if let Some(node_type) = catalog.node_types.get(&update.type_name) { + node_type + } else if catalog.edge_types.contains_key(&update.type_name) { + return Err(NanoError::Type(format!( + "T16: update mutation for edge type `{}` is not supported", + update.type_name + ))); + } else { + return Err(NanoError::Type(format!( + "T10: unknown node/edge type `{}`", + update.type_name + ))); + }; + + if update.assignments.is_empty() { + return Err(NanoError::Type( + "T10: update mutation requires at least one assignment".to_string(), + )); + } + ensure_no_duplicate_assignment_names(&update.assignments)?; + + for assignment in &update.assignments { + let prop_type = + node_type + .properties + .get(&assignment.property) + .ok_or_else(|| { + NanoError::Type(format!( + "T11: type `{}` has no property `{}`", + update.type_name, assignment.property + )) + })?; + check_match_value_type( + &assignment.value, + ¶m_types, + prop_type, + &assignment.property, + )?; + } + + typecheck_mutation_predicate( + &update.type_name, + &update.predicate, + node_type, + ¶m_types, + )?; + Ok(update.type_name.clone()) + } + Mutation::Delete(delete) => { + if let Some(node_type) = catalog.node_types.get(&delete.type_name) { + typecheck_mutation_predicate( + &delete.type_name, + &delete.predicate, + node_type, + ¶m_types, + )?; + Ok(delete.type_name.clone()) + } else if let Some(edge_type) = catalog.edge_types.get(&delete.type_name) { + typecheck_edge_mutation_predicate( + &delete.type_name, + &delete.predicate, + edge_type, + ¶m_types, + )?; + Ok(delete.type_name.clone()) + } else { + Err(NanoError::Type(format!( + "T10: unknown node/edge type `{}`", + delete.type_name + ))) + } + } + } +} + +fn ensure_no_duplicate_assignment_names(assignments: &[MutationAssignment]) -> Result<()> { + let mut seen = std::collections::HashSet::new(); + for assignment in assignments { + if !seen.insert(&assignment.property) { + return Err(NanoError::Type(format!( + "T13: duplicate assignment for property `{}`", + assignment.property + ))); + } + } + Ok(()) +} + +fn typecheck_mutation_predicate( + type_name: &str, + predicate: &MutationPredicate, + node_type: &crate::catalog::NodeType, + param_types: &HashMap, +) -> Result<()> { + let prop_type = node_type + .properties + .get(&predicate.property) + .ok_or_else(|| { + NanoError::Type(format!( + "T11: type `{}` has no property `{}`", + type_name, predicate.property + )) + })?; + if matches!(prop_type.scalar, ScalarType::Blob) { + return Err(NanoError::Type(format!( + "T11: blob property `{}` cannot be used in WHERE predicates", + predicate.property + ))); + } + check_match_value_type( + &predicate.value, + param_types, + prop_type, + &predicate.property, + )?; + Ok(()) +} + +fn typecheck_edge_mutation_predicate( + type_name: &str, + predicate: &MutationPredicate, + edge_type: &crate::catalog::EdgeType, + param_types: &HashMap, +) -> Result<()> { + if predicate.property == "from" || predicate.property == "to" { + return check_match_value_type( + &predicate.value, + param_types, + &PropType::scalar(ScalarType::String, false), + &predicate.property, + ); + } + + let prop_type = edge_type + .properties + .get(&predicate.property) + .ok_or_else(|| { + NanoError::Type(format!( + "T11: type `{}` has no property `{}`", + type_name, predicate.property + )) + })?; + check_match_value_type( + &predicate.value, + param_types, + prop_type, + &predicate.property, + )?; + Ok(()) +} + +fn check_match_value_type( + value: &MatchValue, + params: &HashMap, + expected: &PropType, + property: &str, +) -> Result<()> { + match value { + MatchValue::Literal(lit) => check_literal_type(lit, expected, property), + MatchValue::Variable(v) => { + let Some(actual) = params.get(v) else { + return Err(NanoError::Type(format!( + "T14: mutation variable `${}` must be a declared query parameter", + v + ))); + }; + // Allow String param → Blob property (URI assignment) + let compatible = types_compatible(actual, expected) + || (matches!(expected.scalar, ScalarType::Blob) + && matches!(actual.scalar, ScalarType::String) + && !actual.list); + if !compatible { + return Err(NanoError::Type(format!( + "T7: cannot assign/compare {} with {} for property `{}`", + actual.display_name(), + expected.display_name(), + property + ))); + } + Ok(()) + } + MatchValue::Now => check_now_match_value_type(expected, property), + } +} + +fn check_now_match_value_type(expected: &PropType, property: &str) -> Result<()> { + if expected.list || expected.scalar != ScalarType::DateTime { + return Err(NanoError::Type(format!( + "T7: cannot assign/compare DateTime with {} for property `{}`", + expected.display_name(), + property + ))); + } + Ok(()) +} + +fn typecheck_clauses( + catalog: &Catalog, + clauses: &[Clause], + ctx: &mut TypeContext, + params: &HashMap, + _in_negation: bool, +) -> Result<()> { + for clause in clauses { + match clause { + Clause::Binding(b) => typecheck_binding(catalog, b, ctx, params)?, + Clause::Traversal(t) => typecheck_traversal(catalog, t, ctx)?, + Clause::Filter(f) => typecheck_filter(catalog, f, ctx, params)?, + Clause::Negation(inner) => { + // T9: at least one variable in the negation block must be bound outside + let outer_vars: Vec = ctx.bindings.keys().cloned().collect(); + + // Typecheck inner clauses in a copy of ctx + let mut inner_ctx = ctx.clone(); + typecheck_clauses(catalog, inner, &mut inner_ctx, params, true)?; + + // Check T9 + let mut has_outer = false; + for clause in inner { + match clause { + Clause::Traversal(t) => { + if outer_vars.contains(&t.src) || outer_vars.contains(&t.dst) { + has_outer = true; + } + } + Clause::Filter(f) => { + if expr_references_any(&f.left, &outer_vars) + || expr_references_any(&f.right, &outer_vars) + { + has_outer = true; + } + } + Clause::Binding(b) => { + if outer_vars.contains(&b.variable) { + has_outer = true; + } + } + _ => {} + } + } + if !has_outer { + return Err(NanoError::Type( + "T9: negation block must reference at least one outer-bound variable" + .to_string(), + )); + } + } + } + } + Ok(()) +} + +fn typecheck_binding( + catalog: &Catalog, + binding: &Binding, + ctx: &mut TypeContext, + params: &HashMap, +) -> Result<()> { + // T1: binding type must exist in catalog + if !catalog.node_types.contains_key(&binding.type_name) { + return Err(NanoError::Type(format!( + "T1: unknown node type `{}`", + binding.type_name + ))); + } + + let node_type = &catalog.node_types[&binding.type_name]; + + // T2 + T3: property match fields must exist and have correct types + for pm in &binding.prop_matches { + let prop = node_type.properties.get(&pm.prop_name).ok_or_else(|| { + NanoError::Type(format!( + "T2: type `{}` has no property `{}`", + binding.type_name, pm.prop_name + )) + })?; + + if matches!(prop.scalar, ScalarType::Blob) { + return Err(NanoError::Type(format!( + "T3: blob property `{}.{}` cannot be used in match patterns", + binding.type_name, pm.prop_name + ))); + } + + // T3: check value type matches property type + match &pm.value { + MatchValue::Literal(lit) => { + check_binding_literal_type(lit, prop, &pm.prop_name)?; + } + MatchValue::Variable(v) => { + if let Some(actual) = params.get(v) { + check_binding_variable_type(actual, prop, &pm.prop_name)?; + } + } + MatchValue::Now => check_now_match_value_type(prop, &pm.prop_name)?, + } + } + + // Don't overwrite if already bound to same type (re-binding same var is OK) + if let Some(existing) = ctx.bindings.get(&binding.variable) + && existing.type_name != binding.type_name + { + return Err(NanoError::Type(format!( + "variable `${}` already bound to type `{}`, cannot rebind to `{}`", + binding.variable, existing.type_name, binding.type_name + ))); + } + + ctx.bindings.insert( + binding.variable.clone(), + BoundVariable { + var_name: binding.variable.clone(), + type_name: binding.type_name.clone(), + kind: BindingKind::Node, + }, + ); + + Ok(()) +} + +fn check_binding_literal_type(lit: &Literal, expected: &PropType, property: &str) -> Result<()> { + if expected.list { + let lit_type = literal_type(lit)?; + if lit_type.list { + return Err(NanoError::Type(format!( + "T3: list equality is not supported for property `{}`; use a scalar value to match list membership", + property + ))); + } + + let expected_member = PropType::scalar(expected.scalar, expected.nullable); + if !types_compatible(&lit_type, &expected_member) { + return Err(NanoError::Type(format!( + "T3: property `{}` has type {} but membership match got {}", + property, + expected.display_name(), + lit_type.display_name() + ))); + } + return Ok(()); + } + + check_literal_type(lit, expected, property) +} + +fn check_binding_variable_type( + actual: &PropType, + expected: &PropType, + property: &str, +) -> Result<()> { + if expected.list { + if actual.list { + return Err(NanoError::Type(format!( + "T7: list equality is not supported for property `{}`; use a scalar parameter for membership matching", + property + ))); + } + + let expected_member = PropType::scalar(expected.scalar, expected.nullable); + if !types_compatible(actual, &expected_member) { + return Err(NanoError::Type(format!( + "T7: cannot compare {} membership against {} for property `{}`", + actual.display_name(), + expected.display_name(), + property + ))); + } + return Ok(()); + } + + if !types_compatible(actual, expected) { + return Err(NanoError::Type(format!( + "T7: cannot assign/compare {} with {} for property `{}`", + actual.display_name(), + expected.display_name(), + property + ))); + } + Ok(()) +} + +fn typecheck_traversal( + catalog: &Catalog, + traversal: &Traversal, + ctx: &mut TypeContext, +) -> Result<()> { + // T4: edge must exist + let edge = catalog + .lookup_edge_by_name(&traversal.edge_name) + .ok_or_else(|| { + NanoError::Type(format!("T4: unknown edge type `{}`", traversal.edge_name)) + })?; + + if traversal.min_hops == 0 { + return Err(NanoError::Type( + "T15: traversal min hop bound must be >= 1".to_string(), + )); + } + if let Some(max_hops) = traversal.max_hops { + if max_hops < traversal.min_hops { + return Err(NanoError::Type(format!( + "T15: invalid traversal bounds {{{},{}}}; max must be >= min", + traversal.min_hops, max_hops + ))); + } + } else { + return Err(NanoError::Type( + "T15: unbounded traversal is disabled; use bounded traversal {min,max}".to_string(), + )); + } + + // Determine direction based on bound variables and edge endpoints + let src_bound = ctx.bindings.get(&traversal.src); + let dst_bound = ctx.bindings.get(&traversal.dst); + + let direction; + + if let Some(src_bv) = src_bound { + // T5: src type must match one endpoint of the edge + if src_bv.type_name == edge.from_type { + direction = Direction::Out; + // dst should be edge.to_type + bind_traversal_endpoint(ctx, &traversal.dst, &edge.to_type, edge)?; + } else if src_bv.type_name == edge.to_type { + direction = Direction::In; + // dst should be edge.from_type + bind_traversal_endpoint(ctx, &traversal.dst, &edge.from_type, edge)?; + } else { + return Err(NanoError::Type(format!( + "T5: variable `${}` has type `{}`, which is not an endpoint of edge `{}: {} -> {}`", + traversal.src, src_bv.type_name, edge.name, edge.from_type, edge.to_type + ))); + } + } else if let Some(dst_bv) = dst_bound { + // dst is bound, infer direction from it + if dst_bv.type_name == edge.to_type { + direction = Direction::Out; + bind_traversal_endpoint(ctx, &traversal.src, &edge.from_type, edge)?; + } else if dst_bv.type_name == edge.from_type { + direction = Direction::In; + bind_traversal_endpoint(ctx, &traversal.src, &edge.to_type, edge)?; + } else { + return Err(NanoError::Type(format!( + "T5: variable `${}` has type `{}`, which is not an endpoint of edge `{}: {} -> {}`", + traversal.dst, dst_bv.type_name, edge.name, edge.from_type, edge.to_type + ))); + } + } else { + // Neither bound — default Out direction, bind both + direction = Direction::Out; + bind_traversal_endpoint(ctx, &traversal.src, &edge.from_type, edge)?; + bind_traversal_endpoint(ctx, &traversal.dst, &edge.to_type, edge)?; + } + + ctx.traversals.push(ResolvedTraversal { + src: traversal.src.clone(), + dst: traversal.dst.clone(), + edge_type: edge.name.clone(), + direction, + min_hops: traversal.min_hops, + max_hops: traversal.max_hops, + }); + + Ok(()) +} + +fn bind_traversal_endpoint( + ctx: &mut TypeContext, + var: &str, + expected_type: &str, + edge: &crate::catalog::EdgeType, +) -> Result<()> { + if var == "_" { + return Ok(()); // anonymous variable + } + if let Some(existing) = ctx.bindings.get(var) { + if existing.type_name != expected_type { + return Err(NanoError::Type(format!( + "T5: variable `${}` has type `{}` but edge `{}` expects `{}`", + var, existing.type_name, edge.name, expected_type + ))); + } + } else { + ctx.bindings.insert( + var.to_string(), + BoundVariable { + var_name: var.to_string(), + type_name: expected_type.to_string(), + kind: BindingKind::Node, + }, + ); + } + Ok(()) +} + +fn typecheck_filter( + catalog: &Catalog, + filter: &Filter, + ctx: &TypeContext, + params: &HashMap, +) -> Result<()> { + let left_type = resolve_expr_type(catalog, &filter.left, ctx, params)?; + let right_type = resolve_expr_type(catalog, &filter.right, ctx, params)?; + + if let (ResolvedType::Scalar(l), ResolvedType::Scalar(r)) = (&left_type, &right_type) { + if filter.op == CompOp::Contains { + if !l.list { + return Err(NanoError::Type(format!( + "T7: contains requires a list property on the left, got {}", + l.display_name() + ))); + } + if r.list { + return Err(NanoError::Type( + "T7: contains requires a scalar right operand".to_string(), + )); + } + if matches!(l.scalar, ScalarType::Vector(_)) + || matches!(r.scalar, ScalarType::Vector(_)) + { + return Err(NanoError::Type( + "T7: vector membership filters are not supported".to_string(), + )); + } + + let expected_member = PropType::scalar(l.scalar, l.nullable); + if !types_compatible(&expected_member, r) { + return Err(NanoError::Type(format!( + "T7: cannot test membership of {} in {}", + r.display_name(), + l.display_name() + ))); + } + return Ok(()); + } + + // T7: check type compatibility + if l.list || r.list { + return Err(NanoError::Type( + "T7: list comparisons in filters are not supported; use `contains` for list membership".to_string(), + )); + } + if matches!(l.scalar, ScalarType::Vector(_)) || matches!(r.scalar, ScalarType::Vector(_)) { + return Err(NanoError::Type( + "T7: vector comparisons in filters are not supported".to_string(), + )); + } + if matches!(l.scalar, ScalarType::Blob) || matches!(r.scalar, ScalarType::Blob) { + return Err(NanoError::Type( + "T7: blob comparisons in filters are not supported".to_string(), + )); + } + if !types_compatible(l, r) { + return Err(NanoError::Type(format!( + "T7: cannot compare {} with {}", + l.display_name(), + r.display_name() + ))); + } + } else { + return Err(NanoError::Type(format!( + "T7: filter comparisons require scalar operands, got {} and {}", + left_type.display_name(), + right_type.display_name() + ))); + } + + Ok(()) +} + +fn resolve_expr_type( + catalog: &Catalog, + expr: &Expr, + ctx: &TypeContext, + params: &HashMap, +) -> Result { + match expr { + Expr::Now => Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::DateTime, + false, + ))), + Expr::PropAccess { variable, property } => { + // T6: variable must be bound and property must exist + let bv = ctx.bindings.get(variable).ok_or_else(|| { + NanoError::Type(format!("T6: variable `${}` is not bound", variable)) + })?; + + let node_type = catalog.node_types.get(&bv.type_name).ok_or_else(|| { + NanoError::Type(format!("T6: type `{}` not found in catalog", bv.type_name)) + })?; + + let prop = node_type.properties.get(property).ok_or_else(|| { + NanoError::Type(format!( + "T6: type `{}` has no property `{}`", + bv.type_name, property + )) + })?; + + Ok(ResolvedType::Scalar(prop.clone())) + } + Expr::Nearest { + variable, + property, + query, + } => { + let node_binding = ctx.bindings.get(variable).ok_or_else(|| { + NanoError::Type(format!("T15: variable `${}` is not bound", variable)) + })?; + let node_type = catalog + .node_types + .get(&node_binding.type_name) + .ok_or_else(|| { + NanoError::Type(format!( + "T15: type `{}` not found in catalog", + node_binding.type_name + )) + })?; + let prop_type = node_type.properties.get(property).ok_or_else(|| { + NanoError::Type(format!( + "T15: type `{}` has no property `{}`", + node_binding.type_name, property + )) + })?; + let vector_dim = match prop_type.scalar { + ScalarType::Vector(dim) => dim, + _ => { + return Err(NanoError::Type(format!( + "T15: nearest requires a Vector property, got {}.{}: {}", + node_binding.type_name, + property, + prop_type.display_name() + ))); + } + }; + if prop_type.list { + return Err(NanoError::Type( + "T15: nearest does not support list-wrapped vectors".to_string(), + )); + } + + if let Expr::Literal(lit) = query.as_ref() + && let Some(dim) = numeric_vector_literal_dim(lit) + { + if dim != vector_dim { + return Err(NanoError::Type(format!( + "T15: nearest vector dimension mismatch: property is Vector({}), query literal has {} elements", + vector_dim, dim + ))); + } + return Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::F64, + false, + ))); + } + + let query_type = resolve_expr_type(catalog, query, ctx, params)?; + match query_type { + ResolvedType::Scalar(s) if matches!(s.scalar, ScalarType::Vector(_)) && !s.list => { + let qdim = match s.scalar { + ScalarType::Vector(dim) => dim, + _ => unreachable!(), + }; + if qdim != vector_dim { + return Err(NanoError::Type(format!( + "T15: nearest vector dimension mismatch: property is Vector({}), query is Vector({})", + vector_dim, qdim + ))); + } + } + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => { + // query-time string embedding is supported by the runtime executor + } + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T15: nearest query must be Vector({}) or String, got {}", + vector_dim, + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T15: nearest query must be a scalar expression".to_string(), + )); + } + } + + Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::F64, + false, + ))) + } + Expr::Search { field, query } => { + let field_type = resolve_expr_type(catalog, field, ctx, params)?; + match field_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T19: search field must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T19: search field must be a scalar String expression".to_string(), + )); + } + } + + let query_type = resolve_expr_type(catalog, query, ctx, params)?; + match query_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T19: search query must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T19: search query must be a scalar String expression".to_string(), + )); + } + } + + Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::Bool, + false, + ))) + } + Expr::Fuzzy { + field, + query, + max_edits, + } => { + let field_type = resolve_expr_type(catalog, field, ctx, params)?; + match field_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T19: fuzzy field must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T19: fuzzy field must be a scalar String expression".to_string(), + )); + } + } + + let query_type = resolve_expr_type(catalog, query, ctx, params)?; + match query_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T19: fuzzy query must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T19: fuzzy query must be a scalar String expression".to_string(), + )); + } + } + + if let Some(max_edits_expr) = max_edits { + let max_edits_type = resolve_expr_type(catalog, max_edits_expr, ctx, params)?; + match max_edits_type { + ResolvedType::Scalar(s) + if !s.list + && matches!( + s.scalar, + ScalarType::I32 + | ScalarType::I64 + | ScalarType::U32 + | ScalarType::U64 + ) => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T19: fuzzy max_edits must be an integer scalar, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T19: fuzzy max_edits must be an integer scalar expression".to_string(), + )); + } + } + } + + Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::Bool, + false, + ))) + } + Expr::MatchText { field, query } => { + let field_type = resolve_expr_type(catalog, field, ctx, params)?; + match field_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T20: match_text field must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T20: match_text field must be a scalar String expression".to_string(), + )); + } + } + + let query_type = resolve_expr_type(catalog, query, ctx, params)?; + match query_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T20: match_text query must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T20: match_text query must be a scalar String expression".to_string(), + )); + } + } + + Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::Bool, + false, + ))) + } + Expr::Bm25 { field, query } => { + let field_type = resolve_expr_type(catalog, field, ctx, params)?; + match field_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T20: bm25 field must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T20: bm25 field must be a scalar String expression".to_string(), + )); + } + } + + let query_type = resolve_expr_type(catalog, query, ctx, params)?; + match query_type { + ResolvedType::Scalar(s) if s.scalar == ScalarType::String && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T20: bm25 query must be String, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T20: bm25 query must be a scalar String expression".to_string(), + )); + } + } + + Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::F64, + false, + ))) + } + Expr::Rrf { + primary, + secondary, + k, + } => { + if !matches!(primary.as_ref(), Expr::Nearest { .. } | Expr::Bm25 { .. }) { + return Err(NanoError::Type( + "T21: rrf primary expression must be nearest(...) or bm25(...)".to_string(), + )); + } + if !matches!(secondary.as_ref(), Expr::Nearest { .. } | Expr::Bm25 { .. }) { + return Err(NanoError::Type( + "T21: rrf secondary expression must be nearest(...) or bm25(...)".to_string(), + )); + } + + let primary_ty = resolve_expr_type(catalog, primary, ctx, params)?; + let secondary_ty = resolve_expr_type(catalog, secondary, ctx, params)?; + + for ty in [primary_ty, secondary_ty] { + match ty { + ResolvedType::Scalar(s) if s.scalar == ScalarType::F64 && !s.list => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T21: rrf rank expressions must evaluate to F64, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T21: rrf rank expressions must be scalar numeric expressions" + .to_string(), + )); + } + } + } + + if let Some(k_expr) = k { + let k_type = resolve_expr_type(catalog, k_expr, ctx, params)?; + match k_type { + ResolvedType::Scalar(s) + if !s.list + && matches!( + s.scalar, + ScalarType::I32 + | ScalarType::I64 + | ScalarType::U32 + | ScalarType::U64 + ) => {} + ResolvedType::Scalar(s) => { + return Err(NanoError::Type(format!( + "T21: rrf k must be an integer scalar, got {}", + s.display_name() + ))); + } + _ => { + return Err(NanoError::Type( + "T21: rrf k must be an integer scalar expression".to_string(), + )); + } + } + if let Expr::Literal(Literal::Integer(v)) = k_expr.as_ref() + && *v <= 0 + { + return Err(NanoError::Type( + "T21: rrf k must be greater than 0".to_string(), + )); + } + } + + Ok(ResolvedType::Scalar(PropType::scalar( + ScalarType::F64, + false, + ))) + } + Expr::Variable(name) => { + // Could be a query parameter or a bound variable + if let Some(prop_type) = params.get(name) { + Ok(ResolvedType::Scalar(prop_type.clone())) + } else if let Some(bv) = ctx.bindings.get(name) { + Ok(ResolvedType::Node(bv.type_name.clone())) + } else { + Err(NanoError::Type(format!( + "variable `${}` is not bound", + name + ))) + } + } + Expr::Literal(lit) => Ok(ResolvedType::Scalar(literal_type(lit)?)), + Expr::Aggregate { func, arg } => { + let arg_type = resolve_expr_type(catalog, arg, ctx, params)?; + + // T8: sum/avg/min/max require numeric + match func { + AggFunc::Sum | AggFunc::Avg | AggFunc::Min | AggFunc::Max => { + if let ResolvedType::Scalar(s) = &arg_type + && (s.list || !s.scalar.is_numeric()) + { + return Err(NanoError::Type(format!( + "T8: {} requires numeric type, got {}", + func, + s.display_name() + ))); + } + } + _ => {} // count works on any type + } + + Ok(ResolvedType::Aggregate) + } + Expr::AliasRef(name) => { + // Check if it's a known alias from return clause + if let Some(resolved) = ctx.aliases.get(name) { + Ok(resolved.clone()) + } else { + // Might be an alias not yet registered (forward reference in order) + Ok(ResolvedType::Aggregate) + } + } + } +} + +fn infer_projection_field( + catalog: &Catalog, + expr: &Expr, + alias: Option<&str>, + ctx: &TypeContext, + params: &HashMap, +) -> Result { + let name = projection_name(expr, alias); + match expr { + Expr::Aggregate { func, arg } => { + let (data_type, nullable) = match func { + AggFunc::Count => (DataType::Int64, true), + AggFunc::Avg => (DataType::Float64, true), + _ => { + let resolved = resolve_expr_type(catalog, arg, ctx, params)?; + let (data_type, _) = resolved_type_to_field_shape(catalog, &resolved)?; + (data_type, true) + } + }; + Ok(Field::new(name, data_type, nullable)) + } + _ => { + let resolved = resolve_expr_type(catalog, expr, ctx, params)?; + let (data_type, nullable) = resolved_type_to_field_shape(catalog, &resolved)?; + Ok(Field::new(name, data_type, nullable)) + } + } +} + +fn projection_name(expr: &Expr, alias: Option<&str>) -> String { + if let Some(alias) = alias { + return alias.to_string(); + } + + match expr { + Expr::Now => "now".to_string(), + Expr::PropAccess { property, .. } => property.clone(), + Expr::Variable(variable) => variable.clone(), + Expr::Literal(_) => "literal".to_string(), + Expr::Nearest { .. } => "nearest".to_string(), + Expr::Search { .. } => "search".to_string(), + Expr::Fuzzy { .. } => "fuzzy".to_string(), + Expr::MatchText { .. } => "match_text".to_string(), + Expr::Bm25 { .. } => "bm25".to_string(), + Expr::Rrf { .. } => "rrf".to_string(), + Expr::Aggregate { func, .. } => func.to_string(), + Expr::AliasRef(name) => name.clone(), + } +} + +fn resolved_type_to_field_shape( + catalog: &Catalog, + resolved: &ResolvedType, +) -> Result<(DataType, bool)> { + match resolved { + ResolvedType::Scalar(prop_type) => Ok((prop_type.to_arrow(), prop_type.nullable)), + ResolvedType::Node(type_name) => { + let node_type = catalog.node_types.get(type_name).ok_or_else(|| { + NanoError::Type(format!("type `{}` not found in catalog", type_name)) + })?; + let fields: Vec = node_type + .arrow_schema + .fields() + .iter() + .map(|field| field.as_ref().clone()) + .collect(); + Ok((DataType::Struct(fields.into()), false)) + } + ResolvedType::Aggregate => Ok((DataType::Int64, true)), + } +} + +fn literal_type(lit: &Literal) -> Result { + match lit { + Literal::String(_) => Ok(PropType::scalar(ScalarType::String, false)), + Literal::Integer(_) => Ok(PropType::scalar(ScalarType::I64, false)), + Literal::Float(_) => Ok(PropType::scalar(ScalarType::F64, false)), + Literal::Bool(_) => Ok(PropType::scalar(ScalarType::Bool, false)), + Literal::Date(_) => Ok(PropType::scalar(ScalarType::Date, false)), + Literal::DateTime(_) => Ok(PropType::scalar(ScalarType::DateTime, false)), + Literal::List(items) => { + if items.is_empty() { + return Ok(PropType::list_of(ScalarType::String, false)); + } + let first = literal_type(&items[0])?; + if first.list { + return Err(NanoError::Type( + "nested list literals are not supported".to_string(), + )); + } + for item in items.iter().skip(1) { + let item_type = literal_type(item)?; + if item_type.list || !types_compatible(&first, &item_type) { + return Err(NanoError::Type( + "list literal elements must share a compatible scalar type".to_string(), + )); + } + } + Ok(PropType::list_of(first.scalar, false)) + } + } +} + +fn check_literal_type(lit: &Literal, expected: &PropType, prop_name: &str) -> Result<()> { + if !expected.list + && let ScalarType::Vector(expected_dim) = expected.scalar + && let Some(actual_dim) = numeric_vector_literal_dim(lit) + { + if actual_dim == expected_dim { + return Ok(()); + } + return Err(NanoError::Type(format!( + "T3: property `{}` has type Vector({}) but got vector literal with {} elements", + prop_name, expected_dim, actual_dim + ))); + } + + let lit_type = literal_type(lit)?; + if !types_compatible(&lit_type, expected) { + return Err(NanoError::Type(format!( + "T3: property `{}` has type {} but got {}", + prop_name, + expected.display_name(), + lit_type.display_name() + ))); + } + if expected.is_enum() { + let allowed = expected.enum_values.as_ref().cloned().unwrap_or_default(); + match lit { + Literal::String(v) => { + if !allowed.contains(v) { + return Err(NanoError::Type(format!( + "T3: property `{}` expects one of [{}], got '{}'", + prop_name, + allowed.join(", "), + v + ))); + } + } + Literal::List(items) if expected.list => { + for item in items { + match item { + Literal::String(v) if allowed.contains(v) => {} + Literal::String(v) => { + return Err(NanoError::Type(format!( + "T3: property `{}` expects one of [{}], got '{}'", + prop_name, + allowed.join(", "), + v + ))); + } + _ => {} + } + } + } + _ => {} + } + } + Ok(()) +} + +fn types_compatible(a: &PropType, b: &PropType) -> bool { + if a.list != b.list { + return false; + } + if a.scalar == b.scalar { + return true; + } + // Numeric types are mutually compatible for comparison + if a.scalar.is_numeric() && b.scalar.is_numeric() { + return true; + } + false +} + +fn numeric_vector_literal_dim(lit: &Literal) -> Option { + let items = match lit { + Literal::List(items) => items, + _ => return None, + }; + if items.is_empty() { + return None; + } + if items + .iter() + .all(|v| matches!(v, Literal::Integer(_) | Literal::Float(_))) + { + Some(items.len() as u32) + } else { + None + } +} + +fn expr_references_any(expr: &Expr, vars: &[String]) -> bool { + match expr { + Expr::PropAccess { variable, .. } => vars.contains(variable), + Expr::Nearest { + variable, query, .. + } => vars.contains(variable) || expr_references_any(query, vars), + Expr::Search { field, query } => { + expr_references_any(field, vars) || expr_references_any(query, vars) + } + Expr::Fuzzy { + field, + query, + max_edits, + } => { + expr_references_any(field, vars) + || expr_references_any(query, vars) + || max_edits + .as_deref() + .is_some_and(|m| expr_references_any(m, vars)) + } + Expr::MatchText { field, query } => { + expr_references_any(field, vars) || expr_references_any(query, vars) + } + Expr::Bm25 { field, query } => { + expr_references_any(field, vars) || expr_references_any(query, vars) + } + Expr::Rrf { + primary, + secondary, + k, + } => { + expr_references_any(primary, vars) + || expr_references_any(secondary, vars) + || k.as_deref() + .is_some_and(|expr| expr_references_any(expr, vars)) + } + Expr::Variable(v) => vars.contains(v), + Expr::Aggregate { arg, .. } => expr_references_any(arg, vars), + _ => false, + } +} + +fn expr_contains_standalone_nearest_with_aliases( + expr: &Expr, + alias_exprs: &HashMap, +) -> bool { + expr_contains_standalone_nearest_inner(expr, alias_exprs, &mut HashSet::new()) +} + +fn expr_contains_standalone_nearest_inner( + expr: &Expr, + alias_exprs: &HashMap, + seen_aliases: &mut HashSet, +) -> bool { + match expr { + Expr::Nearest { .. } => true, + Expr::Aggregate { arg, .. } => { + expr_contains_standalone_nearest_inner(arg, alias_exprs, seen_aliases) + } + Expr::Search { field, query } + | Expr::MatchText { field, query } + | Expr::Bm25 { field, query } => { + expr_contains_standalone_nearest_inner(field, alias_exprs, seen_aliases) + || expr_contains_standalone_nearest_inner(query, alias_exprs, seen_aliases) + } + Expr::Fuzzy { + field, + query, + max_edits, + } => { + expr_contains_standalone_nearest_inner(field, alias_exprs, seen_aliases) + || expr_contains_standalone_nearest_inner(query, alias_exprs, seen_aliases) + || max_edits.as_deref().is_some_and(|expr| { + expr_contains_standalone_nearest_inner(expr, alias_exprs, seen_aliases) + }) + } + Expr::AliasRef(name) => { + if !seen_aliases.insert(name.clone()) { + return false; + } + let found = alias_exprs.get(name).is_some_and(|expr| { + expr_contains_standalone_nearest_inner(expr, alias_exprs, seen_aliases) + }); + seen_aliases.remove(name); + found + } + // nearest() nested under rrf() is handled by T21 and should not trigger T17/T18 checks. + Expr::Rrf { .. } => false, + _ => false, + } +} + +fn expr_contains_rrf_with_aliases(expr: &Expr, alias_exprs: &HashMap) -> bool { + expr_contains_rrf_inner(expr, alias_exprs, &mut HashSet::new()) +} + +fn expr_contains_rrf_inner( + expr: &Expr, + alias_exprs: &HashMap, + seen_aliases: &mut HashSet, +) -> bool { + match expr { + Expr::Rrf { .. } => true, + Expr::Aggregate { arg, .. } => expr_contains_rrf_inner(arg, alias_exprs, seen_aliases), + Expr::Search { field, query } + | Expr::MatchText { field, query } + | Expr::Bm25 { field, query } => { + expr_contains_rrf_inner(field, alias_exprs, seen_aliases) + || expr_contains_rrf_inner(query, alias_exprs, seen_aliases) + } + Expr::Fuzzy { + field, + query, + max_edits, + } => { + expr_contains_rrf_inner(field, alias_exprs, seen_aliases) + || expr_contains_rrf_inner(query, alias_exprs, seen_aliases) + || max_edits + .as_deref() + .is_some_and(|expr| expr_contains_rrf_inner(expr, alias_exprs, seen_aliases)) + } + Expr::AliasRef(name) => { + if !seen_aliases.insert(name.clone()) { + return false; + } + let found = alias_exprs + .get(name) + .is_some_and(|expr| expr_contains_rrf_inner(expr, alias_exprs, seen_aliases)); + seen_aliases.remove(name); + found + } + _ => false, + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::catalog::build_catalog; + use crate::query::parser::parse_query; + use crate::schema::parser::parse_schema; + + fn setup() -> Catalog { + let schema = parse_schema( + r#" +node Person { + name: String + age: I32? +} +node Company { + name: String +} +edge Knows: Person -> Person { + since: Date? +} +edge WorksAt: Person -> Company { + title: String? +} +"#, + ) + .unwrap(); + build_catalog(&schema).unwrap() + } + + fn setup_vector() -> Catalog { + let schema = parse_schema( + r#" +node Doc { + id_str: String + embedding: Vector(3) +} +"#, + ) + .unwrap(); + build_catalog(&schema).unwrap() + } + + fn setup_list() -> Catalog { + let schema = parse_schema( + r#" +node Person { + name: String + tags: [String]? +} +"#, + ) + .unwrap(); + build_catalog(&schema).unwrap() + } + + fn setup_embed_vector() -> Catalog { + let schema = parse_schema( + r#" +node Doc { + slug: String + body: String? + embedding: Vector(3) @embed(body) +} +"#, + ) + .unwrap(); + build_catalog(&schema).unwrap() + } + + #[test] + fn test_basic_binding() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { $p: Person } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_t1_unknown_type() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { $p: Foo } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T1")); + } + + #[test] + fn test_t2_unknown_property_match() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { $p: Person { salary: 100 } } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T2")); + } + + #[test] + fn test_t3_wrong_type_in_match() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { $p: Person { age: "old" } } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T3")); + } + + #[test] + fn test_list_membership_match_accepts_scalar_literal() { + let catalog = setup_list(); + let qf = parse_query( + r#" +query q() { + match { $p: Person { tags: "rust" } } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_list_membership_match_accepts_scalar_param() { + let catalog = setup_list(); + let qf = parse_query( + r#" +query q($tag: String) { + match { $p: Person { tags: $tag } } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_list_equality_match_is_rejected() { + let catalog = setup_list(); + let qf = parse_query( + r#" +query q() { + match { $p: Person { tags: ["rust"] } } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("list equality is not supported")); + assert!(msg.contains("membership")); + } + + #[test] + fn test_contains_filter_accepts_list_membership() { + let catalog = setup_list(); + let qf = parse_query( + r#" +query q($tag: String) { + match { + $p: Person + $p.tags contains $tag + } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_declared_list_params_typecheck() { + let catalog = setup_list(); + let qf = parse_query( + r#" +query q($tags: [String], $days: [Date]?) { + match { + $p: Person + $p.tags contains "friend" + } + return { $p.tags, $tags, $days } +} +"#, + ) + .unwrap(); + assert!(typecheck_query(&catalog, &qf.queries[0]).is_ok()); + } + + #[test] + fn test_contains_filter_requires_list_left_operand() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p.name contains "Al" + } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!( + err.to_string() + .contains("contains requires a list property on the left") + ); + } + + #[test] + fn test_contains_filter_rejects_list_right_operand() { + let catalog = setup_list(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p.tags contains ["rust"] + } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!( + err.to_string() + .contains("contains requires a scalar right operand") + ); + } + + #[test] + fn test_t4_unknown_edge() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p likes $f + } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T4")); + } + + #[test] + fn test_t5_bad_endpoints() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $c: Company + $c knows $f + } + return { $c.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T5")); + } + + #[test] + fn test_t6_bad_property() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p.salary > 100 + } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T6")); + } + + #[test] + fn test_t7_bad_comparison() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p.age > "old" + } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T7")); + } + + #[test] + fn test_t7_rejects_non_scalar_comparison() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p != 5 + } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("scalar operands")); + } + + #[test] + fn test_nearest_requires_limit() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($q: Vector(3)) { + match { $d: Doc } + return { $d.id_str } + order { nearest($d.embedding, $q) } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T17")); + } + + #[test] + fn test_nearest_vector_dim_mismatch() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($q: Vector(2)) { + match { $d: Doc } + return { $d.id_str } + order { nearest($d.embedding, $q) } + limit 3 +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T15")); + } + + #[test] + fn test_nearest_vector_param_ok() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($q: Vector(3)) { + match { $d: Doc } + return { $d.id_str } + order { nearest($d.embedding, $q) } + limit 3 +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("d")); + } + + #[test] + fn test_nearest_string_param_ok() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($q: String) { + match { $d: Doc } + return { $d.id_str } + order { nearest($d.embedding, $q) } + limit 3 +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("d")); + } + + #[test] + fn test_search_string_param_ok() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($q: String) { + match { + $p: Person + search($p.name, $q) + } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_fuzzy_max_edits_param_ok() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($q: String, $m: I64) { + match { + $p: Person + fuzzy($p.name, $q, $m) + } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_fuzzy_rejects_non_integer_max_edits() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($q: String, $m: F64) { + match { + $p: Person + fuzzy($p.name, $q, $m) + } + return { $p.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T19")); + } + + #[test] + fn test_match_text_string_param_ok() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($q: String) { + match { + $p: Person + match_text($p.name, $q) + } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_bm25_string_param_ok() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($q: String) { + match { $p: Person } + return { $p.name, bm25($p.name, $q) as score } + order { bm25($p.name, $q) desc } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_bm25_rejects_non_string_query() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($q: I64) { + match { $p: Person } + return { bm25($p.name, $q) as score } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T20")); + } + + #[test] + fn test_rrf_requires_limit_in_order() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: Vector(3), $tq: String) { + match { $d: Doc } + return { $d.id_str } + order { rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 60) desc } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T21")); + } + + #[test] + fn test_rrf_ordering_ok_with_limit() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: Vector(3), $tq: String) { + match { $d: Doc } + return { $d.id_str } + order { rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 60) desc } + limit 5 +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("d")); + } + + #[test] + fn test_rrf_ordering_ok_with_string_nearest_limit() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: String, $tq: String) { + match { $d: Doc } + return { $d.id_str } + order { rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 60) desc } + limit 5 +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("d")); + } + + #[test] + fn test_rrf_with_nearest_allows_alias_ordering() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: Vector(3), $tq: String) { + match { $d: Doc } + return { + $d.id_str, + rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 60) as score + } + order { + rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 60) desc, + score desc + } + limit 5 +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("d")); + } + + #[test] + fn test_rrf_alias_ordering_requires_limit() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: Vector(3), $tq: String) { + match { $d: Doc } + return { + $d.id_str, + rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 60) as score + } + order { score desc } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T21")); + } + + #[test] + fn test_rrf_alias_ordering_with_limit_is_valid() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: Vector(3), $tq: String) { + match { $d: Doc } + return { + $d.id_str, + rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 60) as score + } + order { score desc } + limit 5 +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("d")); + } + + #[test] + fn test_standalone_nearest_with_alias_ordering_still_rejected() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: Vector(3)) { + match { $d: Doc } + return { + $d.id_str as score + } + order { + nearest($d.embedding, $vq), + score desc + } + limit 5 +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T18")); + } + + #[test] + fn test_rrf_rejects_non_rank_expression_argument() { + let parse = parse_query( + r#" +query q($q: String) { + match { $d: Doc } + return { $d.id_str } + order { rrf(bm25($d.id_str, $q), search($d.id_str, $q), 60) desc } + limit 5 +} +"#, + ); + assert!(parse.is_err()); + } + + #[test] + fn test_rrf_rejects_non_positive_k_literal() { + let catalog = setup_vector(); + let qf = parse_query( + r#" +query q($vq: Vector(3), $tq: String) { + match { $d: Doc } + return { $d.id_str } + order { rrf(nearest($d.embedding, $vq), bm25($d.id_str, $tq), 0) desc } + limit 5 +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T21")); + } + + #[test] + fn test_t8_sum_on_string() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { $p: Person } + return { sum($p.name) as s } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T8")); + } + + #[test] + fn test_traversal_direction_out() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person { name: "Alice" } + $p knows $f + } + return { $f.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert_eq!(ctx.traversals[0].direction, Direction::Out); + assert_eq!(ctx.bindings["f"].type_name, "Person"); + } + + #[test] + fn test_traversal_direction_in() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $c: Company { name: "Acme" } + $p worksAt $c + } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + // $c is Company (to_type), $p is src — direction should be Out + // because $p (Person=from_type) worksAt $c (Company=to_type) is forward + assert_eq!(ctx.traversals[0].direction, Direction::Out); + } + + #[test] + fn test_bounded_traversal_typecheck() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p knows{1,3} $f + } + return { $f.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert_eq!(ctx.traversals[0].min_hops, 1); + assert_eq!(ctx.traversals[0].max_hops, Some(3)); + } + + #[test] + fn test_bounded_traversal_invalid_bounds() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p knows{3,1} $f + } + return { $f.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T15")); + } + + #[test] + fn test_unbounded_traversal_is_disabled() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p knows{1,} $f + } + return { $f.name } +} +"#, + ) + .unwrap(); + let err = typecheck_query(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("unbounded traversal is disabled")); + } + + #[test] + fn test_negation_typecheck() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + not { $p worksAt $_ } + } + return { $p.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("p")); + } + + #[test] + fn test_aggregation_typecheck() { + let catalog = setup(); + let qf = parse_query( + r#" +query q() { + match { + $p: Person + $p knows $f + } + return { + $p.name + count($f) as friends + } +} +"#, + ) + .unwrap(); + typecheck_query(&catalog, &qf.queries[0]).unwrap(); + } + + #[test] + fn test_valid_two_hop() { + let catalog = setup(); + let qf = parse_query( + r#" +query q($name: String) { + match { + $p: Person { name: $name } + $p knows $mid + $mid knows $fof + } + return { $fof.name } +} +"#, + ) + .unwrap(); + let ctx = typecheck_query(&catalog, &qf.queries[0]).unwrap(); + assert!(ctx.bindings.contains_key("mid")); + assert!(ctx.bindings.contains_key("fof")); + } + + #[test] + fn test_mutation_insert_typecheck_ok() { + let catalog = setup(); + let qf = parse_query( + r#" +query add_person($name: String, $age: I32) { + insert Person { + name: $name + age: $age + } +} +"#, + ) + .unwrap(); + let checked = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap(); + match checked { + CheckedQuery::Mutation(ctx) => assert_eq!(ctx.target_type, "Person"), + _ => panic!("expected mutation typecheck result"), + } + } + + #[test] + fn test_mutation_insert_missing_required_property() { + let catalog = setup(); + let qf = parse_query( + r#" +query add_person($age: I32) { + insert Person { age: $age } +} +"#, + ) + .unwrap(); + let err = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T12")); + } + + #[test] + fn test_mutation_insert_allows_embed_target_omission_when_source_present() { + let catalog = setup_embed_vector(); + let qf = parse_query( + r#" +query add_doc($slug: String, $body: String) { + insert Doc { + slug: $slug + body: $body + } +} +"#, + ) + .unwrap(); + let checked = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap(); + match checked { + CheckedQuery::Mutation(ctx) => assert_eq!(ctx.target_type, "Doc"), + _ => panic!("expected mutation typecheck result"), + } + } + + #[test] + fn test_mutation_insert_requires_embed_source_when_target_omitted() { + let catalog = setup_embed_vector(); + let qf = parse_query( + r#" +query add_doc($slug: String) { + insert Doc { + slug: $slug + } +} +"#, + ) + .unwrap(); + let err = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("T12")); + assert!(msg.contains("embedding")); + assert!(msg.contains("body")); + } + + #[test] + fn test_mutation_update_bad_property() { + let catalog = setup(); + let qf = parse_query( + r#" +query update_person($name: String) { + update Person set { salary: 100 } where name = $name +} +"#, + ) + .unwrap(); + let err = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T11")); + } + + #[test] + fn test_mutation_delete_bad_type() { + let catalog = setup(); + let qf = parse_query( + r#" +query del($name: String) { + delete Unknown where name = $name +} +"#, + ) + .unwrap(); + let err = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T10")); + } + + #[test] + fn test_mutation_insert_edge_typecheck_ok() { + let catalog = setup(); + let qf = parse_query( + r#" +query add_knows($from: String, $to: String) { + insert Knows { + from: $from + to: $to + } +} +"#, + ) + .unwrap(); + let checked = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap(); + match checked { + CheckedQuery::Mutation(ctx) => assert_eq!(ctx.target_type, "Knows"), + _ => panic!("expected mutation typecheck result"), + } + } + + #[test] + fn test_mutation_insert_edge_requires_from_and_to() { + let catalog = setup(); + let qf = parse_query( + r#" +query add_knows($from: String) { + insert Knows { + from: $from + } +} +"#, + ) + .unwrap(); + let err = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T12")); + } + + #[test] + fn test_mutation_delete_edge_typecheck_ok() { + let catalog = setup(); + let qf = parse_query( + r#" +query del_knows($from: String) { + delete Knows where from = $from +} +"#, + ) + .unwrap(); + let checked = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap(); + match checked { + CheckedQuery::Mutation(ctx) => assert_eq!(ctx.target_type, "Knows"), + _ => panic!("expected mutation typecheck result"), + } + } + + #[test] + fn test_mutation_update_edge_not_supported() { + let catalog = setup(); + let qf = parse_query( + r#" +query upd_knows($from: String) { + update Knows set { since: 2000 } where from = $from +} +"#, + ) + .unwrap(); + let err = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("T16")); + } + + #[test] + fn test_now_expression_typechecks_as_datetime() { + let schema = parse_schema( + r#" +node Event { + slug: String @key + at: DateTime +} +"#, + ) + .unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let qf = parse_query( + r#" +query due() { + match { + $e: Event + $e.at <= now() + } + return { now() as ts } +} +"#, + ) + .unwrap(); + + let checked = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap(); + assert!(matches!(checked, CheckedQuery::Read(_))); + } + + #[test] + fn test_now_is_rejected_for_non_datetime_mutation_property() { + let schema = parse_schema( + r#" +node Event { + slug: String @key + on: Date +} +"#, + ) + .unwrap(); + let catalog = build_catalog(&schema).unwrap(); + let qf = parse_query( + r#" +query stamp() { + update Event set { on: now() } where slug = "launch" +} +"#, + ) + .unwrap(); + + let err = typecheck_query_decl(&catalog, &qf.queries[0]).unwrap_err(); + assert!(err.to_string().contains("DateTime")); + assert!(err.to_string().contains("property `on`")); + } +} diff --git a/crates/omnigraph-compiler/src/query_input.rs b/crates/omnigraph-compiler/src/query_input.rs new file mode 100644 index 0000000..e2bab52 --- /dev/null +++ b/crates/omnigraph-compiler/src/query_input.rs @@ -0,0 +1,892 @@ +use std::error::Error; +use std::fmt; + +use serde_json::Value; + +use crate::error::NanoError; +use crate::ir::ParamMap; +use crate::json_output::{JS_MAX_SAFE_INTEGER_U64, is_js_safe_integer_i64}; +use crate::query::ast::{Literal, Param, QueryDecl}; +use crate::query::parser::parse_query; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum JsonParamMode { + Standard, + JavaScript, +} + +#[derive(Debug)] +pub enum RunInputError { + Core(NanoError), + Message(String), +} + +impl RunInputError { + fn message(message: impl Into) -> Self { + Self::Message(message.into()) + } +} + +impl fmt::Display for RunInputError { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Core(err) => err.fmt(f), + Self::Message(message) => f.write_str(message), + } + } +} + +impl Error for RunInputError { + fn source(&self) -> Option<&(dyn Error + 'static)> { + match self { + Self::Core(err) => Some(err), + Self::Message(_) => None, + } + } +} + +impl From for RunInputError { + fn from(value: NanoError) -> Self { + Self::Core(value) + } +} + +pub type RunInputResult = std::result::Result; + +pub trait ToParam { + fn to_param(self) -> crate::error::Result; +} + +impl ToParam for Literal { + fn to_param(self) -> crate::error::Result { + Ok(self) + } +} + +impl ToParam for &Literal { + fn to_param(self) -> crate::error::Result { + Ok(self.clone()) + } +} + +impl ToParam for String { + fn to_param(self) -> crate::error::Result { + Ok(Literal::String(self)) + } +} + +impl ToParam for &String { + fn to_param(self) -> crate::error::Result { + Ok(Literal::String(self.clone())) + } +} + +impl ToParam for &str { + fn to_param(self) -> crate::error::Result { + Ok(Literal::String(self.to_string())) + } +} + +impl ToParam for bool { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Bool(self)) + } +} + +impl ToParam for i8 { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Integer(i64::from(self))) + } +} + +impl ToParam for i16 { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Integer(i64::from(self))) + } +} + +impl ToParam for i32 { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Integer(i64::from(self))) + } +} + +impl ToParam for i64 { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Integer(self)) + } +} + +impl ToParam for isize { + fn to_param(self) -> crate::error::Result { + let value = i64::try_from(self).map_err(|_| { + NanoError::Execution(format!( + "param value {} exceeds current engine range for numeric literals (max {})", + self, + i64::MAX + )) + })?; + Ok(Literal::Integer(value)) + } +} + +impl ToParam for u8 { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Integer(i64::from(self))) + } +} + +impl ToParam for u16 { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Integer(i64::from(self))) + } +} + +impl ToParam for u32 { + fn to_param(self) -> crate::error::Result { + Ok(Literal::Integer(i64::from(self))) + } +} + +impl ToParam for u64 { + fn to_param(self) -> crate::error::Result { + let value = i64::try_from(self).map_err(|_| { + NanoError::Execution(format!( + "param value {} exceeds current engine range for numeric literals (max {})", + self, + i64::MAX + )) + })?; + Ok(Literal::Integer(value)) + } +} + +impl ToParam for usize { + fn to_param(self) -> crate::error::Result { + let value = i64::try_from(self).map_err(|_| { + NanoError::Execution(format!( + "param value {} exceeds current engine range for numeric literals (max {})", + self, + i64::MAX + )) + })?; + Ok(Literal::Integer(value)) + } +} + +impl ToParam for f32 { + fn to_param(self) -> crate::error::Result { + if !self.is_finite() { + return Err(NanoError::Execution(format!( + "invalid float parameter {}", + self + ))); + } + Ok(Literal::Float(f64::from(self))) + } +} + +impl ToParam for f64 { + fn to_param(self) -> crate::error::Result { + if !self.is_finite() { + return Err(NanoError::Execution(format!( + "invalid float parameter {}", + self + ))); + } + Ok(Literal::Float(self)) + } +} + +impl ToParam for Vec +where + T: ToParam, +{ + fn to_param(self) -> crate::error::Result { + let mut out = Vec::with_capacity(self.len()); + for value in self { + out.push(value.to_param()?); + } + Ok(Literal::List(out)) + } +} + +impl ToParam for &[T] +where + T: Clone + ToParam, +{ + fn to_param(self) -> crate::error::Result { + let mut out = Vec::with_capacity(self.len()); + for value in self { + out.push(value.clone().to_param()?); + } + Ok(Literal::List(out)) + } +} + +impl ToParam for [T; N] +where + T: ToParam, +{ + fn to_param(self) -> crate::error::Result { + let mut out = Vec::with_capacity(N); + for value in self { + out.push(value.to_param()?); + } + Ok(Literal::List(out)) + } +} + +#[macro_export] +macro_rules! params { + () => { + ::std::result::Result::Ok($crate::ParamMap::new()) + }; + ($($key:expr => $value:expr),+ $(,)?) => {{ + (|| -> $crate::error::Result<$crate::ParamMap> { + let mut map = $crate::ParamMap::new(); + $( + map.insert(::std::convert::Into::::into($key), $crate::ToParam::to_param($value)?); + )+ + Ok(map) + })() + }}; +} + +pub fn find_named_query(query_source: &str, query_name: &str) -> RunInputResult { + let queries = parse_query(query_source)?; + queries + .queries + .into_iter() + .find(|query| query.name == query_name) + .ok_or_else(|| RunInputError::message(format!("query '{}' not found", query_name))) +} + +pub fn json_params_to_param_map( + params: Option<&Value>, + query_params: &[Param], + mode: JsonParamMode, +) -> RunInputResult { + let mut map = ParamMap::new(); + let object = match params { + Some(Value::Object(object)) => object, + Some(Value::Null) | None => return Ok(map), + Some(other) => { + let message = match mode { + JsonParamMode::Standard => "params must be a JSON object".to_string(), + JsonParamMode::JavaScript => { + format!("params must be an object, got {}", json_type_name(other)) + } + }; + return Err(RunInputError::message(message)); + } + }; + + for (key, value) in object { + let decl = query_params.iter().find(|param| param.name == *key); + let literal = if let Some(decl) = decl { + json_value_to_literal_typed(key, value, &decl.type_name, mode)? + } else { + json_value_to_literal_inferred(key, value, mode)? + }; + map.insert(key.clone(), literal); + } + + Ok(map) +} + +fn json_value_to_literal_typed( + key: &str, + value: &Value, + type_name: &str, + mode: JsonParamMode, +) -> RunInputResult { + match type_name { + "String" => match value { + Value::String(value) => Ok(Literal::String(value.clone())), + other => Err(RunInputError::message(format!( + "param '{}': expected string, got {}", + key, + json_type_name(other) + ))), + }, + "I32" => match mode { + JsonParamMode::Standard => { + let value = parse_i64_param(key, value, mode)?; + let value = i32::try_from(value).map_err(|_| { + RunInputError::message(format!("param '{}': value {} exceeds I32", key, value)) + })?; + Ok(Literal::Integer(i64::from(value))) + } + JsonParamMode::JavaScript => { + let value = parse_i64_param(key, value, mode)?; + let value = i32::try_from(value).map_err(|_| { + RunInputError::message(format!( + "param '{}': value {} exceeds I32 range", + key, value + )) + })?; + Ok(Literal::Integer(i64::from(value))) + } + }, + "I64" => Ok(Literal::Integer(parse_i64_param(key, value, mode)?)), + "U32" => { + let value = parse_u64_param(key, value, mode)?; + let value = match mode { + JsonParamMode::Standard => u32::try_from(value).map_err(|_| { + RunInputError::message(format!("param '{}': value {} exceeds U32", key, value)) + })?, + JsonParamMode::JavaScript => u32::try_from(value).map_err(|_| { + RunInputError::message(format!( + "param '{}': value {} exceeds U32 range", + key, value + )) + })?, + }; + Ok(Literal::Integer(i64::from(value))) + } + "U64" => { + let value = parse_u64_param(key, value, mode)?; + let value = match mode { + JsonParamMode::Standard => i64::try_from(value).map_err(|_| { + RunInputError::message(format!( + "param '{}': value {} exceeds current engine range for U64 (max {})", + key, + value, + i64::MAX + )) + })?, + JsonParamMode::JavaScript => i64::try_from(value).map_err(|_| { + RunInputError::message(format!( + "param '{}': value {} exceeds current engine range for U64 parameters (max {})", + key, + value, + i64::MAX + )) + })?, + }; + Ok(Literal::Integer(value)) + } + "F32" | "F64" => { + let value = value.as_f64().ok_or_else(|| match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': expected float", key)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': expected float, got {}", + key, + json_type_name(value) + )), + })?; + Ok(Literal::Float(value)) + } + "Bool" => { + let value = value.as_bool().ok_or_else(|| match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': expected boolean", key)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': expected boolean, got {}", + key, + json_type_name(value) + )), + })?; + Ok(Literal::Bool(value)) + } + "Date" => match value { + Value::String(value) => Ok(Literal::Date(value.clone())), + other => Err(match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': expected date string", key)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': expected date string, got {}", + key, + json_type_name(other) + )), + }), + }, + "DateTime" => match value { + Value::String(value) => Ok(Literal::DateTime(value.clone())), + other => Err(match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': expected datetime string", key)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': expected datetime string, got {}", + key, + json_type_name(other) + )), + }), + }, + "Blob" => match value { + Value::String(value) => Ok(Literal::String(value.clone())), + other => Err(RunInputError::message(format!( + "param '{}': expected blob URI string, got {}", + key, + json_type_name(other) + ))), + }, + other if parse_list_item_type(other).is_some() => { + let item_type = parse_list_item_type(other).unwrap(); + let items = value.as_array().ok_or_else(|| match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': expected array for {}", key, other)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': expected array for {}, got {}", + key, + other, + json_type_name(value) + )), + })?; + let mut out = Vec::with_capacity(items.len()); + for item in items { + out.push(json_value_to_literal_typed(key, item, item_type, mode)?); + } + Ok(Literal::List(out)) + } + other if other.starts_with("Vector(") => { + let expected_dim = parse_vector_dim(other).ok_or_else(|| match mode { + JsonParamMode::Standard => RunInputError::message(format!( + "param '{}': invalid vector type '{}'", + key, other + )), + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': invalid vector type '{}' (expected Vector(N))", + key, other + )), + })?; + let items = value.as_array().ok_or_else(|| match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': expected array for {}", key, other)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': expected array for {}, got {}", + key, + other, + json_type_name(value) + )), + })?; + if items.len() != expected_dim { + return Err(RunInputError::message(format!( + "param '{}': expected {} values for {}, got {}", + key, + expected_dim, + other, + items.len() + ))); + } + let mut out = Vec::with_capacity(items.len()); + for item in items { + let value = item.as_f64().ok_or_else(|| match mode { + JsonParamMode::Standard => RunInputError::message(format!( + "param '{}': vector element is not numeric", + key + )), + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': vector element '{}' is not numeric", + key, item + )), + })?; + out.push(Literal::Float(value)); + } + Ok(Literal::List(out)) + } + _ => match value { + Value::String(value) => Ok(Literal::String(value.clone())), + other => Err(RunInputError::message(format!( + "param '{}': expected string for type '{}', got {}", + key, + type_name, + json_type_name(other) + ))), + }, + } +} + +fn json_value_to_literal_inferred( + key: &str, + value: &Value, + mode: JsonParamMode, +) -> RunInputResult { + match value { + Value::String(value) => Ok(Literal::String(value.clone())), + Value::Bool(value) => Ok(Literal::Bool(*value)), + Value::Number(number) => match mode { + JsonParamMode::Standard => { + if let Some(value) = number.as_i64() { + Ok(Literal::Integer(value)) + } else if let Some(value) = number.as_f64() { + Ok(Literal::Float(value)) + } else { + Err(RunInputError::message(format!( + "param '{}': unsupported numeric value", + key + ))) + } + } + JsonParamMode::JavaScript => { + if let Some(value) = number.as_i64() { + if !is_js_safe_integer_i64(value) { + return Err(RunInputError::message(format!( + "param '{}': integer {} exceeds JS safe integer range; use a decimal string and a typed query parameter for exact values", + key, value + ))); + } + Ok(Literal::Integer(value)) + } else if let Some(value) = number.as_u64() { + if value > JS_MAX_SAFE_INTEGER_U64 { + return Err(RunInputError::message(format!( + "param '{}': integer {} exceeds JS safe integer range; use a decimal string and a typed query parameter for exact values", + key, value + ))); + } + let value = i64::try_from(value).map_err(|_| { + RunInputError::message(format!( + "param '{}': integer {} exceeds supported range (max {})", + key, + value, + i64::MAX + )) + })?; + Ok(Literal::Integer(value)) + } else if let Some(value) = number.as_f64() { + Ok(Literal::Float(value)) + } else { + Err(RunInputError::message(format!( + "param '{}': unsupported number value", + key + ))) + } + } + }, + Value::Array(values) => { + let mut out = Vec::with_capacity(values.len()); + for value in values { + out.push(json_value_to_literal_inferred(key, value, mode)?); + } + Ok(Literal::List(out)) + } + Value::Null => Err(match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': null is not supported", key)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': null values are not supported as query parameters", + key + )), + }), + Value::Object(_) => Err(match mode { + JsonParamMode::Standard => { + RunInputError::message(format!("param '{}': object is not supported", key)) + } + JsonParamMode::JavaScript => RunInputError::message(format!( + "param '{}': object values are not supported as query parameters", + key + )), + }), + } +} + +fn parse_i64_param(key: &str, value: &Value, mode: JsonParamMode) -> RunInputResult { + match mode { + JsonParamMode::Standard => match value { + Value::Number(number) => number.as_i64().ok_or_else(|| { + RunInputError::message(format!("param '{}': expected integer number", key)) + }), + Value::String(value) => value.parse::().map_err(|_| { + RunInputError::message(format!( + "param '{}': expected integer string, got '{}'", + key, value + )) + }), + _ => Err(RunInputError::message(format!( + "param '{}': expected integer", + key + ))), + }, + JsonParamMode::JavaScript => match value { + Value::Number(number) => { + let parsed = if let Some(parsed) = number.as_i64() { + parsed + } else if let Some(parsed) = number.as_f64() { + if !parsed.is_finite() || parsed.fract() != 0.0 { + return Err(RunInputError::message(format!( + "param '{}': expected integer, got number", + key + ))); + } + if parsed < i64::MIN as f64 || parsed > i64::MAX as f64 { + return Err(RunInputError::message(format!( + "param '{}': integer {} is outside i64 range", + key, parsed + ))); + } + parsed as i64 + } else { + return Err(RunInputError::message(format!( + "param '{}': expected integer, got number", + key + ))); + }; + if !is_js_safe_integer_i64(parsed) { + return Err(RunInputError::message(format!( + "param '{}': integer {} exceeds JS safe integer range; pass a decimal string for exact values", + key, parsed + ))); + } + Ok(parsed) + } + Value::String(value) => value.parse::().map_err(|_| { + RunInputError::message(format!( + "param '{}': expected integer string, got '{}'", + key, value + )) + }), + other => Err(RunInputError::message(format!( + "param '{}': expected integer, got {}", + key, + json_type_name(other) + ))), + }, + } +} + +fn parse_u64_param(key: &str, value: &Value, mode: JsonParamMode) -> RunInputResult { + match mode { + JsonParamMode::Standard => match value { + Value::Number(number) => number.as_u64().ok_or_else(|| { + RunInputError::message(format!("param '{}': expected unsigned integer number", key)) + }), + Value::String(value) => value.parse::().map_err(|_| { + RunInputError::message(format!( + "param '{}': expected unsigned integer string, got '{}'", + key, value + )) + }), + _ => Err(RunInputError::message(format!( + "param '{}': expected unsigned integer", + key + ))), + }, + JsonParamMode::JavaScript => match value { + Value::Number(number) => { + let parsed = if let Some(parsed) = number.as_u64() { + parsed + } else if let Some(parsed) = number.as_f64() { + if !parsed.is_finite() || parsed.fract() != 0.0 || parsed < 0.0 { + return Err(RunInputError::message(format!( + "param '{}': expected unsigned integer, got number", + key + ))); + } + if parsed > u64::MAX as f64 { + return Err(RunInputError::message(format!( + "param '{}': integer {} is outside u64 range", + key, parsed + ))); + } + parsed as u64 + } else { + return Err(RunInputError::message(format!( + "param '{}': expected unsigned integer, got number", + key + ))); + }; + if parsed > JS_MAX_SAFE_INTEGER_U64 { + return Err(RunInputError::message(format!( + "param '{}': integer {} exceeds JS safe integer range; pass a decimal string for exact values", + key, parsed + ))); + } + Ok(parsed) + } + Value::String(value) => value.parse::().map_err(|_| { + RunInputError::message(format!( + "param '{}': expected unsigned integer string, got '{}'", + key, value + )) + }), + other => Err(RunInputError::message(format!( + "param '{}': expected unsigned integer, got {}", + key, + json_type_name(other) + ))), + }, + } +} + +fn parse_vector_dim(type_name: &str) -> Option { + let dim = type_name + .strip_prefix("Vector(")? + .strip_suffix(')')? + .parse::() + .ok()?; + if dim == 0 { None } else { Some(dim) } +} + +fn parse_list_item_type(type_name: &str) -> Option<&str> { + Some(type_name.strip_prefix('[')?.strip_suffix(']')?.trim()) +} + +fn json_type_name(value: &Value) -> &'static str { + match value { + Value::Null => "null", + Value::Bool(_) => "boolean", + Value::Number(_) => "number", + Value::String(_) => "string", + Value::Array(_) => "array", + Value::Object(_) => "object", + } +} + +#[cfg(test)] +mod tests { + use serde_json::json; + + use super::{JsonParamMode, ToParam, find_named_query, json_params_to_param_map}; + use crate::query::ast::Literal; + + #[test] + fn js_mode_rejects_unsafe_integer_numbers() { + let query = find_named_query( + "query find($id: U64) { match { $u: User } return { $u } }", + "find", + ) + .expect("query should parse"); + + let error = json_params_to_param_map( + Some(&json!({ "id": 9_007_199_254_740_992u64 })), + &query.params, + JsonParamMode::JavaScript, + ) + .expect_err("unsafe integer should fail"); + + assert_eq!( + error.to_string(), + "param 'id': integer 9007199254740992 exceeds JS safe integer range; pass a decimal string for exact values" + ); + } + + #[test] + fn standard_mode_preserves_ffi_param_object_error() { + let error = json_params_to_param_map(Some(&json!(["nope"])), &[], JsonParamMode::Standard) + .expect_err("non-object params should fail"); + + assert_eq!(error.to_string(), "params must be a JSON object"); + } + + #[test] + fn to_param_supports_lists_and_explicit_date_literals() { + let vector = vec![1_i32, 2_i32, 3_i32].to_param().expect("vector param"); + match vector { + Literal::List(values) => { + assert!(matches!(values.first(), Some(Literal::Integer(1)))); + assert!(matches!(values.get(1), Some(Literal::Integer(2)))); + assert!(matches!(values.get(2), Some(Literal::Integer(3)))); + } + other => panic!("expected list param, got {:?}", other), + } + + let date = Literal::Date("2026-03-06".to_string()) + .to_param() + .expect("date param"); + assert!(matches!(date, Literal::Date(ref value) if value == "2026-03-06")); + } + + #[test] + fn to_param_rejects_unsigned_values_outside_engine_range() { + let error = u64::MAX.to_param().expect_err("oversized u64 should fail"); + + assert_eq!( + error.to_string(), + format!( + "execution error: param value {} exceeds current engine range for numeric literals (max {})", + u64::MAX, + i64::MAX + ) + ); + } + + #[test] + fn params_macro_builds_param_map() { + let params = params! { + "name" => "Alice", + "age" => 41_i32, + "scores" => [1_u8, 2_u8, 3_u8], + "published_at" => Literal::DateTime("2026-03-06T12:00:00Z".to_string()), + } + .expect("params"); + + assert!(matches!( + params.get("name"), + Some(Literal::String(value)) if value == "Alice" + )); + assert!(matches!(params.get("age"), Some(Literal::Integer(41)))); + match params.get("scores") { + Some(Literal::List(values)) => { + assert!(matches!(values.first(), Some(Literal::Integer(1)))); + assert!(matches!(values.get(1), Some(Literal::Integer(2)))); + assert!(matches!(values.get(2), Some(Literal::Integer(3)))); + } + other => panic!("expected list param, got {:?}", other), + } + assert!(matches!( + params.get("published_at"), + Some(Literal::DateTime(value)) if value == "2026-03-06T12:00:00Z" + )); + } + + #[test] + fn typed_json_params_support_list_and_datetime_types() { + let query = find_named_query( + r#" +query q($tags: [String], $days: [Date]?, $due_at: DateTime) { + match { $t: Task } + return { $t.slug } +} +"#, + "q", + ) + .expect("query"); + + let params = json_params_to_param_map( + Some(&json!({ + "tags": ["launch", "priority"], + "days": ["2026-04-01", "2026-04-02"], + "due_at": "2026-04-03T10:15:00Z" + })), + &query.params, + JsonParamMode::Standard, + ) + .expect("typed params"); + + assert!(matches!( + params.get("due_at"), + Some(Literal::DateTime(value)) if value == "2026-04-03T10:15:00Z" + )); + match params.get("tags") { + Some(Literal::List(values)) => { + assert!( + matches!(values.first(), Some(Literal::String(value)) if value == "launch") + ); + assert!( + matches!(values.get(1), Some(Literal::String(value)) if value == "priority") + ); + } + other => panic!("expected string list param, got {:?}", other), + } + match params.get("days") { + Some(Literal::List(values)) => { + assert!( + matches!(values.first(), Some(Literal::Date(value)) if value == "2026-04-01") + ); + assert!( + matches!(values.get(1), Some(Literal::Date(value)) if value == "2026-04-02") + ); + } + other => panic!("expected date list param, got {:?}", other), + } + } +} diff --git a/crates/omnigraph-compiler/src/result.rs b/crates/omnigraph-compiler/src/result.rs new file mode 100644 index 0000000..7de77ac --- /dev/null +++ b/crates/omnigraph-compiler/src/result.rs @@ -0,0 +1,286 @@ +use std::sync::Arc; + +use arrow_array::{RecordBatch, UInt64Array}; +use arrow_ipc::writer::StreamWriter; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use serde::de::DeserializeOwned; + +use crate::error::{NanoError, Result}; +use crate::json_output::{record_batches_to_json_rows, record_batches_to_rust_json_rows}; + +#[derive(Debug, Clone, Copy, Default)] +pub struct MutationExecResult { + pub affected_nodes: usize, + pub affected_edges: usize, +} + +#[derive(Debug, Clone)] +pub struct QueryResult { + schema: SchemaRef, + batches: Vec, +} + +impl QueryResult { + pub fn new(schema: SchemaRef, batches: Vec) -> Self { + Self { schema, batches } + } + + pub fn schema(&self) -> &SchemaRef { + &self.schema + } + + pub fn batches(&self) -> &[RecordBatch] { + &self.batches + } + + pub fn into_batches(self) -> Vec { + self.batches + } + + pub fn num_rows(&self) -> usize { + self.batches.iter().map(RecordBatch::num_rows).sum() + } + + pub fn concat_batches(&self) -> Result { + if self.batches.is_empty() { + return Ok(RecordBatch::new_empty(self.schema.clone())); + } + + arrow_select::concat::concat_batches(&self.schema, &self.batches) + .map_err(|err| NanoError::Execution(err.to_string())) + } + + pub fn to_sdk_json(&self) -> serde_json::Value { + serde_json::Value::Array(record_batches_to_json_rows(&self.batches)) + } + + pub fn to_rust_json(&self) -> serde_json::Value { + serde_json::Value::Array(record_batches_to_rust_json_rows(&self.batches)) + } + + pub fn deserialize(&self) -> Result { + serde_json::from_value(self.to_rust_json()).map_err(|err| { + NanoError::Execution(format!("failed to deserialize query result: {}", err)) + }) + } + + pub fn to_arrow_ipc(&self) -> Result> { + let mut buffer = Vec::new(); + let mut writer = StreamWriter::try_new(&mut buffer, &self.schema)?; + for batch in &self.batches { + writer.write(batch)?; + } + writer.finish()?; + drop(writer); + Ok(buffer) + } +} + +#[derive(Debug, Clone, Copy, Default)] +pub struct MutationResult { + pub affected_nodes: usize, + pub affected_edges: usize, +} + +impl MutationResult { + pub fn to_sdk_json(&self) -> serde_json::Value { + serde_json::json!({ + "affectedNodes": self.affected_nodes, + "affectedEdges": self.affected_edges, + }) + } + + pub fn to_record_batch(&self) -> Result { + let schema = Arc::new(Schema::new(vec![ + Field::new("affected_nodes", DataType::UInt64, false), + Field::new("affected_edges", DataType::UInt64, false), + ])); + Ok(RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![self.affected_nodes as u64])), + Arc::new(UInt64Array::from(vec![self.affected_edges as u64])), + ], + )?) + } +} + +impl From for MutationResult { + fn from(value: MutationExecResult) -> Self { + Self { + affected_nodes: value.affected_nodes, + affected_edges: value.affected_edges, + } + } +} + +#[derive(Debug, Clone)] +pub enum RunResult { + Query(QueryResult), + Mutation(MutationResult), +} + +impl RunResult { + pub fn to_sdk_json(&self) -> serde_json::Value { + match self { + Self::Query(result) => result.to_sdk_json(), + Self::Mutation(result) => result.to_sdk_json(), + } + } + + pub fn into_record_batches(self) -> Result> { + match self { + Self::Query(result) => Ok(result.into_batches()), + Self::Mutation(result) => Ok(vec![result.to_record_batch()?]), + } + } +} + +#[cfg(test)] +mod tests { + use std::io::Cursor; + + use arrow_array::Int64Array; + use arrow_ipc::reader::StreamReader; + use serde::Deserialize; + + use super::*; + + #[test] + fn query_result_arrow_ipc_round_trips_empty_schema() { + let schema = Arc::new(Schema::new(vec![Field::new("name", DataType::Utf8, false)])); + let result = QueryResult::new(schema.clone(), vec![]); + + let encoded = result.to_arrow_ipc().expect("encode empty result"); + let reader = StreamReader::try_new(Cursor::new(encoded), None).expect("open stream"); + + assert_eq!(reader.schema().as_ref(), schema.as_ref()); + assert_eq!(reader.count(), 0); + } + + #[test] + fn query_result_arrow_ipc_round_trips_batches() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::UInt64, false)])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt64Array::from(vec![1_u64, 2_u64]))], + ) + .expect("batch"); + let result = QueryResult::new(schema.clone(), vec![batch]); + + let encoded = result.to_arrow_ipc().expect("encode result"); + let mut reader = StreamReader::try_new(Cursor::new(encoded), None).expect("open stream"); + let decoded = reader.next().expect("first batch").expect("decode batch"); + + assert_eq!(reader.schema().as_ref(), schema.as_ref()); + assert_eq!(decoded.num_rows(), 2); + assert_eq!(decoded.schema().as_ref(), schema.as_ref()); + } + + #[test] + fn query_result_num_rows_and_concat_cover_multiple_batches() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::UInt64, false)])); + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt64Array::from(vec![1_u64, 2_u64]))], + ) + .expect("batch1"); + let batch2 = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(UInt64Array::from(vec![3_u64]))], + ) + .expect("batch2"); + let result = QueryResult::new(schema.clone(), vec![batch1, batch2]); + + assert_eq!(result.num_rows(), 3); + + let concatenated = result.concat_batches().expect("concat batches"); + let ids = concatenated + .column(0) + .as_any() + .downcast_ref::() + .expect("u64 ids"); + assert_eq!(concatenated.schema().as_ref(), schema.as_ref()); + assert_eq!(ids.values(), &[1, 2, 3]); + } + + #[test] + fn query_result_concat_empty_batches_returns_empty_batch() { + let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::UInt64, false)])); + let result = QueryResult::new(schema.clone(), vec![]); + + let concatenated = result.concat_batches().expect("concat empty"); + + assert_eq!(concatenated.schema().as_ref(), schema.as_ref()); + assert_eq!(concatenated.num_rows(), 0); + } + + #[test] + fn query_result_to_rust_json_preserves_wide_integers() { + let schema = Arc::new(Schema::new(vec![ + Field::new("signed", DataType::Int64, false), + Field::new("unsigned", DataType::UInt64, false), + ])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int64Array::from(vec![i64::MIN])), + Arc::new(UInt64Array::from(vec![u64::MAX])), + ], + ) + .expect("batch"); + let result = QueryResult::new(schema, vec![batch]); + + assert_eq!( + result.to_rust_json(), + serde_json::json!([{ + "signed": i64::MIN, + "unsigned": u64::MAX, + }]) + ); + } + + #[derive(Debug, Deserialize, PartialEq)] + struct PersonRow { + id: u64, + age: i64, + } + + #[test] + fn query_result_deserialize_decodes_rust_rows() { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::UInt64, false), + Field::new("age", DataType::Int64, false), + ])); + let batch1 = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(UInt64Array::from(vec![1_u64])), + Arc::new(Int64Array::from(vec![40_i64])), + ], + ) + .expect("batch1"); + let batch2 = RecordBatch::try_new( + schema, + vec![ + Arc::new(UInt64Array::from(vec![u64::MAX])), + Arc::new(Int64Array::from(vec![-5_i64])), + ], + ) + .expect("batch2"); + let result = QueryResult::new(batch1.schema(), vec![batch1, batch2]); + + let rows: Vec = result.deserialize().expect("deserialize rows"); + + assert_eq!( + rows, + vec![ + PersonRow { id: 1, age: 40 }, + PersonRow { + id: u64::MAX, + age: -5, + }, + ] + ); + } +} diff --git a/crates/omnigraph-compiler/src/schema/ast.rs b/crates/omnigraph-compiler/src/schema/ast.rs new file mode 100644 index 0000000..f8ed18a --- /dev/null +++ b/crates/omnigraph-compiler/src/schema/ast.rs @@ -0,0 +1,111 @@ +use crate::types::PropType; +use serde::{Deserialize, Serialize}; + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct SchemaFile { + pub declarations: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum SchemaDecl { + Interface(InterfaceDecl), + Node(NodeDecl), + Edge(EdgeDecl), +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct InterfaceDecl { + pub name: String, + pub properties: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct NodeDecl { + pub name: String, + pub annotations: Vec, + pub implements: Vec, + pub properties: Vec, + pub constraints: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct EdgeDecl { + pub name: String, + pub from_type: String, + pub to_type: String, + pub cardinality: Cardinality, + pub annotations: Vec, + pub properties: Vec, + pub constraints: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct PropDecl { + pub name: String, + pub prop_type: PropType, + pub annotations: Vec, +} + +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub struct Annotation { + pub name: String, + pub value: Option, +} + +/// A typed constraint declared in a node or edge body. +/// +/// Property-level annotations (`@key`, `@unique`, `@index`) are desugared +/// into these during parsing, so both syntactic positions produce the same +/// representation. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum Constraint { + Key(Vec), + Unique(Vec), + Index(Vec), + Range { + property: String, + min: Option, + max: Option, + }, + Check { + property: String, + pattern: String, + }, +} + +/// A numeric bound used in `@range` constraints. +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +pub enum ConstraintBound { + Integer(i64), + Float(f64), +} + +/// Edge cardinality: `@card(min..max)`. Default is `0..*`. +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct Cardinality { + pub min: u32, + pub max: Option, +} + +impl Default for Cardinality { + fn default() -> Self { + Self { min: 0, max: None } + } +} + +impl Cardinality { + pub fn is_default(&self) -> bool { + self.min == 0 && self.max.is_none() + } +} + +pub fn has_annotation(annotations: &[Annotation], name: &str) -> bool { + annotations.iter().any(|ann| ann.name == name) +} + +pub fn annotation_value<'a>(annotations: &'a [Annotation], name: &str) -> Option<&'a str> { + annotations + .iter() + .find(|ann| ann.name == name) + .and_then(|ann| ann.value.as_deref()) +} diff --git a/crates/omnigraph-compiler/src/schema/mod.rs b/crates/omnigraph-compiler/src/schema/mod.rs new file mode 100644 index 0000000..a310c76 --- /dev/null +++ b/crates/omnigraph-compiler/src/schema/mod.rs @@ -0,0 +1,2 @@ +pub mod ast; +pub mod parser; diff --git a/crates/omnigraph-compiler/src/schema/parser.rs b/crates/omnigraph-compiler/src/schema/parser.rs new file mode 100644 index 0000000..975d5a0 --- /dev/null +++ b/crates/omnigraph-compiler/src/schema/parser.rs @@ -0,0 +1,1950 @@ +use std::collections::HashMap; + +use pest::Parser; +use pest::error::InputLocation; +use pest_derive::Parser; + +use crate::error::{ + NanoError, ParseDiagnostic, Result, SourceSpan, decode_string_literal, render_span, +}; +use crate::types::{PropType, ScalarType}; + +use super::ast::*; + +#[derive(Parser)] +#[grammar = "schema/schema.pest"] +struct SchemaParser; + +pub fn parse_schema(input: &str) -> Result { + parse_schema_diagnostic(input).map_err(|e| NanoError::Parse(e.to_string())) +} + +pub fn parse_schema_diagnostic(input: &str) -> std::result::Result { + let pairs = SchemaParser::parse(Rule::schema_file, input).map_err(pest_error_to_diagnostic)?; + + let mut declarations = Vec::new(); + for pair in pairs { + if pair.as_rule() == Rule::schema_file { + for inner in pair.into_inner() { + if let Rule::schema_decl = inner.as_rule() { + declarations.push(parse_schema_decl(inner).map_err(nano_error_to_diagnostic)?); + } + } + } + } + + // Collect interfaces for resolution (clone to avoid borrow conflict) + let interfaces: Vec = declarations + .iter() + .filter_map(|d| match d { + SchemaDecl::Interface(i) => Some(i.clone()), + _ => None, + }) + .collect(); + + // Resolve implements clauses on nodes + let iface_refs: Vec<&InterfaceDecl> = interfaces.iter().collect(); + for decl in &mut declarations { + if let SchemaDecl::Node(node) = decl { + resolve_interfaces(node, &iface_refs).map_err(nano_error_to_diagnostic)?; + } + } + + let schema = SchemaFile { declarations }; + validate_schema_annotations(&schema).map_err(nano_error_to_diagnostic)?; + validate_constraints(&schema).map_err(nano_error_to_diagnostic)?; + Ok(schema) +} + +fn pest_error_to_diagnostic(err: pest::error::Error) -> ParseDiagnostic { + let span = match err.location { + InputLocation::Pos(pos) => Some(render_span(SourceSpan::new(pos, pos))), + InputLocation::Span((start, end)) => Some(render_span(SourceSpan::new(start, end))), + }; + ParseDiagnostic::new(err.to_string(), span) +} + +fn nano_error_to_diagnostic(err: NanoError) -> ParseDiagnostic { + ParseDiagnostic::new(err.to_string(), None) +} + +fn parse_schema_decl(pair: pest::iterators::Pair) -> Result { + let inner = pair.into_inner().next().unwrap(); + match inner.as_rule() { + Rule::interface_decl => Ok(SchemaDecl::Interface(parse_interface_decl(inner)?)), + Rule::node_decl => Ok(SchemaDecl::Node(parse_node_decl(inner)?)), + Rule::edge_decl => Ok(SchemaDecl::Edge(parse_edge_decl(inner)?)), + _ => Err(NanoError::Parse(format!( + "unexpected rule: {:?}", + inner.as_rule() + ))), + } +} + +fn parse_interface_decl(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let name = inner.next().unwrap().as_str().to_string(); + + let mut properties = Vec::new(); + for item in inner { + if let Rule::prop_decl = item.as_rule() { + properties.push(parse_prop_decl(item)?); + } + } + + Ok(InterfaceDecl { name, properties }) +} + +fn parse_node_decl(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let name = inner.next().unwrap().as_str().to_string(); + + let mut annotations = Vec::new(); + let mut implements = Vec::new(); + let mut properties = Vec::new(); + let mut constraints = Vec::new(); + + for item in inner { + match item.as_rule() { + Rule::annotation => { + annotations.push(parse_annotation(item)?); + } + Rule::implements_clause => { + for iface in item.into_inner() { + if iface.as_rule() == Rule::type_name { + implements.push(iface.as_str().to_string()); + } + } + } + Rule::prop_decl => { + properties.push(parse_prop_decl(item)?); + } + Rule::body_constraint => { + constraints.push(parse_body_constraint(item)?); + } + _ => {} + } + } + + // Desugar property-level @key/@unique/@index annotations into constraints + desugar_property_constraints(&properties, &mut constraints); + + Ok(NodeDecl { + name, + annotations, + implements, + properties, + constraints, + }) +} + +fn parse_edge_decl(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let name = inner.next().unwrap().as_str().to_string(); + let from_type = inner.next().unwrap().as_str().to_string(); + let to_type = inner.next().unwrap().as_str().to_string(); + + let mut cardinality = Cardinality::default(); + let mut annotations = Vec::new(); + let mut properties = Vec::new(); + let mut constraints = Vec::new(); + + for item in inner { + match item.as_rule() { + Rule::cardinality => { + cardinality = parse_cardinality(item)?; + } + Rule::annotation => annotations.push(parse_annotation(item)?), + Rule::prop_decl => properties.push(parse_prop_decl(item)?), + Rule::body_constraint => constraints.push(parse_body_constraint(item)?), + _ => {} + } + } + + // Desugar property-level @unique/@index on edge properties + desugar_property_constraints(&properties, &mut constraints); + + Ok(EdgeDecl { + name, + from_type, + to_type, + cardinality, + annotations, + properties, + constraints, + }) +} + +fn parse_cardinality(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let min_str = inner.next().unwrap().as_str(); + let min = min_str + .parse::() + .map_err(|_| NanoError::Parse(format!("invalid cardinality min: {}", min_str)))?; + let max = if let Some(max_pair) = inner.next() { + let max_str = max_pair.as_str(); + Some( + max_str + .parse::() + .map_err(|_| NanoError::Parse(format!("invalid cardinality max: {}", max_str)))?, + ) + } else { + None + }; + + if let Some(max_val) = max { + if min > max_val { + return Err(NanoError::Parse(format!( + "cardinality min ({}) exceeds max ({})", + min, max_val + ))); + } + } + + Ok(Cardinality { min, max }) +} + +fn parse_body_constraint(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let name_pair = inner.next().unwrap(); + let constraint_name = name_pair.as_str(); + let args_pair = inner.next().unwrap(); + let args: Vec> = args_pair.into_inner().collect(); + + match constraint_name { + "key" => { + let names: Vec = args + .into_iter() + .filter(|a| a.as_rule() == Rule::ident || a.as_rule() == Rule::constraint_arg) + .map(|a| extract_ident_from_constraint_arg(a)) + .collect::>>()?; + if names.is_empty() { + return Err(NanoError::Parse( + "@key constraint requires at least one property name".to_string(), + )); + } + Ok(Constraint::Key(names)) + } + "unique" => { + let names = extract_ident_list_from_args(args)?; + if names.is_empty() { + return Err(NanoError::Parse( + "@unique constraint requires at least one property name".to_string(), + )); + } + Ok(Constraint::Unique(names)) + } + "index" => { + let names = extract_ident_list_from_args(args)?; + if names.is_empty() { + return Err(NanoError::Parse( + "@index constraint requires at least one property name".to_string(), + )); + } + Ok(Constraint::Index(names)) + } + "range" => { + // @range(prop, min..max) + if args.len() < 2 { + return Err(NanoError::Parse( + "@range requires property name and bounds: @range(prop, min..max)".to_string(), + )); + } + let property = extract_ident_from_constraint_arg(args[0].clone())?; + // The second arg should be a range_bound + let (min, max) = extract_range_bounds(&args[1])?; + Ok(Constraint::Range { property, min, max }) + } + "check" => { + // @check(prop, "regex") + if args.len() < 2 { + return Err(NanoError::Parse( + "@check requires property name and pattern: @check(prop, \"regex\")" + .to_string(), + )); + } + let property = extract_ident_from_constraint_arg(args[0].clone())?; + let pattern = extract_string_from_constraint_arg(&args[1])?; + Ok(Constraint::Check { property, pattern }) + } + other => Err(NanoError::Parse(format!("unknown constraint: @{}", other))), + } +} + +fn extract_ident_from_constraint_arg(pair: pest::iterators::Pair) -> Result { + if pair.as_rule() == Rule::ident { + return Ok(pair.as_str().to_string()); + } + // constraint_arg wraps ident or literal + if let Some(inner) = pair.into_inner().next() { + if inner.as_rule() == Rule::ident { + return Ok(inner.as_str().to_string()); + } + } + Err(NanoError::Parse( + "expected property name in constraint".to_string(), + )) +} + +fn extract_ident_list_from_args(args: Vec>) -> Result> { + let mut names = Vec::new(); + for arg in args { + names.push(extract_ident_from_constraint_arg(arg)?); + } + Ok(names) +} + +fn extract_string_from_constraint_arg(pair: &pest::iterators::Pair) -> Result { + // Navigate into constraint_arg -> literal -> string_lit + fn find_string(pair: &pest::iterators::Pair) -> Result> { + if pair.as_rule() == Rule::string_lit { + return decode_string_literal(pair.as_str()).map(Some); + } + for inner in pair.clone().into_inner() { + if let Some(s) = find_string(&inner)? { + return Ok(Some(s)); + } + } + Ok(None) + } + + find_string(pair)? + .ok_or_else(|| NanoError::Parse("expected string argument in constraint".to_string())) +} + +fn extract_range_bounds( + pair: &pest::iterators::Pair, +) -> Result<(Option, Option)> { + // Find the range_bound node inside the constraint_arg + let range_pair = if pair.as_rule() == Rule::range_bound { + pair.clone() + } else { + let mut found = None; + for inner in pair.clone().into_inner() { + if inner.as_rule() == Rule::range_bound { + found = Some(inner); + break; + } + } + found.ok_or_else(|| { + NanoError::Parse("expected range bounds (min..max) in @range constraint".to_string()) + })? + }; + + let mut min = None; + let mut max = None; + let mut seen_bound = false; + + for child in range_pair.into_inner() { + if child.as_rule() == Rule::literal + || child.as_rule() == Rule::integer + || child.as_rule() == Rule::float_lit + || child.as_rule() == Rule::signed_integer + || child.as_rule() == Rule::signed_float + { + let bound = parse_constraint_bound(&child)?; + if !seen_bound { + min = Some(bound); + seen_bound = true; + } else { + max = Some(bound); + } + } + } + + Ok((min, max)) +} + +fn parse_constraint_bound(pair: &pest::iterators::Pair) -> Result { + let text = pair.as_str(); + + // Try as integer first + if let Ok(n) = text.parse::() { + return Ok(ConstraintBound::Integer(n)); + } + // Try as float + if let Ok(f) = text.parse::() { + return Ok(ConstraintBound::Float(f)); + } + + // Navigate into literal -> integer/float_lit + for inner in pair.clone().into_inner() { + let s = inner.as_str(); + if let Ok(n) = s.parse::() { + return Ok(ConstraintBound::Integer(n)); + } + if let Ok(f) = s.parse::() { + return Ok(ConstraintBound::Float(f)); + } + } + + Err(NanoError::Parse(format!( + "invalid constraint bound: {}", + text + ))) +} + +/// Desugar property-level @key/@unique/@index annotations into body-level constraints. +fn desugar_property_constraints(properties: &[PropDecl], constraints: &mut Vec) { + for prop in properties { + for ann in &prop.annotations { + match ann.name.as_str() { + "key" if ann.value.is_none() => { + constraints.push(Constraint::Key(vec![prop.name.clone()])); + } + "unique" if ann.value.is_none() => { + constraints.push(Constraint::Unique(vec![prop.name.clone()])); + } + "index" if ann.value.is_none() => { + constraints.push(Constraint::Index(vec![prop.name.clone()])); + } + _ => {} + } + } + } +} + +/// Resolve interface implements clauses — verify properties exist or inject them. +fn resolve_interfaces(node: &mut NodeDecl, interfaces: &[&InterfaceDecl]) -> Result<()> { + let interface_map: HashMap<&str, &InterfaceDecl> = + interfaces.iter().map(|i| (i.name.as_str(), *i)).collect(); + + for iface_name in &node.implements { + let iface = interface_map.get(iface_name.as_str()).ok_or_else(|| { + NanoError::Parse(format!( + "node {} implements unknown interface '{}'", + node.name, iface_name + )) + })?; + + for iface_prop in &iface.properties { + if let Some(existing) = node.properties.iter().find(|p| p.name == iface_prop.name) { + // Property exists — verify type compatibility + if existing.prop_type != iface_prop.prop_type { + return Err(NanoError::Parse(format!( + "node {} property '{}' has type {} but interface {} declares it as {}", + node.name, + iface_prop.name, + existing.prop_type.display_name(), + iface_name, + iface_prop.prop_type.display_name() + ))); + } + } else { + // Property missing — inject it from the interface + node.properties.push(iface_prop.clone()); + // Also desugar any constraint annotations from the injected property + desugar_property_constraints( + std::slice::from_ref(iface_prop), + &mut node.constraints, + ); + } + } + } + + Ok(()) +} + +fn parse_prop_decl(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let name = inner.next().unwrap().as_str().to_string(); + let type_ref = inner.next().unwrap(); + let prop_type = parse_type_ref(type_ref)?; + + let mut annotations = Vec::new(); + for item in inner { + if let Rule::annotation = item.as_rule() { + annotations.push(parse_annotation(item)?); + } + } + + Ok(PropDecl { + name, + prop_type, + annotations, + }) +} + +fn parse_type_ref(pair: pest::iterators::Pair) -> Result { + let text = pair.as_str(); + let nullable = text.ends_with('?'); + + let mut inner = pair + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("type reference is missing core type".to_string()))?; + if inner.as_rule() == Rule::core_type { + inner = inner + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("type reference is missing core type".to_string()))?; + } + + match inner.as_rule() { + Rule::base_type => { + let scalar = ScalarType::from_str_name(inner.as_str()) + .ok_or_else(|| NanoError::Parse(format!("unknown type: {}", inner.as_str())))?; + Ok(PropType::scalar(scalar, nullable)) + } + Rule::vector_type => { + let dim_text = inner + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("Vector type missing dimension".to_string()))? + .as_str(); + let dim = dim_text + .parse::() + .map_err(|e| NanoError::Parse(format!("invalid Vector dimension: {}", e)))?; + if dim == 0 { + return Err(NanoError::Parse( + "Vector dimension must be greater than zero".to_string(), + )); + } + if dim > i32::MAX as u32 { + return Err(NanoError::Parse(format!( + "Vector dimension {} exceeds maximum supported {}", + dim, + i32::MAX + ))); + } + Ok(PropType::scalar(ScalarType::Vector(dim), nullable)) + } + Rule::list_type => { + let element = inner + .into_inner() + .next() + .ok_or_else(|| NanoError::Parse("list type missing element type".to_string()))?; + let scalar = ScalarType::from_str_name(element.as_str()).ok_or_else(|| { + NanoError::Parse(format!("unknown list element type: {}", element.as_str())) + })?; + if matches!(scalar, ScalarType::Blob) { + return Err(NanoError::Parse( + "list of Blob is not supported".to_string(), + )); + } + Ok(PropType::list_of(scalar, nullable)) + } + Rule::enum_type => { + let mut values = Vec::new(); + for value in inner.into_inner() { + if value.as_rule() == Rule::enum_value { + values.push(value.as_str().to_string()); + } + } + if values.is_empty() { + return Err(NanoError::Parse( + "enum type must include at least one value".to_string(), + )); + } + let mut dedup = values.clone(); + dedup.sort(); + dedup.dedup(); + if dedup.len() != values.len() { + return Err(NanoError::Parse( + "enum type cannot include duplicate values".to_string(), + )); + } + Ok(PropType::enum_type(values, nullable)) + } + other => Err(NanoError::Parse(format!( + "unexpected type rule: {:?}", + other + ))), + } +} + +fn parse_annotation(pair: pest::iterators::Pair) -> Result { + let mut inner = pair.into_inner(); + let name = inner.next().unwrap().as_str().to_string(); + let value = inner + .next() + .map(|p| decode_string_literal(p.as_str())) + .transpose()?; + + Ok(Annotation { name, value }) +} + +fn validate_string_annotation( + annotations: &[Annotation], + annotation: &str, + target: &str, +) -> Result<()> { + let mut seen = false; + for ann in annotations { + if ann.name != annotation { + continue; + } + if seen { + return Err(NanoError::Parse(format!( + "{} declares @{} multiple times", + target, annotation + ))); + } + let value = ann.value.as_deref().ok_or_else(|| { + NanoError::Parse(format!( + "@{} on {} requires a non-empty value", + annotation, target + )) + })?; + if value.trim().is_empty() { + return Err(NanoError::Parse(format!( + "@{} on {} requires a non-empty value", + annotation, target + ))); + } + seen = true; + } + Ok(()) +} + +// ─── Annotation Validation (metadata only) ─────────────────────────────────── + +fn validate_schema_annotations(schema: &SchemaFile) -> Result<()> { + for decl in &schema.declarations { + match decl { + SchemaDecl::Interface(_) => {} // Interfaces have no type-level annotations + SchemaDecl::Node(node) => { + // Reject constraint annotations on node level (must be on properties or as body constraints) + for ann in &node.annotations { + if ann.name == "key" + || ann.name == "unique" + || ann.name == "index" + || ann.name == "embed" + { + return Err(NanoError::Parse(format!( + "@{} is only supported on node properties or as body constraint (node {})", + ann.name, node.name + ))); + } + } + validate_string_annotation( + &node.annotations, + "description", + &format!("node {}", node.name), + )?; + validate_string_annotation( + &node.annotations, + "instruction", + &format!("node {}", node.name), + )?; + + // Validate property-level annotations + for prop in &node.properties { + validate_property_annotations(prop, &node.name, &node.properties, false)?; + } + } + SchemaDecl::Edge(edge) => { + for ann in &edge.annotations { + if ann.name == "key" + || ann.name == "unique" + || ann.name == "index" + || ann.name == "embed" + { + return Err(NanoError::Parse(format!( + "@{} is not supported on edges (edge {})", + ann.name, edge.name + ))); + } + } + validate_string_annotation( + &edge.annotations, + "description", + &format!("edge {}", edge.name), + )?; + validate_string_annotation( + &edge.annotations, + "instruction", + &format!("edge {}", edge.name), + )?; + + for prop in &edge.properties { + validate_property_annotations(prop, &edge.name, &edge.properties, true)?; + } + } + } + } + Ok(()) +} + +fn validate_property_annotations( + prop: &PropDecl, + type_name: &str, + all_properties: &[PropDecl], + is_edge: bool, +) -> Result<()> { + let is_vector = matches!(prop.prop_type.scalar, ScalarType::Vector(_)); + let is_blob = matches!(prop.prop_type.scalar, ScalarType::Blob); + + validate_string_annotation( + &prop.annotations, + "description", + &format!("property {}.{}", type_name, prop.name), + )?; + + let mut key_seen = false; + let mut unique_seen = false; + let mut index_seen = false; + let mut embed_seen = false; + + for ann in &prop.annotations { + // List/vector/blob restrictions on property-level annotations + if prop.prop_type.list + && (ann.name == "key" + || ann.name == "unique" + || ann.name == "index" + || ann.name == "embed") + { + return Err(NanoError::Parse(format!( + "@{} is not supported on list property {}.{}", + ann.name, type_name, prop.name + ))); + } + if is_vector && (ann.name == "key" || ann.name == "unique") { + return Err(NanoError::Parse(format!( + "@{} is not supported on vector property {}.{}", + ann.name, type_name, prop.name + ))); + } + if is_blob + && (ann.name == "key" + || ann.name == "unique" + || ann.name == "index" + || ann.name == "embed") + { + return Err(NanoError::Parse(format!( + "@{} is not supported on blob property {}.{}", + ann.name, type_name, prop.name + ))); + } + if ann.name == "instruction" { + return Err(NanoError::Parse(format!( + "@instruction is only supported on node and edge types (property {}.{})", + type_name, prop.name + ))); + } + + // Edge-specific restrictions + if is_edge && (ann.name == "key" || ann.name == "embed") { + return Err(NanoError::Parse(format!( + "@{} is not supported on edge properties (edge {}.{})", + ann.name, type_name, prop.name + ))); + } + + // Arity checks + match ann.name.as_str() { + "key" => { + if ann.value.is_some() { + return Err(NanoError::Parse(format!( + "@key on {}.{} does not accept a value", + type_name, prop.name + ))); + } + if key_seen { + return Err(NanoError::Parse(format!( + "property {}.{} declares @key multiple times", + type_name, prop.name + ))); + } + key_seen = true; + } + "unique" => { + if ann.value.is_some() { + return Err(NanoError::Parse(format!( + "@unique on {}.{} does not accept a value", + type_name, prop.name + ))); + } + if unique_seen { + return Err(NanoError::Parse(format!( + "property {}.{} declares @unique multiple times", + type_name, prop.name + ))); + } + unique_seen = true; + } + "index" => { + if ann.value.is_some() { + return Err(NanoError::Parse(format!( + "@index on {}.{} does not accept a value", + type_name, prop.name + ))); + } + if index_seen { + return Err(NanoError::Parse(format!( + "property {}.{} declares @index multiple times", + type_name, prop.name + ))); + } + index_seen = true; + } + "embed" => { + if embed_seen { + return Err(NanoError::Parse(format!( + "property {}.{} declares @embed multiple times", + type_name, prop.name + ))); + } + embed_seen = true; + + if !is_vector { + return Err(NanoError::Parse(format!( + "@embed is only supported on vector properties ({}.{})", + type_name, prop.name + ))); + } + + let source_prop = ann.value.as_deref().ok_or_else(|| { + NanoError::Parse(format!( + "@embed on {}.{} requires a source property name", + type_name, prop.name + )) + })?; + if source_prop.trim().is_empty() { + return Err(NanoError::Parse(format!( + "@embed on {}.{} requires a non-empty source property name", + type_name, prop.name + ))); + } + + let source_decl = all_properties + .iter() + .find(|p| p.name == source_prop) + .ok_or_else(|| { + NanoError::Parse(format!( + "@embed on {}.{} references unknown source property {}", + type_name, prop.name, source_prop + )) + })?; + if source_decl.prop_type.list || source_decl.prop_type.scalar != ScalarType::String + { + return Err(NanoError::Parse(format!( + "@embed source property {}.{} must be String", + type_name, source_prop + ))); + } + } + _ => {} + } + } + Ok(()) +} + +// ─── Constraint Validation ─────────────────────────────────────────────────── + +fn validate_constraints(schema: &SchemaFile) -> Result<()> { + for decl in &schema.declarations { + match decl { + SchemaDecl::Interface(_) => {} + SchemaDecl::Node(node) => { + validate_type_constraints(&node.constraints, &node.properties, &node.name, false)?; + } + SchemaDecl::Edge(edge) => { + validate_type_constraints(&edge.constraints, &edge.properties, &edge.name, true)?; + } + } + } + Ok(()) +} + +fn validate_type_constraints( + constraints: &[Constraint], + properties: &[PropDecl], + type_name: &str, + is_edge: bool, +) -> Result<()> { + let prop_names: HashMap<&str, &PropDecl> = + properties.iter().map(|p| (p.name.as_str(), p)).collect(); + + let mut key_count = 0usize; + + for constraint in constraints { + match constraint { + Constraint::Key(cols) => { + if is_edge { + return Err(NanoError::Parse(format!( + "@key constraint is not supported on edges (edge {})", + type_name + ))); + } + key_count += 1; + if key_count > 1 { + return Err(NanoError::Parse(format!( + "node type {} has multiple @key constraints; only one is supported", + type_name + ))); + } + for col in cols { + let prop = prop_names.get(col.as_str()).ok_or_else(|| { + NanoError::Parse(format!( + "@key on {} references unknown property '{}'", + type_name, col + )) + })?; + if prop.prop_type.nullable { + return Err(NanoError::Parse(format!( + "@key property {}.{} cannot be nullable", + type_name, col + ))); + } + if prop.prop_type.list { + return Err(NanoError::Parse(format!( + "@key is not supported on list property {}.{}", + type_name, col + ))); + } + if matches!(prop.prop_type.scalar, ScalarType::Vector(_)) { + return Err(NanoError::Parse(format!( + "@key is not supported on vector property {}.{}", + type_name, col + ))); + } + if matches!(prop.prop_type.scalar, ScalarType::Blob) { + return Err(NanoError::Parse(format!( + "@key is not supported on blob property {}.{}", + type_name, col + ))); + } + } + } + Constraint::Unique(cols) => { + for col in cols { + // Allow "src" and "dst" as implicit edge columns + if is_edge && (col == "src" || col == "dst") { + continue; + } + if !prop_names.contains_key(col.as_str()) { + return Err(NanoError::Parse(format!( + "@unique on {} references unknown property '{}'", + type_name, col + ))); + } + } + } + Constraint::Index(cols) => { + for col in cols { + if is_edge && (col == "src" || col == "dst") { + continue; + } + let prop = prop_names.get(col.as_str()).ok_or_else(|| { + NanoError::Parse(format!( + "@index on {} references unknown property '{}'", + type_name, col + )) + })?; + if matches!(prop.prop_type.scalar, ScalarType::Blob) { + return Err(NanoError::Parse(format!( + "@index is not supported on blob property {}.{}", + type_name, col + ))); + } + } + } + Constraint::Range { property, .. } => { + if is_edge { + return Err(NanoError::Parse(format!( + "@range constraint is not supported on edges (edge {})", + type_name + ))); + } + let prop = prop_names.get(property.as_str()).ok_or_else(|| { + NanoError::Parse(format!( + "@range on {} references unknown property '{}'", + type_name, property + )) + })?; + if !prop.prop_type.scalar.is_numeric() { + return Err(NanoError::Parse(format!( + "@range on {}.{} requires a numeric type, got {}", + type_name, + property, + prop.prop_type.display_name() + ))); + } + } + Constraint::Check { property, .. } => { + if is_edge { + return Err(NanoError::Parse(format!( + "@check constraint is not supported on edges (edge {})", + type_name + ))); + } + let prop = prop_names.get(property.as_str()).ok_or_else(|| { + NanoError::Parse(format!( + "@check on {} references unknown property '{}'", + type_name, property + )) + })?; + if prop.prop_type.scalar != ScalarType::String { + return Err(NanoError::Parse(format!( + "@check on {}.{} requires String type, got {}", + type_name, + property, + prop.prop_type.display_name() + ))); + } + } + } + } + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_basic_schema() { + let input = r#" +node Person { + name: String + age: I32? +} + +node Company { + name: String +} + +edge Knows: Person -> Person { + since: Date? +} + +edge WorksAt: Person -> Company { + title: String? +} +"#; + let schema = parse_schema(input).unwrap(); + assert_eq!(schema.declarations.len(), 4); + + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert_eq!(n.name, "Person"); + assert!(n.annotations.is_empty()); + assert!(n.implements.is_empty()); + assert_eq!(n.properties.len(), 2); + assert_eq!(n.properties[0].name, "name"); + assert!(!n.properties[0].prop_type.nullable); + assert_eq!(n.properties[1].name, "age"); + assert!(n.properties[1].prop_type.nullable); + } + _ => panic!("expected Node"), + } + + match &schema.declarations[2] { + SchemaDecl::Edge(e) => { + assert_eq!(e.name, "Knows"); + assert_eq!(e.from_type, "Person"); + assert_eq!(e.to_type, "Person"); + assert!(e.annotations.is_empty()); + assert_eq!(e.properties.len(), 1); + assert!(e.cardinality.is_default()); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_parse_interface_basic() { + let input = r#" +interface Named { + name: String +} +node Person implements Named { + age: I32? +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Interface(i) => { + assert_eq!(i.name, "Named"); + assert_eq!(i.properties.len(), 1); + assert_eq!(i.properties[0].name, "name"); + } + _ => panic!("expected Interface"), + } + match &schema.declarations[1] { + SchemaDecl::Node(n) => { + assert_eq!(n.name, "Person"); + assert_eq!(n.implements, vec!["Named"]); + // "name" injected from interface + "age" declared locally + assert_eq!(n.properties.len(), 2); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_implements_multiple() { + let input = r#" +interface Slugged { + slug: String @key +} +interface Described { + title: String + description: String? +} +node Signal implements Slugged, Described { + strength: F64 +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[2] { + SchemaDecl::Node(n) => { + assert_eq!(n.name, "Signal"); + assert_eq!(n.implements, vec!["Slugged", "Described"]); + // slug + title + description + strength + assert_eq!(n.properties.len(), 4); + // @key from Slugged should be desugared into constraints + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Key(v) if v == &["slug"])) + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_reject_implements_unknown_interface() { + let input = r#" +node Person implements Unknown { + name: String +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("unknown interface")); + } + + #[test] + fn test_reject_interface_property_type_conflict() { + let input = r#" +interface Named { + name: I32 +} +node Person implements Named { + name: String +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("type") || err.to_string().contains("interface")); + } + + #[test] + fn test_parse_annotation() { + let input = r#" +node Person { + name: String @unique + id: U64 @key + handle: String @index +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert_eq!(n.properties[0].annotations.len(), 1); + assert_eq!(n.properties[0].annotations[0].name, "unique"); + assert_eq!(n.properties[1].annotations[0].name, "key"); + assert_eq!(n.properties[2].annotations[0].name, "index"); + // Annotations are desugared into constraints + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Unique(_))) + ); + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Key(_))) + ); + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Index(_))) + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_property_level_key_desugars_to_constraint() { + let input = r#" +node Person { + name: String @key +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Key(v) if v == &["name"])) + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_body_constraint_key() { + let input = r#" +node Person { + name: String + @key(name) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Key(v) if v == &["name"])) + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_body_constraint_unique_composite() { + let input = r#" +node Person { + first: String + last: String + @unique(first, last) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Unique(v) if v == &["first", "last"])) + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_body_constraint_index_composite() { + let input = r#" +node Event { + category: String + date: Date + @index(category, date) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!( + n.constraints + .iter() + .any(|c| matches!(c, Constraint::Index(v) if v == &["category", "date"])) + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_body_constraint_range() { + let input = r#" +node Person { + age: I32? + @range(age, 0..200) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!( + n.constraints.iter().any( + |c| matches!(c, Constraint::Range { property, .. } if property == "age") + ) + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_range_float_bounds() { + let input = r#" +node Measurement { + name: String @key + temperature: F64? + @range(temperature, 0.0..100.0) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!(n.constraints.iter().any(|c| matches!( + c, + Constraint::Range { property, min, max } + if property == "temperature" + && matches!(min, Some(ConstraintBound::Float(f)) if *f == 0.0) + && matches!(max, Some(ConstraintBound::Float(f)) if *f == 100.0) + ))); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_range_negative_float_bounds() { + let input = r#" +node Measurement { + name: String @key + temperature: F64? + @range(temperature, -40.0..60.0) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!(n.constraints.iter().any(|c| matches!( + c, + Constraint::Range { property, min, max } + if property == "temperature" + && matches!(min, Some(ConstraintBound::Float(f)) if *f == -40.0) + && matches!(max, Some(ConstraintBound::Float(f)) if *f == 60.0) + ))); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_range_negative_integer_bounds() { + let input = r#" +node Account { + name: String @key + balance: I64? + @range(balance, -1000..1000) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!(n.constraints.iter().any(|c| matches!( + c, + Constraint::Range { property, min, max } + if property == "balance" + && matches!(min, Some(ConstraintBound::Integer(n)) if *n == -1000) + && matches!(max, Some(ConstraintBound::Integer(n)) if *n == 1000) + ))); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_body_constraint_check() { + let input = r#" +node Order { + code: String + @check(code, "[A-Z]{3}-[0-9]+") +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert!(n.constraints.iter().any(|c| matches!(c, Constraint::Check { property, pattern } if property == "code" && pattern == "[A-Z]{3}-[0-9]+"))); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_reject_range_on_string() { + let input = r#" +node Person { + name: String + @range(name, 0..100) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("numeric")); + } + + #[test] + fn test_reject_check_on_integer() { + let input = r#" +node Person { + age: I32 + @check(age, "[0-9]+") +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("String")); + } + + #[test] + fn test_parse_edge_cardinality() { + let input = r#" +node Person { name: String } +node Company { name: String } +edge WorksAt: Person -> Company @card(0..1) +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[2] { + SchemaDecl::Edge(e) => { + assert_eq!(e.cardinality.min, 0); + assert_eq!(e.cardinality.max, Some(1)); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_parse_edge_cardinality_unbounded() { + let input = r#" +node Person { name: String } +node Paper { title: String } +edge Authored: Person -> Paper @card(1..) +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[2] { + SchemaDecl::Edge(e) => { + assert_eq!(e.cardinality.min, 1); + assert_eq!(e.cardinality.max, None); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_parse_edge_default_cardinality() { + let input = r#" +node Person { name: String } +edge Knows: Person -> Person +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[1] { + SchemaDecl::Edge(e) => { + assert!(e.cardinality.is_default()); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_parse_edge_unique_src_dst() { + let input = r#" +node Person { name: String } +edge Knows: Person -> Person { + @unique(src, dst) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[1] { + SchemaDecl::Edge(e) => { + assert!( + e.constraints + .iter() + .any(|c| matches!(c, Constraint::Unique(v) if v == &["src", "dst"])) + ); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_parse_edge_property_index() { + let input = r#" +node Person { name: String } +node Company { name: String } +edge WorksAt: Person -> Company { + since: Date? @index +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[2] { + SchemaDecl::Edge(e) => { + // @index on since is desugared to Constraint::Index + assert!( + e.constraints + .iter() + .any(|c| matches!(c, Constraint::Index(v) if v == &["since"])) + ); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_parse_embed_annotation_identifier_arg() { + let input = r#" +node Doc { + title: String + embedding: Vector(3) @embed(title) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert_eq!(n.properties[1].annotations.len(), 1); + assert_eq!(n.properties[1].annotations[0].name, "embed"); + assert_eq!( + n.properties[1].annotations[0].value.as_deref(), + Some("title") + ); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_parse_edge_no_body() { + let input = "edge WorksAt: Person -> Company\n"; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Edge(e) => { + assert_eq!(e.name, "WorksAt"); + assert!(e.annotations.is_empty()); + assert!(e.properties.is_empty()); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_parse_type_rename_annotation() { + let input = r#" +node Account @rename_from("User") { + full_name: String @rename_from("name") +} + +edge ConnectedTo: Account -> Account @rename_from("Knows") +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + assert_eq!(n.name, "Account"); + assert_eq!(n.annotations.len(), 1); + assert_eq!(n.annotations[0].name, "rename_from"); + assert_eq!(n.annotations[0].value.as_deref(), Some("User")); + assert_eq!(n.properties[0].annotations[0].name, "rename_from"); + assert_eq!( + n.properties[0].annotations[0].value.as_deref(), + Some("name") + ); + } + _ => panic!("expected Node"), + } + match &schema.declarations[1] { + SchemaDecl::Edge(e) => { + assert_eq!(e.name, "ConnectedTo"); + assert_eq!(e.annotations.len(), 1); + assert_eq!(e.annotations[0].name, "rename_from"); + assert_eq!(e.annotations[0].value.as_deref(), Some("Knows")); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_reject_multiple_node_keys() { + let input = r#" +node Person { + id: U64 @key + ext_id: String @key +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("multiple @key")); + } + + #[test] + fn test_reject_unique_with_value() { + // @unique("x") is now a parse error — the grammar parses it as a body_constraint + // which expects ident args, not string literals as the sole argument + let input = r#" +node Person { + email: String @unique("x") +} +"#; + assert!(parse_schema(input).is_err()); + } + + #[test] + fn test_reject_index_with_value() { + // @index("x") is now a parse error — same reason as above + let input = r#" +node Person { + email: String @index("x") +} +"#; + assert!(parse_schema(input).is_err()); + } + + #[test] + fn test_reject_unique_on_node_annotation() { + let input = r#" +node Person @unique { + email: String +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!( + err.to_string() + .contains("only supported on node properties") + ); + } + + #[test] + fn test_reject_index_on_node_annotation() { + let input = r#" +node Person @index { + email: String +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!( + err.to_string() + .contains("only supported on node properties") + ); + } + + #[test] + fn test_allow_unique_on_edge_property() { + let input = r#" +node Person { name: String } +edge Knows: Person -> Person { + weight: I32 @unique +} +"#; + // Should now succeed (edge property @unique is allowed) + let schema = parse_schema(input).unwrap(); + match &schema.declarations[1] { + SchemaDecl::Edge(e) => { + assert!( + e.constraints + .iter() + .any(|c| matches!(c, Constraint::Unique(v) if v == &["weight"])) + ); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_allow_index_on_edge_property() { + let input = r#" +node Person { name: String } +edge Knows: Person -> Person { + weight: I32 @index +} +"#; + // Should now succeed (edge property @index is allowed) + let schema = parse_schema(input).unwrap(); + match &schema.declarations[1] { + SchemaDecl::Edge(e) => { + assert!( + e.constraints + .iter() + .any(|c| matches!(c, Constraint::Index(v) if v == &["weight"])) + ); + } + _ => panic!("expected Edge"), + } + } + + #[test] + fn test_reject_embed_without_source_property() { + let input = r#" +node Doc { + title: String + embedding: Vector(3) @embed +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("requires a source property name")); + } + + #[test] + fn test_reject_embed_on_non_vector_property() { + let input = r#" +node Doc { + title: String @embed(title) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!( + err.to_string() + .contains("only supported on vector properties") + ); + } + + #[test] + fn test_reject_embed_unknown_source_property() { + let input = r#" +node Doc { + title: String + embedding: Vector(3) @embed(body) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!( + err.to_string() + .contains("references unknown source property") + ); + } + + #[test] + fn test_reject_embed_source_not_string() { + let input = r#" +node Doc { + body: I32 + embedding: Vector(3) @embed(body) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("must be String")); + } + + #[test] + fn test_reject_embed_on_edge_property() { + let input = r#" +node Doc { title: String } +edge Linked: Doc -> Doc { + embedding: Vector(3) @embed(title) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("edge properties")); + } + + #[test] + fn test_parse_enum_and_list_types() { + let input = r#" +node Ticket { + status: enum(open, closed, blocked) + tags: [String] +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => { + let status = &n.properties[0].prop_type; + assert!(status.is_enum()); + assert!(!status.list); + assert_eq!( + status.enum_values.as_ref().unwrap(), + &vec![ + "blocked".to_string(), + "closed".to_string(), + "open".to_string() + ] + ); + + let tags = &n.properties[1].prop_type; + assert!(tags.list); + assert!(!tags.is_enum()); + assert_eq!(tags.scalar, ScalarType::String); + } + _ => panic!("expected Node"), + } + } + + #[test] + fn test_reject_duplicate_enum_values() { + let input = r#" +node Ticket { + status: enum(open, closed, open) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("duplicate values")); + } + + #[test] + fn test_parse_description_and_instruction_annotations() { + let input = r#" +node Task @description("Tracked work item") @instruction("Prefer querying by slug") { + slug: String @key @description("Stable external identifier") +} +edge DependsOn: Task -> Task @description("Hard dependency") @instruction("Use only for blockers") +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(node) => { + assert_eq!( + node.annotations + .iter() + .find(|ann| ann.name == "description") + .and_then(|ann| ann.value.as_deref()), + Some("Tracked work item") + ); + assert_eq!( + node.annotations + .iter() + .find(|ann| ann.name == "instruction") + .and_then(|ann| ann.value.as_deref()), + Some("Prefer querying by slug") + ); + assert_eq!( + node.properties[0] + .annotations + .iter() + .find(|ann| ann.name == "description") + .and_then(|ann| ann.value.as_deref()), + Some("Stable external identifier") + ); + } + _ => panic!("expected node"), + } + match &schema.declarations[1] { + SchemaDecl::Edge(edge) => { + assert_eq!( + edge.annotations + .iter() + .find(|ann| ann.name == "description") + .and_then(|ann| ann.value.as_deref()), + Some("Hard dependency") + ); + assert_eq!( + edge.annotations + .iter() + .find(|ann| ann.name == "instruction") + .and_then(|ann| ann.value.as_deref()), + Some("Use only for blockers") + ); + } + _ => panic!("expected edge"), + } + } + + #[test] + fn test_parse_annotation_decodes_escapes() { + let input = r#" +node Task @description("Tracked\n\"work\"\\item") { + slug: String @key @description("Stable\tidentifier") +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(node) => { + assert_eq!( + node.annotations[0].value.as_deref(), + Some("Tracked\n\"work\"\\item") + ); + assert_eq!( + node.properties[0].annotations[1].value.as_deref(), + Some("Stable\tidentifier") + ); + } + _ => panic!("expected node"), + } + } + + #[test] + fn test_parse_annotation_rejects_unknown_escape() { + let input = r#" +node Task @description("Tracked\q") { + slug: String @key +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("unsupported escape sequence")); + } + + #[test] + fn test_reject_duplicate_description_annotations() { + let input = r#" +node Task @description("a") @description("b") { + slug: String @key +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!( + err.to_string() + .contains("declares @description multiple times") + ); + } + + #[test] + fn test_reject_instruction_on_property() { + let input = r#" +node Task { + slug: String @instruction("bad") +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!( + err.to_string() + .contains("@instruction is only supported on node and edge types") + ); + } + + #[test] + fn test_reject_key_on_list_property() { + let input = r#" +node Ticket { + tags: [String] @key +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("list property")); + } + + #[test] + fn test_parse_vector_type() { + let input = r#" +node Doc { + embedding: Vector(3) +} +"#; + let schema = parse_schema(input).unwrap(); + match &schema.declarations[0] { + SchemaDecl::Node(n) => match n.properties[0].prop_type.scalar { + ScalarType::Vector(dim) => assert_eq!(dim, 3), + other => panic!("expected vector type, got {:?}", other), + }, + _ => panic!("expected node"), + } + } + + #[test] + fn test_reject_zero_vector_dimension() { + let input = r#" +node Doc { + embedding: Vector(0) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("Vector dimension")); + } + + #[test] + fn test_reject_vector_dimension_larger_than_arrow_bound() { + let input = r#" +node Doc { + embedding: Vector(2147483648) +} +"#; + let err = parse_schema(input).unwrap_err(); + assert!(err.to_string().contains("exceeds maximum supported")); + } + + #[test] + fn test_parse_error() { + let input = "node { }"; // missing type name + assert!(parse_schema(input).is_err()); + } + + #[test] + fn test_parse_error_diagnostic_has_span() { + let input = "node { }"; + let err = parse_schema_diagnostic(input).unwrap_err(); + assert!(err.span.is_some()); + } +} diff --git a/crates/omnigraph-compiler/src/schema/schema.pest b/crates/omnigraph-compiler/src/schema/schema.pest new file mode 100644 index 0000000..395c516 --- /dev/null +++ b/crates/omnigraph-compiler/src/schema/schema.pest @@ -0,0 +1,60 @@ +// Omnigraph Schema Grammar (.pg files) + +WHITESPACE = _{ " " | "\t" | "\r" | "\n" } +COMMENT = _{ LINE_COMMENT | BLOCK_COMMENT } +LINE_COMMENT = _{ "//" ~ (!"\n" ~ ANY)* } +BLOCK_COMMENT = _{ "/*" ~ (!"*/" ~ ANY)* ~ "*/" } + +schema_file = { SOI ~ schema_decl* ~ EOI } + +schema_decl = { interface_decl | node_decl | edge_decl } + +// interface Named { name: String @key } +interface_decl = { "interface" ~ type_name ~ "{" ~ prop_decl* ~ "}" } + +// node Person implements Named, Described { ... } +node_decl = { "node" ~ type_name ~ annotation* ~ implements_clause? ~ "{" ~ (prop_decl | body_constraint)* ~ "}" } +implements_clause = { "implements" ~ type_name ~ ("," ~ type_name)* } + +// edge Knows: Person -> Person @card(0..1) { ... } +// edge Knows: Person -> Person +edge_decl = { "edge" ~ type_name ~ ":" ~ type_name ~ "->" ~ type_name ~ cardinality? ~ annotation* ~ ("{" ~ (prop_decl | body_constraint)* ~ "}")? } + +// @card(0..1), @card(1..), @card(0..) +cardinality = { "@card" ~ "(" ~ integer ~ ".." ~ integer? ~ ")" } + +prop_decl = { ident ~ ":" ~ type_ref ~ annotation* } + +// Body-level constraints: @key(name), @unique(a, b), @index(a, b), @range(age, 0..200), @check(code, "regex") +body_constraint = { "@" ~ constraint_name ~ "(" ~ constraint_args ~ ")" } +constraint_name = { "key" | "unique" | "index" | "range" | "check" } +constraint_args = { constraint_arg ~ ("," ~ constraint_arg)* } +constraint_arg = { range_bound | literal | ident } +range_bound = { (signed_float | signed_integer) ~ ".." ~ (signed_float | signed_integer)? | ".." ~ (signed_float | signed_integer) } + +type_ref = { core_type ~ "?"? } +core_type = { list_type | enum_type | vector_type | base_type } +list_type = { "[" ~ base_type ~ "]" } +enum_type = { "enum" ~ "(" ~ enum_value ~ ("," ~ enum_value)* ~ ")" } +vector_type = { "Vector" ~ "(" ~ integer ~ ")" } +enum_value = @{ (ASCII_ALPHANUMERIC | "_" | "-")+ } + +base_type = { "String" | "Blob" | "Bool" | "I32" | "I64" | "U32" | "U64" | "F32" | "F64" | "DateTime" | "Date" } + +// Annotation rule excludes constraint keywords followed by "(" — those are body_constraints +annotation = { "@" ~ !(constraint_name ~ "(") ~ ident ~ ("(" ~ annotation_arg ~ ")")? } +annotation_arg = { literal | ident } + +literal = { string_lit | float_lit | integer | bool_lit } + +string_lit = @{ "\"" ~ string_char* ~ "\"" } +string_char = @{ !("\"" | "\\") ~ ANY | "\\" ~ ANY } +float_lit = @{ ASCII_DIGIT+ ~ "." ~ ASCII_DIGIT+ } +integer = @{ ASCII_DIGIT+ } + +signed_float = @{ "-"? ~ ASCII_DIGIT+ ~ "." ~ ASCII_DIGIT+ } +signed_integer = @{ "-"? ~ ASCII_DIGIT+ } +bool_lit = { "true" | "false" } + +type_name = @{ ASCII_ALPHA_UPPER ~ (ASCII_ALPHANUMERIC | "_")* } +ident = @{ (ASCII_ALPHA_LOWER | "_") ~ (ASCII_ALPHANUMERIC | "_")* } diff --git a/crates/omnigraph-compiler/src/types.rs b/crates/omnigraph-compiler/src/types.rs new file mode 100644 index 0000000..5140acc --- /dev/null +++ b/crates/omnigraph-compiler/src/types.rs @@ -0,0 +1,227 @@ +use arrow_schema::DataType; +use serde::{Deserialize, Serialize}; + +const MAX_VECTOR_DIM: u32 = i32::MAX as u32; + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum ScalarType { + String, + Bool, + I32, + I64, + U32, + U64, + F32, + F64, + Date, + DateTime, + Vector(u32), + Blob, +} + +impl ScalarType { + pub fn from_str_name(s: &str) -> Option { + if let Some(inner) = s.strip_prefix("Vector(").and_then(|t| t.strip_suffix(')')) { + let dim = inner.parse::().ok()?; + if dim == 0 || dim > MAX_VECTOR_DIM { + return None; + } + return Some(Self::Vector(dim)); + } + + match s { + "String" => Some(Self::String), + "Bool" => Some(Self::Bool), + "I32" => Some(Self::I32), + "I64" => Some(Self::I64), + "U32" => Some(Self::U32), + "U64" => Some(Self::U64), + "F32" => Some(Self::F32), + "F64" => Some(Self::F64), + "Date" => Some(Self::Date), + "DateTime" => Some(Self::DateTime), + "Blob" => Some(Self::Blob), + _ => None, + } + } + + pub fn to_arrow(&self) -> DataType { + match self { + Self::String => DataType::Utf8, + Self::Bool => DataType::Boolean, + Self::I32 => DataType::Int32, + Self::I64 => DataType::Int64, + Self::U32 => DataType::UInt32, + Self::U64 => DataType::UInt64, + Self::F32 => DataType::Float32, + Self::F64 => DataType::Float64, + Self::Date => DataType::Date32, + Self::DateTime => DataType::Date64, + Self::Blob => DataType::LargeBinary, + Self::Vector(dim) => { + let dim = i32::try_from(*dim) + .expect("vector dimension exceeds Arrow FixedSizeList i32 bound"); + DataType::FixedSizeList( + std::sync::Arc::new(arrow_schema::Field::new("item", DataType::Float32, true)), + dim, + ) + } + } + } + + pub fn is_numeric(&self) -> bool { + matches!( + self, + Self::I32 | Self::I64 | Self::U32 | Self::U64 | Self::F32 | Self::F64 + ) + } +} + +impl std::fmt::Display for ScalarType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let s = match self { + Self::String => "String", + Self::Bool => "Bool", + Self::I32 => "I32", + Self::I64 => "I64", + Self::U32 => "U32", + Self::U64 => "U64", + Self::F32 => "F32", + Self::F64 => "F64", + Self::Date => "Date", + Self::DateTime => "DateTime", + Self::Blob => "Blob", + Self::Vector(dim) => return write!(f, "Vector({})", dim), + }; + write!(f, "{}", s) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct PropType { + pub scalar: ScalarType, + pub nullable: bool, + pub list: bool, + pub enum_values: Option>, +} + +impl PropType { + pub fn from_param_type_name(s: &str, nullable: bool) -> Option { + if let Some(inner) = s + .strip_prefix('[') + .and_then(|value| value.strip_suffix(']')) + { + let scalar = ScalarType::from_str_name(inner)?; + return Some(Self::list_of(scalar, nullable)); + } + + let scalar = ScalarType::from_str_name(s)?; + Some(Self::scalar(scalar, nullable)) + } + + pub fn scalar(scalar: ScalarType, nullable: bool) -> Self { + Self { + scalar, + nullable, + list: false, + enum_values: None, + } + } + + pub fn list_of(scalar: ScalarType, nullable: bool) -> Self { + Self { + scalar, + nullable, + list: true, + enum_values: None, + } + } + + pub fn enum_type(mut values: Vec, nullable: bool) -> Self { + values.sort(); + values.dedup(); + Self { + scalar: ScalarType::String, + nullable, + list: false, + enum_values: Some(values), + } + } + + pub fn is_enum(&self) -> bool { + self.enum_values.is_some() + } + + pub fn to_arrow(&self) -> DataType { + let scalar_dt = self.scalar.to_arrow(); + if self.list { + DataType::List(std::sync::Arc::new(arrow_schema::Field::new( + "item", scalar_dt, true, + ))) + } else { + scalar_dt + } + } + + pub fn display_name(&self) -> String { + let base = if let Some(values) = &self.enum_values { + format!("enum({})", values.join(", ")) + } else { + self.scalar.to_string() + }; + let wrapped = if self.list { + format!("[{}]", base) + } else { + base + }; + if self.nullable { + format!("{}?", wrapped) + } else { + wrapped + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum Direction { + Out, + In, +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow_schema::{DataType, Field}; + use std::sync::Arc; + + #[test] + fn vector_to_arrow_uses_nullable_float32_child() { + let dt = ScalarType::Vector(4).to_arrow(); + assert_eq!( + dt, + DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 4) + ); + } + + #[test] + fn scalar_type_from_str_name_rejects_vector_dimensions_outside_arrow_bounds() { + let too_large = format!("Vector({})", (i32::MAX as u64) + 1); + assert!(ScalarType::from_str_name(&too_large).is_none()); + assert_eq!( + ScalarType::from_str_name("Vector(2147483647)"), + Some(ScalarType::Vector(2147483647)) + ); + } + + #[test] + fn prop_type_from_param_type_name_supports_lists_and_nullable_scalars() { + assert_eq!( + PropType::from_param_type_name("[DateTime]", false), + Some(PropType::list_of(ScalarType::DateTime, false)) + ); + assert_eq!( + PropType::from_param_type_name("DateTime", true), + Some(PropType::scalar(ScalarType::DateTime, true)) + ); + } +} diff --git a/crates/omnigraph-server/Cargo.toml b/crates/omnigraph-server/Cargo.toml new file mode 100644 index 0000000..7d789b6 --- /dev/null +++ b/crates/omnigraph-server/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "omnigraph-server" +version = "0.4.0" +edition = "2024" +description = "HTTP server for the Omnigraph graph database." +license = "MIT" + +[[bin]] +name = "omnigraph-server" +path = "src/main.rs" + +[dependencies] +omnigraph = { path = "../omnigraph", version = "0.4.0" } +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.0" } +axum = { workspace = true } +clap = { workspace = true } +color-eyre = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +tokio = { workspace = true } +serde_yaml = { workspace = true } +tracing = { workspace = true } +tracing-subscriber = { workspace = true } +tower-http = { workspace = true } +cedar-policy = { workspace = true } + +[dev-dependencies] +tempfile = { workspace = true } +tower = { workspace = true } +serial_test = "3" diff --git a/crates/omnigraph-server/src/api.rs b/crates/omnigraph-server/src/api.rs new file mode 100644 index 0000000..9411c60 --- /dev/null +++ b/crates/omnigraph-server/src/api.rs @@ -0,0 +1,395 @@ +use omnigraph::db::{GraphCommit, MergeOutcome, ReadTarget, RunRecord, Snapshot}; +use omnigraph::error::{MergeConflict, MergeConflictKind}; +use omnigraph::loader::{IngestResult, LoadMode}; +use omnigraph_compiler::result::QueryResult; +use serde::{Deserialize, Serialize}; +use serde_json::Value; + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SnapshotTableOutput { + pub table_key: String, + pub table_path: String, + pub table_version: u64, + pub table_branch: Option, + pub row_count: u64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct SnapshotOutput { + pub branch: String, + pub manifest_version: u64, + pub tables: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RunOutput { + pub run_id: String, + pub target_branch: String, + pub run_branch: String, + pub base_snapshot_id: String, + pub base_manifest_version: u64, + pub operation_hash: Option, + pub actor_id: Option, + pub status: String, + pub published_snapshot_id: Option, + pub created_at: i64, + pub updated_at: i64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct RunListOutput { + pub runs: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BranchCreateRequest { + pub from: Option, + pub name: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BranchCreateOutput { + pub uri: String, + pub from: String, + pub name: String, + pub actor_id: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BranchListOutput { + pub branches: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BranchDeleteOutput { + pub uri: String, + pub name: String, + pub actor_id: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BranchMergeRequest { + pub source: String, + pub target: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum BranchMergeOutcome { + AlreadyUpToDate, + FastForward, + Merged, +} + +impl From for BranchMergeOutcome { + fn from(value: MergeOutcome) -> Self { + match value { + MergeOutcome::AlreadyUpToDate => Self::AlreadyUpToDate, + MergeOutcome::FastForward => Self::FastForward, + MergeOutcome::Merged => Self::Merged, + } + } +} + +impl BranchMergeOutcome { + pub fn as_str(self) -> &'static str { + match self { + Self::AlreadyUpToDate => "already_up_to_date", + Self::FastForward => "fast_forward", + Self::Merged => "merged", + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct BranchMergeOutput { + pub source: String, + pub target: String, + pub outcome: BranchMergeOutcome, + pub actor_id: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum MergeConflictKindOutput { + DivergentInsert, + DivergentUpdate, + DeleteVsUpdate, + OrphanEdge, + UniqueViolation, + CardinalityViolation, + ValueConstraintViolation, +} + +impl MergeConflictKindOutput { + pub fn as_str(self) -> &'static str { + match self { + Self::DivergentInsert => "divergent_insert", + Self::DivergentUpdate => "divergent_update", + Self::DeleteVsUpdate => "delete_vs_update", + Self::OrphanEdge => "orphan_edge", + Self::UniqueViolation => "unique_violation", + Self::CardinalityViolation => "cardinality_violation", + Self::ValueConstraintViolation => "value_constraint_violation", + } + } +} + +impl From for MergeConflictKindOutput { + fn from(value: MergeConflictKind) -> Self { + match value { + MergeConflictKind::DivergentInsert => Self::DivergentInsert, + MergeConflictKind::DivergentUpdate => Self::DivergentUpdate, + MergeConflictKind::DeleteVsUpdate => Self::DeleteVsUpdate, + MergeConflictKind::OrphanEdge => Self::OrphanEdge, + MergeConflictKind::UniqueViolation => Self::UniqueViolation, + MergeConflictKind::CardinalityViolation => Self::CardinalityViolation, + MergeConflictKind::ValueConstraintViolation => Self::ValueConstraintViolation, + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct MergeConflictOutput { + pub table_key: String, + pub row_id: Option, + pub kind: MergeConflictKindOutput, + pub message: String, +} + +impl From<&MergeConflict> for MergeConflictOutput { + fn from(value: &MergeConflict) -> Self { + Self { + table_key: value.table_key.clone(), + row_id: value.row_id.clone(), + kind: value.kind.into(), + message: value.message.clone(), + } + } +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReadTargetOutput { + pub branch: Option, + pub snapshot: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReadOutput { + pub query_name: String, + pub target: ReadTargetOutput, + pub row_count: usize, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub columns: Vec, + pub rows: Value, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChangeOutput { + pub branch: String, + pub query_name: String, + pub affected_nodes: usize, + pub affected_edges: usize, + pub actor_id: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IngestTableOutput { + pub table_key: String, + pub rows_loaded: usize, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IngestOutput { + pub uri: String, + pub branch: String, + pub base_branch: String, + pub branch_created: bool, + pub mode: LoadMode, + pub tables: Vec, + pub actor_id: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CommitOutput { + pub graph_commit_id: String, + pub manifest_branch: Option, + pub manifest_version: u64, + pub parent_commit_id: Option, + pub merged_parent_commit_id: Option, + pub actor_id: Option, + pub created_at: i64, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct CommitListOutput { + pub commits: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ReadRequest { + pub query_source: String, + pub query_name: Option, + pub params: Option, + pub branch: Option, + pub snapshot: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ChangeRequest { + pub query_source: String, + pub query_name: Option, + pub params: Option, + pub branch: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct IngestRequest { + pub branch: Option, + pub from: Option, + pub mode: Option, + pub data: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ExportRequest { + pub branch: Option, + #[serde(default)] + pub type_names: Vec, + #[serde(default)] + pub table_keys: Vec, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct SnapshotQuery { + pub branch: Option, +} + +#[derive(Debug, Clone, Deserialize)] +pub struct CommitListQuery { + pub branch: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct HealthOutput { + pub status: String, + pub version: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub source_version: Option, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum ErrorCode { + Unauthorized, + Forbidden, + BadRequest, + NotFound, + Conflict, + Internal, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct ErrorOutput { + pub error: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub code: Option, + #[serde(default, skip_serializing_if = "Vec::is_empty")] + pub merge_conflicts: Vec, +} + +pub fn snapshot_payload(branch: &str, snapshot: &Snapshot) -> SnapshotOutput { + let mut entries: Vec<_> = snapshot.entries().cloned().collect(); + entries.sort_by(|a, b| a.table_key.cmp(&b.table_key)); + let tables = entries + .iter() + .map(|entry| SnapshotTableOutput { + table_key: entry.table_key.clone(), + table_path: entry.table_path.clone(), + table_version: entry.table_version, + table_branch: entry.table_branch.clone(), + row_count: entry.row_count, + }) + .collect::>(); + SnapshotOutput { + branch: branch.to_string(), + manifest_version: snapshot.version(), + tables, + } +} + +pub fn run_output(run: &RunRecord) -> RunOutput { + RunOutput { + run_id: run.run_id.as_str().to_string(), + target_branch: run.target_branch.clone(), + run_branch: run.run_branch.clone(), + base_snapshot_id: run.base_snapshot_id.as_str().to_string(), + base_manifest_version: run.base_manifest_version, + operation_hash: run.operation_hash.clone(), + actor_id: run.actor_id.clone(), + status: run.status.as_str().to_string(), + published_snapshot_id: run.published_snapshot_id.clone(), + created_at: run.created_at, + updated_at: run.updated_at, + } +} + +pub fn commit_output(commit: &GraphCommit) -> CommitOutput { + CommitOutput { + graph_commit_id: commit.graph_commit_id.clone(), + manifest_branch: commit.manifest_branch.clone(), + manifest_version: commit.manifest_version, + parent_commit_id: commit.parent_commit_id.clone(), + merged_parent_commit_id: commit.merged_parent_commit_id.clone(), + actor_id: commit.actor_id.clone(), + created_at: commit.created_at, + } +} + +pub fn read_output(query_name: String, target: &ReadTarget, result: QueryResult) -> ReadOutput { + let columns = result + .schema() + .fields() + .iter() + .map(|field| field.name().clone()) + .collect(); + ReadOutput { + query_name, + target: read_target_output(target), + row_count: result.num_rows(), + columns, + rows: result.to_rust_json(), + } +} + +pub fn ingest_output(uri: &str, result: &IngestResult, actor_id: Option) -> IngestOutput { + IngestOutput { + uri: uri.to_string(), + branch: result.branch.clone(), + base_branch: result.base_branch.clone(), + branch_created: result.branch_created, + mode: result.mode, + tables: result + .tables + .iter() + .map(|table| IngestTableOutput { + table_key: table.table_key.clone(), + rows_loaded: table.rows_loaded, + }) + .collect(), + actor_id, + } +} + +pub fn read_target_output(target: &ReadTarget) -> ReadTargetOutput { + match target { + ReadTarget::Branch(branch) => ReadTargetOutput { + branch: Some(branch.clone()), + snapshot: None, + }, + ReadTarget::Snapshot(snapshot) => ReadTargetOutput { + branch: None, + snapshot: Some(snapshot.as_str().to_string()), + }, + } +} diff --git a/crates/omnigraph-server/src/config.rs b/crates/omnigraph-server/src/config.rs new file mode 100644 index 0000000..69f8e95 --- /dev/null +++ b/crates/omnigraph-server/src/config.rs @@ -0,0 +1,479 @@ +use std::collections::BTreeMap; +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +use clap::ValueEnum; +use color_eyre::eyre::{Result, bail}; +use serde::{Deserialize, Serialize}; +pub const DEFAULT_CONFIG_FILE: &str = "omnigraph.yaml"; + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ProjectConfig { + pub name: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct TargetConfig { + pub uri: String, + pub bearer_token_env: Option, +} + +#[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Serialize, Deserialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +pub enum ReadOutputFormat { + #[default] + Table, + Kv, + Csv, + Jsonl, + Json, +} + +#[derive(Debug, Clone, Copy, Default, Eq, PartialEq, Serialize, Deserialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +pub enum TableCellLayout { + #[default] + Truncate, + Wrap, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct CliDefaults { + pub target: Option, + pub branch: Option, + pub output_format: Option, + pub table_max_column_width: Option, + pub table_cell_layout: Option, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct ServerDefaults { + pub target: Option, + pub bind: Option, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct AuthDefaults { + pub env_file: Option, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct QueryDefaults { + #[serde(default)] + pub roots: Vec, +} + +#[derive(Debug, Clone, Default, Serialize, Deserialize)] +pub struct PolicySettings { + pub file: Option, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum AliasCommand { + Read, + Change, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AliasConfig { + pub command: AliasCommand, + pub query: String, + pub name: Option, + #[serde(default)] + pub args: Vec, + pub target: Option, + pub branch: Option, + pub format: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct OmnigraphConfig { + #[serde(default)] + pub project: ProjectConfig, + #[serde(default)] + pub targets: BTreeMap, + #[serde(default)] + pub server: ServerDefaults, + #[serde(default)] + pub auth: AuthDefaults, + #[serde(default)] + pub cli: CliDefaults, + #[serde(default)] + pub query: QueryDefaults, + #[serde(default)] + pub aliases: BTreeMap, + #[serde(default)] + pub policy: PolicySettings, + #[serde(skip)] + base_dir: PathBuf, +} + +impl Default for OmnigraphConfig { + fn default() -> Self { + Self { + project: ProjectConfig::default(), + targets: BTreeMap::new(), + server: ServerDefaults::default(), + auth: AuthDefaults::default(), + cli: CliDefaults::default(), + query: QueryDefaults::default(), + aliases: BTreeMap::new(), + policy: PolicySettings::default(), + base_dir: PathBuf::new(), + } + } +} + +impl OmnigraphConfig { + pub fn base_dir(&self) -> &Path { + &self.base_dir + } + + pub fn cli_branch(&self) -> &str { + self.cli.branch.as_deref().unwrap_or("main") + } + + pub fn cli_output_format(&self) -> ReadOutputFormat { + self.cli.output_format.unwrap_or_default() + } + + pub fn table_max_column_width(&self) -> usize { + self.cli.table_max_column_width.unwrap_or(80) + } + + pub fn table_cell_layout(&self) -> TableCellLayout { + self.cli.table_cell_layout.unwrap_or_default() + } + + pub fn cli_target_name(&self) -> Option<&str> { + self.cli.target.as_deref() + } + + pub fn server_target_name(&self) -> Option<&str> { + self.server.target.as_deref() + } + + pub fn server_bind(&self) -> &str { + self.server.bind.as_deref().unwrap_or("127.0.0.1:8080") + } + + pub fn resolve_target_name<'a>( + &self, + explicit_uri: Option<&str>, + explicit_target: Option<&'a str>, + default_target: Option<&'a str>, + ) -> Option<&'a str> { + explicit_target.or_else(|| { + if explicit_uri.is_some() { + None + } else { + default_target + } + }) + } + + pub fn target_bearer_token_env( + &self, + explicit_uri: Option<&str>, + explicit_target: Option<&str>, + default_target: Option<&str>, + ) -> Option<&str> { + let target_name = + self.resolve_target_name(explicit_uri, explicit_target, default_target)?; + self.targets + .get(target_name) + .and_then(|target| target.bearer_token_env.as_deref()) + } + + pub fn resolve_auth_env_file(&self) -> Option { + let path = self.auth.env_file.as_deref()?; + let path = Path::new(path); + Some(if path.is_absolute() { + path.to_path_buf() + } else { + self.base_dir.join(path) + }) + } + + pub fn resolve_policy_file(&self) -> Option { + let path = self.policy.file.as_deref()?; + let path = Path::new(path); + Some(if path.is_absolute() { + path.to_path_buf() + } else { + self.base_dir.join(path) + }) + } + + pub fn resolve_policy_tests_file(&self) -> Option { + let policy_file = self.resolve_policy_file()?; + Some(policy_file.with_file_name("policy.tests.yaml")) + } + + pub fn alias(&self, name: &str) -> Result<&AliasConfig> { + self.aliases + .get(name) + .ok_or_else(|| color_eyre::eyre::eyre!("alias '{}' not found", name)) + } + + pub fn resolve_target_uri( + &self, + explicit_uri: Option, + explicit_target: Option<&str>, + default_target: Option<&str>, + ) -> Result { + if let Some(uri) = explicit_uri { + return Ok(uri); + } + + let target_name = explicit_target.or(default_target).ok_or_else(|| { + color_eyre::eyre::eyre!("URI must be provided via , --target, or config") + })?; + let target = self.targets.get(target_name).ok_or_else(|| { + color_eyre::eyre::eyre!( + "target '{}' not found in {}", + target_name, + DEFAULT_CONFIG_FILE + ) + })?; + Ok(self.resolve_config_uri(&target.uri)) + } + + pub fn resolve_query_path(&self, query: &Path) -> Result { + if query.is_absolute() { + return Ok(query.to_path_buf()); + } + + let direct = self.base_dir.join(query); + if direct.exists() { + return Ok(direct); + } + + for root in &self.query.roots { + let candidate = self.base_dir.join(root).join(query); + if candidate.exists() { + return Ok(candidate); + } + } + + bail!("query file '{}' not found", query.display()); + } + + fn resolve_config_uri(&self, value: &str) -> String { + if value.contains("://") { + return value.to_string(); + } + + let path = Path::new(value); + if path.is_absolute() { + value.to_string() + } else { + self.base_dir.join(path).to_string_lossy().to_string() + } + } +} + +pub fn default_config_path() -> PathBuf { + PathBuf::from(DEFAULT_CONFIG_FILE) +} + +pub fn load_config(config_path: Option<&PathBuf>) -> Result { + load_config_in(&env::current_dir()?, config_path) +} + +fn load_config_in(cwd: &Path, config_path: Option<&PathBuf>) -> Result { + let explicit_path = config_path.cloned(); + let config_path = explicit_path.or_else(|| { + let default_path = cwd.join(DEFAULT_CONFIG_FILE); + default_path.exists().then_some(default_path) + }); + + let mut config = if let Some(path) = &config_path { + serde_yaml::from_str::(&fs::read_to_string(path)?)? + } else { + OmnigraphConfig::default() + }; + + config.base_dir = if let Some(path) = config_path { + absolute_base_dir(cwd, &path)? + } else { + cwd.to_path_buf() + }; + + Ok(config) +} + +fn absolute_base_dir(cwd: &Path, path: &Path) -> Result { + let path = if path.is_absolute() { + path.to_path_buf() + } else { + cwd.join(path) + }; + Ok(path + .parent() + .map(Path::to_path_buf) + .unwrap_or_else(|| cwd.to_path_buf())) +} + +#[cfg(test)] +mod tests { + use std::fs; + use std::path::{Path, PathBuf}; + + use tempfile::tempdir; + + use super::{ReadOutputFormat, TableCellLayout, load_config_in}; + + #[test] + fn load_config_reads_yaml_defaults_from_current_dir() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + r#" +targets: + local: + uri: ./demo.omni + bearer_token_env: DEMO_TOKEN +auth: + env_file: .env.omni +cli: + target: local + branch: main + output_format: kv + table_max_column_width: 40 + table_cell_layout: wrap +policy: {} +"#, + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!(config.cli_target_name(), Some("local")); + assert_eq!(config.cli_branch(), "main"); + assert_eq!(config.cli_output_format(), ReadOutputFormat::Kv); + assert_eq!(config.table_max_column_width(), 40); + assert_eq!(config.table_cell_layout(), TableCellLayout::Wrap); + assert_eq!( + config.target_bearer_token_env(None, None, config.cli_target_name()), + Some("DEMO_TOKEN") + ); + assert_eq!( + config.resolve_auth_env_file().unwrap(), + temp.path().join(".env.omni") + ); + assert_eq!( + PathBuf::from( + config + .resolve_target_uri(None, None, config.cli_target_name()) + .unwrap() + ), + temp.path().join("./demo.omni") + ); + } + + #[test] + fn load_config_does_not_walk_parent_directories() { + let temp = tempdir().unwrap(); + let child = temp.path().join("child"); + fs::create_dir_all(&child).unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "targets:\n local:\n uri: ./demo.omni\n", + ) + .unwrap(); + + let config = load_config_in(&child, None).unwrap(); + assert!(config.targets.is_empty()); + } + + #[test] + fn resolve_query_path_searches_config_roots() { + let temp = tempdir().unwrap(); + fs::create_dir_all(temp.path().join("queries")).unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "query:\n roots:\n - queries\npolicy: {}\n", + ) + .unwrap(); + fs::write( + temp.path().join("queries").join("test.gq"), + "query q { return {} }", + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + let resolved = config.resolve_query_path(Path::new("test.gq")).unwrap(); + assert_eq!(resolved, temp.path().join("queries").join("test.gq")); + } + + #[test] + fn resolve_query_path_prefers_config_base_dir_over_ambient_cwd() { + let workspace = tempdir().unwrap(); + let config_dir = workspace.path().join("config"); + let ambient_dir = workspace.path().join("ambient"); + fs::create_dir_all(&config_dir).unwrap(); + fs::create_dir_all(&ambient_dir).unwrap(); + fs::write(config_dir.join("omnigraph.yaml"), "policy: {}\n").unwrap(); + fs::write(config_dir.join("local.gq"), "query local { return {} }").unwrap(); + fs::write(ambient_dir.join("local.gq"), "query ambient { return {} }").unwrap(); + + let config = + load_config_in(&ambient_dir, Some(&config_dir.join("omnigraph.yaml"))).unwrap(); + let resolved = config.resolve_query_path(Path::new("local.gq")).unwrap(); + + assert_eq!(resolved, config_dir.join("local.gq")); + } + + #[test] + fn policy_block_accepts_non_empty_mapping() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + "policy:\n file: ./policy.yaml\n", + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!( + config.resolve_policy_file().unwrap(), + temp.path().join("policy.yaml") + ); + } + + #[test] + fn scoped_auth_env_ignores_default_target_when_uri_is_explicit() { + let temp = tempdir().unwrap(); + fs::write( + temp.path().join("omnigraph.yaml"), + r#" +targets: + demo: + uri: https://example.com + bearer_token_env: DEMO_TOKEN +cli: + target: demo +"#, + ) + .unwrap(); + + let config = load_config_in(temp.path(), None).unwrap(); + assert_eq!( + config.target_bearer_token_env( + Some("https://override.example.com"), + None, + config.cli_target_name() + ), + None + ); + assert_eq!( + config.target_bearer_token_env( + Some("https://override.example.com"), + Some("demo"), + config.cli_target_name() + ), + Some("DEMO_TOKEN") + ); + } +} diff --git a/crates/omnigraph-server/src/lib.rs b/crates/omnigraph-server/src/lib.rs new file mode 100644 index 0000000..17dca60 --- /dev/null +++ b/crates/omnigraph-server/src/lib.rs @@ -0,0 +1,1257 @@ +pub mod api; +pub mod config; +pub mod policy; + +use std::collections::{HashMap, HashSet}; +use std::fs; +use std::path::PathBuf; +use std::sync::Arc; + +use api::{ + BranchCreateOutput, BranchCreateRequest, BranchDeleteOutput, BranchListOutput, + BranchMergeOutput, BranchMergeRequest, ChangeOutput, ChangeRequest, CommitListOutput, + CommitListQuery, ErrorCode, ErrorOutput, ExportRequest, HealthOutput, IngestOutput, + IngestRequest, ReadOutput, ReadRequest, RunListOutput, SnapshotQuery, ingest_output, + snapshot_payload, +}; +use axum::extract::DefaultBodyLimit; +use axum::extract::{Extension, Path, Query, Request, State}; +use axum::http::StatusCode; +use axum::http::header::{AUTHORIZATION, CONTENT_TYPE}; +use axum::middleware::{self, Next}; +use axum::response::{IntoResponse, Response}; +use axum::routing::{delete, get, post}; +use axum::{Json, Router}; +use color_eyre::eyre::{Result, WrapErr, bail}; +pub use config::{ + AliasCommand, AliasConfig, CliDefaults, DEFAULT_CONFIG_FILE, OmnigraphConfig, PolicySettings, + ProjectConfig, QueryDefaults, ReadOutputFormat, ServerDefaults, TableCellLayout, TargetConfig, + load_config, +}; +use omnigraph::db::{Omnigraph, ReadTarget, RunId}; +use omnigraph::error::{ManifestErrorKind, OmniError}; +use omnigraph_compiler::json_params_to_param_map; +use omnigraph_compiler::query::parser::parse_query; +use omnigraph_compiler::{JsonParamMode, ParamMap}; +pub use policy::{ + PolicyAction, PolicyCompiler, PolicyConfig, PolicyDecision, PolicyEngine, PolicyExpectation, + PolicyRequest, PolicyTestConfig, +}; +use serde_json::Value; +use tokio::net::TcpListener; +use tokio::sync::RwLock; +use tower_http::trace::TraceLayer; +use tracing::{error, info}; +use tracing_subscriber::EnvFilter; + +const DEFAULT_REQUEST_BODY_LIMIT_BYTES: usize = 1_048_576; +const INGEST_REQUEST_BODY_LIMIT_BYTES: usize = 32 * 1024 * 1024; +const SERVER_VERSION: &str = env!("CARGO_PKG_VERSION"); +const SERVER_SOURCE_VERSION: Option<&str> = option_env!("OMNIGRAPH_SOURCE_VERSION"); + +#[derive(Debug, Clone)] +pub struct ServerConfig { + pub uri: String, + pub bind: String, + pub policy_file: Option, +} + +#[derive(Clone)] +pub struct AppState { + uri: String, + db: Arc>, + bearer_tokens: Arc, Arc>>, + policy_engine: Option>, +} + +#[derive(Debug, Clone)] +struct AuthenticatedActor(Arc); + +impl AuthenticatedActor { + fn as_str(&self) -> &str { + &self.0 + } +} + +#[derive(Debug)] +pub struct ApiError { + status: StatusCode, + code: ErrorCode, + message: String, + merge_conflicts: Vec, +} + +impl AppState { + pub fn new(uri: String, db: Omnigraph) -> Self { + Self::new_with_bearer_tokens(uri, db, Vec::new()) + } + + pub fn new_with_bearer_token(uri: String, db: Omnigraph, bearer_token: Option) -> Self { + let bearer_tokens = normalize_bearer_token(bearer_token) + .into_iter() + .map(|token| ("default".to_string(), token)) + .collect(); + Self::new_with_bearer_tokens(uri, db, bearer_tokens) + } + + pub fn new_with_bearer_tokens( + uri: String, + db: Omnigraph, + bearer_tokens: Vec<(String, String)>, + ) -> Self { + Self::new_with_bearer_tokens_and_policy(uri, db, bearer_tokens, None) + } + + pub fn new_with_bearer_tokens_and_policy( + uri: String, + db: Omnigraph, + bearer_tokens: Vec<(String, String)>, + policy_engine: Option, + ) -> Self { + let bearer_tokens = bearer_tokens + .into_iter() + .map(|(actor, token)| (Arc::::from(token), Arc::::from(actor))) + .collect(); + Self { + uri, + db: Arc::new(RwLock::new(db)), + bearer_tokens: Arc::new(bearer_tokens), + policy_engine: policy_engine.map(Arc::new), + } + } + + pub async fn open(uri: impl Into) -> Result { + Self::open_with_bearer_token(uri, None).await + } + + pub async fn open_with_bearer_token( + uri: impl Into, + bearer_token: Option, + ) -> Result { + let bearer_tokens = normalize_bearer_token(bearer_token) + .into_iter() + .map(|token| ("default".to_string(), token)) + .collect(); + Self::open_with_bearer_tokens(uri, bearer_tokens).await + } + + pub async fn open_with_bearer_tokens( + uri: impl Into, + bearer_tokens: Vec<(String, String)>, + ) -> Result { + let uri = uri.into(); + let db = Omnigraph::open(&uri).await?; + Ok(Self::new_with_bearer_tokens(uri, db, bearer_tokens)) + } + + pub async fn open_with_bearer_tokens_and_policy( + uri: impl Into, + bearer_tokens: Vec<(String, String)>, + policy_file: Option<&PathBuf>, + ) -> Result { + let uri = uri.into(); + let db = Omnigraph::open(&uri).await?; + let policy_engine = match policy_file { + Some(path) => Some(PolicyEngine::load(path, &uri)?), + None => None, + }; + if policy_engine.is_some() && bearer_tokens.is_empty() { + bail!("policy requires at least one configured bearer token actor"); + } + Ok(Self::new_with_bearer_tokens_and_policy( + uri, + db, + bearer_tokens, + policy_engine, + )) + } + + pub fn uri(&self) -> &str { + &self.uri + } + + fn requires_bearer_auth(&self) -> bool { + !self.bearer_tokens.is_empty() || self.policy_engine.is_some() + } + + fn authenticate_bearer_token(&self, provided_token: &str) -> Option> { + self.bearer_tokens.get(provided_token).cloned() + } + + fn policy_engine(&self) -> Option<&PolicyEngine> { + self.policy_engine.as_deref() + } +} + +impl ApiError { + pub fn unauthorized(message: impl Into) -> Self { + Self { + status: StatusCode::UNAUTHORIZED, + code: ErrorCode::Unauthorized, + message: message.into(), + merge_conflicts: Vec::new(), + } + } + + pub fn forbidden(message: impl Into) -> Self { + Self { + status: StatusCode::FORBIDDEN, + code: ErrorCode::Forbidden, + message: message.into(), + merge_conflicts: Vec::new(), + } + } + + pub fn bad_request(message: impl Into) -> Self { + Self { + status: StatusCode::BAD_REQUEST, + code: ErrorCode::BadRequest, + message: message.into(), + merge_conflicts: Vec::new(), + } + } + + pub fn not_found(message: impl Into) -> Self { + Self { + status: StatusCode::NOT_FOUND, + code: ErrorCode::NotFound, + message: message.into(), + merge_conflicts: Vec::new(), + } + } + + pub fn conflict(message: impl Into) -> Self { + Self { + status: StatusCode::CONFLICT, + code: ErrorCode::Conflict, + message: message.into(), + merge_conflicts: Vec::new(), + } + } + + pub fn internal(message: impl Into) -> Self { + Self { + status: StatusCode::INTERNAL_SERVER_ERROR, + code: ErrorCode::Internal, + message: message.into(), + merge_conflicts: Vec::new(), + } + } + + fn merge_conflict(conflicts: Vec) -> Self { + Self { + status: StatusCode::CONFLICT, + code: ErrorCode::Conflict, + message: summarize_merge_conflicts(&conflicts), + merge_conflicts: conflicts, + } + } + + fn from_omni(err: OmniError) -> Self { + match err { + OmniError::Compiler(err) => Self::bad_request(err.to_string()), + OmniError::DataFusion(message) => Self::bad_request(format!("query: {message}")), + OmniError::Manifest(err) => match err.kind { + ManifestErrorKind::BadRequest => Self::bad_request(err.message), + ManifestErrorKind::NotFound => Self::not_found(err.message), + ManifestErrorKind::Conflict => Self::conflict(err.message), + ManifestErrorKind::Internal => Self::internal(err.message), + }, + OmniError::MergeConflicts(conflicts) => Self::merge_conflict( + conflicts + .iter() + .map(api::MergeConflictOutput::from) + .collect(), + ), + OmniError::Lance(message) => Self::internal(format!("storage: {message}")), + OmniError::Io(err) => Self::internal(format!("io: {err}")), + } + } +} + +fn summarize_merge_conflicts(conflicts: &[api::MergeConflictOutput]) -> String { + if conflicts.is_empty() { + return "merge conflicts".to_string(); + } + + let preview: Vec = conflicts + .iter() + .take(3) + .map(|conflict| match conflict.row_id.as_deref() { + Some(row_id) => format!( + "{}:{} ({})", + conflict.table_key, + row_id, + conflict.kind.as_str() + ), + None => format!("{} ({})", conflict.table_key, conflict.kind.as_str()), + }) + .collect(); + + let suffix = if conflicts.len() > preview.len() { + format!("; and {} more", conflicts.len() - preview.len()) + } else { + String::new() + }; + + format!("merge conflicts: {}{}", preview.join("; "), suffix) +} + +impl IntoResponse for ApiError { + fn into_response(self) -> Response { + ( + self.status, + Json(ErrorOutput { + error: self.message, + code: Some(self.code), + merge_conflicts: self.merge_conflicts, + }), + ) + .into_response() + } +} + +pub fn init_tracing() { + let filter = EnvFilter::try_from_default_env().unwrap_or_else(|_| EnvFilter::new("info")); + let _ = tracing_subscriber::fmt().with_env_filter(filter).try_init(); +} + +pub fn load_server_settings( + config_path: Option<&PathBuf>, + cli_uri: Option, + cli_target: Option, + cli_bind: Option, +) -> Result { + let config = load_config(config_path)?; + let uri = + config.resolve_target_uri(cli_uri, cli_target.as_deref(), config.server_target_name())?; + let bind = cli_bind.unwrap_or_else(|| config.server_bind().to_string()); + let policy_file = config.resolve_policy_file(); + + Ok(ServerConfig { + uri, + bind, + policy_file, + }) +} + +pub fn build_app(state: AppState) -> Router { + let protected = Router::new() + .route("/snapshot", get(server_snapshot)) + .route("/export", post(server_export)) + .route("/read", post(server_read)) + .route("/change", post(server_change)) + .route( + "/ingest", + post(server_ingest).layer(DefaultBodyLimit::max(INGEST_REQUEST_BODY_LIMIT_BYTES)), + ) + .route( + "/branches", + get(server_branch_list).post(server_branch_create), + ) + .route("/branches/{branch}", delete(server_branch_delete)) + .route("/branches/merge", post(server_branch_merge)) + .route("/runs", get(server_run_list)) + .route("/runs/{run_id}", get(server_run_show)) + .route("/runs/{run_id}/publish", post(server_run_publish)) + .route("/runs/{run_id}/abort", post(server_run_abort)) + .route("/commits", get(server_commit_list)) + .route("/commits/{commit_id}", get(server_commit_show)) + .route_layer(middleware::from_fn_with_state( + state.clone(), + require_bearer_auth, + )); + + Router::new() + .route("/healthz", get(server_health)) + .merge(protected) + .layer(DefaultBodyLimit::max(DEFAULT_REQUEST_BODY_LIMIT_BYTES)) + .layer(TraceLayer::new_for_http()) + .with_state(state) +} + +pub async fn serve(config: ServerConfig) -> Result<()> { + let state = AppState::open_with_bearer_tokens_and_policy( + config.uri.clone(), + server_bearer_tokens_from_env()?, + config.policy_file.as_ref(), + ) + .await?; + let listener = TcpListener::bind(&config.bind).await?; + info!(uri = %config.uri, bind = %config.bind, "serving omnigraph"); + axum::serve(listener, build_app(state)) + .with_graceful_shutdown(shutdown_signal()) + .await?; + Ok(()) +} + +async fn shutdown_signal() { + if let Err(err) = tokio::signal::ctrl_c().await { + error!(error = %err, "failed to install ctrl-c handler"); + return; + } + info!("shutdown signal received"); +} + +async fn server_health() -> Json { + Json(HealthOutput { + status: "ok".to_string(), + version: SERVER_VERSION.to_string(), + source_version: SERVER_SOURCE_VERSION.map(str::to_string), + }) +} + +async fn require_bearer_auth( + State(state): State, + mut request: Request, + next: Next, +) -> std::result::Result { + if !state.requires_bearer_auth() { + return Ok(next.run(request).await); + } + + let Some(header) = request + .headers() + .get(AUTHORIZATION) + .and_then(|value| value.to_str().ok()) + else { + return Err(ApiError::unauthorized("missing bearer token")); + }; + + let Some(provided_token) = header.strip_prefix("Bearer ") else { + return Err(ApiError::unauthorized("missing bearer token")); + }; + + let Some(actor) = state.authenticate_bearer_token(provided_token) else { + return Err(ApiError::unauthorized("invalid bearer token")); + }; + request.extensions_mut().insert(AuthenticatedActor(actor)); + + Ok(next.run(request).await) +} + +fn log_policy_decision(actor_id: &str, request: &PolicyRequest, decision: &PolicyDecision) { + info!( + actor_id = actor_id, + action = %request.action, + branch = request.branch.as_deref().unwrap_or(""), + target_branch = request.target_branch.as_deref().unwrap_or(""), + allowed = decision.allowed, + matched_rule_id = decision.matched_rule_id.as_deref().unwrap_or(""), + "policy decision" + ); +} + +fn authorize_request( + state: &AppState, + actor: Option<&AuthenticatedActor>, + request: PolicyRequest, +) -> std::result::Result<(), ApiError> { + let Some(engine) = state.policy_engine() else { + return Ok(()); + }; + let Some(actor) = actor else { + return Err(ApiError::unauthorized("missing bearer token")); + }; + let decision = engine + .authorize(&request) + .map_err(|err| ApiError::internal(format!("policy: {err}")))?; + log_policy_decision(actor.as_str(), &request, &decision); + if decision.allowed { + Ok(()) + } else { + Err(ApiError::forbidden(decision.message)) + } +} + +async fn server_snapshot( + State(state): State, + actor: Option>, + Query(query): Query, +) -> std::result::Result, ApiError> { + let branch = query.branch.unwrap_or_else(|| "main".to_string()); + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Read, + branch: Some(branch.clone()), + target_branch: None, + }, + )?; + let snapshot = { + let db = Arc::clone(&state.db).read_owned().await; + db.snapshot_of(ReadTarget::branch(branch.as_str())) + .await + .map_err(ApiError::from_omni)? + }; + Ok(Json(snapshot_payload(&branch, &snapshot))) +} + +async fn server_read( + State(state): State, + actor: Option>, + Json(request): Json, +) -> std::result::Result, ApiError> { + if request.branch.is_some() && request.snapshot.is_some() { + return Err(ApiError::bad_request( + "read request may specify branch or snapshot, not both", + )); + } + + let target = read_target_from_request(request.branch, request.snapshot); + let policy_branch = match &target { + ReadTarget::Branch(branch) => Some(branch.clone()), + ReadTarget::Snapshot(_) if state.policy_engine().is_some() && actor.is_some() => { + let db = Arc::clone(&state.db).read_owned().await; + db.resolved_branch_of(target.clone()) + .await + .map(|branch| branch.or_else(|| Some("main".to_string()))) + .map_err(ApiError::from_omni)? + } + ReadTarget::Snapshot(_) => None, + }; + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Read, + branch: policy_branch, + target_branch: None, + }, + )?; + let (selected_name, query_params) = + select_named_query(&request.query_source, request.query_name.as_deref()) + .map_err(|err| ApiError::bad_request(err.to_string()))?; + let params = query_params_from_json(&query_params, request.params.as_ref()) + .map_err(|err| ApiError::bad_request(err.to_string()))?; + + let result = { + let db = Arc::clone(&state.db).read_owned().await; + db.query( + target.clone(), + &request.query_source, + &selected_name, + ¶ms, + ) + .await + .map_err(ApiError::from_omni)? + }; + Ok(Json(api::read_output(selected_name, &target, result))) +} + +async fn server_export( + State(state): State, + actor: Option>, + Json(request): Json, +) -> std::result::Result { + let branch = request.branch.unwrap_or_else(|| "main".to_string()); + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Export, + branch: Some(branch.clone()), + target_branch: None, + }, + )?; + let payload = { + let db = Arc::clone(&state.db).read_owned().await; + db.export_jsonl(&branch, &request.type_names, &request.table_keys) + .await + .map_err(ApiError::from_omni)? + }; + Ok(( + StatusCode::OK, + [(CONTENT_TYPE, "application/x-ndjson; charset=utf-8")], + payload, + ) + .into_response()) +} + +async fn server_change( + State(state): State, + actor: Option>, + Json(request): Json, +) -> std::result::Result, ApiError> { + let branch = request.branch.unwrap_or_else(|| "main".to_string()); + let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor_id.map(str::to_string).unwrap_or_default(), + action: PolicyAction::Change, + branch: Some(branch.clone()), + target_branch: None, + }, + )?; + let (selected_name, query_params) = + select_named_query(&request.query_source, request.query_name.as_deref()) + .map_err(|err| ApiError::bad_request(err.to_string()))?; + let params = query_params_from_json(&query_params, request.params.as_ref()) + .map_err(|err| ApiError::bad_request(err.to_string()))?; + + let result = { + let mut db = Arc::clone(&state.db).write_owned().await; + db.mutate_as( + &branch, + &request.query_source, + &selected_name, + ¶ms, + actor_id, + ) + .await + .map_err(ApiError::from_omni)? + }; + Ok(Json(ChangeOutput { + branch, + query_name: selected_name, + affected_nodes: result.affected_nodes, + affected_edges: result.affected_edges, + actor_id: actor_id.map(str::to_string), + })) +} + +async fn server_ingest( + State(state): State, + actor: Option>, + Json(request): Json, +) -> std::result::Result, ApiError> { + let branch = request.branch.unwrap_or_else(|| "main".to_string()); + let from = request.from.unwrap_or_else(|| "main".to_string()); + let mode = request.mode.unwrap_or(omnigraph::loader::LoadMode::Merge); + let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + + let branch_exists = { + let db = Arc::clone(&state.db).read_owned().await; + db.branch_list() + .await + .map_err(ApiError::from_omni)? + .into_iter() + .any(|name| name == branch) + }; + + if !branch_exists { + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor_id.map(str::to_string).unwrap_or_default(), + action: PolicyAction::BranchCreate, + branch: Some(from.clone()), + target_branch: Some(branch.clone()), + }, + )?; + } + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor_id.map(str::to_string).unwrap_or_default(), + action: PolicyAction::Change, + branch: Some(branch.clone()), + target_branch: None, + }, + )?; + + let result = { + let mut db = Arc::clone(&state.db).write_owned().await; + db.ingest_as(&branch, Some(&from), &request.data, mode, actor_id) + .await + .map_err(ApiError::from_omni)? + }; + + Ok(Json(ingest_output( + state.uri(), + &result, + actor_id.map(str::to_string), + ))) +} + +async fn server_branch_list( + State(state): State, + actor: Option>, +) -> std::result::Result, ApiError> { + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Read, + branch: None, + target_branch: None, + }, + )?; + let mut branches = { + let db = Arc::clone(&state.db).read_owned().await; + db.branch_list().await.map_err(ApiError::from_omni)? + }; + branches.sort(); + Ok(Json(BranchListOutput { branches })) +} + +async fn server_branch_create( + State(state): State, + actor: Option>, + Json(request): Json, +) -> std::result::Result, ApiError> { + let from = request.from.unwrap_or_else(|| "main".to_string()); + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::BranchCreate, + branch: Some(from.clone()), + target_branch: Some(request.name.clone()), + }, + )?; + { + let mut db = Arc::clone(&state.db).write_owned().await; + db.branch_create_from(ReadTarget::branch(&from), &request.name) + .await + .map_err(ApiError::from_omni)?; + } + Ok(Json(BranchCreateOutput { + uri: state.uri().to_string(), + from, + name: request.name, + actor_id: actor.map(|Extension(actor)| actor.as_str().to_string()), + })) +} + +async fn server_branch_delete( + State(state): State, + actor: Option>, + Path(branch): Path, +) -> std::result::Result, ApiError> { + let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor_id.map(str::to_string).unwrap_or_default(), + action: PolicyAction::BranchDelete, + branch: None, + target_branch: Some(branch.clone()), + }, + )?; + { + let mut db = Arc::clone(&state.db).write_owned().await; + db.branch_delete(&branch) + .await + .map_err(ApiError::from_omni)?; + } + Ok(Json(BranchDeleteOutput { + uri: state.uri().to_string(), + name: branch, + actor_id: actor_id.map(str::to_string), + })) +} + +async fn server_branch_merge( + State(state): State, + actor: Option>, + Json(request): Json, +) -> std::result::Result, ApiError> { + let target = request.target.unwrap_or_else(|| "main".to_string()); + let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor_id.map(str::to_string).unwrap_or_default(), + action: PolicyAction::BranchMerge, + branch: Some(request.source.clone()), + target_branch: Some(target.clone()), + }, + )?; + let outcome = { + let mut db = Arc::clone(&state.db).write_owned().await; + db.branch_merge_as(&request.source, &target, actor_id) + .await + .map_err(ApiError::from_omni)? + }; + Ok(Json(BranchMergeOutput { + source: request.source, + target, + outcome: outcome.into(), + actor_id: actor_id.map(str::to_string), + })) +} + +async fn server_run_list( + State(state): State, + actor: Option>, +) -> std::result::Result, ApiError> { + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Read, + branch: None, + target_branch: None, + }, + )?; + let runs = { + let db = Arc::clone(&state.db).read_owned().await; + db.list_runs().await.map_err(ApiError::from_omni)? + }; + Ok(Json(RunListOutput { + runs: runs.iter().map(api::run_output).collect(), + })) +} + +async fn server_run_show( + State(state): State, + actor: Option>, + Path(run_id): Path, +) -> std::result::Result, ApiError> { + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Read, + branch: None, + target_branch: None, + }, + )?; + let run = { + let db = Arc::clone(&state.db).read_owned().await; + db.get_run(&RunId::new(run_id)) + .await + .map_err(ApiError::from_omni)? + }; + Ok(Json(api::run_output(&run))) +} + +async fn server_run_publish( + State(state): State, + actor: Option>, + Path(run_id): Path, +) -> std::result::Result, ApiError> { + let run_id = RunId::new(run_id); + let actor_id = actor.as_ref().map(|Extension(actor)| actor.as_str()); + let target_branch = { + let db = Arc::clone(&state.db).read_owned().await; + db.get_run(&run_id) + .await + .map_err(ApiError::from_omni)? + .target_branch + }; + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor_id.map(str::to_string).unwrap_or_default(), + action: PolicyAction::RunPublish, + branch: None, + target_branch: Some(target_branch), + }, + )?; + let run = { + let mut db = Arc::clone(&state.db).write_owned().await; + db.publish_run_as(&run_id, actor_id) + .await + .map_err(ApiError::from_omni)?; + db.get_run(&run_id).await.map_err(ApiError::from_omni)? + }; + Ok(Json(api::run_output(&run))) +} + +async fn server_run_abort( + State(state): State, + actor: Option>, + Path(run_id): Path, +) -> std::result::Result, ApiError> { + let run_id = RunId::new(run_id); + let target_branch = { + let db = Arc::clone(&state.db).read_owned().await; + db.get_run(&run_id) + .await + .map_err(ApiError::from_omni)? + .target_branch + }; + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::RunAbort, + branch: None, + target_branch: Some(target_branch), + }, + )?; + let run = { + let mut db = Arc::clone(&state.db).write_owned().await; + db.abort_run(&run_id).await.map_err(ApiError::from_omni)? + }; + Ok(Json(api::run_output(&run))) +} + +async fn server_commit_list( + State(state): State, + actor: Option>, + Query(query): Query, +) -> std::result::Result, ApiError> { + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Read, + branch: query.branch.clone(), + target_branch: None, + }, + )?; + let commits = { + let db = Arc::clone(&state.db).read_owned().await; + db.list_commits(query.branch.as_deref()) + .await + .map_err(ApiError::from_omni)? + }; + Ok(Json(CommitListOutput { + commits: commits.iter().map(api::commit_output).collect(), + })) +} + +async fn server_commit_show( + State(state): State, + actor: Option>, + Path(commit_id): Path, +) -> std::result::Result, ApiError> { + authorize_request( + &state, + actor.as_ref().map(|Extension(actor)| actor), + PolicyRequest { + actor_id: actor + .as_ref() + .map(|Extension(actor)| actor.as_str().to_string()) + .unwrap_or_default(), + action: PolicyAction::Read, + branch: None, + target_branch: None, + }, + )?; + let commit = { + let db = Arc::clone(&state.db).read_owned().await; + db.get_commit(&commit_id) + .await + .map_err(ApiError::from_omni)? + }; + Ok(Json(api::commit_output(&commit))) +} + +fn read_target_from_request(branch: Option, snapshot: Option) -> ReadTarget { + if let Some(snapshot) = snapshot { + ReadTarget::snapshot(omnigraph::db::SnapshotId::new(snapshot)) + } else { + ReadTarget::branch(branch.unwrap_or_else(|| "main".to_string())) + } +} + +fn select_named_query( + query_source: &str, + requested_name: Option<&str>, +) -> Result<(String, Vec)> { + let parsed = parse_query(query_source)?; + let query = if let Some(name) = requested_name { + parsed + .queries + .into_iter() + .find(|query| query.name == name) + .ok_or_else(|| color_eyre::eyre::eyre!("query '{}' not found", name))? + } else if parsed.queries.len() == 1 { + parsed.queries.into_iter().next().unwrap() + } else { + bail!("query file contains multiple queries; pass --name"); + }; + + Ok((query.name, query.params)) +} + +fn query_params_from_json( + query_params: &[omnigraph_compiler::query::ast::Param], + params_json: Option<&Value>, +) -> Result { + json_params_to_param_map(params_json, query_params, JsonParamMode::Standard) + .map_err(|err| color_eyre::eyre::eyre!(err.to_string())) +} + +fn normalize_bearer_token(value: Option) -> Option { + value + .map(|value| value.trim().to_string()) + .filter(|value| !value.is_empty()) +} + +fn normalize_bearer_actor(value: String) -> Result { + let value = value.trim().to_string(); + if value.is_empty() { + bail!("bearer token actor names must not be blank"); + } + Ok(value) +} + +fn parse_bearer_tokens_json(value: &str) -> Result> { + let entries: HashMap = serde_json::from_str(value) + .wrap_err("OMNIGRAPH_SERVER_BEARER_TOKENS_JSON must be a JSON object of actor->token")?; + Ok(entries.into_iter().collect()) +} + +fn read_bearer_tokens_file(path: &str) -> Result> { + let contents = fs::read_to_string(path) + .wrap_err_with(|| format!("failed to read bearer tokens file at {path}"))?; + parse_bearer_tokens_json(&contents) + .wrap_err_with(|| format!("failed to parse bearer tokens file at {path}")) +} + +fn validate_bearer_tokens(entries: Vec<(String, String)>) -> Result> { + let mut seen_actors = HashSet::new(); + let mut seen_tokens = HashSet::new(); + let mut normalized = Vec::with_capacity(entries.len()); + + for (actor, token) in entries { + let actor = normalize_bearer_actor(actor)?; + let Some(token) = normalize_bearer_token(Some(token)) else { + bail!("bearer token for actor '{actor}' must not be blank"); + }; + if !seen_actors.insert(actor.clone()) { + bail!("duplicate bearer token actor '{actor}'"); + } + if !seen_tokens.insert(token.clone()) { + bail!("duplicate bearer token value configured"); + } + normalized.push((actor, token)); + } + + normalized.sort_by(|(left, _), (right, _)| left.cmp(right)); + Ok(normalized) +} + +fn server_bearer_tokens_from_env() -> Result> { + let mut entries = Vec::new(); + + if let Some(token) = normalize_bearer_token(std::env::var("OMNIGRAPH_SERVER_BEARER_TOKEN").ok()) + { + entries.push(("default".to_string(), token)); + } + + if let Some(path) = + normalize_bearer_token(std::env::var("OMNIGRAPH_SERVER_BEARER_TOKENS_FILE").ok()) + { + entries.extend(read_bearer_tokens_file(&path)?); + } else if let Some(json) = + normalize_bearer_token(std::env::var("OMNIGRAPH_SERVER_BEARER_TOKENS_JSON").ok()) + { + entries.extend(parse_bearer_tokens_json(&json)?); + } + + validate_bearer_tokens(entries) +} + +#[cfg(test)] +mod tests { + use super::{ + load_server_settings, normalize_bearer_token, parse_bearer_tokens_json, + server_bearer_tokens_from_env, + }; + use std::env; + use std::fs; + use tempfile::tempdir; + + #[test] + fn server_settings_load_from_yaml_config() { + let temp = tempdir().unwrap(); + let config = temp.path().join("omnigraph.yaml"); + fs::write( + &config, + r#" +targets: + local: + uri: /tmp/demo.omni +server: + target: local + bind: 0.0.0.0:9090 +"#, + ) + .unwrap(); + + let settings = load_server_settings(Some(&config), None, None, None).unwrap(); + assert_eq!(settings.uri, "/tmp/demo.omni"); + assert_eq!(settings.bind, "0.0.0.0:9090"); + } + + #[test] + fn server_settings_cli_flags_override_yaml_config() { + let temp = tempdir().unwrap(); + let config = temp.path().join("omnigraph.yaml"); + fs::write( + &config, + r#" +targets: + local: + uri: /tmp/demo.omni +server: + target: local + bind: 127.0.0.1:8080 +"#, + ) + .unwrap(); + + let settings = load_server_settings( + Some(&config), + Some("/tmp/override.omni".to_string()), + None, + Some("0.0.0.0:9999".to_string()), + ) + .unwrap(); + assert_eq!(settings.uri, "/tmp/override.omni"); + assert_eq!(settings.bind, "0.0.0.0:9999"); + } + + #[test] + fn server_settings_can_resolve_named_target() { + let temp = tempdir().unwrap(); + let config = temp.path().join("omnigraph.yaml"); + fs::write( + &config, + r#" +targets: + local: + uri: ./demo.omni + dev: + uri: http://127.0.0.1:8080 +server: + target: local + bind: 127.0.0.1:8080 +"#, + ) + .unwrap(); + + let settings = + load_server_settings(Some(&config), None, Some("dev".to_string()), None).unwrap(); + assert_eq!(settings.uri, "http://127.0.0.1:8080"); + } + + #[test] + fn server_settings_require_uri_from_cli_or_config() { + let error = load_server_settings(None, None, None, None).unwrap_err(); + assert!(error.to_string().contains("URI must be provided")); + } + + #[test] + fn normalize_bearer_token_trims_and_filters_blank_values() { + assert_eq!(normalize_bearer_token(None), None); + assert_eq!(normalize_bearer_token(Some(" ".to_string())), None); + assert_eq!( + normalize_bearer_token(Some(" demo-token ".to_string())).as_deref(), + Some("demo-token") + ); + } + + struct EnvGuard { + saved: Vec<(&'static str, Option)>, + } + + impl EnvGuard { + fn set(vars: &[(&'static str, Option<&str>)]) -> Self { + let saved = vars + .iter() + .map(|(name, _)| (*name, env::var(name).ok())) + .collect::>(); + for (name, value) in vars { + unsafe { + match value { + Some(value) => env::set_var(name, value), + None => env::remove_var(name), + } + } + } + Self { saved } + } + } + + impl Drop for EnvGuard { + fn drop(&mut self) { + for (name, value) in self.saved.drain(..) { + unsafe { + match value { + Some(value) => env::set_var(name, value), + None => env::remove_var(name), + } + } + } + } + } + + #[test] + fn parse_bearer_tokens_json_reads_actor_token_map() { + let tokens = parse_bearer_tokens_json(r#"{"alice":" token-a ","bob":"token-b"}"#).unwrap(); + assert_eq!(tokens.len(), 2); + assert!(tokens.contains(&("alice".to_string(), " token-a ".to_string()))); + assert!(tokens.contains(&("bob".to_string(), "token-b".to_string()))); + } + + #[test] + fn server_bearer_tokens_from_env_reads_legacy_token_and_token_file() { + let temp = tempdir().unwrap(); + let tokens_path = temp.path().join("tokens.json"); + fs::write( + &tokens_path, + r#"{"team-01":"token-one","team-02":"token-two"}"#, + ) + .unwrap(); + + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_SERVER_BEARER_TOKEN", Some(" legacy-token ")), + ( + "OMNIGRAPH_SERVER_BEARER_TOKENS_FILE", + Some(tokens_path.to_str().unwrap()), + ), + ("OMNIGRAPH_SERVER_BEARER_TOKENS_JSON", None), + ]); + + let tokens = server_bearer_tokens_from_env().unwrap(); + assert_eq!( + tokens, + vec![ + ("default".to_string(), "legacy-token".to_string()), + ("team-01".to_string(), "token-one".to_string()), + ("team-02".to_string(), "token-two".to_string()), + ] + ); + } +} diff --git a/crates/omnigraph-server/src/main.rs b/crates/omnigraph-server/src/main.rs new file mode 100644 index 0000000..0b43105 --- /dev/null +++ b/crates/omnigraph-server/src/main.rs @@ -0,0 +1,30 @@ +use std::path::PathBuf; + +use clap::Parser; +use color_eyre::eyre::Result; +use omnigraph_server::{ServerConfig, init_tracing, load_server_settings, serve}; + +#[derive(Debug, Parser)] +#[command(name = "omnigraph-server")] +#[command(about = "HTTP server for the Omnigraph graph database")] +struct Cli { + /// Repo URI + uri: Option, + #[arg(long)] + target: Option, + #[arg(long)] + config: Option, + #[arg(long)] + bind: Option, +} + +#[tokio::main] +async fn main() -> Result<()> { + color_eyre::install()?; + init_tracing(); + + let cli = Cli::parse(); + let settings: ServerConfig = + load_server_settings(cli.config.as_ref(), cli.uri, cli.target, cli.bind)?; + serve(settings).await +} diff --git a/crates/omnigraph-server/src/policy.rs b/crates/omnigraph-server/src/policy.rs new file mode 100644 index 0000000..21b6ea6 --- /dev/null +++ b/crates/omnigraph-server/src/policy.rs @@ -0,0 +1,812 @@ +use std::collections::{BTreeMap, BTreeSet, HashMap, HashSet}; +use std::fmt; +use std::fs; +use std::path::Path; +use std::str::FromStr; + +use cedar_policy::{ + Authorizer, Context, Decision, Entities, Entity, EntityId, EntityTypeName, EntityUid, Policy, + PolicyId, PolicySet, Request, Schema, ValidationMode, Validator, +}; +use clap::ValueEnum; +use color_eyre::eyre::{Result, bail, eyre}; +use serde::{Deserialize, Serialize}; +use serde_json::json; + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Hash, Serialize, Deserialize, ValueEnum)] +#[serde(rename_all = "snake_case")] +pub enum PolicyAction { + Read, + Export, + Change, + BranchCreate, + BranchDelete, + BranchMerge, + RunPublish, + RunAbort, + Admin, +} + +impl PolicyAction { + pub fn as_str(self) -> &'static str { + match self { + Self::Read => "read", + Self::Export => "export", + Self::Change => "change", + Self::BranchCreate => "branch_create", + Self::BranchDelete => "branch_delete", + Self::BranchMerge => "branch_merge", + Self::RunPublish => "run_publish", + Self::RunAbort => "run_abort", + Self::Admin => "admin", + } + } + + fn uses_branch_scope(self) -> bool { + matches!(self, Self::Read | Self::Export | Self::Change) + } + + fn uses_target_branch_scope(self) -> bool { + matches!( + self, + Self::BranchCreate + | Self::BranchDelete + | Self::BranchMerge + | Self::RunPublish + | Self::RunAbort + ) + } +} + +impl fmt::Display for PolicyAction { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.write_str(self.as_str()) + } +} + +impl FromStr for PolicyAction { + type Err = color_eyre::eyre::Error; + + fn from_str(value: &str) -> Result { + match value.trim() { + "read" => Ok(Self::Read), + "export" => Ok(Self::Export), + "change" => Ok(Self::Change), + "branch_create" => Ok(Self::BranchCreate), + "branch_delete" => Ok(Self::BranchDelete), + "branch_merge" => Ok(Self::BranchMerge), + "run_publish" => Ok(Self::RunPublish), + "run_abort" => Ok(Self::RunAbort), + "admin" => Ok(Self::Admin), + other => bail!("unknown policy action '{other}'"), + } + } +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PolicyBranchScope { + Any, + Protected, + Unprotected, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyActorSelector { + pub group: String, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyAllowRule { + pub actors: PolicyActorSelector, + pub actions: Vec, + pub branch_scope: Option, + pub target_branch_scope: Option, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyRule { + pub id: String, + pub allow: PolicyAllowRule, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyConfig { + pub version: u32, + #[serde(default)] + pub groups: BTreeMap>, + #[serde(default)] + pub protected_branches: Vec, + #[serde(default)] + pub rules: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyTestConfig { + pub version: u32, + #[serde(default)] + pub cases: Vec, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct PolicyTestCase { + pub id: String, + pub actor: String, + pub action: PolicyAction, + pub branch: Option, + pub target_branch: Option, + pub expect: PolicyExpectation, +} + +#[derive(Debug, Clone, Copy, Eq, PartialEq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum PolicyExpectation { + Allow, + Deny, +} + +#[derive(Debug, Clone)] +pub struct PolicyRequest { + pub actor_id: String, + pub action: PolicyAction, + pub branch: Option, + pub target_branch: Option, +} + +#[derive(Debug, Clone)] +pub struct PolicyDecision { + pub allowed: bool, + pub matched_rule_id: Option, + pub message: String, +} + +pub struct PolicyCompiler; + +#[derive(Clone)] +pub struct PolicyEngine { + repo_id: String, + protected_branches: BTreeSet, + known_actors: BTreeSet, + schema: Schema, + entities: Entities, + policies: PolicySet, + policy_to_rule: HashMap, +} + +impl PolicyConfig { + pub fn load(path: &Path) -> Result { + let config: Self = serde_yaml::from_str(&fs::read_to_string(path)?)?; + config.validate()?; + Ok(config) + } + + pub fn validate(&self) -> Result<()> { + if self.version != 1 { + bail!("policy version must be 1"); + } + + for (group, members) in &self.groups { + if group.trim().is_empty() { + bail!("policy group names must not be blank"); + } + if members.is_empty() { + bail!("policy group '{group}' must not be empty"); + } + for actor in members { + if actor.trim().is_empty() { + bail!("policy group '{group}' contains a blank actor id"); + } + } + } + + for branch in &self.protected_branches { + if branch.trim().is_empty() { + bail!("protected branch names must not be blank"); + } + } + + let mut seen_rule_ids = HashSet::new(); + for rule in &self.rules { + if rule.id.trim().is_empty() { + bail!("policy rule ids must not be blank"); + } + if !seen_rule_ids.insert(rule.id.clone()) { + bail!("duplicate policy rule id '{}'", rule.id); + } + if rule.allow.actors.group.trim().is_empty() { + bail!("policy rule '{}' must reference a non-blank group", rule.id); + } + if !self.groups.contains_key(rule.allow.actors.group.as_str()) { + bail!( + "policy rule '{}' references unknown group '{}'", + rule.id, + rule.allow.actors.group + ); + } + if rule.allow.actions.is_empty() { + bail!("policy rule '{}' must include at least one action", rule.id); + } + if rule.allow.branch_scope.is_some() && rule.allow.target_branch_scope.is_some() { + bail!( + "policy rule '{}' may specify branch_scope or target_branch_scope, not both", + rule.id + ); + } + if let Some(_) = rule.allow.branch_scope { + for action in &rule.allow.actions { + if !action.uses_branch_scope() { + bail!( + "policy rule '{}' uses branch_scope with unsupported action '{}'", + rule.id, + action + ); + } + } + } + if let Some(_) = rule.allow.target_branch_scope { + for action in &rule.allow.actions { + if !action.uses_target_branch_scope() { + bail!( + "policy rule '{}' uses target_branch_scope with unsupported action '{}'", + rule.id, + action + ); + } + } + } + } + + Ok(()) + } +} + +impl PolicyTestConfig { + pub fn load(path: &Path) -> Result { + let config: Self = serde_yaml::from_str(&fs::read_to_string(path)?)?; + if config.version != 1 { + bail!("policy test version must be 1"); + } + let mut seen = HashSet::new(); + for case in &config.cases { + if case.id.trim().is_empty() { + bail!("policy test case ids must not be blank"); + } + if !seen.insert(case.id.clone()) { + bail!("duplicate policy test case id '{}'", case.id); + } + if case.actor.trim().is_empty() { + bail!("policy test case '{}' must not use a blank actor", case.id); + } + } + Ok(config) + } +} + +impl PolicyCompiler { + pub fn compile(config: &PolicyConfig, repo_id: &str) -> Result { + config.validate()?; + let (schema, schema_warnings) = Schema::from_cedarschema_str(policy_schema_source())?; + let schema_warnings = schema_warnings + .map(|warning| warning.to_string()) + .collect::>(); + if !schema_warnings.is_empty() { + bail!("policy schema warnings:\n{}", schema_warnings.join("\n")); + } + let entities = compile_entities(config, repo_id, &schema)?; + let (policies, policy_to_rule) = compile_policies(config, repo_id)?; + let validator = Validator::new(schema.clone()); + let validation = validator.validate(&policies, ValidationMode::Strict); + let errors = validation + .validation_errors() + .map(|err| err.to_string()) + .collect::>(); + if !errors.is_empty() { + bail!("policy validation failed:\n{}", errors.join("\n")); + } + + let known_actors = config + .groups + .values() + .flat_map(|members| members.iter().cloned()) + .collect(); + Ok(PolicyEngine { + repo_id: repo_id.to_string(), + protected_branches: config.protected_branches.iter().cloned().collect(), + known_actors, + schema, + entities, + policies, + policy_to_rule, + }) + } +} + +impl PolicyEngine { + pub fn load(path: &Path, repo_id: &str) -> Result { + let config = PolicyConfig::load(path)?; + PolicyCompiler::compile(&config, repo_id) + } + + pub fn authorize(&self, request: &PolicyRequest) -> Result { + if !self.known_actors.contains(request.actor_id.as_str()) { + return Ok(self.deny( + request, + None, + format!( + "policy denied action '{}' for unknown actor '{}'", + request.action, request.actor_id + ), + )); + } + + let principal = entity_uid("Actor", &request.actor_id)?; + let action = entity_uid("Action", request.action.as_str())?; + let resource = entity_uid("Repo", &self.repo_id)?; + let context_value = json!({ + "has_branch": request.branch.is_some(), + "branch": request.branch.clone().unwrap_or_default(), + "has_target_branch": request.target_branch.is_some(), + "target_branch": request.target_branch.clone().unwrap_or_default(), + "branch_is_protected": request.branch.as_ref().is_some_and(|branch| self.protected_branches.contains(branch)), + "target_branch_is_protected": request.target_branch.as_ref().is_some_and(|branch| self.protected_branches.contains(branch)), + }); + let context = Context::from_json_value(context_value, Some((&self.schema, &action)))?; + let cedar_request = Request::new(principal, action, resource, context, Some(&self.schema))?; + let response = + Authorizer::new().is_authorized(&cedar_request, &self.policies, &self.entities); + let errors = response + .diagnostics() + .errors() + .map(|err| err.to_string()) + .collect::>(); + if !errors.is_empty() { + bail!("policy evaluation failed:\n{}", errors.join("\n")); + } + + let matched_rule_id = response + .diagnostics() + .reason() + .filter_map(|policy_id| { + let key: &str = policy_id.as_ref(); + self.policy_to_rule.get(key).cloned() + }) + .min(); + + Ok(match response.decision() { + Decision::Allow => PolicyDecision { + allowed: true, + matched_rule_id: matched_rule_id.clone(), + message: format!( + "policy allowed action '{}' for actor '{}'", + request.action, request.actor_id + ), + }, + Decision::Deny => { + let message = format!( + "policy denied action '{}'{}{} for actor '{}'", + request.action, + request + .branch + .as_deref() + .map(|branch| format!(" on branch '{}'", branch)) + .unwrap_or_default(), + request + .target_branch + .as_deref() + .map(|branch| format!(" targeting branch '{}'", branch)) + .unwrap_or_default(), + request.actor_id + ); + self.deny(request, matched_rule_id, message) + } + }) + } + + pub fn validate_request(&self, request: &PolicyRequest) -> Result<()> { + let _ = self.authorize(request)?; + Ok(()) + } + + pub fn run_tests(&self, tests: &PolicyTestConfig) -> Result<()> { + if tests.version != 1 { + bail!("policy test version must be 1"); + } + let mut failures = Vec::new(); + for case in &tests.cases { + let decision = self.authorize(&PolicyRequest { + actor_id: case.actor.clone(), + action: case.action, + branch: case.branch.clone(), + target_branch: case.target_branch.clone(), + })?; + let expected_allowed = matches!(case.expect, PolicyExpectation::Allow); + if decision.allowed != expected_allowed { + failures.push(format!( + "{}: expected {:?} but got {}", + case.id, + case.expect, + if decision.allowed { "allow" } else { "deny" } + )); + } + } + if failures.is_empty() { + Ok(()) + } else { + bail!("policy tests failed:\n{}", failures.join("\n")) + } + } + + pub fn known_actor_count(&self) -> usize { + self.known_actors.len() + } + + fn deny( + &self, + _request: &PolicyRequest, + matched_rule_id: Option, + message: String, + ) -> PolicyDecision { + PolicyDecision { + allowed: false, + matched_rule_id, + message, + } + } +} + +fn compile_entities(config: &PolicyConfig, repo_id: &str, schema: &Schema) -> Result { + let mut group_entities = Vec::new(); + for group in config.groups.keys() { + group_entities.push(Entity::new( + entity_uid("Group", group)?, + HashMap::new(), + HashSet::::new(), + )?); + } + + let mut actor_groups: BTreeMap> = BTreeMap::new(); + for (group, members) in &config.groups { + for actor in members { + actor_groups + .entry(actor.clone()) + .or_default() + .insert(group.clone()); + } + } + + let mut actor_entities = Vec::new(); + for (actor, groups) in actor_groups { + let parents = groups + .iter() + .map(|group| entity_uid("Group", group)) + .collect::>>()?; + actor_entities.push(Entity::new( + entity_uid("Actor", &actor)?, + HashMap::new(), + parents, + )?); + } + + let repo_entity = Entity::new( + entity_uid("Repo", repo_id)?, + HashMap::new(), + HashSet::::new(), + )?; + + let mut entities = Vec::new(); + entities.extend(group_entities); + entities.extend(actor_entities); + entities.push(repo_entity); + Ok(Entities::from_entities(entities, Some(schema))?) +} + +fn compile_policies( + config: &PolicyConfig, + repo_id: &str, +) -> Result<(PolicySet, HashMap)> { + let mut policies = Vec::new(); + let mut policy_to_rule = HashMap::new(); + + for rule in &config.rules { + for action in &rule.allow.actions { + let policy_id = PolicyId::new(format!("{}:{}", rule.id, action.as_str())); + let source = compile_policy_source(rule, action, repo_id); + let policy = Policy::parse(Some(policy_id.clone()), source.as_str())?; + policy_to_rule.insert(policy_id.to_string(), rule.id.clone()); + policies.push(policy); + } + } + + Ok((PolicySet::from_policies(policies)?, policy_to_rule)) +} + +fn compile_policy_source(rule: &PolicyRule, action: &PolicyAction, repo_id: &str) -> String { + let mut conditions = Vec::new(); + if let Some(scope) = rule.allow.branch_scope { + conditions.push(branch_scope_condition(scope)); + } + if let Some(scope) = rule.allow.target_branch_scope { + conditions.push(target_branch_scope_condition(scope)); + } + + let when = if conditions.is_empty() { + String::new() + } else { + format!("\nwhen {{ {} }}", conditions.join(" && ")) + }; + + format!( + r#"permit ( + principal in Omnigraph::Group::{group}, + action == Omnigraph::Action::{action}, + resource == Omnigraph::Repo::{repo} +){when};"#, + group = cedar_literal(&rule.allow.actors.group), + action = cedar_literal(action.as_str()), + repo = cedar_literal(repo_id), + when = when, + ) +} + +fn branch_scope_condition(scope: PolicyBranchScope) -> String { + match scope { + PolicyBranchScope::Any => "true".to_string(), + PolicyBranchScope::Protected => { + "context.has_branch && context.branch_is_protected".to_string() + } + PolicyBranchScope::Unprotected => { + "context.has_branch && context.branch_is_protected == false".to_string() + } + } +} + +fn target_branch_scope_condition(scope: PolicyBranchScope) -> String { + match scope { + PolicyBranchScope::Any => "true".to_string(), + PolicyBranchScope::Protected => { + "context.has_target_branch && context.target_branch_is_protected".to_string() + } + PolicyBranchScope::Unprotected => { + "context.has_target_branch && context.target_branch_is_protected == false".to_string() + } + } +} + +fn policy_schema_source() -> &'static str { + r#" +namespace Omnigraph { + type RequestContext = { + has_branch: Bool, + branch: String, + has_target_branch: Bool, + target_branch: String, + branch_is_protected: Bool, + target_branch_is_protected: Bool, + }; + + entity Actor in [Group]; + entity Group; + entity Repo; + + action "read" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "export" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "change" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "branch_create" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "branch_delete" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "branch_merge" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "run_publish" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "run_abort" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; + action "admin" appliesTo { principal: Actor, resource: Repo, context: RequestContext }; +} +"# +} + +fn entity_uid(entity_type: &str, id: &str) -> Result { + let typename = EntityTypeName::from_str(&format!("Omnigraph::{entity_type}"))?; + let entity_id = EntityId::from_str(id).map_err(|err| eyre!(err.to_string()))?; + Ok(EntityUid::from_type_name_and_id(typename, entity_id)) +} + +fn cedar_literal(value: &str) -> String { + serde_json::to_string(value).expect("string literal should serialize") +} + +impl PolicyRequest { + pub fn actor_id(&self) -> &str { + &self.actor_id + } + + pub fn action(&self) -> PolicyAction { + self.action + } + + pub fn branch(&self) -> Option<&str> { + self.branch.as_deref() + } + + pub fn target_branch(&self) -> Option<&str> { + self.target_branch.as_deref() + } +} + +#[cfg(test)] +mod tests { + use super::{ + PolicyAction, PolicyCompiler, PolicyConfig, PolicyExpectation, PolicyRequest, + PolicyTestCase, PolicyTestConfig, + }; + + #[test] + fn rejects_duplicate_rule_ids() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-andrew] +rules: + - id: same + allow: + actors: { group: team } + actions: [read] + branch_scope: any + - id: same + allow: + actors: { group: team } + actions: [export] + branch_scope: any +"#, + ) + .unwrap(); + + let err = policy.validate().unwrap_err(); + assert!(err.to_string().contains("duplicate policy rule id")); + } + + #[test] + fn rejects_unknown_group_references() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-andrew] +rules: + - id: bad + allow: + actors: { group: admins } + actions: [read] + branch_scope: any +"#, + ) + .unwrap(); + + let err = policy.validate().unwrap_err(); + assert!(err.to_string().contains("references unknown group")); + } + + #[test] + fn rejects_invalid_scope_action_combinations() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-andrew] +rules: + - id: bad + allow: + actors: { group: team } + actions: [branch_merge] + branch_scope: protected +"#, + ) + .unwrap(); + + let err = policy.validate().unwrap_err(); + assert!(err.to_string().contains("unsupported action")); + } + + #[test] + fn compiles_and_authorizes_branch_and_target_rules() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-andrew, act-bruno] + admins: [act-andrew] +protected_branches: [main] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read, export] + branch_scope: any + - id: team-write + allow: + actors: { group: team } + actions: [change] + branch_scope: unprotected + - id: admins-promote + allow: + actors: { group: admins } + actions: [branch_delete, branch_merge, run_publish] + target_branch_scope: protected +"#, + ) + .unwrap(); + + let engine = PolicyCompiler::compile(&policy, "repo").unwrap(); + let allow = engine + .authorize(&PolicyRequest { + actor_id: "act-bruno".to_string(), + action: PolicyAction::Change, + branch: Some("feature".to_string()), + target_branch: None, + }) + .unwrap(); + assert!(allow.allowed); + assert_eq!(allow.matched_rule_id.as_deref(), Some("team-write")); + + let deny = engine + .authorize(&PolicyRequest { + actor_id: "act-bruno".to_string(), + action: PolicyAction::BranchDelete, + branch: None, + target_branch: Some("main".to_string()), + }) + .unwrap(); + assert!(!deny.allowed); + + let admin = engine + .authorize(&PolicyRequest { + actor_id: "act-andrew".to_string(), + action: PolicyAction::BranchDelete, + branch: None, + target_branch: Some("main".to_string()), + }) + .unwrap(); + assert!(admin.allowed); + assert_eq!(admin.matched_rule_id.as_deref(), Some("admins-promote")); + } + + #[test] + fn policy_tests_enforce_expected_outcomes() { + let policy: PolicyConfig = serde_yaml::from_str( + r#" +version: 1 +groups: + team: [act-andrew] +protected_branches: [main] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read] + branch_scope: any +"#, + ) + .unwrap(); + let engine = PolicyCompiler::compile(&policy, "repo").unwrap(); + let tests = PolicyTestConfig { + version: 1, + cases: vec![ + PolicyTestCase { + id: "allow-read".to_string(), + actor: "act-andrew".to_string(), + action: PolicyAction::Read, + branch: Some("main".to_string()), + target_branch: None, + expect: PolicyExpectation::Allow, + }, + PolicyTestCase { + id: "deny-change".to_string(), + actor: "act-andrew".to_string(), + action: PolicyAction::Change, + branch: Some("main".to_string()), + target_branch: None, + expect: PolicyExpectation::Deny, + }, + ], + }; + + engine.run_tests(&tests).unwrap(); + } +} diff --git a/crates/omnigraph-server/tests/server.rs b/crates/omnigraph-server/tests/server.rs new file mode 100644 index 0000000..69fa6c8 --- /dev/null +++ b/crates/omnigraph-server/tests/server.rs @@ -0,0 +1,1773 @@ +use std::env; +use std::fs; +use std::path::{Path, PathBuf}; + +use axum::Router; +use axum::body::{Body, to_bytes}; +use axum::http::{Method, Request, StatusCode}; +use omnigraph::db::{Omnigraph, ReadTarget}; +use omnigraph::loader::{LoadMode, load_jsonl}; +use omnigraph_server::api::{ + BranchCreateRequest, BranchMergeRequest, ChangeRequest, ErrorOutput, ExportRequest, + IngestRequest, ReadRequest, +}; +use omnigraph_server::{AppState, build_app}; +use serde_json::{Value, json}; +use serial_test::serial; +use tower::ServiceExt; + +const MUTATION_QUERIES: &str = r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} + +query set_age($name: String, $age: I32) { + update Person set { age: $age } where name = $name +} +"#; + +const POLICY_YAML: &str = r#" +version: 1 +groups: + team: [act-andrew, act-bruno, act-ragnor] + admins: [act-ragnor] +protected_branches: [main] +rules: + - id: team-read + allow: + actors: { group: team } + actions: [read] + branch_scope: any + - id: admins-export + allow: + actors: { group: admins } + actions: [export] + branch_scope: any + - id: team-write-unprotected + allow: + actors: { group: team } + actions: [change] + branch_scope: unprotected + - id: admins-merge + allow: + actors: { group: admins } + actions: [branch_delete, branch_merge] + target_branch_scope: protected + - id: admins-publish + allow: + actors: { group: admins } + actions: [run_publish] + target_branch_scope: protected +"#; + +const POLICY_PROTECTED_READ_YAML: &str = r#" +version: 1 +groups: + team: [act-bruno] +protected_branches: [main] +rules: + - id: protected-read + allow: + actors: { group: team } + actions: [read] + branch_scope: protected +"#; + +const INGEST_CREATE_ONLY_POLICY_YAML: &str = r#" +version: 1 +groups: + team: [act-bruno] +protected_branches: [main] +rules: + - id: team-branch-create + allow: + actors: { group: team } + actions: [branch_create] + target_branch_scope: unprotected +"#; + +fn fixture(name: &str) -> PathBuf { + PathBuf::from(env!("CARGO_MANIFEST_DIR")) + .join("../omnigraph/tests/fixtures") + .join(name) +} + +async fn init_loaded_repo() -> tempfile::TempDir { + init_repo_with_schema_and_data( + &fs::read_to_string(fixture("test.pg")).unwrap(), + &fs::read_to_string(fixture("test.jsonl")).unwrap(), + ) + .await +} + +async fn init_repo_with_schema_and_data(schema: &str, data: &str) -> tempfile::TempDir { + let temp = tempfile::tempdir().unwrap(); + let repo = repo_path(temp.path()); + fs::create_dir_all(&repo).unwrap(); + Omnigraph::init(repo.to_str().unwrap(), schema) + .await + .unwrap(); + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + temp +} + +fn repo_path(root: &Path) -> PathBuf { + root.join("server.omni") +} + +fn drifted_test_schema() -> String { + fs::read_to_string(fixture("test.pg")) + .unwrap() + .replace("age: I32?", "age: I64?") +} + +async fn manifest_dataset_version(repo: &Path) -> u64 { + Omnigraph::open(repo.to_string_lossy().as_ref()) + .await + .unwrap() + .snapshot_of(ReadTarget::branch("main")) + .await + .unwrap() + .version() +} + +fn s3_test_repo_uri(suite: &str) -> Option { + let bucket = env::var("OMNIGRAPH_S3_TEST_BUCKET").ok()?; + let prefix = env::var("OMNIGRAPH_S3_TEST_PREFIX") + .ok() + .filter(|value| !value.trim().is_empty()) + .unwrap_or_else(|| "omnigraph-itests".to_string()); + let unique = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .ok()? + .as_nanos(); + Some(format!("s3://{}/{}/{}/{}", bucket, prefix, suite, unique)) +} + +async fn app_for_loaded_repo() -> (tempfile::TempDir, Router) { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let state = AppState::open(repo.to_string_lossy().to_string()) + .await + .unwrap(); + (temp, build_app(state)) +} + +async fn app_for_loaded_repo_with_auth(token: &str) -> (tempfile::TempDir, Router) { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + let state = AppState::new_with_bearer_token( + repo.to_string_lossy().to_string(), + db, + Some(token.to_string()), + ); + (temp, build_app(state)) +} + +async fn app_for_loaded_repo_with_auth_tokens( + tokens: &[(&str, &str)], +) -> (tempfile::TempDir, Router) { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + let state = AppState::new_with_bearer_tokens( + repo.to_string_lossy().to_string(), + db, + tokens + .iter() + .map(|(actor, token)| ((*actor).to_string(), (*token).to_string())) + .collect(), + ); + (temp, build_app(state)) +} + +async fn app_for_loaded_repo_with_auth_tokens_and_policy( + tokens: &[(&str, &str)], + policy: &str, +) -> (tempfile::TempDir, Router) { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, policy).unwrap(); + let state = AppState::open_with_bearer_tokens_and_policy( + repo.to_string_lossy().to_string(), + tokens + .iter() + .map(|(actor, token)| ((*actor).to_string(), (*token).to_string())) + .collect(), + Some(&policy_path), + ) + .await + .unwrap(); + (temp, build_app(state)) +} + +async fn json_response(app: &Router, request: Request) -> (StatusCode, Value) { + let response = app.clone().oneshot(request).await.unwrap(); + let status = response.status(); + let body = to_bytes(response.into_body(), usize::MAX).await.unwrap(); + let value = serde_json::from_slice(&body).unwrap(); + (status, value) +} + +struct EnvGuard { + saved: Vec<(&'static str, Option)>, +} + +impl EnvGuard { + fn set(vars: &[(&'static str, Option<&str>)]) -> Self { + let saved = vars + .iter() + .map(|(name, _)| (*name, env::var(name).ok())) + .collect::>(); + for (name, value) in vars { + unsafe { + match value { + Some(value) => env::set_var(name, value), + None => env::remove_var(name), + } + } + } + Self { saved } + } +} + +impl Drop for EnvGuard { + fn drop(&mut self) { + for (name, value) in self.saved.drain(..) { + unsafe { + match value { + Some(value) => env::set_var(name, value), + None => env::remove_var(name), + } + } + } + } +} + +fn format_vector(values: &[f32]) -> String { + values + .iter() + .map(|value| format!("{:.8}", value)) + .collect::>() + .join(", ") +} + +fn normalize_vector(mut values: Vec) -> Vec { + let norm = values + .iter() + .map(|value| (*value as f64) * (*value as f64)) + .sum::() + .sqrt() as f32; + if norm > f32::EPSILON { + for value in &mut values { + *value /= norm; + } + } + values +} + +fn fnv1a64(bytes: &[u8]) -> u64 { + let mut hash = 14695981039346656037u64; + for byte in bytes { + hash ^= *byte as u64; + hash = hash.wrapping_mul(1099511628211u64); + } + hash +} + +fn xorshift64(mut x: u64) -> u64 { + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + x +} + +fn mock_embedding(input: &str, dim: usize) -> Vec { + let mut seed = fnv1a64(input.as_bytes()); + let mut out = Vec::with_capacity(dim); + for _ in 0..dim { + seed = xorshift64(seed); + let ratio = (seed as f64 / u64::MAX as f64) as f32; + out.push((ratio * 2.0) - 1.0); + } + normalize_vector(out) +} + +#[tokio::test(flavor = "multi_thread")] +async fn healthz_succeeds_after_startup() { + let (_temp, app) = app_for_loaded_repo().await; + let (status, body) = json_response( + &app, + Request::builder() + .uri("/healthz") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + + assert_eq!(status, StatusCode::OK); + assert_eq!(body["status"], "ok"); + assert_eq!(body["version"], env!("CARGO_PKG_VERSION")); + match option_env!("OMNIGRAPH_SOURCE_VERSION") { + Some(source_version) => assert_eq!(body["source_version"], source_version), + None => assert!(body.get("source_version").is_none()), + } +} + +#[tokio::test(flavor = "multi_thread")] +async fn schema_drift_returns_conflict_for_snapshot_read_and_change() { + let (temp, app) = app_for_loaded_repo().await; + let repo = repo_path(temp.path()); + fs::write(repo.join("_schema.pg"), drifted_test_schema()).unwrap(); + + let (snapshot_status, snapshot_body) = json_response( + &app, + Request::builder() + .uri("/snapshot?branch=main") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + let snapshot_error: ErrorOutput = serde_json::from_value(snapshot_body).unwrap(); + assert_eq!(snapshot_status, StatusCode::CONFLICT); + assert_eq!( + snapshot_error.code, + Some(omnigraph_server::api::ErrorCode::Conflict) + ); + assert!( + snapshot_error + .error + .contains("schema evolution is locked down in phase 1") + ); + + let read = ReadRequest { + query_source: fs::read_to_string(fixture("test.gq")).unwrap(), + query_name: Some("get_person".to_string()), + params: Some(json!({ "name": "Alice" })), + branch: Some("main".to_string()), + snapshot: None, + }; + let (read_status, read_body) = json_response( + &app, + Request::builder() + .uri("/read") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&read).unwrap())) + .unwrap(), + ) + .await; + let read_error: ErrorOutput = serde_json::from_value(read_body).unwrap(); + assert_eq!(read_status, StatusCode::CONFLICT); + assert_eq!( + read_error.code, + Some(omnigraph_server::api::ErrorCode::Conflict) + ); + assert!( + read_error + .error + .contains("schema evolution is locked down in phase 1") + ); + + let change = ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("insert_person".to_string()), + params: Some(json!({ "name": "Mina", "age": 28 })), + branch: Some("main".to_string()), + }; + let (change_status, change_body) = json_response( + &app, + Request::builder() + .uri("/change") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&change).unwrap())) + .unwrap(), + ) + .await; + let change_error: ErrorOutput = serde_json::from_value(change_body).unwrap(); + assert_eq!(change_status, StatusCode::CONFLICT); + assert_eq!( + change_error.code, + Some(omnigraph_server::api::ErrorCode::Conflict) + ); + assert!( + change_error + .error + .contains("schema evolution is locked down in phase 1") + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn protected_routes_require_bearer_token() { + let (_temp, app) = app_for_loaded_repo_with_auth("demo-token").await; + let (status, body) = json_response( + &app, + Request::builder() + .uri("/runs") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + + let error: ErrorOutput = serde_json::from_value(body).unwrap(); + assert_eq!(status, StatusCode::UNAUTHORIZED); + assert_eq!( + error.code, + Some(omnigraph_server::api::ErrorCode::Unauthorized) + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn protected_routes_accept_valid_bearer_token_while_healthz_stays_open() { + let (_temp, app) = app_for_loaded_repo_with_auth("demo-token").await; + + let health = app + .clone() + .oneshot( + Request::builder() + .uri("/healthz") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(health.status(), StatusCode::OK); + + let (status, body) = json_response( + &app, + Request::builder() + .uri("/runs") + .method(Method::GET) + .header("authorization", "Bearer demo-token") + .body(Body::empty()) + .unwrap(), + ) + .await; + + assert_eq!(status, StatusCode::OK); + assert!(body["runs"].is_array()); +} + +#[tokio::test(flavor = "multi_thread")] +async fn export_route_returns_jsonl_for_branch_snapshot() { + let token = "demo-token"; + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.branch_create_from(ReadTarget::branch("main"), "feature") + .await + .unwrap(); + db.load( + "feature", + r#"{"type":"Person","data":{"name":"Eve","age":29}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + let expected = db + .export_jsonl("feature", &["Person".to_string()], &[]) + .await + .unwrap(); + drop(db); + + let state = AppState::new_with_bearer_token( + repo.to_string_lossy().to_string(), + Omnigraph::open(repo.to_str().unwrap()).await.unwrap(), + Some(token.to_string()), + ); + let app = build_app(state); + + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/export") + .method(Method::POST) + .header("content-type", "application/json") + .header("authorization", format!("Bearer {}", token)) + .body(Body::from( + serde_json::to_vec(&ExportRequest { + branch: Some("feature".to_string()), + type_names: vec!["Person".to_string()], + table_keys: Vec::new(), + }) + .unwrap(), + )) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::OK); + assert_eq!( + response.headers().get("content-type").unwrap(), + "application/x-ndjson; charset=utf-8" + ); + let body = to_bytes(response.into_body(), usize::MAX).await.unwrap(); + let text = String::from_utf8(body.to_vec()).unwrap(); + assert_eq!(text, expected); +} + +#[tokio::test(flavor = "multi_thread")] +async fn protected_routes_accept_any_configured_team_bearer_token() { + let (_temp, app) = + app_for_loaded_repo_with_auth_tokens(&[("team-01", "token-one"), ("team-02", "token-two")]) + .await; + + let (status, body) = json_response( + &app, + Request::builder() + .uri("/runs") + .method(Method::GET) + .header("authorization", "Bearer token-two") + .body(Body::empty()) + .unwrap(), + ) + .await; + + assert_eq!(status, StatusCode::OK); + assert!(body["runs"].is_array()); +} + +#[tokio::test(flavor = "multi_thread")] +async fn policy_allows_read_but_distinguishes_401_from_403() { + let (_temp, app) = app_for_loaded_repo_with_auth_tokens_and_policy( + &[("act-bruno", "team-token"), ("act-ragnor", "admin-token")], + POLICY_YAML, + ) + .await; + + let (missing_status, missing_body) = json_response( + &app, + Request::builder() + .uri("/snapshot?branch=main") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + let missing_error: ErrorOutput = serde_json::from_value(missing_body).unwrap(); + assert_eq!(missing_status, StatusCode::UNAUTHORIZED); + assert_eq!( + missing_error.code, + Some(omnigraph_server::api::ErrorCode::Unauthorized) + ); + + let (snapshot_status, snapshot_body) = json_response( + &app, + Request::builder() + .uri("/snapshot?branch=main") + .method(Method::GET) + .header("authorization", "Bearer team-token") + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(snapshot_status, StatusCode::OK); + assert_eq!(snapshot_body["branch"], "main"); + + let export_request = ExportRequest { + branch: Some("main".to_string()), + type_names: Vec::new(), + table_keys: Vec::new(), + }; + let (forbidden_status, forbidden_body) = json_response( + &app, + Request::builder() + .uri("/export") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&export_request).unwrap())) + .unwrap(), + ) + .await; + let forbidden_error: ErrorOutput = serde_json::from_value(forbidden_body).unwrap(); + assert_eq!(forbidden_status, StatusCode::FORBIDDEN); + assert_eq!( + forbidden_error.code, + Some(omnigraph_server::api::ErrorCode::Forbidden) + ); + + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/export") + .method(Method::POST) + .header("authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&export_request).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::OK); +} + +#[tokio::test(flavor = "multi_thread")] +async fn policy_uses_resolved_branch_for_snapshot_reads() { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let snapshot_id = { + let db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.resolve_snapshot("main").await.unwrap().to_string() + }; + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, POLICY_PROTECTED_READ_YAML).unwrap(); + let state = AppState::open_with_bearer_tokens_and_policy( + repo.to_string_lossy().to_string(), + vec![("act-bruno".to_string(), "team-token".to_string())], + Some(&policy_path), + ) + .await + .unwrap(); + let app = build_app(state); + + let read = ReadRequest { + query_source: fs::read_to_string(fixture("test.gq")).unwrap(), + query_name: Some("get_person".to_string()), + params: Some(json!({ "name": "Alice" })), + branch: None, + snapshot: Some(snapshot_id), + }; + let (status, body) = json_response( + &app, + Request::builder() + .uri("/read") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&read).unwrap())) + .unwrap(), + ) + .await; + + assert_eq!(status, StatusCode::OK); + assert_eq!(body["target"]["branch"], Value::Null); + assert_eq!( + body["target"]["snapshot"].as_str(), + read.snapshot.as_deref() + ); + assert_eq!(body["row_count"], 1); +} + +#[tokio::test(flavor = "multi_thread")] +async fn snapshot_route_returns_manifest_dataset_version() { + let (temp, app) = app_for_loaded_repo().await; + let repo = repo_path(temp.path()); + let expected_manifest_version = manifest_dataset_version(&repo).await; + + let (snapshot_status, snapshot_body) = json_response( + &app, + Request::builder() + .uri("/snapshot?branch=main") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + + assert_eq!(snapshot_status, StatusCode::OK); + assert_eq!(snapshot_body["branch"], "main"); + assert_eq!( + snapshot_body["manifest_version"].as_u64().unwrap(), + expected_manifest_version + ); + assert!(snapshot_body["tables"].is_array()); +} + +#[tokio::test(flavor = "multi_thread")] +async fn policy_blocks_change_on_protected_main_but_allows_unprotected_branch() { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.branch_create_from(ReadTarget::branch("main"), "feature") + .await + .unwrap(); + drop(db); + + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, POLICY_YAML).unwrap(); + let state = AppState::open_with_bearer_tokens_and_policy( + repo.to_string_lossy().to_string(), + vec![("act-bruno".to_string(), "team-token".to_string())], + Some(&policy_path), + ) + .await + .unwrap(); + let app = build_app(state); + + let main_change = ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("insert_person".to_string()), + params: Some(json!({ "name": "Mina", "age": 28 })), + branch: Some("main".to_string()), + }; + let (main_status, main_body) = json_response( + &app, + Request::builder() + .uri("/change") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&main_change).unwrap())) + .unwrap(), + ) + .await; + let main_error: ErrorOutput = serde_json::from_value(main_body).unwrap(); + assert_eq!(main_status, StatusCode::FORBIDDEN); + assert_eq!( + main_error.code, + Some(omnigraph_server::api::ErrorCode::Forbidden) + ); + + let feature_change = ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("insert_person".to_string()), + params: Some(json!({ "name": "Mina", "age": 28 })), + branch: Some("feature".to_string()), + }; + let (feature_status, feature_body) = json_response( + &app, + Request::builder() + .uri("/change") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&feature_change).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(feature_status, StatusCode::OK); + assert_eq!(feature_body["branch"], "feature"); + assert_eq!(feature_body["affected_nodes"], 1); +} + +#[tokio::test(flavor = "multi_thread")] +async fn policy_blocks_non_admin_merge_to_main_and_allows_admin() { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.branch_create_from(ReadTarget::branch("main"), "feature") + .await + .unwrap(); + db.load( + "feature", + r#"{"type":"Person","data":{"name":"Zoe","age":33}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + drop(db); + + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, POLICY_YAML).unwrap(); + let state = AppState::open_with_bearer_tokens_and_policy( + repo.to_string_lossy().to_string(), + vec![ + ("act-bruno".to_string(), "team-token".to_string()), + ("act-ragnor".to_string(), "admin-token".to_string()), + ], + Some(&policy_path), + ) + .await + .unwrap(); + let app = build_app(state); + + let merge = BranchMergeRequest { + source: "feature".to_string(), + target: Some("main".to_string()), + }; + let (deny_status, deny_body) = json_response( + &app, + Request::builder() + .uri("/branches/merge") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&merge).unwrap())) + .unwrap(), + ) + .await; + let deny_error: ErrorOutput = serde_json::from_value(deny_body).unwrap(); + assert_eq!(deny_status, StatusCode::FORBIDDEN); + assert_eq!( + deny_error.code, + Some(omnigraph_server::api::ErrorCode::Forbidden) + ); + + let (allow_status, allow_body) = json_response( + &app, + Request::builder() + .uri("/branches/merge") + .method(Method::POST) + .header("authorization", "Bearer admin-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&merge).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(allow_status, StatusCode::OK); + assert_eq!(allow_body["actor_id"], "act-ragnor"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn policy_blocks_non_admin_run_publish_to_main() { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let run_id = { + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.begin_run("main", Some("policy-publish")) + .await + .unwrap() + .run_id + .as_str() + .to_string() + }; + + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, POLICY_YAML).unwrap(); + let state = AppState::open_with_bearer_tokens_and_policy( + repo.to_string_lossy().to_string(), + vec![ + ("act-bruno".to_string(), "team-token".to_string()), + ("act-ragnor".to_string(), "admin-token".to_string()), + ], + Some(&policy_path), + ) + .await + .unwrap(); + let app = build_app(state); + + let (deny_status, deny_body) = json_response( + &app, + Request::builder() + .uri(format!("/runs/{run_id}/publish")) + .method(Method::POST) + .header("authorization", "Bearer team-token") + .body(Body::empty()) + .unwrap(), + ) + .await; + let deny_error: ErrorOutput = serde_json::from_value(deny_body).unwrap(); + assert_eq!(deny_status, StatusCode::FORBIDDEN); + assert_eq!( + deny_error.code, + Some(omnigraph_server::api::ErrorCode::Forbidden) + ); + + let (allow_status, allow_body) = json_response( + &app, + Request::builder() + .uri(format!("/runs/{run_id}/publish")) + .method(Method::POST) + .header("authorization", "Bearer admin-token") + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(allow_status, StatusCode::OK); + assert_eq!(allow_body["target_branch"], "main"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn authenticated_change_stamps_actor_on_runs_and_commits() { + let (_temp, app) = app_for_loaded_repo_with_auth_tokens(&[("act-andrew", "token-one")]).await; + + let change = ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("insert_person".to_string()), + params: Some(json!({ "name": "Mina", "age": 28 })), + branch: Some("main".to_string()), + }; + let (change_status, change_body) = json_response( + &app, + Request::builder() + .uri("/change") + .method(Method::POST) + .header("authorization", "Bearer token-one") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&change).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(change_status, StatusCode::OK); + assert_eq!(change_body["actor_id"], "act-andrew"); + + let (runs_status, runs_body) = json_response( + &app, + Request::builder() + .uri("/runs") + .method(Method::GET) + .header("authorization", "Bearer token-one") + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(runs_status, StatusCode::OK); + let run = runs_body["runs"] + .as_array() + .unwrap() + .iter() + .find(|run| run["operation_hash"] == "mutation:insert_person:branch=main") + .expect("mutation run should be present"); + assert_eq!(run["actor_id"], "act-andrew"); + assert_eq!(run["status"], "published"); + + let (commits_status, commits_body) = json_response( + &app, + Request::builder() + .uri("/commits?branch=main") + .method(Method::GET) + .header("authorization", "Bearer token-one") + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(commits_status, StatusCode::OK); + let head = commits_body["commits"] + .as_array() + .unwrap() + .last() + .expect("head commit should exist"); + assert_eq!(head["actor_id"], "act-andrew"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn ingest_creates_branch_returns_metadata_and_stamps_actor() { + let (temp, app) = app_for_loaded_repo_with_auth_tokens(&[("act-andrew", "token-one")]).await; + let repo = repo_path(temp.path()); + let ingest = IngestRequest { + branch: Some("feature-ingest".to_string()), + from: Some("main".to_string()), + mode: Some(LoadMode::Merge), + data: r#"{"type":"Person","data":{"name":"Zoe","age":33}} +{"type":"Person","data":{"name":"Bob","age":26}}"# + .to_string(), + }; + + let (status, body) = json_response( + &app, + Request::builder() + .uri("/ingest") + .method(Method::POST) + .header("authorization", "Bearer token-one") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&ingest).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(status, StatusCode::OK); + assert_eq!(body["branch"], "feature-ingest"); + assert_eq!(body["base_branch"], "main"); + assert_eq!(body["branch_created"], true); + assert_eq!(body["mode"], "merge"); + assert_eq!(body["actor_id"], "act-andrew"); + assert_eq!(body["tables"][0]["table_key"], "node:Person"); + assert_eq!(body["tables"][0]["rows_loaded"], 2); + + let db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + let snapshot = db + .snapshot_of(ReadTarget::branch("feature-ingest")) + .await + .unwrap(); + let person_ds = snapshot.open("node:Person").await.unwrap(); + assert_eq!(person_ds.count_rows(None).await.unwrap(), 5); + let head = db + .list_commits(Some("feature-ingest")) + .await + .unwrap() + .into_iter() + .last() + .unwrap(); + assert_eq!(head.actor_id.as_deref(), Some("act-andrew")); +} + +#[tokio::test(flavor = "multi_thread")] +async fn ingest_existing_branch_skips_branch_create_policy_check() { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + { + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.branch_create_from(ReadTarget::branch("main"), "feature") + .await + .unwrap(); + } + let policy_path = temp.path().join("policy.yaml"); + fs::write(&policy_path, POLICY_YAML).unwrap(); + let state = AppState::open_with_bearer_tokens_and_policy( + repo.to_string_lossy().to_string(), + vec![("act-bruno".to_string(), "team-token".to_string())], + Some(&policy_path), + ) + .await + .unwrap(); + let app = build_app(state); + let ingest = IngestRequest { + branch: Some("feature".to_string()), + from: Some("other-base".to_string()), + mode: Some(LoadMode::Merge), + data: r#"{"type":"Person","data":{"name":"Zoe","age":33}}"#.to_string(), + }; + + let (status, body) = json_response( + &app, + Request::builder() + .uri("/ingest") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&ingest).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(status, StatusCode::OK); + assert_eq!(body["branch"], "feature"); + assert_eq!(body["branch_created"], false); + assert_eq!(body["base_branch"], "other-base"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn ingest_denies_missing_branch_without_branch_create_permission() { + let (_temp, app) = app_for_loaded_repo_with_auth_tokens_and_policy( + &[("act-bruno", "team-token")], + POLICY_YAML, + ) + .await; + let ingest = IngestRequest { + branch: Some("feature".to_string()), + from: Some("main".to_string()), + mode: Some(LoadMode::Merge), + data: r#"{"type":"Person","data":{"name":"Zoe","age":33}}"#.to_string(), + }; + + let (status, body) = json_response( + &app, + Request::builder() + .uri("/ingest") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&ingest).unwrap())) + .unwrap(), + ) + .await; + let error: ErrorOutput = serde_json::from_value(body).unwrap(); + assert_eq!(status, StatusCode::FORBIDDEN); + assert_eq!( + error.code, + Some(omnigraph_server::api::ErrorCode::Forbidden) + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn ingest_denies_when_actor_lacks_change_permission() { + let (_temp, app) = app_for_loaded_repo_with_auth_tokens_and_policy( + &[("act-bruno", "team-token")], + INGEST_CREATE_ONLY_POLICY_YAML, + ) + .await; + let ingest = IngestRequest { + branch: Some("feature".to_string()), + from: Some("main".to_string()), + mode: Some(LoadMode::Merge), + data: r#"{"type":"Person","data":{"name":"Zoe","age":33}}"#.to_string(), + }; + + let (status, body) = json_response( + &app, + Request::builder() + .uri("/ingest") + .method(Method::POST) + .header("authorization", "Bearer team-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&ingest).unwrap())) + .unwrap(), + ) + .await; + let error: ErrorOutput = serde_json::from_value(body).unwrap(); + assert_eq!(status, StatusCode::FORBIDDEN); + assert_eq!( + error.code, + Some(omnigraph_server::api::ErrorCode::Forbidden) + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn ingest_rejects_payloads_over_32_mib() { + let (_temp, app) = app_for_loaded_repo().await; + let oversize = IngestRequest { + branch: Some("feature".to_string()), + from: Some("main".to_string()), + mode: Some(LoadMode::Merge), + data: "x".repeat(33 * 1024 * 1024), + }; + + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/ingest") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&oversize).unwrap())) + .unwrap(), + ) + .await + .unwrap(); + assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE); +} + +#[tokio::test(flavor = "multi_thread")] +async fn authenticated_branch_merge_stamps_merge_actor_on_head_commit() { + let (_temp, app) = app_for_loaded_repo_with_auth_tokens(&[ + ("act-andrew", "token-one"), + ("act-ragnor", "token-two"), + ]) + .await; + + let create = BranchCreateRequest { + from: Some("main".to_string()), + name: "feature".to_string(), + }; + let (create_status, _) = json_response( + &app, + Request::builder() + .uri("/branches") + .method(Method::POST) + .header("authorization", "Bearer token-one") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&create).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(create_status, StatusCode::OK); + + let change = ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("insert_person".to_string()), + params: Some(json!({ "name": "Zoe", "age": 33 })), + branch: Some("feature".to_string()), + }; + let (change_status, _) = json_response( + &app, + Request::builder() + .uri("/change") + .method(Method::POST) + .header("authorization", "Bearer token-one") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&change).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(change_status, StatusCode::OK); + + let merge = BranchMergeRequest { + source: "feature".to_string(), + target: Some("main".to_string()), + }; + let (merge_status, merge_body) = json_response( + &app, + Request::builder() + .uri("/branches/merge") + .method(Method::POST) + .header("authorization", "Bearer token-two") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&merge).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(merge_status, StatusCode::OK); + assert_eq!(merge_body["actor_id"], "act-ragnor"); + + let (commit_status, commit_body) = json_response( + &app, + Request::builder() + .uri("/commits?branch=main") + .method(Method::GET) + .header("authorization", "Bearer token-two") + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(commit_status, StatusCode::OK); + let head = commit_body["commits"] + .as_array() + .unwrap() + .last() + .expect("head commit should exist"); + assert_eq!(head["actor_id"], "act-ragnor"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn branch_merge_conflict_response_includes_structured_conflicts() { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.branch_create_from(ReadTarget::branch("main"), "feature") + .await + .unwrap(); + db.mutate( + "main", + MUTATION_QUERIES, + "set_age", + &omnigraph_compiler::json_params_to_param_map( + Some(&json!({"name": "Alice", "age": 31 })), + &omnigraph_compiler::find_named_query(MUTATION_QUERIES, "set_age") + .unwrap() + .params, + omnigraph_compiler::JsonParamMode::Standard, + ) + .unwrap(), + ) + .await + .unwrap(); + db.mutate( + "feature", + MUTATION_QUERIES, + "set_age", + &omnigraph_compiler::json_params_to_param_map( + Some(&json!({"name": "Alice", "age": 32 })), + &omnigraph_compiler::find_named_query(MUTATION_QUERIES, "set_age") + .unwrap() + .params, + omnigraph_compiler::JsonParamMode::Standard, + ) + .unwrap(), + ) + .await + .unwrap(); + drop(db); + + let state = AppState::open(repo.to_string_lossy().to_string()) + .await + .unwrap(); + let app = build_app(state); + let merge = BranchMergeRequest { + source: "feature".to_string(), + target: Some("main".to_string()), + }; + let (status, body) = json_response( + &app, + Request::builder() + .uri("/branches/merge") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&merge).unwrap())) + .unwrap(), + ) + .await; + + let error: ErrorOutput = serde_json::from_value(body).unwrap(); + assert_eq!(status, StatusCode::CONFLICT); + assert_eq!(error.code, Some(omnigraph_server::api::ErrorCode::Conflict)); + assert!(error.error.contains("merge conflict")); + assert!(error.merge_conflicts.iter().any(|conflict| { + conflict.table_key == "node:Person" + && conflict.row_id.as_deref() == Some("Alice") + && conflict.kind == omnigraph_server::api::MergeConflictKindOutput::DivergentUpdate + })); +} + +#[tokio::test(flavor = "multi_thread")] +async fn repeated_read_after_change_sees_updated_state_from_same_app() { + let (_temp, app) = app_for_loaded_repo().await; + + let change = ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("insert_person".to_string()), + params: Some(json!({ "name": "Mina", "age": 28 })), + branch: Some("main".to_string()), + }; + let (change_status, change_body) = json_response( + &app, + Request::builder() + .uri("/change") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&change).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(change_status, StatusCode::OK); + assert_eq!(change_body["affected_nodes"], 1); + + let read = ReadRequest { + query_source: fs::read_to_string(fixture("test.gq")).unwrap(), + query_name: Some("get_person".to_string()), + params: Some(json!({ "name": "Mina" })), + branch: Some("main".to_string()), + snapshot: None, + }; + let (read_status, read_body) = json_response( + &app, + Request::builder() + .uri("/read") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&read).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(read_status, StatusCode::OK); + assert_eq!(read_body["row_count"], 1); + assert_eq!(read_body["rows"][0]["p.name"], "Mina"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn remote_branch_list_create_merge_flow_works() { + let (_temp, app) = app_for_loaded_repo().await; + + let (list_status, list_body) = json_response( + &app, + Request::builder() + .uri("/branches") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(list_status, StatusCode::OK); + assert_eq!(list_body["branches"], json!(["main"])); + + let create = BranchCreateRequest { + from: Some("main".to_string()), + name: "feature".to_string(), + }; + let (create_status, create_body) = json_response( + &app, + Request::builder() + .uri("/branches") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&create).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(create_status, StatusCode::OK); + assert_eq!(create_body["from"], "main"); + assert_eq!(create_body["name"], "feature"); + + let (list_status, list_body) = json_response( + &app, + Request::builder() + .uri("/branches") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(list_status, StatusCode::OK); + assert_eq!(list_body["branches"], json!(["feature", "main"])); + + let change = ChangeRequest { + query_source: MUTATION_QUERIES.to_string(), + query_name: Some("insert_person".to_string()), + params: Some(json!({ "name": "Zoe", "age": 33 })), + branch: Some("feature".to_string()), + }; + let (change_status, change_body) = json_response( + &app, + Request::builder() + .uri("/change") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&change).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(change_status, StatusCode::OK); + assert_eq!(change_body["branch"], "feature"); + assert_eq!(change_body["affected_nodes"], 1); + + let read_main_before = ReadRequest { + query_source: fs::read_to_string(fixture("test.gq")).unwrap(), + query_name: Some("get_person".to_string()), + params: Some(json!({ "name": "Zoe" })), + branch: Some("main".to_string()), + snapshot: None, + }; + let (read_status, read_body) = json_response( + &app, + Request::builder() + .uri("/read") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&read_main_before).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(read_status, StatusCode::OK); + assert_eq!(read_body["row_count"], 0); + + let merge = BranchMergeRequest { + source: "feature".to_string(), + target: Some("main".to_string()), + }; + let (merge_status, merge_body) = json_response( + &app, + Request::builder() + .uri("/branches/merge") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&merge).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(merge_status, StatusCode::OK); + assert_eq!(merge_body["source"], "feature"); + assert_eq!(merge_body["target"], "main"); + assert_eq!(merge_body["outcome"], "fast_forward"); + + let read_main_after = ReadRequest { + query_source: fs::read_to_string(fixture("test.gq")).unwrap(), + query_name: Some("get_person".to_string()), + params: Some(json!({ "name": "Zoe" })), + branch: Some("main".to_string()), + snapshot: None, + }; + let (read_status, read_body) = json_response( + &app, + Request::builder() + .uri("/read") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&read_main_after).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(read_status, StatusCode::OK); + assert_eq!(read_body["row_count"], 1); + assert_eq!(read_body["rows"][0]["p.name"], "Zoe"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn remote_branch_delete_flow_works() { + let (_temp, app) = app_for_loaded_repo().await; + + let create = BranchCreateRequest { + from: Some("main".to_string()), + name: "feature".to_string(), + }; + let (create_status, _) = json_response( + &app, + Request::builder() + .uri("/branches") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&create).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(create_status, StatusCode::OK); + + let (delete_status, delete_body) = json_response( + &app, + Request::builder() + .uri("/branches/feature") + .method(Method::DELETE) + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(delete_status, StatusCode::OK); + assert_eq!(delete_body["name"], "feature"); + + let (list_status, list_body) = json_response( + &app, + Request::builder() + .uri("/branches") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(list_status, StatusCode::OK); + assert_eq!(list_body["branches"], json!(["main"])); +} + +#[tokio::test(flavor = "multi_thread")] +async fn branch_delete_denies_without_policy_permission() { + let (temp, app) = app_for_loaded_repo_with_auth_tokens_and_policy( + &[("act-andrew", "token-admin"), ("act-bruno", "token-team")], + POLICY_YAML, + ) + .await; + let repo = repo_path(temp.path()); + + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + db.branch_create_from(ReadTarget::branch("main"), "feature") + .await + .unwrap(); + drop(db); + + let (status, body) = json_response( + &app, + Request::builder() + .uri("/branches/feature") + .method(Method::DELETE) + .header("authorization", "Bearer token-team") + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(status, StatusCode::FORBIDDEN); + assert!( + body["error"] + .as_str() + .unwrap() + .contains("policy denied action 'branch_delete'") + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn server_opens_s3_repo_directly_and_serves_snapshot_and_read() { + let Some(uri) = s3_test_repo_uri("server") else { + eprintln!("skipping s3 server test: OMNIGRAPH_S3_TEST_BUCKET is not set"); + return; + }; + + Omnigraph::init(&uri, &fs::read_to_string(fixture("test.pg")).unwrap()) + .await + .unwrap(); + let mut db = Omnigraph::open(&uri).await.unwrap(); + load_jsonl( + &mut db, + &fs::read_to_string(fixture("test.jsonl")).unwrap(), + LoadMode::Overwrite, + ) + .await + .unwrap(); + + let app = build_app( + AppState::open_with_bearer_token(uri.clone(), Some("s3-token".to_string())) + .await + .unwrap(), + ); + + let (snapshot_status, snapshot_body) = json_response( + &app, + Request::builder() + .uri("/snapshot") + .method(Method::GET) + .header("authorization", "Bearer s3-token") + .body(Body::empty()) + .unwrap(), + ) + .await; + assert_eq!(snapshot_status, StatusCode::OK); + assert!(snapshot_body["tables"].is_array()); + + let read = ReadRequest { + query_source: fs::read_to_string(fixture("test.gq")).unwrap(), + query_name: Some("get_person".to_string()), + params: Some(json!({ "name": "Alice" })), + branch: Some("main".to_string()), + snapshot: None, + }; + let (read_status, read_body) = json_response( + &app, + Request::builder() + .uri("/read") + .method(Method::POST) + .header("authorization", "Bearer s3-token") + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&read).unwrap())) + .unwrap(), + ) + .await; + assert_eq!(read_status, StatusCode::OK); + assert_eq!(read_body["row_count"], 1); + assert_eq!(read_body["rows"][0]["p.name"], "Alice"); +} + +#[tokio::test(flavor = "multi_thread")] +#[serial] +async fn remote_read_embeds_string_nearest_queries_with_mock_runtime() { + const EMBED_SCHEMA: &str = r#" +node Doc { + slug: String @key + title: String @index + embedding: Vector(4) @index +} +"#; + const EMBED_QUERY: &str = r#" +query vector_search_string($q: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { nearest($d.embedding, $q) } + limit 3 +} +"#; + + let alpha = mock_embedding("alpha", 4); + let beta = mock_embedding("beta", 4); + let gamma = mock_embedding("gamma", 4); + let data = format!( + concat!( + r#"{{"type":"Doc","data":{{"slug":"alpha-doc","title":"alpha guide","embedding":[{}]}}}}"#, + "\n", + r#"{{"type":"Doc","data":{{"slug":"beta-doc","title":"beta guide","embedding":[{}]}}}}"#, + "\n", + r#"{{"type":"Doc","data":{{"slug":"gamma-doc","title":"gamma handbook","embedding":[{}]}}}}"# + ), + format_vector(&alpha), + format_vector(&beta), + format_vector(&gamma), + ); + + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_EMBEDDINGS_MOCK", Some("1")), + ("GEMINI_API_KEY", None), + ]); + let temp = init_repo_with_schema_and_data(EMBED_SCHEMA, &data).await; + let repo = repo_path(temp.path()); + let state = AppState::open(repo.to_string_lossy().to_string()) + .await + .unwrap(); + let app = build_app(state); + + let read = ReadRequest { + query_source: EMBED_QUERY.to_string(), + query_name: Some("vector_search_string".to_string()), + params: Some(json!({ "q": "alpha" })), + branch: Some("main".to_string()), + snapshot: None, + }; + let (status, body) = json_response( + &app, + Request::builder() + .uri("/read") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(serde_json::to_vec(&read).unwrap())) + .unwrap(), + ) + .await; + + assert_eq!(status, StatusCode::OK); + assert_eq!(body["row_count"], 3); + assert_eq!(body["rows"][0]["d.slug"], "alpha-doc"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn missing_run_returns_not_found() { + let (_temp, app) = app_for_loaded_repo().await; + let (status, body) = json_response( + &app, + Request::builder() + .uri("/runs/missing-run") + .method(Method::GET) + .body(Body::empty()) + .unwrap(), + ) + .await; + + let error: ErrorOutput = serde_json::from_value(body).unwrap(); + assert_eq!(status, StatusCode::NOT_FOUND); + assert_eq!(error.code, Some(omnigraph_server::api::ErrorCode::NotFound)); + assert!(error.error.contains("run 'missing-run' not found")); +} + +#[tokio::test(flavor = "multi_thread")] +async fn publish_conflict_returns_conflict_status() { + let temp = init_loaded_repo().await; + let repo = repo_path(temp.path()); + let mut db = Omnigraph::open(repo.to_str().unwrap()).await.unwrap(); + + let run_a = db + .begin_run("main", Some("server-conflict-a")) + .await + .unwrap(); + let run_b = db + .begin_run("main", Some("server-conflict-b")) + .await + .unwrap(); + db.mutate( + &run_a.run_branch, + MUTATION_QUERIES, + "set_age", + &omnigraph_compiler::json_params_to_param_map( + Some(&json!({"name": "Alice", "age": 31 })), + &omnigraph_compiler::find_named_query(MUTATION_QUERIES, "set_age") + .unwrap() + .params, + omnigraph_compiler::JsonParamMode::Standard, + ) + .unwrap(), + ) + .await + .unwrap(); + db.mutate( + &run_b.run_branch, + MUTATION_QUERIES, + "set_age", + &omnigraph_compiler::json_params_to_param_map( + Some(&json!({"name": "Alice", "age": 32 })), + &omnigraph_compiler::find_named_query(MUTATION_QUERIES, "set_age") + .unwrap() + .params, + omnigraph_compiler::JsonParamMode::Standard, + ) + .unwrap(), + ) + .await + .unwrap(); + db.publish_run(&run_a.run_id).await.unwrap(); + drop(db); + + let state = AppState::open(repo.to_string_lossy().to_string()) + .await + .unwrap(); + let app = build_app(state); + let (status, body) = json_response( + &app, + Request::builder() + .uri(format!("/runs/{}/publish", run_b.run_id.as_str())) + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(b"{}" as &[u8])) + .unwrap(), + ) + .await; + + let error: ErrorOutput = serde_json::from_value(body).unwrap(); + assert_eq!(status, StatusCode::CONFLICT); + assert_eq!(error.code, Some(omnigraph_server::api::ErrorCode::Conflict)); + assert!(error.merge_conflicts.iter().any(|conflict| { + conflict.table_key == "node:Person" + && conflict.row_id.as_deref() == Some("Alice") + && conflict.kind == omnigraph_server::api::MergeConflictKindOutput::DivergentUpdate + })); +} + +#[tokio::test(flavor = "multi_thread")] +async fn oversized_request_body_returns_payload_too_large() { + let (_temp, app) = app_for_loaded_repo().await; + let oversized = "x".repeat(1_100_000); + let response = app + .clone() + .oneshot( + Request::builder() + .uri("/read") + .method(Method::POST) + .header("content-type", "application/json") + .body(Body::from(oversized)) + .unwrap(), + ) + .await + .unwrap(); + + assert_eq!(response.status(), StatusCode::PAYLOAD_TOO_LARGE); +} diff --git a/crates/omnigraph/Cargo.toml b/crates/omnigraph/Cargo.toml new file mode 100644 index 0000000..ba61c0c --- /dev/null +++ b/crates/omnigraph/Cargo.toml @@ -0,0 +1,47 @@ +[package] +name = "omnigraph" +version = "0.4.0" +edition = "2024" +description = "Lance-native graph database with git-style branching." +license = "MIT" + +[features] +default = [] +failpoints = ["dep:fail", "fail/failpoints"] + +[dependencies] +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.0" } +lance = { workspace = true } +lance-datafusion = { workspace = true } +lance-file = { workspace = true } +lance-index = { workspace = true } +lance-linalg = { workspace = true } +lance-namespace = { workspace = true } +lance-table = { workspace = true } +arrow-array = { workspace = true } +arrow-schema = { workspace = true } +arrow-ord = { workspace = true } +arrow-select = { workspace = true } +arrow-cast = { workspace = true } +tokio = { workspace = true } +serde = { workspace = true } +serde_json = { workspace = true } +reqwest = { workspace = true } +object_store = { workspace = true } +ulid = { workspace = true } +base64 = { workspace = true } +futures = { workspace = true } +tracing = { workspace = true } +thiserror = { workspace = true } +regex = { workspace = true } +tempfile = { workspace = true } +fail = { workspace = true, optional = true } +time = { workspace = true } +async-trait = { workspace = true } +url = { workspace = true } + +[dev-dependencies] +omnigraph-compiler = { path = "../omnigraph-compiler", version = "0.4.0" } +tokio = { workspace = true } +lance-namespace-impls = { workspace = true } +serial_test = "3" diff --git a/crates/omnigraph/src/changes/mod.rs b/crates/omnigraph/src/changes/mod.rs new file mode 100644 index 0000000..7c9e8ea --- /dev/null +++ b/crates/omnigraph/src/changes/mod.rs @@ -0,0 +1,598 @@ +use std::collections::HashSet; + +use arrow_array::{Array, RecordBatch, StringArray, UInt64Array}; +use arrow_cast::display::array_value_to_string; +use lance::dataset::scanner::ColumnOrdering; + +use crate::db::SubTableEntry; +use crate::db::manifest::Snapshot; +use crate::error::Result; +use crate::table_store::TableStore; + +// ─── Types ────────────────────────────────────────────────────────────────── + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum EntityKind { + Node, + Edge, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ChangeOp { + Insert, + Update, + Delete, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Endpoints { + pub src: String, + pub dst: String, +} + +#[derive(Debug, Clone)] +pub struct EntityChange { + pub table_key: String, + pub kind: EntityKind, + pub type_name: String, + pub id: String, + pub op: ChangeOp, + pub manifest_version: u64, + pub endpoints: Option, +} + +#[derive(Debug, Clone, Default)] +pub struct ChangeFilter { + pub kinds: Option>, + pub type_names: Option>, + pub ops: Option>, +} + +#[derive(Debug, Clone, Default)] +pub struct ChangeStats { + pub inserts: usize, + pub updates: usize, + pub deletes: usize, + pub types_affected: Vec, +} + +#[derive(Debug, Clone)] +pub struct ChangeSet { + pub from_version: u64, + pub to_version: u64, + pub branch: Option, + pub changes: Vec, + pub stats: ChangeStats, +} + +// ─── Filter helpers ───────────────────────────────────────────────────────── + +fn parse_table_key(table_key: &str) -> (EntityKind, &str) { + if let Some(name) = table_key.strip_prefix("node:") { + (EntityKind::Node, name) + } else if let Some(name) = table_key.strip_prefix("edge:") { + (EntityKind::Edge, name) + } else { + (EntityKind::Node, table_key) + } +} + +impl ChangeFilter { + fn matches_table(&self, table_key: &str) -> bool { + let (kind, type_name) = parse_table_key(table_key); + if let Some(ref kinds) = self.kinds { + if !kinds.contains(&kind) { + return false; + } + } + if let Some(ref names) = self.type_names { + if !names.iter().any(|n| n == type_name) { + return false; + } + } + true + } + + fn wants_op(&self, op: ChangeOp) -> bool { + match &self.ops { + Some(ops) => ops.contains(&op), + None => true, + } + } +} + +// ─── Core diff ────────────────────────────────────────────────────────────── + +/// Net-current diff between two snapshots. +/// +/// Uses a three-level algorithm: +/// 1. Manifest diff — skip unchanged sub-tables +/// 2. Lineage check — same branch → version-column diff; different → ID-based diff +/// 3. Row-level diff +pub async fn diff_snapshots( + root_uri: &str, + from: &Snapshot, + to: &Snapshot, + filter: &ChangeFilter, + branch: Option, +) -> Result { + let table_store = TableStore::new(root_uri); + let mut all_keys: HashSet = HashSet::new(); + for entry in from.entries() { + all_keys.insert(entry.table_key.clone()); + } + for entry in to.entries() { + all_keys.insert(entry.table_key.clone()); + } + + let mut changes = Vec::new(); + + for table_key in &all_keys { + if !filter.matches_table(table_key) { + continue; + } + + let from_entry = from.entry(table_key); + let to_entry = to.entry(table_key); + + // Skip if both snapshots have identical state for this table + if same_state(from_entry, to_entry) { + continue; + } + + let (kind, type_name) = parse_table_key(table_key); + let is_edge = kind == EntityKind::Edge; + + let table_changes = if from_entry.is_none() { + // Table added — all rows are inserts + diff_table_added(&table_store, to, table_key, is_edge, filter).await? + } else if to_entry.is_none() { + // Table removed — all rows are deletes + diff_table_removed(&table_store, from, table_key, is_edge, filter).await? + } else if same_lineage(from_entry, to_entry) { + // Fast path: version-column diff + diff_table_same_lineage( + &table_store, + from_entry.unwrap(), + to_entry.unwrap(), + is_edge, + filter, + ) + .await? + } else { + // Cross-branch path: streaming ID-based diff + diff_table_cross_branch(&table_store, from, to, table_key, is_edge, filter).await? + }; + + for mut c in table_changes { + c.table_key = table_key.clone(); + c.kind = kind; + c.type_name = type_name.to_string(); + if c.manifest_version == 0 { + c.manifest_version = to.version(); + } + changes.push(c); + } + } + + let stats = compute_stats(&changes); + Ok(ChangeSet { + from_version: from.version(), + to_version: to.version(), + branch, + changes, + stats, + }) +} + +fn same_state(a: Option<&SubTableEntry>, b: Option<&SubTableEntry>) -> bool { + match (a, b) { + (None, None) => true, + (Some(a), Some(b)) => { + a.table_version == b.table_version && a.table_branch == b.table_branch + } + _ => false, + } +} + +fn same_lineage(from: Option<&SubTableEntry>, to: Option<&SubTableEntry>) -> bool { + match (from, to) { + (Some(f), Some(t)) => f.table_branch == t.table_branch, + _ => false, + } +} + +fn compute_stats(changes: &[EntityChange]) -> ChangeStats { + let mut stats = ChangeStats::default(); + let mut types = HashSet::new(); + for c in changes { + match c.op { + ChangeOp::Insert => stats.inserts += 1, + ChangeOp::Update => stats.updates += 1, + ChangeOp::Delete => stats.deletes += 1, + } + types.insert(c.type_name.clone()); + } + stats.types_affected = types.into_iter().collect(); + stats.types_affected.sort(); + stats +} + +// ─── Fast path: version-column diff ───────────────────────────────────────── + +async fn diff_table_same_lineage( + table_store: &TableStore, + from_entry: &SubTableEntry, + to_entry: &SubTableEntry, + is_edge: bool, + filter: &ChangeFilter, +) -> Result> { + let vf = from_entry.table_version; + let vt = to_entry.table_version; + let to_ds = table_store.open_at_entry(to_entry).await?; + + let cols: Vec<&str> = if is_edge { + vec!["id", "src", "dst", "_row_last_updated_at_version"] + } else { + vec!["id", "_row_last_updated_at_version"] + }; + + let wants_inserts = filter.wants_op(ChangeOp::Insert); + let wants_updates = filter.wants_op(ChangeOp::Update); + let wants_deletes = filter.wants_op(ChangeOp::Delete); + + let mut changes = Vec::new(); + + // Inserts + Updates: use _row_last_updated_at_version to find all rows + // touched since Vf, then classify by checking whether the ID existed at Vf. + // + // Why not _row_created_at_version for inserts: Lance's merge_insert stamps + // new rows with _row_created_at_version = dataset_creation_version (v1), + // not the merge_insert commit version. This makes _row_created_at_version + // unreliable for detecting inserts from merge_insert writes. Using + // _row_last_updated_at_version catches all touched rows regardless of + // write mode, and ID-set membership distinguishes inserts from updates. + if wants_inserts || wants_updates { + let filter_sql = format!( + "_row_last_updated_at_version > {} AND _row_last_updated_at_version <= {}", + vf, vt + ); + let changed_rows = scan_with_filter(table_store, &to_ds, &cols, &filter_sql).await?; + + if !changed_rows.is_empty() { + // Build the set of IDs that existed at the from version + let from_ds = table_store.open_at_entry(from_entry).await?; + let from_ids: HashSet = scan_id_set(table_store, &from_ds, &["id"]) + .await? + .into_iter() + .map(|r| r.id) + .collect(); + + for row in changed_rows { + if from_ids.contains(&row.id) { + if wants_updates { + changes.push(entity_change_from_row(&row, ChangeOp::Update, is_edge)); + } + } else if wants_inserts { + changes.push(entity_change_from_row(&row, ChangeOp::Insert, is_edge)); + } + } + } + } + + // Deletes: ID set-difference + if wants_deletes { + let from_ds = table_store.open_at_entry(from_entry).await?; + let deleted = deleted_ids_by_set_diff(table_store, &from_ds, &to_ds, is_edge).await?; + changes.extend(deleted); + } + + Ok(changes) +} + +// ─── Cross-branch path: streaming ID-based diff ──────────────────────────── + +async fn diff_table_cross_branch( + table_store: &TableStore, + from_snap: &Snapshot, + to_snap: &Snapshot, + table_key: &str, + is_edge: bool, + filter: &ChangeFilter, +) -> Result> { + let from_ds = table_store + .open_snapshot_table(from_snap, table_key) + .await?; + let to_ds = table_store.open_snapshot_table(to_snap, table_key).await?; + + let from_rows = scan_all_rows_ordered(table_store, &from_ds, is_edge).await?; + let to_rows = scan_all_rows_ordered(table_store, &to_ds, is_edge).await?; + + let mut changes = Vec::new(); + let mut fi = 0; + let mut ti = 0; + + while fi < from_rows.len() || ti < to_rows.len() { + let from_id = from_rows.get(fi).map(|r| r.id.as_str()); + let to_id = to_rows.get(ti).map(|r| r.id.as_str()); + + match (from_id, to_id) { + (Some(fid), Some(tid)) if fid < tid => { + // ID only in from → Delete + if filter.wants_op(ChangeOp::Delete) { + changes.push(entity_change_from_row( + &from_rows[fi], + ChangeOp::Delete, + is_edge, + )); + } + fi += 1; + } + (Some(fid), Some(tid)) if fid > tid => { + // ID only in to → Insert + if filter.wants_op(ChangeOp::Insert) { + changes.push(entity_change_from_row( + &to_rows[ti], + ChangeOp::Insert, + is_edge, + )); + } + ti += 1; + } + (Some(_), Some(_)) => { + // Same ID — check signature + if from_rows[fi].signature != to_rows[ti].signature + && filter.wants_op(ChangeOp::Update) + { + changes.push(entity_change_from_row( + &to_rows[ti], + ChangeOp::Update, + is_edge, + )); + } + fi += 1; + ti += 1; + } + (Some(_), None) => { + if filter.wants_op(ChangeOp::Delete) { + changes.push(entity_change_from_row( + &from_rows[fi], + ChangeOp::Delete, + is_edge, + )); + } + fi += 1; + } + (None, Some(_)) => { + if filter.wants_op(ChangeOp::Insert) { + changes.push(entity_change_from_row( + &to_rows[ti], + ChangeOp::Insert, + is_edge, + )); + } + ti += 1; + } + (None, None) => break, + } + } + + Ok(changes) +} + +// ─── Table added/removed ──────────────────────────────────────────────────── + +async fn diff_table_added( + table_store: &TableStore, + to_snap: &Snapshot, + table_key: &str, + is_edge: bool, + filter: &ChangeFilter, +) -> Result> { + if !filter.wants_op(ChangeOp::Insert) { + return Ok(Vec::new()); + } + let ds = table_store.open_snapshot_table(to_snap, table_key).await?; + let rows = scan_all_rows_ordered(table_store, &ds, is_edge).await?; + Ok(rows + .into_iter() + .map(|r| entity_change_from_row(&r, ChangeOp::Insert, is_edge)) + .collect()) +} + +async fn diff_table_removed( + table_store: &TableStore, + from_snap: &Snapshot, + table_key: &str, + is_edge: bool, + filter: &ChangeFilter, +) -> Result> { + if !filter.wants_op(ChangeOp::Delete) { + return Ok(Vec::new()); + } + let ds = table_store + .open_snapshot_table(from_snap, table_key) + .await?; + let rows = scan_all_rows_ordered(table_store, &ds, is_edge).await?; + Ok(rows + .into_iter() + .map(|r| entity_change_from_row(&r, ChangeOp::Delete, is_edge)) + .collect()) +} + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +/// Scan with a SQL filter, projecting specific columns. +async fn scan_with_filter( + table_store: &TableStore, + ds: &lance::Dataset, + cols: &[&str], + filter_sql: &str, +) -> Result> { + let batches = table_store + .scan(ds, Some(cols), Some(filter_sql), None) + .await?; + Ok(extract_rows(&batches)) +} + +/// Scan all rows ordered by id, projecting id (+ src/dst for edges) + all columns for signature. +async fn scan_all_rows_ordered( + table_store: &TableStore, + ds: &lance::Dataset, + is_edge: bool, +) -> Result> { + let batches = table_store + .scan( + ds, + None, + None, + Some(vec![ColumnOrdering::asc_nulls_last("id".to_string())]), + ) + .await?; + Ok(extract_rows_with_signature(&batches, is_edge)) +} + +/// Compute deleted IDs: scan id at from and to, set-difference. +async fn deleted_ids_by_set_diff( + table_store: &TableStore, + from_ds: &lance::Dataset, + to_ds: &lance::Dataset, + is_edge: bool, +) -> Result> { + let cols: Vec<&str> = if is_edge { + vec!["id", "src", "dst"] + } else { + vec!["id"] + }; + + let from_rows = scan_id_set(table_store, from_ds, &cols).await?; + let to_ids: HashSet = scan_id_set(table_store, to_ds, &["id"]) + .await? + .into_iter() + .map(|r| r.id) + .collect(); + + Ok(from_rows + .into_iter() + .filter(|r| !to_ids.contains(&r.id)) + .map(|r| entity_change_from_row(&r, ChangeOp::Delete, is_edge)) + .collect()) +} + +async fn scan_id_set( + table_store: &TableStore, + ds: &lance::Dataset, + cols: &[&str], +) -> Result> { + let batches = table_store.scan(ds, Some(cols), None, None).await?; + Ok(extract_rows(&batches)) +} + +// ─── Row extraction ───────────────────────────────────────────────────────── + +#[derive(Debug, Clone)] +struct ScannedRow { + id: String, + src: Option, + dst: Option, + signature: String, + change_version: Option, +} + +fn extract_rows(batches: &[RecordBatch]) -> Vec { + let mut rows = Vec::new(); + for batch in batches { + let ids = batch + .column_by_name("id") + .and_then(|c| c.as_any().downcast_ref::()); + let Some(ids) = ids else { continue }; + let srcs = batch + .column_by_name("src") + .and_then(|c| c.as_any().downcast_ref::()); + let dsts = batch + .column_by_name("dst") + .and_then(|c| c.as_any().downcast_ref::()); + for i in 0..ids.len() { + rows.push(ScannedRow { + id: ids.value(i).to_string(), + src: srcs.map(|a| a.value(i).to_string()), + dst: dsts.map(|a| a.value(i).to_string()), + signature: String::new(), + change_version: batch + .column_by_name("_row_last_updated_at_version") + .and_then(|c| c.as_any().downcast_ref::()) + .map(|versions| versions.value(i)), + }); + } + } + rows +} + +fn extract_rows_with_signature(batches: &[RecordBatch], is_edge: bool) -> Vec { + let mut rows = Vec::new(); + for batch in batches { + let ids = batch + .column_by_name("id") + .and_then(|c| c.as_any().downcast_ref::()); + let Some(ids) = ids else { continue }; + let srcs = if is_edge { + batch + .column_by_name("src") + .and_then(|c| c.as_any().downcast_ref::()) + } else { + None + }; + let dsts = if is_edge { + batch + .column_by_name("dst") + .and_then(|c| c.as_any().downcast_ref::()) + } else { + None + }; + for i in 0..ids.len() { + let mut values = Vec::with_capacity(batch.num_columns()); + for (field, col) in batch.schema().fields().iter().zip(batch.columns()) { + if field.name().starts_with("_row_") { + continue; + } + if let Ok(v) = array_value_to_string(col.as_ref(), i) { + values.push(v); + } + } + rows.push(ScannedRow { + id: ids.value(i).to_string(), + src: srcs.map(|a| a.value(i).to_string()), + dst: dsts.map(|a| a.value(i).to_string()), + signature: values.join("\x1f"), + change_version: batch + .column_by_name("_row_last_updated_at_version") + .and_then(|c| c.as_any().downcast_ref::()) + .map(|versions| versions.value(i)), + }); + } + } + rows +} + +fn entity_change_from_row(row: &ScannedRow, op: ChangeOp, is_edge: bool) -> EntityChange { + EntityChange { + table_key: String::new(), + kind: if is_edge { + EntityKind::Edge + } else { + EntityKind::Node + }, + type_name: String::new(), + id: row.id.clone(), + op, + manifest_version: row.change_version.unwrap_or(0), + endpoints: if is_edge { + Some(Endpoints { + src: row.src.clone().unwrap_or_default(), + dst: row.dst.clone().unwrap_or_default(), + }) + } else { + None + }, + } +} diff --git a/crates/omnigraph/src/db/commit_graph.rs b/crates/omnigraph/src/db/commit_graph.rs new file mode 100644 index 0000000..565bd69 --- /dev/null +++ b/crates/omnigraph/src/db/commit_graph.rs @@ -0,0 +1,692 @@ +use std::collections::{HashMap, VecDeque}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +use arrow_array::{ + Array, RecordBatch, RecordBatchIterator, StringArray, TimestampMicrosecondArray, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use futures::TryStreamExt; +use lance::Dataset; +use lance::dataset::{WriteMode, WriteParams}; +use lance_file::version::LanceFileVersion; + +use crate::error::{OmniError, Result}; + +const GRAPH_COMMITS_DIR: &str = "_graph_commits.lance"; +const GRAPH_COMMIT_ACTORS_DIR: &str = "_graph_commit_actors.lance"; + +#[derive(Debug, Clone)] +pub struct GraphCommit { + pub graph_commit_id: String, + pub manifest_branch: Option, + pub manifest_version: u64, + pub parent_commit_id: Option, + pub merged_parent_commit_id: Option, + pub actor_id: Option, + pub created_at: i64, +} + +pub struct CommitGraph { + root_uri: String, + dataset: Dataset, + actor_dataset: Option, + active_branch: Option, + actor_by_commit_id: HashMap, + commit_by_id: HashMap, + head_commit: Option, +} + +impl CommitGraph { + pub async fn init(root_uri: &str, manifest_version: u64) -> Result { + let root = root_uri.trim_end_matches('/'); + let uri = graph_commits_uri(root); + let genesis = GraphCommit { + graph_commit_id: ulid::Ulid::new().to_string(), + manifest_branch: None, + manifest_version, + parent_commit_id: None, + merged_parent_commit_id: None, + actor_id: None, + created_at: now_micros()?, + }; + + let batch = commits_to_batch(&[genesis.clone()])?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_graph_schema()); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + let dataset = Dataset::write(reader, &uri as &str, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let actor_dataset = create_commit_actor_dataset(root).await?; + + Ok(Self { + root_uri: root.to_string(), + dataset, + actor_dataset: Some(actor_dataset), + active_branch: None, + actor_by_commit_id: HashMap::new(), + commit_by_id: HashMap::from([(genesis.graph_commit_id.clone(), genesis.clone())]), + head_commit: Some(genesis), + }) + } + + pub async fn open(root_uri: &str) -> Result { + let root = root_uri.trim_end_matches('/'); + let dataset = Dataset::open(&graph_commits_uri(root)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let actor_dataset = Dataset::open(&graph_commit_actors_uri(root)).await.ok(); + let actor_by_commit_id = match &actor_dataset { + Some(dataset) => load_commit_actor_cache(dataset).await?, + None => HashMap::new(), + }; + let (commit_by_id, head_commit) = load_commit_cache(&dataset, &actor_by_commit_id).await?; + Ok(Self { + root_uri: root.to_string(), + dataset, + actor_dataset, + active_branch: None, + actor_by_commit_id, + commit_by_id, + head_commit, + }) + } + + pub async fn open_at_branch(root_uri: &str, branch: &str) -> Result { + let root = root_uri.trim_end_matches('/'); + let dataset = Dataset::open(&graph_commits_uri(root)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let dataset = dataset + .checkout_branch(branch) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let actor_dataset = Dataset::open(&graph_commit_actors_uri(root)).await.ok(); + let actor_by_commit_id = match &actor_dataset { + Some(dataset) => load_commit_actor_cache(dataset).await?, + None => HashMap::new(), + }; + let (commit_by_id, head_commit) = load_commit_cache(&dataset, &actor_by_commit_id).await?; + Ok(Self { + root_uri: root.to_string(), + dataset, + actor_dataset, + active_branch: Some(branch.to_string()), + actor_by_commit_id, + commit_by_id, + head_commit, + }) + } + + pub async fn refresh(&mut self) -> Result<()> { + let root = self.root_uri.clone(); + self.dataset = Dataset::open(&graph_commits_uri(&root)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + if let Some(branch) = &self.active_branch { + self.dataset = self + .dataset + .checkout_branch(branch) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + self.actor_dataset = Dataset::open(&graph_commit_actors_uri(&root)).await.ok(); + self.actor_by_commit_id = match &self.actor_dataset { + Some(dataset) => load_commit_actor_cache(dataset).await?, + None => HashMap::new(), + }; + let (commit_by_id, head_commit) = + load_commit_cache(&self.dataset, &self.actor_by_commit_id).await?; + self.commit_by_id = commit_by_id; + self.head_commit = head_commit; + Ok(()) + } + + pub fn version(&self) -> u64 { + self.dataset.version().version + } + + pub async fn create_branch(&mut self, name: &str) -> Result<()> { + let mut ds = self.dataset.clone(); + ds.create_branch(name, self.version(), None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(()) + } + + pub async fn delete_branch(&mut self, name: &str) -> Result<()> { + let mut ds = Dataset::open(&graph_commits_uri(&self.root_uri)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + ds.delete_branch(name) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.refresh().await + } + + pub async fn append_commit( + &mut self, + manifest_branch: Option<&str>, + manifest_version: u64, + actor_id: Option<&str>, + ) -> Result { + let parent_commit_id = self.head_commit_id().await?; + self.append_commit_with_parents( + manifest_branch, + manifest_version, + parent_commit_id.as_deref(), + None, + actor_id, + ) + .await + } + + pub async fn append_merge_commit( + &mut self, + manifest_branch: Option<&str>, + manifest_version: u64, + parent_commit_id: &str, + merged_parent_commit_id: &str, + actor_id: Option<&str>, + ) -> Result { + self.append_commit_with_parents( + manifest_branch, + manifest_version, + Some(parent_commit_id), + Some(merged_parent_commit_id), + actor_id, + ) + .await + } + + async fn append_commit_with_parents( + &mut self, + manifest_branch: Option<&str>, + manifest_version: u64, + parent_commit_id: Option<&str>, + merged_parent_commit_id: Option<&str>, + actor_id: Option<&str>, + ) -> Result { + let graph_commit_id = ulid::Ulid::new().to_string(); + let commit = GraphCommit { + graph_commit_id: graph_commit_id.clone(), + manifest_branch: manifest_branch.map(|s| s.to_string()), + manifest_version, + parent_commit_id: parent_commit_id.map(|s| s.to_string()), + merged_parent_commit_id: merged_parent_commit_id.map(|s| s.to_string()), + actor_id: actor_id.map(str::to_string), + created_at: now_micros()?, + }; + + let batch = commits_to_batch(&[commit.clone()])?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_graph_schema()); + let mut ds = self.dataset.clone(); + ds.append(reader, None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.dataset = ds; + if let Some(actor_id) = actor_id { + self.append_actor(&graph_commit_id, actor_id).await?; + } + self.commit_by_id + .insert(graph_commit_id.clone(), commit.clone()); + if should_replace_head(self.head_commit.as_ref(), &commit) { + self.head_commit = Some(commit); + } + + Ok(graph_commit_id) + } + + async fn append_actor(&mut self, graph_commit_id: &str, actor_id: &str) -> Result<()> { + if self + .actor_by_commit_id + .get(graph_commit_id) + .is_some_and(|existing| existing == actor_id) + { + return Ok(()); + } + + let record = CommitActorRecord { + graph_commit_id: graph_commit_id.to_string(), + actor_id: actor_id.to_string(), + created_at: now_micros()?, + }; + let batch = commit_actors_to_batch(&[record])?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_actor_schema()); + let mut dataset = match self.actor_dataset.take() { + Some(dataset) => dataset, + None => create_commit_actor_dataset(&self.root_uri).await?, + }; + dataset + .append(reader, None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.actor_by_commit_id + .insert(graph_commit_id.to_string(), actor_id.to_string()); + self.actor_dataset = Some(dataset); + Ok(()) + } + + pub async fn head_commit(&self) -> Result> { + Ok(self.head_commit.clone()) + } + + pub async fn head_commit_id(&self) -> Result> { + Ok(self.head_commit().await?.map(|c| c.graph_commit_id)) + } + + pub async fn load_commits(&self) -> Result> { + let mut commits = self.commit_by_id.values().cloned().collect::>(); + commits.sort_by(|a, b| { + a.manifest_version + .cmp(&b.manifest_version) + .then_with(|| a.created_at.cmp(&b.created_at)) + .then_with(|| a.graph_commit_id.cmp(&b.graph_commit_id)) + }); + Ok(commits) + } + + pub fn get_commit(&self, commit_id: &str) -> Option { + self.commit_by_id.get(commit_id).cloned() + } + + pub async fn merge_base( + root_uri: &str, + source_branch: Option<&str>, + target_branch: Option<&str>, + ) -> Result> { + let source = open_for_branch(root_uri, source_branch).await?; + let target = open_for_branch(root_uri, target_branch).await?; + + let source_head = match source.head_commit().await? { + Some(commit) => commit, + None => return Ok(None), + }; + let target_head = match target.head_commit().await? { + Some(commit) => commit, + None => return Ok(None), + }; + + let mut commits = HashMap::new(); + for commit in source.load_commits().await? { + commits.insert(commit.graph_commit_id.clone(), commit); + } + for commit in target.load_commits().await? { + commits.insert(commit.graph_commit_id.clone(), commit); + } + + let source_distances = ancestor_distances(&source_head.graph_commit_id, &commits); + let target_distances = ancestor_distances(&target_head.graph_commit_id, &commits); + + let best = source_distances + .iter() + .filter_map(|(id, source_distance)| { + target_distances.get(id).and_then(|target_distance| { + commits.get(id).map(|commit| { + ( + ( + *source_distance + *target_distance, + u64::MAX - commit.manifest_version, + ), + commit.clone(), + ) + }) + }) + }) + .min_by_key(|(score, _)| *score) + .map(|(_, commit)| commit); + + Ok(best) + } +} + +fn graph_commits_uri(root_uri: &str) -> String { + format!("{}/{}", root_uri.trim_end_matches('/'), GRAPH_COMMITS_DIR) +} + +fn graph_commit_actors_uri(root_uri: &str) -> String { + format!( + "{}/{}", + root_uri.trim_end_matches('/'), + GRAPH_COMMIT_ACTORS_DIR + ) +} + +fn commit_graph_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("graph_commit_id", DataType::Utf8, false), + Field::new("manifest_branch", DataType::Utf8, true), + Field::new("manifest_version", DataType::UInt64, false), + Field::new("parent_commit_id", DataType::Utf8, true), + Field::new("merged_parent_commit_id", DataType::Utf8, true), + Field::new( + "created_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + ])) +} + +fn commit_actor_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("graph_commit_id", DataType::Utf8, false), + Field::new("actor_id", DataType::Utf8, false), + Field::new( + "created_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + ])) +} + +#[derive(Debug, Clone)] +struct CommitActorRecord { + graph_commit_id: String, + actor_id: String, + created_at: i64, +} + +async fn create_commit_actor_dataset(root_uri: &str) -> Result { + let uri = graph_commit_actors_uri(root_uri); + let batch = RecordBatch::new_empty(commit_actor_schema()); + let reader = RecordBatchIterator::new(vec![Ok(batch)], commit_actor_schema()); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + match Dataset::write(reader, &uri as &str, Some(params)).await { + Ok(dataset) => Ok(dataset), + Err(err) if err.to_string().contains("Dataset already exists") => Dataset::open(&uri) + .await + .map_err(|open_err| OmniError::Lance(open_err.to_string())), + Err(err) => Err(OmniError::Lance(err.to_string())), + } +} + +fn commits_to_batch(commits: &[GraphCommit]) -> Result { + let ids: Vec<&str> = commits.iter().map(|c| c.graph_commit_id.as_str()).collect(); + let branches: Vec> = commits + .iter() + .map(|c| c.manifest_branch.as_deref()) + .collect(); + let versions: Vec = commits.iter().map(|c| c.manifest_version).collect(); + let parents: Vec> = commits + .iter() + .map(|c| c.parent_commit_id.as_deref()) + .collect(); + let merged_parents: Vec> = commits + .iter() + .map(|c| c.merged_parent_commit_id.as_deref()) + .collect(); + let created_at: Vec = commits.iter().map(|c| c.created_at).collect(); + + RecordBatch::try_new( + commit_graph_schema(), + vec![ + Arc::new(StringArray::from(ids)), + Arc::new(StringArray::from(branches)), + Arc::new(UInt64Array::from(versions)), + Arc::new(StringArray::from(parents)), + Arc::new(StringArray::from(merged_parents)), + Arc::new(TimestampMicrosecondArray::from(created_at)), + ], + ) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +async fn load_commit_cache( + dataset: &Dataset, + actor_by_commit_id: &HashMap, +) -> Result<(HashMap, Option)> { + let batches: Vec = dataset + .scan() + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let mut commits = load_commits_from_batches(&batches)?; + for commit in &mut commits { + commit.actor_id = actor_by_commit_id + .get(commit.graph_commit_id.as_str()) + .cloned(); + } + let mut commit_by_id = HashMap::with_capacity(commits.len()); + let mut head_commit = None; + for commit in commits { + if should_replace_head(head_commit.as_ref(), &commit) { + head_commit = Some(commit.clone()); + } + commit_by_id.insert(commit.graph_commit_id.clone(), commit); + } + Ok((commit_by_id, head_commit)) +} + +async fn load_commit_actor_cache(dataset: &Dataset) -> Result> { + let batches: Vec = dataset + .scan() + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let mut actors = HashMap::new(); + for batch in batches { + let commit_ids = string_column(&batch, "graph_commit_id", "commit actor registry")?; + let actor_ids = string_column(&batch, "actor_id", "commit actor registry")?; + for row in 0..batch.num_rows() { + actors.insert( + commit_ids.value(row).to_string(), + actor_ids.value(row).to_string(), + ); + } + } + Ok(actors) +} + +fn load_commits_from_batches(batches: &[RecordBatch]) -> Result> { + let mut commits = Vec::new(); + for batch in batches { + let ids = string_column(batch, "graph_commit_id", "commit graph")?; + let branches = string_column(batch, "manifest_branch", "commit graph")?; + let versions = u64_column(batch, "manifest_version", "commit graph")?; + let parents = string_column(batch, "parent_commit_id", "commit graph")?; + let merged_parents = string_column(batch, "merged_parent_commit_id", "commit graph")?; + let created = timestamp_micros_column(batch, "created_at", "commit graph")?; + + for row in 0..batch.num_rows() { + commits.push(GraphCommit { + graph_commit_id: ids.value(row).to_string(), + manifest_branch: if branches.is_null(row) { + None + } else { + Some(branches.value(row).to_string()) + }, + manifest_version: versions.value(row), + parent_commit_id: if parents.is_null(row) { + None + } else { + Some(parents.value(row).to_string()) + }, + merged_parent_commit_id: if merged_parents.is_null(row) { + None + } else { + Some(merged_parents.value(row).to_string()) + }, + actor_id: None, + created_at: created.value(row), + }); + } + } + Ok(commits) +} + +fn commit_actors_to_batch(records: &[CommitActorRecord]) -> Result { + let commit_ids: Vec<&str> = records + .iter() + .map(|record| record.graph_commit_id.as_str()) + .collect(); + let actor_ids: Vec<&str> = records + .iter() + .map(|record| record.actor_id.as_str()) + .collect(); + let created_at: Vec = records.iter().map(|record| record.created_at).collect(); + + RecordBatch::try_new( + commit_actor_schema(), + vec![ + Arc::new(StringArray::from(commit_ids)), + Arc::new(StringArray::from(actor_ids)), + Arc::new(TimestampMicrosecondArray::from(created_at)), + ], + ) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +fn should_replace_head(current: Option<&GraphCommit>, candidate: &GraphCommit) -> bool { + current.is_none_or(|existing| { + candidate + .manifest_version + .cmp(&existing.manifest_version) + .then_with(|| candidate.created_at.cmp(&existing.created_at)) + .then_with(|| candidate.graph_commit_id.cmp(&existing.graph_commit_id)) + .is_gt() + }) +} + +fn string_column<'a>(batch: &'a RecordBatch, name: &str, context: &str) -> Result<&'a StringArray> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} column '{name}' is not Utf8")) + }) +} + +fn u64_column<'a>(batch: &'a RecordBatch, name: &str, context: &str) -> Result<&'a UInt64Array> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} column '{name}' is not UInt64")) + }) +} + +fn timestamp_micros_column<'a>( + batch: &'a RecordBatch, + name: &str, + context: &str, +) -> Result<&'a TimestampMicrosecondArray> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!( + "{context} column '{name}' is not Timestamp(Microsecond)" + )) + }) +} + +fn ancestor_distances( + start_id: &str, + commits: &HashMap, +) -> HashMap { + let mut distances = HashMap::new(); + let mut queue = VecDeque::from([(start_id.to_string(), 0u64)]); + + while let Some((id, distance)) = queue.pop_front() { + if let Some(existing) = distances.get(&id) { + if *existing <= distance { + continue; + } + } + distances.insert(id.clone(), distance); + + if let Some(commit) = commits.get(&id) { + if let Some(parent) = &commit.parent_commit_id { + queue.push_back((parent.clone(), distance + 1)); + } + if let Some(parent) = &commit.merged_parent_commit_id { + queue.push_back((parent.clone(), distance + 1)); + } + } + } + + distances +} + +async fn open_for_branch(root_uri: &str, branch: Option<&str>) -> Result { + match branch { + Some(branch) if branch != "main" => CommitGraph::open_at_branch(root_uri, branch).await, + _ => CommitGraph::open(root_uri).await, + } +} + +fn now_micros() -> Result { + let duration = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| OmniError::manifest(format!("system clock before UNIX_EPOCH: {}", e)))?; + Ok(duration.as_micros() as i64) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_schema::{DataType, Field, Schema}; + + use super::*; + + #[test] + fn load_commits_from_batches_returns_error_for_bad_schema() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("graph_commit_id", DataType::UInt64, false), + Field::new("manifest_branch", DataType::Utf8, true), + Field::new("manifest_version", DataType::UInt64, false), + Field::new("parent_commit_id", DataType::Utf8, true), + Field::new("merged_parent_commit_id", DataType::Utf8, true), + Field::new( + "created_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + ])), + vec![ + Arc::new(UInt64Array::from(vec![1_u64])), + Arc::new(StringArray::from(vec![None::<&str>])), + Arc::new(UInt64Array::from(vec![1_u64])), + Arc::new(StringArray::from(vec![None::<&str>])), + Arc::new(StringArray::from(vec![None::<&str>])), + Arc::new(TimestampMicrosecondArray::from(vec![1_i64])), + ], + ) + .unwrap(); + + let err = load_commits_from_batches(&[batch]).unwrap_err(); + assert!(err.to_string().contains("graph_commit_id")); + } +} diff --git a/crates/omnigraph/src/db/graph_coordinator.rs b/crates/omnigraph/src/db/graph_coordinator.rs new file mode 100644 index 0000000..4de6d5d --- /dev/null +++ b/crates/omnigraph/src/db/graph_coordinator.rs @@ -0,0 +1,562 @@ +use std::fmt; +use std::sync::Arc; + +use omnigraph_compiler::catalog::Catalog; + +use crate::error::{OmniError, Result}; +use crate::failpoints; +use crate::storage::{StorageAdapter, join_uri, normalize_root_uri}; + +use super::commit_graph::{CommitGraph, GraphCommit}; +use super::manifest::{ManifestCoordinator, Snapshot, SubTableUpdate}; +use super::run_registry::{RunId, RunRecord, RunRegistry, graph_runs_uri, is_internal_run_branch}; + +const GRAPH_COMMITS_DIR: &str = "_graph_commits.lance"; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SnapshotId(String); + +impl SnapshotId { + pub fn new(id: impl Into) -> Self { + Self(id.into()) + } + + pub fn as_str(&self) -> &str { + &self.0 + } + + pub(crate) fn synthetic(branch: Option<&str>, version: u64) -> Self { + match branch { + Some(branch) => Self(format!("manifest:{}:v{}", branch, version)), + None => Self(format!("manifest:main:v{}", version)), + } + } +} + +impl fmt::Display for SnapshotId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub enum ReadTarget { + Branch(String), + Snapshot(SnapshotId), +} + +impl ReadTarget { + pub fn branch(name: impl Into) -> Self { + Self::Branch(name.into()) + } + + pub fn snapshot(id: impl Into) -> Self { + Self::Snapshot(id.into()) + } +} + +impl From<&str> for ReadTarget { + fn from(value: &str) -> Self { + Self::branch(value) + } +} + +impl From for ReadTarget { + fn from(value: String) -> Self { + Self::Branch(value) + } +} + +impl From for ReadTarget { + fn from(value: SnapshotId) -> Self { + Self::Snapshot(value) + } +} + +#[derive(Debug, Clone)] +pub struct ResolvedTarget { + pub requested: ReadTarget, + pub branch: Option, + pub snapshot_id: SnapshotId, + pub snapshot: Snapshot, +} + +#[derive(Debug, Clone)] +pub(crate) struct PublishedSnapshot { + pub manifest_version: u64, + pub _snapshot_id: SnapshotId, +} + +pub struct GraphCoordinator { + root_uri: String, + storage: Arc, + manifest: ManifestCoordinator, + commit_graph: Option, + run_registry: Option, + bound_branch: Option, +} + +impl GraphCoordinator { + pub async fn init( + root_uri: &str, + catalog: &Catalog, + storage: Arc, + ) -> Result { + let root = normalize_root_uri(root_uri)?; + let manifest = ManifestCoordinator::init(&root, catalog).await?; + let commit_graph = Some(CommitGraph::init(&root, manifest.version()).await?); + Ok(Self { + root_uri: root, + storage, + manifest, + commit_graph, + run_registry: None, + bound_branch: None, + }) + } + + pub async fn open(root_uri: &str, storage: Arc) -> Result { + let root = normalize_root_uri(root_uri)?; + let manifest = ManifestCoordinator::open(&root).await?; + let commit_graph = if storage.exists(&graph_commits_uri(&root)).await? { + Some(CommitGraph::open(&root).await?) + } else { + None + }; + let run_registry = if storage.exists(&graph_runs_uri(&root)).await? { + Some(RunRegistry::open(&root).await?) + } else { + None + }; + Ok(Self { + root_uri: root, + storage, + manifest, + commit_graph, + run_registry, + bound_branch: None, + }) + } + + pub async fn open_branch( + root_uri: &str, + branch: &str, + storage: Arc, + ) -> Result { + let branch = normalize_branch_name(branch)?; + let Some(branch_name) = branch else { + return Self::open(root_uri, storage).await; + }; + + let root = normalize_root_uri(root_uri)?; + let manifest = ManifestCoordinator::open_at_branch(&root, &branch_name).await?; + let commit_graph = if storage.exists(&graph_commits_uri(&root)).await? { + Some(CommitGraph::open_at_branch(&root, &branch_name).await?) + } else { + None + }; + let run_registry = if storage.exists(&graph_runs_uri(&root)).await? { + Some(RunRegistry::open(&root).await?) + } else { + None + }; + + Ok(Self { + root_uri: root, + storage, + manifest, + commit_graph, + run_registry, + bound_branch: Some(branch_name), + }) + } + + pub fn root_uri(&self) -> &str { + &self.root_uri + } + + pub fn version(&self) -> u64 { + self.manifest.version() + } + + pub fn snapshot(&self) -> Snapshot { + self.manifest.snapshot() + } + + pub fn current_branch(&self) -> Option<&str> { + self.bound_branch.as_deref() + } + + pub async fn refresh(&mut self) -> Result<()> { + self.manifest.refresh().await?; + if let Some(commit_graph) = &mut self.commit_graph { + commit_graph.refresh().await?; + } + if let Some(run_registry) = &mut self.run_registry { + let root_uri = self.root_uri.clone(); + run_registry.refresh(&root_uri).await?; + } + Ok(()) + } + + pub async fn branch_list(&self) -> Result> { + self.manifest.list_branches().await.map(|branches| { + branches + .into_iter() + .filter(|branch| !is_internal_run_branch(branch)) + .collect() + }) + } + + pub async fn branch_descendants(&self, name: &str) -> Result> { + self.manifest + .descendant_branches(name) + .await + .map(|branches| { + branches + .into_iter() + .filter(|branch| !is_internal_run_branch(branch)) + .collect() + }) + } + + pub async fn branch_create(&mut self, name: &str) -> Result<()> { + let branch = normalize_branch_name(name)? + .ok_or_else(|| OmniError::manifest("cannot create branch 'main'".to_string()))?; + self.ensure_commit_graph_initialized().await?; + self.manifest.create_branch(&branch).await?; + failpoints::maybe_fail("branch_create.after_manifest_branch_create")?; + if let Some(commit_graph) = &mut self.commit_graph { + commit_graph.create_branch(&branch).await?; + } + Ok(()) + } + + pub async fn branch_delete(&mut self, name: &str) -> Result<()> { + let branch = normalize_branch_name(name)? + .ok_or_else(|| OmniError::manifest("cannot delete branch 'main'".to_string()))?; + if self.current_branch() == Some(branch.as_str()) { + return Err(OmniError::manifest_conflict(format!( + "cannot delete currently active branch '{}'", + branch + ))); + } + + self.manifest.delete_branch(&branch).await?; + + if let Some(commit_graph) = &mut self.commit_graph { + commit_graph.delete_branch(&branch).await?; + } else if self + .storage + .exists(&graph_commits_uri(self.root_uri())) + .await? + { + let mut commit_graph = CommitGraph::open(self.root_uri()).await?; + commit_graph.delete_branch(&branch).await?; + } + + Ok(()) + } + + pub async fn snapshot_at_version(&self, version: u64) -> Result { + ManifestCoordinator::snapshot_at(self.root_uri(), self.current_branch(), version).await + } + + pub async fn resolve_snapshot_id(&self, branch: &str) -> Result { + let normalized = normalize_branch_name(branch)?; + let other = match normalized.as_deref() { + Some(branch) => { + GraphCoordinator::open_branch(self.root_uri(), branch, Arc::clone(&self.storage)) + .await? + } + None => GraphCoordinator::open(self.root_uri(), Arc::clone(&self.storage)).await?, + }; + + Ok(other + .head_commit_id() + .await? + .unwrap_or_else(|| SnapshotId::synthetic(other.current_branch(), other.version()))) + } + + pub async fn resolve_target(&self, target: &ReadTarget) -> Result { + match target { + ReadTarget::Branch(branch) => { + let normalized = normalize_branch_name(branch)?; + let other = match normalized.as_deref() { + Some(branch) => { + GraphCoordinator::open_branch( + self.root_uri(), + branch, + Arc::clone(&self.storage), + ) + .await? + } + None => { + GraphCoordinator::open(self.root_uri(), Arc::clone(&self.storage)).await? + } + }; + let snapshot_id = other.head_commit_id().await?.unwrap_or_else(|| { + SnapshotId::synthetic(other.current_branch(), other.version()) + }); + Ok(ResolvedTarget { + requested: target.clone(), + branch: other.bound_branch.clone(), + snapshot_id, + snapshot: other.snapshot(), + }) + } + ReadTarget::Snapshot(snapshot_id) => { + let commit = self.resolve_commit(snapshot_id).await?; + let snapshot = ManifestCoordinator::snapshot_at( + self.root_uri(), + commit.manifest_branch.as_deref(), + commit.manifest_version, + ) + .await?; + Ok(ResolvedTarget { + requested: target.clone(), + branch: commit.manifest_branch.clone(), + snapshot_id: snapshot_id.clone(), + snapshot, + }) + } + } + } + + pub async fn resolve_commit(&self, snapshot_id: &SnapshotId) -> Result { + if let Some(commit_graph) = &self.commit_graph { + if let Some(commit) = commit_graph.get_commit(snapshot_id.as_str()) { + return Ok(commit); + } + } + + for branch in self.manifest.list_branches().await? { + let normalized = normalize_branch_name(&branch)?; + let Some(commit_graph) = self + .open_commit_graph_for_branch(normalized.as_deref()) + .await? + else { + break; + }; + if let Some(commit) = commit_graph.get_commit(snapshot_id.as_str()) { + return Ok(commit); + } + } + + Err(OmniError::manifest_not_found(format!( + "commit '{}' not found", + snapshot_id + ))) + } + + pub(crate) async fn head_commit_id(&self) -> Result> { + match &self.commit_graph { + Some(commit_graph) => commit_graph + .head_commit_id() + .await + .map(|id| id.map(SnapshotId::new)), + None => Ok(None), + } + } + + pub(crate) async fn ensure_commit_graph_initialized(&mut self) -> Result<()> { + if self.commit_graph.is_some() { + return Ok(()); + } + if !self + .storage + .exists(&graph_commits_uri(self.root_uri())) + .await? + { + let _ = CommitGraph::init(self.root_uri(), self.manifest.version()).await?; + } + self.commit_graph = match self.current_branch() { + Some(branch) => Some(CommitGraph::open_at_branch(self.root_uri(), branch).await?), + None => Some(CommitGraph::open(self.root_uri()).await?), + }; + Ok(()) + } + + pub(crate) async fn ensure_run_registry_initialized(&mut self) -> Result<()> { + if self.run_registry.is_some() { + return Ok(()); + } + if !self + .storage + .exists(&graph_runs_uri(self.root_uri())) + .await? + { + let _ = RunRegistry::init(self.root_uri()).await?; + } + self.run_registry = Some(RunRegistry::open(self.root_uri()).await?); + Ok(()) + } + + pub(crate) async fn commit_updates_with_actor( + &mut self, + updates: &[SubTableUpdate], + actor_id: Option<&str>, + ) -> Result { + let manifest_version = self.commit_manifest_updates(updates).await?; + let snapshot_id = self.record_graph_commit(manifest_version, actor_id).await?; + Ok(PublishedSnapshot { + manifest_version, + _snapshot_id: snapshot_id, + }) + } + + pub(crate) async fn commit_manifest_updates( + &mut self, + updates: &[SubTableUpdate], + ) -> Result { + let manifest_version = self.manifest.commit(updates).await?; + failpoints::maybe_fail("graph_publish.after_manifest_commit")?; + Ok(manifest_version) + } + + pub(crate) async fn record_graph_commit( + &mut self, + manifest_version: u64, + actor_id: Option<&str>, + ) -> Result { + self.ensure_commit_graph_initialized().await?; + let current_branch = self.current_branch().map(str::to_string); + let Some(commit_graph) = &mut self.commit_graph else { + return Ok(SnapshotId::synthetic( + current_branch.as_deref(), + manifest_version, + )); + }; + failpoints::maybe_fail("graph_publish.before_commit_append")?; + let graph_commit_id = commit_graph + .append_commit(current_branch.as_deref(), manifest_version, actor_id) + .await?; + Ok(SnapshotId::new(graph_commit_id)) + } + + pub(crate) async fn record_merge_commit( + &mut self, + manifest_version: u64, + parent_commit_id: &str, + merged_parent_commit_id: &str, + actor_id: Option<&str>, + ) -> Result { + self.ensure_commit_graph_initialized().await?; + let current_branch = self.current_branch().map(str::to_string); + let commit_graph = self.commit_graph.as_mut().ok_or_else(|| { + OmniError::manifest("branch merge requires _graph_commits.lance".to_string()) + })?; + failpoints::maybe_fail("graph_publish.before_commit_append")?; + let graph_commit_id = commit_graph + .append_merge_commit( + current_branch.as_deref(), + manifest_version, + parent_commit_id, + merged_parent_commit_id, + actor_id, + ) + .await?; + Ok(SnapshotId::new(graph_commit_id)) + } + + async fn open_commit_graph_for_branch( + &self, + branch: Option<&str>, + ) -> Result> { + if !self + .storage + .exists(&graph_commits_uri(self.root_uri())) + .await? + { + return Ok(None); + } + let graph = match branch { + Some(branch) => CommitGraph::open_at_branch(self.root_uri(), branch).await?, + None => CommitGraph::open(self.root_uri()).await?, + }; + Ok(Some(graph)) + } + + pub(crate) async fn append_run_record(&mut self, record: &RunRecord) -> Result<()> { + self.ensure_run_registry_initialized().await?; + let Some(run_registry) = &mut self.run_registry else { + return Err(OmniError::manifest( + "run registry not initialized".to_string(), + )); + }; + run_registry.append_record(record).await + } + + pub(crate) async fn get_run(&self, run_id: &RunId) -> Result { + if let Some(run_registry) = &self.run_registry { + if let Some(run) = run_registry.get_run(run_id).await? { + return Ok(run); + } + } + if !self + .storage + .exists(&graph_runs_uri(self.root_uri())) + .await? + { + return Err(OmniError::manifest_not_found(format!( + "run '{}' not found", + run_id + ))); + } + let run_registry = RunRegistry::open(self.root_uri()).await?; + run_registry + .get_run(run_id) + .await? + .ok_or_else(|| OmniError::manifest_not_found(format!("run '{}' not found", run_id))) + } + + pub(crate) async fn list_runs(&self) -> Result> { + if let Some(run_registry) = &self.run_registry { + return run_registry.list_runs().await; + } + if !self + .storage + .exists(&graph_runs_uri(self.root_uri())) + .await? + { + return Ok(Vec::new()); + } + let run_registry = RunRegistry::open(self.root_uri()).await?; + run_registry.list_runs().await + } + + pub(crate) async fn list_commits(&self) -> Result> { + if let Some(commit_graph) = &self.commit_graph { + return commit_graph.load_commits().await; + } + if !self + .storage + .exists(&graph_commits_uri(self.root_uri())) + .await? + { + return Ok(Vec::new()); + } + let commit_graph = match self.current_branch() { + Some(branch) => CommitGraph::open_at_branch(self.root_uri(), branch).await?, + None => CommitGraph::open(self.root_uri()).await?, + }; + commit_graph.load_commits().await + } +} + +fn graph_commits_uri(root_uri: &str) -> String { + join_uri(root_uri, GRAPH_COMMITS_DIR) +} + +fn normalize_branch_name(branch: &str) -> Result> { + let branch = branch.trim(); + if branch.is_empty() { + return Err(OmniError::manifest( + "branch name cannot be empty".to_string(), + )); + } + if branch == "main" { + return Ok(None); + } + Ok(Some(branch.to_string())) +} diff --git a/crates/omnigraph/src/db/manifest.rs b/crates/omnigraph/src/db/manifest.rs new file mode 100644 index 0000000..7d7dd45 --- /dev/null +++ b/crates/omnigraph/src/db/manifest.rs @@ -0,0 +1,339 @@ +use std::collections::{HashMap, HashSet}; +use std::sync::Arc; + +use crate::error::{OmniError, Result}; +use lance::Dataset; +use lance_namespace::models::CreateTableVersionRequest; +use omnigraph_compiler::catalog::Catalog; + +#[path = "manifest/layout.rs"] +mod layout; +#[path = "manifest/metadata.rs"] +mod metadata; +#[path = "manifest/namespace.rs"] +mod namespace; +#[path = "manifest/publisher.rs"] +mod publisher; +#[path = "manifest/repo.rs"] +mod repo; +#[path = "manifest/state.rs"] +mod state; + +use layout::{manifest_uri, open_manifest_dataset}; +pub(crate) use metadata::TableVersionMetadata; +#[cfg(test)] +use metadata::{OMNIGRAPH_ROW_COUNT_KEY, table_version_metadata_for_state}; +use namespace::open_table_at_version_from_manifest; +pub(crate) use namespace::open_table_head_for_write; +#[cfg(test)] +use namespace::{branch_manifest_namespace, staged_table_namespace}; +use publisher::{GraphNamespacePublisher, ManifestBatchPublisher}; +use repo::{init_manifest_repo, open_manifest_repo, snapshot_state_at}; +pub use state::SubTableEntry; +#[cfg(test)] +use state::string_column; +use state::{ManifestState, read_manifest_state}; + +const OBJECT_TYPE_TABLE: &str = "table"; +const OBJECT_TYPE_TABLE_VERSION: &str = "table_version"; +const TABLE_VERSION_MANAGEMENT_KEY: &str = "table_version_management"; + +/// Immutable point-in-time view of the database. +/// +/// Cheap to create (no storage I/O). All reads within a query go through one +/// Snapshot to guarantee cross-type consistency. +#[derive(Debug, Clone)] +pub struct Snapshot { + root_uri: String, + version: u64, + entries: HashMap, +} + +impl Snapshot { + /// Open a sub-table dataset at its pinned version. + pub async fn open(&self, table_key: &str) -> Result { + let entry = self + .entries + .get(table_key) + .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + entry.open(&self.root_uri).await + } + + /// Manifest version this snapshot was taken from. + pub fn version(&self) -> u64 { + self.version + } + + /// Look up a sub-table entry by key. + pub fn entry(&self, table_key: &str) -> Option<&SubTableEntry> { + self.entries.get(table_key) + } + + pub fn entries(&self) -> impl Iterator { + self.entries.values() + } +} + +impl SubTableUpdate { + pub(crate) fn to_create_table_version_request(&self) -> CreateTableVersionRequest { + self.version_metadata.to_create_table_version_request( + &self.table_key, + self.table_version, + self.row_count, + self.table_branch.as_deref(), + ) + } +} + +impl SubTableEntry { + pub(crate) async fn open(&self, root_uri: &str) -> Result { + open_table_at_version_from_manifest( + root_uri, + &self.table_key, + self.table_branch.as_deref(), + self.table_version, + ) + .await + } +} + +/// An update to apply to the manifest via `commit`. +#[derive(Debug, Clone)] +pub struct SubTableUpdate { + pub table_key: String, + pub table_version: u64, + pub table_branch: Option, + pub row_count: u64, + pub(crate) version_metadata: TableVersionMetadata, +} + +/// Coordinates cross-dataset state through the namespace `__manifest` table. +/// +/// Table rows register stable metadata such as location. Append-only +/// `table_version` rows are the graph publish boundary and reconstruct the +/// current graph snapshot by selecting the latest visible version row per +/// sub-table. +pub struct ManifestCoordinator { + root_uri: String, + dataset: Dataset, + known_state: ManifestState, + active_branch: Option, + publisher: Arc, +} + +impl ManifestCoordinator { + fn default_batch_publisher( + root_uri: &str, + active_branch: Option<&str>, + ) -> Arc { + Arc::new(GraphNamespacePublisher::new(root_uri, active_branch)) + } + + fn from_parts( + root_uri: &str, + dataset: Dataset, + known_state: ManifestState, + active_branch: Option, + publisher: Arc, + ) -> Self { + Self { + root_uri: root_uri.trim_end_matches('/').to_string(), + dataset, + known_state, + active_branch, + publisher, + } + } + + fn from_parts_with_default_publisher( + root_uri: &str, + dataset: Dataset, + known_state: ManifestState, + active_branch: Option, + ) -> Self { + let publisher = Self::default_batch_publisher(root_uri, active_branch.as_deref()); + Self::from_parts(root_uri, dataset, known_state, active_branch, publisher) + } + + fn snapshot_from_state(root_uri: &str, state: ManifestState) -> Snapshot { + Snapshot { + root_uri: root_uri.trim_end_matches('/').to_string(), + version: state.version, + entries: state + .entries + .into_iter() + .map(|entry| (entry.table_key.clone(), entry)) + .collect(), + } + } + + #[cfg(test)] + fn with_batch_publisher(mut self, publisher: Arc) -> Self { + self.publisher = publisher; + self + } + + /// Create a new repo at `root_uri` from a catalog. + /// + /// Creates per-type Lance datasets and the namespace `__manifest` table. + pub async fn init(root_uri: &str, catalog: &Catalog) -> Result { + let root = root_uri.trim_end_matches('/'); + let (dataset, known_state) = init_manifest_repo(root, catalog).await?; + + Ok(Self::from_parts_with_default_publisher( + root, + dataset, + known_state, + None, + )) + } + + /// Open an existing repo's manifest. + pub async fn open(root_uri: &str) -> Result { + let root = root_uri.trim_end_matches('/'); + let (dataset, known_state) = open_manifest_repo(root, None).await?; + Ok(Self::from_parts_with_default_publisher( + root, + dataset, + known_state, + None, + )) + } + + /// Open an existing repo's manifest at a specific branch. + pub async fn open_at_branch(root_uri: &str, branch: &str) -> Result { + if branch == "main" { + return Self::open(root_uri).await; + } + + let root = root_uri.trim_end_matches('/'); + let (dataset, known_state) = open_manifest_repo(root, Some(branch)).await?; + Ok(Self::from_parts_with_default_publisher( + root, + dataset, + known_state, + Some(branch.to_string()), + )) + } + + pub async fn snapshot_at( + root_uri: &str, + branch: Option<&str>, + version: u64, + ) -> Result { + let root = root_uri.trim_end_matches('/'); + Ok(Self::snapshot_from_state( + root, + snapshot_state_at(root, branch, version).await?, + )) + } + + /// Return a Snapshot from the known manifest state. No storage I/O. + pub fn snapshot(&self) -> Snapshot { + Self::snapshot_from_state(&self.root_uri, self.known_state.clone()) + } + + /// Re-read manifest from storage to see other writers' commits. + pub async fn refresh(&mut self) -> Result<()> { + self.dataset = open_manifest_dataset(&self.root_uri, self.active_branch.as_deref()).await?; + self.known_state = read_manifest_state(&self.dataset).await?; + Ok(()) + } + + /// Commit updated sub-table versions to the manifest. + /// + /// Atomically inserts one immutable `table_version` row per updated table. + /// The merge-insert commit on `__manifest` is the graph-level publish point. + pub async fn commit(&mut self, updates: &[SubTableUpdate]) -> Result { + if updates.is_empty() { + return Ok(self.version()); + } + + self.dataset = self.publisher.publish(updates).await?; + + self.known_state = read_manifest_state(&self.dataset).await?; + Ok(self.version()) + } + + /// Current manifest version. + pub fn version(&self) -> u64 { + self.dataset.version().version + } + + pub fn active_branch(&self) -> Option<&str> { + self.active_branch.as_deref() + } + + pub async fn create_branch(&mut self, name: &str) -> Result<()> { + let mut ds = self.dataset.clone(); + ds.create_branch(name, self.version(), None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(()) + } + + pub async fn delete_branch(&mut self, name: &str) -> Result<()> { + let uri = manifest_uri(&self.root_uri); + let mut ds = Dataset::open(&uri) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + ds.delete_branch(name) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.dataset = open_manifest_dataset(&self.root_uri, self.active_branch.as_deref()).await?; + self.known_state = read_manifest_state(&self.dataset).await?; + Ok(()) + } + + pub async fn list_branches(&self) -> Result> { + let branches = self + .dataset + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let mut names: Vec = branches.into_keys().filter(|name| name != "main").collect(); + names.sort(); + let mut all = vec!["main".to_string()]; + all.extend(names); + Ok(all) + } + + pub async fn descendant_branches(&self, name: &str) -> Result> { + let branches = self + .dataset + .list_branches() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let mut frontier = vec![name.to_string()]; + let mut descendants = Vec::new(); + let mut seen = HashSet::new(); + + while let Some(parent) = frontier.pop() { + let mut children = branches + .iter() + .filter_map(|(branch, contents)| { + (contents.parent_branch.as_deref() == Some(parent.as_str())) + .then_some(branch.clone()) + }) + .collect::>(); + children.sort(); + for child in children { + if seen.insert(child.clone()) { + frontier.push(child.clone()); + descendants.push(child); + } + } + } + + Ok(descendants) + } + + /// Root URI of the repo. + pub fn root_uri(&self) -> &str { + &self.root_uri + } +} + +#[cfg(test)] +#[path = "manifest/tests.rs"] +mod tests; diff --git a/crates/omnigraph/src/db/manifest/layout.rs b/crates/omnigraph/src/db/manifest/layout.rs new file mode 100644 index 0000000..9a4fca3 --- /dev/null +++ b/crates/omnigraph/src/db/manifest/layout.rs @@ -0,0 +1,74 @@ +use lance::Dataset; +use lance_namespace::Error as LanceNamespaceError; + +use crate::error::{OmniError, Result}; +use crate::storage::{StorageKind, join_uri, storage_kind_for_uri}; + +const MANIFEST_DIR: &str = "__manifest"; + +pub(super) fn type_name_hash(name: &str) -> String { + let mut h: u64 = 0xcbf29ce484222325; + for byte in name.as_bytes() { + h ^= *byte as u64; + h = h.wrapping_mul(0x100000001b3); + } + format!("{:016x}", h) +} + +pub(super) fn manifest_uri(root: &str) -> String { + format!("{}/{}", root.trim_end_matches('/'), MANIFEST_DIR) +} + +pub(super) async fn open_manifest_dataset(root_uri: &str, branch: Option<&str>) -> Result { + let dataset = Dataset::open(&manifest_uri(root_uri.trim_end_matches('/'))) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + match branch { + Some(branch) if branch != "main" => dataset + .checkout_branch(branch) + .await + .map_err(|e| OmniError::Lance(e.to_string())), + _ => Ok(dataset), + } +} + +fn format_table_version(version: u64) -> String { + format!("{version:020}") +} + +pub(super) fn version_object_id(table_key: &str, version: u64) -> String { + format!("{}${}", table_key, format_table_version(version)) +} + +pub(super) fn table_id_to_key(request_id: Option<&Vec>) -> lance_namespace::Result { + match request_id { + Some(request_id) if request_id.len() == 1 && !request_id[0].is_empty() => { + Ok(request_id[0].clone()) + } + Some(request_id) => Err(LanceNamespaceError::invalid_input(format!( + "expected single table id component, got {:?}", + request_id + ))), + None => Err(LanceNamespaceError::invalid_input("table id is required")), + } +} + +pub(super) fn table_uri_for_path(root_uri: &str, table_path: &str, branch: Option<&str>) -> String { + let mut dataset_location = join_uri(root_uri, table_path); + if let Some(branch) = branch.filter(|branch| *branch != "main") { + dataset_location = join_uri(&dataset_location, "tree"); + for segment in branch.split('/') { + dataset_location = join_uri(&dataset_location, segment); + } + } + match storage_kind_for_uri(root_uri) { + StorageKind::Local => url::Url::from_file_path(&dataset_location) + .map(|uri| uri.to_string()) + .unwrap_or(dataset_location), + StorageKind::S3 => dataset_location, + } +} + +pub(super) fn namespace_internal_error(message: impl Into) -> LanceNamespaceError { + LanceNamespaceError::namespace_source(Box::new(std::io::Error::other(message.into()))) +} diff --git a/crates/omnigraph/src/db/manifest/metadata.rs b/crates/omnigraph/src/db/manifest/metadata.rs new file mode 100644 index 0000000..0bf14b6 --- /dev/null +++ b/crates/omnigraph/src/db/manifest/metadata.rs @@ -0,0 +1,244 @@ +use std::collections::HashMap; + +use lance::Dataset; +use lance_namespace::Error as LanceNamespaceError; +use lance_namespace::models::{CreateTableVersionRequest, TableVersion}; +use serde::{Deserialize, Serialize}; + +use crate::error::{OmniError, Result}; +use crate::storage::{StorageKind, join_uri, storage_kind_for_uri}; + +use super::layout::table_id_to_key; + +pub(super) const OMNIGRAPH_ROW_COUNT_KEY: &str = "omnigraph.row_count"; +const OMNIGRAPH_TABLE_BRANCH_KEY: &str = "omnigraph.table_branch"; + +pub(super) fn namespace_version_metadata( + row_count: u64, + table_branch: Option<&str>, +) -> HashMap { + let mut metadata = + HashMap::from([(OMNIGRAPH_ROW_COUNT_KEY.to_string(), row_count.to_string())]); + if let Some(table_branch) = table_branch { + metadata.insert( + OMNIGRAPH_TABLE_BRANCH_KEY.to_string(), + table_branch.to_string(), + ); + } + metadata +} + +pub(super) fn parse_namespace_version_request( + request: &CreateTableVersionRequest, +) -> lance_namespace::Result<(String, u64, u64, Option, TableVersionMetadata)> { + let table_key = table_id_to_key(request.id.as_ref())?; + let version = u64::try_from(request.version) + .map_err(|_| LanceNamespaceError::invalid_input("table version must be non-negative"))?; + let metadata = request.metadata.as_ref().ok_or_else(|| { + LanceNamespaceError::invalid_input("version metadata is required for Omnigraph rows") + })?; + let row_count = metadata + .get(OMNIGRAPH_ROW_COUNT_KEY) + .ok_or_else(|| { + LanceNamespaceError::invalid_input("missing omnigraph.row_count in metadata") + })? + .parse::() + .map_err(|e| { + LanceNamespaceError::invalid_input(format!("invalid omnigraph.row_count value: {}", e)) + })?; + let table_branch = metadata.get(OMNIGRAPH_TABLE_BRANCH_KEY).cloned(); + let version_metadata = TableVersionMetadata { + manifest_path: request.manifest_path.clone(), + manifest_size: request.manifest_size.map(|size| size as u64), + e_tag: request.e_tag.clone(), + naming_scheme: request.naming_scheme.clone(), + }; + + Ok(( + table_key, + version, + row_count, + table_branch, + version_metadata, + )) +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct TableVersionMetadata { + manifest_path: String, + manifest_size: Option, + e_tag: Option, + naming_scheme: Option, +} + +impl TableVersionMetadata { + pub(crate) fn from_dataset( + root_uri: &str, + table_path: &str, + dataset: &Dataset, + ) -> Result { + Ok(Self { + manifest_path: full_manifest_object_store_path( + root_uri, + table_path, + &dataset.manifest_location().path.to_string(), + )?, + manifest_size: dataset.manifest_location().size, + e_tag: dataset.manifest_location().e_tag.clone(), + naming_scheme: Some(format!("{:?}", dataset.manifest_location().naming_scheme)), + }) + } + + pub(super) fn from_json_str(value: &str) -> Result { + serde_json::from_str(value).map_err(|e| { + OmniError::manifest_internal(format!("failed to decode manifest metadata: {e}")) + }) + } + + pub(super) fn to_json_string(&self) -> Result { + serde_json::to_string(self).map_err(|e| { + OmniError::manifest_internal(format!("failed to encode manifest metadata: {e}")) + }) + } + + #[cfg(test)] + pub(crate) fn manifest_path(&self) -> &str { + &self.manifest_path + } + + #[cfg(test)] + pub(crate) fn manifest_size(&self) -> Option { + self.manifest_size + } + + #[cfg(test)] + pub(crate) fn e_tag(&self) -> Option<&str> { + self.e_tag.as_deref() + } + + #[cfg(test)] + pub(crate) fn naming_scheme(&self) -> Option<&str> { + self.naming_scheme.as_deref() + } + + pub(crate) fn to_create_table_version_request( + &self, + table_key: &str, + table_version: u64, + row_count: u64, + table_branch: Option<&str>, + ) -> CreateTableVersionRequest { + let mut request = + CreateTableVersionRequest::new(table_version as i64, self.manifest_path.clone()); + request.id = Some(vec![table_key.to_string()]); + request.manifest_size = self.manifest_size.map(|size| size as i64); + request.e_tag = self.e_tag.clone(); + request.naming_scheme = self.naming_scheme.clone(); + request.metadata = Some(namespace_version_metadata(row_count, table_branch)); + request + } + + pub(super) fn to_namespace_version(&self, version: u64) -> TableVersion { + self.to_namespace_version_with_details(version, None, None) + } + + pub(super) fn to_namespace_version_with_details( + &self, + version: u64, + timestamp_millis: Option, + metadata: Option>, + ) -> TableVersion { + let mut metadata = metadata.unwrap_or_default(); + if let Some(naming_scheme) = &self.naming_scheme { + metadata.insert("naming_scheme".to_string(), naming_scheme.clone()); + } + + TableVersion { + version: version as i64, + manifest_path: self.manifest_path.clone(), + manifest_size: self.manifest_size.map(|size| size as i64), + e_tag: self.e_tag.clone(), + timestamp_millis, + metadata: (!metadata.is_empty()).then_some(metadata), + } + } +} + +fn object_store_path_from_uri(uri: &str) -> Result { + match storage_kind_for_uri(uri) { + StorageKind::Local => { + if uri.strip_prefix("file://").is_some() { + let path = url::Url::parse(uri) + .map_err(|e| { + OmniError::manifest_internal(format!("invalid file uri '{}': {}", uri, e)) + })? + .to_file_path() + .map_err(|_| { + OmniError::manifest_internal(format!("invalid file uri '{}'", uri)) + })?; + Ok(path.to_string_lossy().to_string()) + } else { + Ok(uri.to_string()) + } + } + StorageKind::S3 => { + let url = url::Url::parse(uri).map_err(|e| { + OmniError::manifest_internal(format!("invalid s3 uri '{}': {}", uri, e)) + })?; + Ok(url.path().trim_start_matches('/').to_string()) + } + } +} + +fn full_manifest_object_store_path( + root_uri: &str, + table_path: &str, + manifest_path: &str, +) -> Result { + if manifest_path.contains("://") { + return object_store_path_from_uri(manifest_path); + } + + if manifest_path.contains(table_path) { + return Ok(manifest_path.to_string()); + } + + let dataset_uri = join_uri(root_uri, table_path); + let dataset_path = object_store_path_from_uri(&dataset_uri)?; + let manifest_path = manifest_path.trim_start_matches('/'); + + if manifest_path.is_empty() { + return Ok(dataset_path); + } + + Ok(format!( + "{}/{}", + dataset_path.trim_end_matches('/'), + manifest_path + )) +} + +#[cfg(test)] +pub(super) async fn table_version_metadata_for_state( + root_uri: &str, + table_path: &str, + branch: Option<&str>, + version: u64, +) -> Result { + let full_path = format!("{}/{}", root_uri.trim_end_matches('/'), table_path); + let ds = Dataset::open(&full_path) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let ds = match branch { + Some(branch) => ds + .checkout_branch(branch) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?, + None => ds, + }; + let ds = ds + .checkout_version(version) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + TableVersionMetadata::from_dataset(root_uri, table_path, &ds) +} diff --git a/crates/omnigraph/src/db/manifest/namespace.rs b/crates/omnigraph/src/db/manifest/namespace.rs new file mode 100644 index 0000000..724b3e5 --- /dev/null +++ b/crates/omnigraph/src/db/manifest/namespace.rs @@ -0,0 +1,549 @@ +use std::sync::Arc; + +use async_trait::async_trait; +use lance::Dataset; +use lance::dataset::builder::DatasetBuilder; +use lance_namespace::models::{ + CreateTableVersionRequest, CreateTableVersionResponse, DescribeTableRequest, + DescribeTableResponse, DescribeTableVersionRequest, DescribeTableVersionResponse, + ListTableVersionsRequest, ListTableVersionsResponse, TableExistsRequest, TableVersion, +}; +use lance_namespace::{Error as LanceNamespaceError, LanceNamespace, NamespaceError}; +use lance_table::io::commit::ManifestNamingScheme; +use object_store::{Error as ObjectStoreError, ObjectStore as _, PutMode, PutOptions, path::Path}; + +use crate::error::{OmniError, Result}; + +use super::layout::{ + namespace_internal_error, open_manifest_dataset, table_id_to_key, table_uri_for_path, +}; +use super::metadata::{ + TableVersionMetadata, namespace_version_metadata, parse_namespace_version_request, +}; +use super::publisher::GraphNamespacePublisher; +use super::state::{ManifestState, SubTableEntry, read_manifest_entries, read_manifest_state}; + +#[derive(Debug, Clone)] +struct BranchManifestNamespace { + root_uri: String, + branch: Option, +} + +impl BranchManifestNamespace { + fn new(root_uri: &str, branch: Option<&str>) -> Self { + Self { + root_uri: root_uri.trim_end_matches('/').to_string(), + branch: branch + .filter(|branch| *branch != "main") + .map(ToOwned::to_owned), + } + } + + async fn dataset(&self) -> Result { + open_manifest_dataset(&self.root_uri, self.branch.as_deref()).await + } + + async fn state(&self) -> Result { + let dataset = self.dataset().await?; + read_manifest_state(&dataset).await + } + + async fn version_entries(&self) -> Result> { + let dataset = self.dataset().await?; + read_manifest_entries(&dataset).await + } +} + +#[derive(Debug, Clone)] +struct StagedTableNamespace { + root_uri: String, + table_id: Vec, + table_path: String, + branch: Option, +} + +impl StagedTableNamespace { + fn new(root_uri: &str, table_key: &str, table_path: &str, branch: Option<&str>) -> Self { + Self { + root_uri: root_uri.trim_end_matches('/').to_string(), + table_id: vec![table_key.to_string()], + table_path: table_path.to_string(), + branch: branch + .filter(|branch| *branch != "main") + .map(ToOwned::to_owned), + } + } + + fn table_key(&self) -> &str { + &self.table_id[0] + } + + fn table_uri(&self) -> String { + table_uri_for_path(&self.root_uri, &self.table_path, self.branch.as_deref()) + } + + fn ensure_request_table( + &self, + request_id: Option<&Vec>, + ) -> lance_namespace::Result<()> { + match request_id { + Some(request_id) if request_id == &self.table_id => Ok(()), + Some(request_id) => Err(LanceNamespaceError::namespace_source(Box::new( + NamespaceError::TableNotFound { + message: format!("table {:?} not found", request_id), + }, + ))), + None => Err(LanceNamespaceError::invalid_input("table id is required")), + } + } + + async fn open_head(&self) -> Result { + Dataset::open(&self.table_uri()) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + + async fn open_version(&self, version: u64) -> Result { + let ds = self.open_head().await?; + if ds.version().version == version { + Ok(ds) + } else { + ds.checkout_version(version) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + } + + fn to_table_version( + &self, + dataset: &Dataset, + version: &lance::dataset::Version, + ) -> Result { + let metadata = + TableVersionMetadata::from_dataset(&self.root_uri, &self.table_path, dataset)?; + Ok(metadata.to_namespace_version_with_details( + version.version, + Some(version.timestamp.timestamp_millis()), + Some( + version + .metadata + .iter() + .map(|(k, v)| (k.clone(), v.clone())) + .collect(), + ), + )) + } +} + +pub(crate) fn branch_manifest_namespace( + root_uri: &str, + branch: Option<&str>, +) -> Arc { + Arc::new(BranchManifestNamespace::new(root_uri, branch)) +} + +pub(crate) fn staged_table_namespace( + root_uri: &str, + table_key: &str, + table_path: &str, + branch: Option<&str>, +) -> Arc { + Arc::new(StagedTableNamespace::new( + root_uri, table_key, table_path, branch, + )) +} + +async fn load_table_from_namespace( + namespace: Arc, + table_key: &str, + branch: Option<&str>, + version: Option, +) -> Result { + let builder = DatasetBuilder::from_namespace(namespace, vec![table_key.to_string()]) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let builder = match (branch, version) { + (Some(branch), version) => builder.with_branch(branch, version), + (None, Some(version)) => builder.with_version(version), + (None, None) => builder, + }; + builder + .load() + .await + .map_err(|e| OmniError::Lance(e.to_string())) +} + +pub(crate) async fn open_table_at_version_from_manifest( + root_uri: &str, + table_key: &str, + branch: Option<&str>, + version: u64, +) -> Result { + load_table_from_namespace( + branch_manifest_namespace(root_uri, branch), + table_key, + branch, + Some(version), + ) + .await +} + +#[async_trait] +impl LanceNamespace for BranchManifestNamespace { + fn namespace_id(&self) -> String { + "__manifest".to_string() + } + + async fn describe_table( + &self, + request: DescribeTableRequest, + ) -> lance_namespace::Result { + let table_key = table_id_to_key(request.id.as_ref())?; + let state = self + .state() + .await + .map_err(|e| LanceNamespaceError::namespace_source(Box::new(e)))?; + let entry = state + .entries + .into_iter() + .find(|entry| entry.table_key == table_key); + let entry = entry.ok_or_else(|| { + LanceNamespaceError::namespace_source(Box::new(NamespaceError::TableNotFound { + message: format!("table {} not found", table_key), + })) + })?; + let table_uri = table_uri_for_path( + &self.root_uri, + &entry.table_path, + entry.table_branch.as_deref(), + ); + + Ok(DescribeTableResponse { + table: Some(entry.table_key.clone()), + namespace: Some(Vec::new()), + version: Some(entry.table_version as i64), + location: Some(table_uri.clone()), + table_uri: request.with_table_uri.unwrap_or(false).then_some(table_uri), + schema: None, + storage_options: None, + stats: None, + metadata: None, + properties: None, + managed_versioning: Some(true), + }) + } + + async fn table_exists(&self, request: TableExistsRequest) -> lance_namespace::Result<()> { + let table_key = table_id_to_key(request.id.as_ref())?; + let state = self + .state() + .await + .map_err(|e| LanceNamespaceError::namespace_source(Box::new(e)))?; + if state + .entries + .iter() + .any(|entry| entry.table_key == table_key) + { + Ok(()) + } else { + Err(LanceNamespaceError::namespace_source(Box::new( + NamespaceError::TableNotFound { + message: format!("table {} not found", table_key), + }, + ))) + } + } + + async fn list_table_versions( + &self, + request: ListTableVersionsRequest, + ) -> lance_namespace::Result { + let table_key = table_id_to_key(request.id.as_ref())?; + let mut versions: Vec = self + .version_entries() + .await + .map_err(|e| LanceNamespaceError::namespace_source(Box::new(e)))? + .into_iter() + .filter(|entry| entry.table_key == table_key) + .map(|entry| { + entry + .version_metadata + .to_namespace_version(entry.table_version) + }) + .collect(); + + if request.descending.unwrap_or(false) { + versions.sort_by(|a, b| b.version.cmp(&a.version)); + } else { + versions.sort_by(|a, b| a.version.cmp(&b.version)); + } + if let Some(limit) = request.limit { + versions.truncate(limit as usize); + } + + Ok(ListTableVersionsResponse { + versions, + page_token: None, + }) + } + + async fn describe_table_version( + &self, + request: DescribeTableVersionRequest, + ) -> lance_namespace::Result { + let table_key = table_id_to_key(request.id.as_ref())?; + let version = request + .version + .ok_or_else(|| LanceNamespaceError::invalid_input("table version is required"))?; + let version = u64::try_from(version).map_err(|_| { + LanceNamespaceError::invalid_input("table version must be non-negative") + })?; + let entry = self + .version_entries() + .await + .map_err(|e| LanceNamespaceError::namespace_source(Box::new(e)))? + .into_iter() + .find(|entry| entry.table_key == table_key && entry.table_version == version) + .ok_or_else(|| { + LanceNamespaceError::namespace_source(Box::new( + NamespaceError::TableVersionNotFound { + message: format!("table version {} not found for {}", version, table_key), + }, + )) + })?; + + Ok(DescribeTableVersionResponse::new( + entry + .version_metadata + .to_namespace_version(entry.table_version), + )) + } + + async fn create_table_version( + &self, + request: CreateTableVersionRequest, + ) -> lance_namespace::Result { + let (table_key, table_version, row_count, table_branch, version_metadata) = + parse_namespace_version_request(&request)?; + GraphNamespacePublisher::new(&self.root_uri, self.branch.as_deref()) + .publish_requests(std::slice::from_ref(&request)) + .await + .map_err(|e| LanceNamespaceError::namespace_source(Box::new(e)))?; + let mut response = CreateTableVersionResponse::new(); + response.version = Some(Box::new( + version_metadata.to_namespace_version_with_details( + table_version, + None, + Some(namespace_version_metadata( + row_count, + table_branch.as_deref(), + )), + ), + )); + let _ = table_key; + Ok(response) + } +} + +#[async_trait] +impl LanceNamespace for StagedTableNamespace { + fn namespace_id(&self) -> String { + "__manifest".to_string() + } + + async fn describe_table( + &self, + request: DescribeTableRequest, + ) -> lance_namespace::Result { + self.ensure_request_table(request.id.as_ref())?; + let ds = self + .open_head() + .await + .map_err(|e| namespace_internal_error(e.to_string()))?; + let table_uri = self.table_uri(); + Ok(DescribeTableResponse { + table: Some(self.table_key().to_string()), + namespace: Some(Vec::new()), + version: Some(ds.version().version as i64), + location: Some(table_uri.clone()), + table_uri: request.with_table_uri.unwrap_or(false).then_some(table_uri), + schema: None, + storage_options: None, + stats: None, + metadata: None, + properties: None, + managed_versioning: Some(true), + }) + } + + async fn table_exists(&self, request: TableExistsRequest) -> lance_namespace::Result<()> { + self.ensure_request_table(request.id.as_ref())?; + self.open_head() + .await + .map(|_| ()) + .map_err(|e| LanceNamespaceError::namespace_source(Box::new(e))) + } + + async fn list_table_versions( + &self, + request: ListTableVersionsRequest, + ) -> lance_namespace::Result { + self.ensure_request_table(request.id.as_ref())?; + if request.limit == Some(0) { + return Ok(ListTableVersionsResponse { + versions: Vec::new(), + page_token: None, + }); + } + let head = self + .open_head() + .await + .map_err(|e| namespace_internal_error(e.to_string()))?; + let dataset_versions = head + .versions() + .await + .map_err(|e| namespace_internal_error(e.to_string()))?; + let mut versions = Vec::with_capacity(dataset_versions.len()); + for version in dataset_versions { + let dataset = if version.version == head.version().version { + head.clone() + } else { + head.checkout_version(version.version) + .await + .map_err(|e| namespace_internal_error(e.to_string()))? + }; + versions.push( + self.to_table_version(&dataset, &version) + .map_err(|e| namespace_internal_error(e.to_string()))?, + ); + } + if request.descending.unwrap_or(false) { + versions.sort_by(|a, b| b.version.cmp(&a.version)); + } else { + versions.sort_by(|a, b| a.version.cmp(&b.version)); + } + if let Some(limit) = request.limit { + versions.truncate(limit as usize); + } + Ok(ListTableVersionsResponse { + versions, + page_token: None, + }) + } + + async fn describe_table_version( + &self, + request: DescribeTableVersionRequest, + ) -> lance_namespace::Result { + self.ensure_request_table(request.id.as_ref())?; + let version = request + .version + .ok_or_else(|| LanceNamespaceError::invalid_input("table version is required"))?; + let version = u64::try_from(version).map_err(|_| { + LanceNamespaceError::invalid_input("table version must be non-negative") + })?; + let ds = self + .open_version(version) + .await + .map_err(|e| namespace_internal_error(e.to_string()))?; + let version_info = self + .to_table_version( + &ds, + &lance::dataset::Version { + version: ds.version().version, + timestamp: ds.version().timestamp, + metadata: ds.version().metadata, + }, + ) + .map_err(|e| namespace_internal_error(e.to_string()))?; + Ok(DescribeTableVersionResponse::new(version_info)) + } + + async fn create_table_version( + &self, + request: CreateTableVersionRequest, + ) -> lance_namespace::Result { + self.ensure_request_table(request.id.as_ref())?; + let version = u64::try_from(request.version).map_err(|_| { + LanceNamespaceError::invalid_input("table version must be non-negative") + })?; + let naming_scheme = match request.naming_scheme.as_deref() { + Some("V1") => ManifestNamingScheme::V1, + _ => ManifestNamingScheme::V2, + }; + let (object_store, base_path, _) = DatasetBuilder::from_uri(&self.table_uri()) + .build_object_store() + .await + .map_err(|e| namespace_internal_error(e.to_string()))?; + let staging_path = Path::from(request.manifest_path.clone()); + let manifest_data = object_store + .inner + .get(&staging_path) + .await + .map_err(|e| namespace_internal_error(e.to_string()))? + .bytes() + .await + .map_err(|e| namespace_internal_error(e.to_string()))?; + let final_path = naming_scheme.manifest_path(&base_path, version); + object_store + .inner + .put_opts( + &final_path, + manifest_data.into(), + PutOptions { + mode: PutMode::Create, + ..Default::default() + }, + ) + .await + .map_err(|e| match e { + ObjectStoreError::AlreadyExists { .. } | ObjectStoreError::Precondition { .. } => { + LanceNamespaceError::namespace_source(Box::new( + NamespaceError::ConcurrentModification { + message: format!( + "table version {} already exists for {}", + version, + self.table_key() + ), + }, + )) + } + other => namespace_internal_error(other.to_string()), + })?; + let meta = object_store + .inner + .head(&final_path) + .await + .map_err(|e| namespace_internal_error(e.to_string()))?; + match object_store.inner.delete(&staging_path).await { + Ok(_) | Err(ObjectStoreError::NotFound { .. }) => {} + Err(e) => return Err(namespace_internal_error(e.to_string())), + } + + let mut response = CreateTableVersionResponse::new(); + response.version = Some(Box::new(TableVersion { + version: version as i64, + manifest_path: final_path.to_string(), + manifest_size: Some(meta.size as i64), + e_tag: meta.e_tag, + timestamp_millis: None, + metadata: request.metadata, + })); + Ok(response) + } +} + +pub(crate) async fn open_table_head_for_write( + root_uri: &str, + table_key: &str, + table_path: &str, + branch: Option<&str>, +) -> Result { + load_table_from_namespace( + staged_table_namespace(root_uri, table_key, table_path, branch), + table_key, + branch, + None, + ) + .await +} diff --git a/crates/omnigraph/src/db/manifest/publisher.rs b/crates/omnigraph/src/db/manifest/publisher.rs new file mode 100644 index 0000000..efdbd1d --- /dev/null +++ b/crates/omnigraph/src/db/manifest/publisher.rs @@ -0,0 +1,236 @@ +//! Graph-level batch publish over the namespace `__manifest` table. +//! +//! Lance now owns most of the table/version control plane for Omnigraph: +//! table storage, table-local versioning, namespace lookup, and native table +//! history. This module exists for the remaining graph-specific gap: +//! Omnigraph needs one atomic publish point across multiple tables and the +//! current Rust namespace surface does not expose a branch-aware +//! `BatchCreateTableVersions` path for `DirectoryNamespace`. +//! +//! Until Lance exposes that operation directly, this publisher owns only: +//! - validating batch publish invariants against the current `__manifest` state +//! - atomically inserting immutable `table_version` rows into `__manifest` +//! - returning the refreshed manifest dataset that defines the visible graph +//! +//! This module should disappear once Lance Rust can do branch-aware batch table +//! version publication against a managed namespace manifest. + +use async_trait::async_trait; +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::RecordBatchIterator; +use lance::Dataset; +use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched}; +use lance_namespace::NamespaceError; +use lance_namespace::models::CreateTableVersionRequest; + +use crate::error::{OmniError, Result}; + +use super::layout::{open_manifest_dataset, version_object_id}; +use super::metadata::parse_namespace_version_request; +use super::state::{ + manifest_rows_batch, manifest_schema, read_manifest_entries, read_manifest_state, +}; +use super::{OBJECT_TYPE_TABLE_VERSION, SubTableEntry, SubTableUpdate}; + +#[async_trait] +pub(super) trait ManifestBatchPublisher: Send + Sync { + async fn publish(&self, updates: &[SubTableUpdate]) -> Result; +} + +pub(super) struct GraphNamespacePublisher { + root_uri: String, + branch: Option, +} + +#[derive(Debug)] +struct PendingVersionRow { + object_id: String, + metadata: Option, + table_key: String, + table_version: Option, + table_branch: Option, + row_count: Option, +} + +impl GraphNamespacePublisher { + pub(super) fn new(root_uri: &str, branch: Option<&str>) -> Self { + Self { + root_uri: root_uri.trim_end_matches('/').to_string(), + branch: branch + .filter(|branch| *branch != "main") + .map(ToOwned::to_owned), + } + } + + async fn dataset(&self) -> Result { + open_manifest_dataset(&self.root_uri, self.branch.as_deref()).await + } + + async fn load_publish_state( + &self, + ) -> Result<( + Dataset, + HashMap, + HashMap<(String, u64), SubTableEntry>, + )> { + let dataset = self.dataset().await?; + let current = read_manifest_state(&dataset).await?; + let existing_entries = read_manifest_entries(&dataset).await?; + let known_tables = current + .entries + .iter() + .map(|entry| (entry.table_key.clone(), ())) + .collect(); + let existing_versions = existing_entries + .iter() + .map(|entry| { + ( + (entry.table_key.clone(), entry.table_version), + entry.clone(), + ) + }) + .collect(); + Ok((dataset, known_tables, existing_versions)) + } + + fn build_pending_rows( + requests: &[CreateTableVersionRequest], + known_tables: &HashMap, + existing_versions: &HashMap<(String, u64), SubTableEntry>, + ) -> Result> { + let mut request_versions = HashMap::<(String, u64), ()>::new(); + let mut rows = Vec::with_capacity(requests.len()); + + for request in requests { + let (table_key, table_version, row_count, table_branch, version_metadata) = + parse_namespace_version_request(request) + .map_err(|e| OmniError::Lance(e.to_string()))?; + if !known_tables.contains_key(table_key.as_str()) { + return Err(OmniError::Lance( + NamespaceError::TableNotFound { + message: format!("table {} not found", table_key), + } + .to_string(), + )); + } + if request_versions + .insert((table_key.clone(), table_version), ()) + .is_some() + { + return Err(OmniError::Lance( + NamespaceError::ConcurrentModification { + message: format!( + "table version {} already exists for {}", + table_version, table_key + ), + } + .to_string(), + )); + } + if let Some(existing) = existing_versions.get(&(table_key.clone(), table_version)) { + let is_owner_branch_handoff = + existing.row_count == row_count && existing.table_branch != table_branch; + if !is_owner_branch_handoff { + return Err(OmniError::Lance( + NamespaceError::ConcurrentModification { + message: format!( + "table version {} already exists for {}", + table_version, table_key + ), + } + .to_string(), + )); + } + } + + rows.push(PendingVersionRow { + object_id: version_object_id(&table_key, table_version), + metadata: Some(version_metadata.to_json_string()?), + table_key, + table_version: Some(table_version), + table_branch, + row_count: Some(row_count), + }); + } + + Ok(rows) + } + + fn pending_rows_to_batch(rows: Vec) -> Result { + let mut object_ids = Vec::with_capacity(rows.len()); + let mut object_types = Vec::with_capacity(rows.len()); + let mut locations: Vec> = Vec::with_capacity(rows.len()); + let mut metadata = Vec::with_capacity(rows.len()); + let mut table_keys = Vec::with_capacity(rows.len()); + let mut table_versions: Vec> = Vec::with_capacity(rows.len()); + let mut table_branches = Vec::with_capacity(rows.len()); + let mut row_counts: Vec> = Vec::with_capacity(rows.len()); + + for row in rows { + object_ids.push(row.object_id); + object_types.push(OBJECT_TYPE_TABLE_VERSION.to_string()); + locations.push(None); + metadata.push(row.metadata); + table_keys.push(row.table_key); + table_versions.push(row.table_version); + table_branches.push(row.table_branch); + row_counts.push(row.row_count); + } + + manifest_rows_batch( + object_ids, + object_types, + locations, + metadata, + table_keys, + table_versions, + table_branches, + row_counts, + ) + } + + async fn merge_rows(&self, dataset: Dataset, rows: Vec) -> Result { + let batch = Self::pending_rows_to_batch(rows)?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], manifest_schema()); + let dataset = Arc::new(dataset); + let mut merge_builder = MergeInsertBuilder::try_new(dataset, vec!["object_id".to_string()]) + .map_err(|e| OmniError::Lance(e.to_string()))?; + merge_builder.when_matched(WhenMatched::UpdateAll); + merge_builder.when_not_matched(WhenNotMatched::InsertAll); + merge_builder.conflict_retries(5); + merge_builder.use_index(false); + let (new_dataset, _stats) = merge_builder + .try_build() + .map_err(|e| OmniError::Lance(e.to_string()))? + .execute_reader(Box::new(reader)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(Arc::try_unwrap(new_dataset).unwrap_or_else(|arc| (*arc).clone())) + } + + pub(super) async fn publish_requests( + &self, + requests: &[CreateTableVersionRequest], + ) -> Result { + if requests.is_empty() { + return self.dataset().await; + } + + let (dataset, known_tables, existing_versions) = self.load_publish_state().await?; + let rows = Self::build_pending_rows(requests, &known_tables, &existing_versions)?; + self.merge_rows(dataset, rows).await + } +} + +#[async_trait] +impl ManifestBatchPublisher for GraphNamespacePublisher { + async fn publish(&self, updates: &[SubTableUpdate]) -> Result { + let requests: Vec = updates + .iter() + .map(SubTableUpdate::to_create_table_version_request) + .collect(); + self.publish_requests(&requests).await + } +} diff --git a/crates/omnigraph/src/db/manifest/repo.rs b/crates/omnigraph/src/db/manifest/repo.rs new file mode 100644 index 0000000..1133be2 --- /dev/null +++ b/crates/omnigraph/src/db/manifest/repo.rs @@ -0,0 +1,133 @@ +use std::collections::HashMap; + +use arrow_array::{RecordBatch, RecordBatchIterator}; +use arrow_schema::SchemaRef; +use lance::Dataset; +use lance::dataset::{WriteMode, WriteParams}; +use lance_file::version::LanceFileVersion; +use omnigraph_compiler::catalog::Catalog; + +use crate::error::{OmniError, Result}; + +use super::TABLE_VERSION_MANAGEMENT_KEY; +use super::layout::{manifest_uri, open_manifest_dataset, type_name_hash}; +use super::metadata::TableVersionMetadata; +use super::state::{ + ManifestState, SubTableEntry, entries_to_batch, manifest_schema, read_manifest_state, +}; + +pub(super) async fn init_manifest_repo( + root_uri: &str, + catalog: &Catalog, +) -> Result<(Dataset, ManifestState)> { + let root = root_uri.trim_end_matches('/'); + let (entries, version_metadata) = build_initial_entries(root, catalog).await?; + + let manifest_batch = entries_to_batch(&entries, &version_metadata)?; + let schema = manifest_schema(); + let reader = RecordBatchIterator::new(vec![Ok(manifest_batch)], schema); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + let manifest_path = manifest_uri(root); + let mut dataset = Dataset::write(reader, &manifest_path, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + dataset + .update_config([(TABLE_VERSION_MANAGEMENT_KEY, Some("true"))]) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let known_state = read_manifest_state(&dataset).await?; + Ok((dataset, known_state)) +} + +pub(super) async fn open_manifest_repo( + root_uri: &str, + branch: Option<&str>, +) -> Result<(Dataset, ManifestState)> { + let dataset = open_manifest_dataset(root_uri.trim_end_matches('/'), branch).await?; + let known_state = read_manifest_state(&dataset).await?; + Ok((dataset, known_state)) +} + +pub(super) async fn snapshot_state_at( + root_uri: &str, + branch: Option<&str>, + version: u64, +) -> Result { + let dataset = open_manifest_dataset(root_uri.trim_end_matches('/'), branch).await?; + let dataset = dataset + .checkout_version(version) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + read_manifest_state(&dataset).await +} + +async fn build_initial_entries( + root_uri: &str, + catalog: &Catalog, +) -> Result<(Vec, HashMap)> { + let mut entries = Vec::new(); + let mut version_metadata = HashMap::new(); + + for (name, node_type) in &catalog.node_types { + let hash = type_name_hash(name); + let table_path = format!("nodes/{}", hash); + let full_path = format!("{}/{}", root_uri, table_path); + + let ds = create_empty_dataset(&full_path, &node_type.arrow_schema).await?; + let table_key = format!("node:{}", name); + let metadata = TableVersionMetadata::from_dataset(root_uri, &table_path, &ds)?; + + entries.push(SubTableEntry { + table_key: table_key.clone(), + table_path: table_path.clone(), + table_version: ds.version().version, + table_branch: None, + row_count: 0, + version_metadata: metadata.clone(), + }); + version_metadata.insert(table_key, metadata.to_json_string()?); + } + + for (name, edge_type) in &catalog.edge_types { + let hash = type_name_hash(name); + let table_path = format!("edges/{}", hash); + let full_path = format!("{}/{}", root_uri, table_path); + + let ds = create_empty_dataset(&full_path, &edge_type.arrow_schema).await?; + let table_key = format!("edge:{}", name); + let metadata = TableVersionMetadata::from_dataset(root_uri, &table_path, &ds)?; + + entries.push(SubTableEntry { + table_key: table_key.clone(), + table_path: table_path.clone(), + table_version: ds.version().version, + table_branch: None, + row_count: 0, + version_metadata: metadata.clone(), + }); + version_metadata.insert(table_key, metadata.to_json_string()?); + } + + Ok((entries, version_metadata)) +} + +async fn create_empty_dataset(uri: &str, schema: &SchemaRef) -> Result { + let batch = RecordBatch::new_empty(schema.clone()); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema.clone()); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + allow_external_blob_outside_bases: true, + ..Default::default() + }; + Dataset::write(reader, uri, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string())) +} diff --git a/crates/omnigraph/src/db/manifest/state.rs b/crates/omnigraph/src/db/manifest/state.rs new file mode 100644 index 0000000..418615b --- /dev/null +++ b/crates/omnigraph/src/db/manifest/state.rs @@ -0,0 +1,274 @@ +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_array::{Array, RecordBatch, StringArray, UInt64Array, new_null_array}; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use futures::TryStreamExt; +use lance::Dataset; + +use crate::error::{OmniError, Result}; + +use super::layout::version_object_id; +use super::metadata::TableVersionMetadata; +use super::{OBJECT_TYPE_TABLE, OBJECT_TYPE_TABLE_VERSION}; + +#[derive(Debug, Clone)] +pub struct SubTableEntry { + pub table_key: String, + pub table_path: String, + pub table_version: u64, + pub table_branch: Option, + pub row_count: u64, + pub(crate) version_metadata: TableVersionMetadata, +} + +#[derive(Debug, Clone)] +pub(super) struct ManifestState { + pub(super) version: u64, + pub(super) entries: Vec, +} + +pub(super) fn manifest_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("object_id", DataType::Utf8, false), + Field::new("object_type", DataType::Utf8, false), + Field::new("location", DataType::Utf8, true), + Field::new("metadata", DataType::Utf8, true), + Field::new( + "base_objects", + DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + true, + ), + Field::new("table_key", DataType::Utf8, false), + Field::new("table_version", DataType::UInt64, true), + Field::new("table_branch", DataType::Utf8, true), + Field::new("row_count", DataType::UInt64, true), + ])) +} + +pub(super) async fn read_manifest_state(dataset: &Dataset) -> Result { + let version = dataset.version().version; + let entries = read_manifest_entries(dataset).await?; + let mut latest_versions = HashMap::::new(); + + for entry in entries { + match latest_versions.get(&entry.table_key) { + Some(existing) if existing.table_version >= entry.table_version => {} + _ => { + latest_versions.insert(entry.table_key.clone(), entry); + } + } + } + + let mut entries: Vec = latest_versions.into_values().collect(); + entries.sort_by(|a, b| a.table_key.cmp(&b.table_key)); + + Ok(ManifestState { version, entries }) +} + +pub(super) async fn read_manifest_entries(dataset: &Dataset) -> Result> { + let batches: Vec = dataset + .scan() + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let mut table_locations = HashMap::new(); + let mut version_entries = Vec::new(); + + for batch in &batches { + let object_types = string_column(batch, "object_type")?; + let locations = string_column(batch, "location")?; + let metadata = string_column(batch, "metadata")?; + let table_keys = string_column(batch, "table_key")?; + let versions = u64_column(batch, "table_version")?; + let branches = string_column(batch, "table_branch")?; + let row_counts = u64_column(batch, "row_count")?; + + for row in 0..batch.num_rows() { + let table_key = table_keys.value(row).to_string(); + match object_types.value(row) { + OBJECT_TYPE_TABLE => { + if locations.is_null(row) { + return Err(OmniError::manifest_internal(format!( + "manifest table row missing location for {}", + table_key + ))); + } + table_locations.insert(table_key, locations.value(row).to_string()); + } + OBJECT_TYPE_TABLE_VERSION => { + let table_version = required_u64(versions, row, "table_version")?; + let row_count = required_u64(row_counts, row, "row_count")?; + if metadata.is_null(row) { + return Err(OmniError::manifest_internal(format!( + "manifest table_version row missing metadata for {}", + table_key + ))); + } + let table_branch = if branches.is_null(row) { + None + } else { + Some(branches.value(row).to_string()) + }; + version_entries.push(SubTableEntry { + table_key: table_key.clone(), + table_path: String::new(), + table_version, + table_branch, + row_count, + version_metadata: TableVersionMetadata::from_json_str(metadata.value(row))?, + }); + } + _ => {} + } + } + } + + let mut entries = version_entries + .into_iter() + .map(|mut entry| { + entry.table_path = table_locations + .get(&entry.table_key) + .cloned() + .ok_or_else(|| { + OmniError::manifest_internal(format!( + "manifest missing table row for {}", + entry.table_key + )) + })?; + Ok(entry) + }) + .collect::>>()?; + entries.sort_by(|a, b| { + a.table_key + .cmp(&b.table_key) + .then(a.table_version.cmp(&b.table_version)) + }); + + Ok(entries) +} + +pub(super) fn entries_to_batch( + entries: &[SubTableEntry], + version_metadata: &HashMap, +) -> Result { + let mut object_ids = Vec::with_capacity(entries.len() * 2); + let mut object_types = Vec::with_capacity(entries.len() * 2); + let mut locations = Vec::with_capacity(entries.len() * 2); + let mut metadata = Vec::with_capacity(entries.len() * 2); + let mut table_keys = Vec::with_capacity(entries.len() * 2); + let mut table_versions = Vec::with_capacity(entries.len() * 2); + let mut table_branches = Vec::with_capacity(entries.len() * 2); + let mut row_counts = Vec::with_capacity(entries.len() * 2); + + for entry in entries { + object_ids.push(entry.table_key.clone()); + object_types.push(OBJECT_TYPE_TABLE.to_string()); + locations.push(Some(entry.table_path.clone())); + metadata.push(None); + table_keys.push(entry.table_key.clone()); + table_versions.push(None); + table_branches.push(None); + row_counts.push(None); + + object_ids.push(version_object_id(&entry.table_key, entry.table_version)); + object_types.push(OBJECT_TYPE_TABLE_VERSION.to_string()); + locations.push(None); + metadata.push(Some( + version_metadata + .get(&entry.table_key) + .cloned() + .ok_or_else(|| { + OmniError::manifest_internal(format!( + "missing initial version metadata for {}", + entry.table_key + )) + })?, + )); + table_keys.push(entry.table_key.clone()); + table_versions.push(Some(entry.table_version)); + table_branches.push(entry.table_branch.clone()); + row_counts.push(Some(entry.row_count)); + } + + manifest_rows_batch( + object_ids, + object_types, + locations, + metadata, + table_keys, + table_versions, + table_branches, + row_counts, + ) +} + +pub(super) fn manifest_rows_batch( + object_ids: Vec, + object_types: Vec, + locations: Vec>, + metadata: Vec>, + table_keys: Vec, + table_versions: Vec>, + table_branches: Vec>, + row_counts: Vec>, +) -> Result { + let len = object_ids.len(); + RecordBatch::try_new( + manifest_schema(), + vec![ + Arc::new(StringArray::from(object_ids)), + Arc::new(StringArray::from(object_types)), + Arc::new(StringArray::from(locations)), + Arc::new(StringArray::from(metadata)), + new_null_array( + &DataType::List(Arc::new(Field::new("item", DataType::Utf8, true))), + len, + ), + Arc::new(StringArray::from(table_keys)), + Arc::new(UInt64Array::from(table_versions)), + Arc::new(StringArray::from(table_branches)), + Arc::new(UInt64Array::from(row_counts)), + ], + ) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +pub(super) fn string_column<'a>(batch: &'a RecordBatch, name: &str) -> Result<&'a StringArray> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("manifest batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!("manifest column '{name}' is not Utf8")) + }) +} + +fn u64_column<'a>(batch: &'a RecordBatch, name: &str) -> Result<&'a UInt64Array> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("manifest batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!("manifest column '{name}' is not UInt64")) + }) +} + +fn required_u64(column: &UInt64Array, row: usize, name: &str) -> Result { + if column.is_null(row) { + return Err(OmniError::manifest_internal(format!( + "manifest column '{name}' is null at row {row}" + ))); + } + Ok(column.value(row)) +} diff --git a/crates/omnigraph/src/db/manifest/tests.rs b/crates/omnigraph/src/db/manifest/tests.rs new file mode 100644 index 0000000..c7eee82 --- /dev/null +++ b/crates/omnigraph/src/db/manifest/tests.rs @@ -0,0 +1,1064 @@ +use std::sync::Arc; + +use arrow_array::{Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt64Array}; +use arrow_schema::{DataType, Field, Schema}; +use async_trait::async_trait; +use lance::dataset::builder::DatasetBuilder; +use lance_namespace::LanceNamespace; +use lance_namespace::models::{ + DescribeTableRequest, DescribeTableVersionRequest, ListTableVersionsRequest, +}; +use lance_namespace_impls::DirectoryNamespaceBuilder; +use tokio::sync::Mutex; + +use super::publisher::ManifestBatchPublisher; +use super::*; +use omnigraph_compiler::catalog::build_catalog; +use omnigraph_compiler::schema::parser::parse_schema; + +fn test_schema_source() -> &'static str { + r#" +node Person { + name: String + age: I32? +} +node Company { + name: String +} +edge Knows: Person -> Person { + since: Date? +} +edge WorksAt: Person -> Company { + title: String? +} +"# +} + +fn build_test_catalog() -> Catalog { + let schema = parse_schema(test_schema_source()).unwrap(); + build_catalog(&schema).unwrap() +} + +#[tokio::test] +async fn test_init_creates_manifest_and_sub_tables() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + + assert!(snap.entry("node:Person").is_some()); + assert!(snap.entry("node:Company").is_some()); + assert!(snap.entry("edge:Knows").is_some()); + assert!(snap.entry("edge:WorksAt").is_some()); + + for key in &["node:Person", "node:Company", "edge:Knows", "edge:WorksAt"] { + let entry = snap.entry(key).unwrap(); + assert_eq!(entry.table_version, 1); + assert_eq!(entry.row_count, 0); + assert!(entry.table_branch.is_none()); + } +} + +#[tokio::test] +async fn test_open_reads_existing_manifest() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + ManifestCoordinator::init(uri, &catalog).await.unwrap(); + + let mc = ManifestCoordinator::open(uri).await.unwrap(); + let snap = mc.snapshot(); + assert!(snap.entry("node:Person").is_some()); + assert!(snap.entry("edge:Knows").is_some()); +} + +#[tokio::test] +async fn test_commit_advances_version() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let v1 = mc.version(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap(); + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + + let new_version = mc + .commit(&[SubTableUpdate { + table_key: "node:Person".to_string(), + table_version: person_version, + table_branch: None, + row_count: 1, + version_metadata: table_version_metadata_for_state( + uri, + &person_entry.table_path, + None, + person_version, + ) + .await + .unwrap(), + }]) + .await + .unwrap(); + + assert!(new_version > v1); + + let snap = mc.snapshot(); + let person = snap.entry("node:Person").unwrap(); + assert_eq!(person.table_version, person_version); + assert_eq!(person.row_count, 1); + + let company = snap.entry("node:Company").unwrap(); + assert_eq!(company.table_version, 1); + assert_eq!(company.row_count, 0); +} + +#[tokio::test] +async fn test_snapshot_open_sub_table() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_ds = snap.open("node:Person").await.unwrap(); + + assert_eq!(person_ds.schema().fields.len(), 3); + assert_eq!(person_ds.count_rows(None).await.unwrap(), 0); +} + +#[tokio::test] +async fn test_version_is_manifest_version() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + assert_eq!(mc.version(), snap.version()); +} + +#[tokio::test] +async fn test_list_branches_only_returns_main_once() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let branches = mc.list_branches().await.unwrap(); + assert_eq!( + branches + .iter() + .filter(|branch| branch.as_str() == "main") + .count(), + 1 + ); +} + +#[tokio::test] +async fn test_branch_namespace_lists_and_describes_versions() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + let version_metadata = + table_version_metadata_for_state(uri, &person_entry.table_path, None, person_version) + .await + .unwrap(); + + let namespace = branch_manifest_namespace(uri, None); + let request = + version_metadata.to_create_table_version_request("node:Person", person_version, 1, None); + namespace.create_table_version(request).await.unwrap(); + mc.refresh().await.unwrap(); + + let versions = namespace + .list_table_versions(ListTableVersionsRequest { + id: Some(vec!["node:Person".to_string()]), + descending: Some(true), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!(versions.versions.len(), 2); + assert_eq!(versions.versions[0].version as u64, person_version); + assert_eq!(versions.versions[1].version, 1); + + let described = namespace + .describe_table_version(DescribeTableVersionRequest { + id: Some(vec!["node:Person".to_string()]), + version: Some(person_version as i64), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!(described.version.version as u64, person_version); + assert_eq!( + mc.snapshot().entry("node:Person").unwrap().table_version, + person_version + ); + assert_eq!(mc.snapshot().entry("node:Person").unwrap().row_count, 1); +} + +#[tokio::test] +async fn test_directory_namespace_direct_publish_cannot_replace_native_omnigraph_write_path() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + let version_metadata = + table_version_metadata_for_state(uri, &person_entry.table_path, None, person_version) + .await + .unwrap(); + + let namespace = DirectoryNamespaceBuilder::new(uri) + .manifest_enabled(true) + .dir_listing_enabled(false) + .table_version_tracking_enabled(true) + .table_version_storage_enabled(true) + .inline_optimization_enabled(false) + .build() + .await + .unwrap(); + + let versions = namespace + .list_table_versions(ListTableVersionsRequest { + id: Some(vec!["node:Person".to_string()]), + descending: Some(true), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!( + versions.versions[0].version as u64, + person_entry.table_version + ); + + let err = namespace + .describe_table_version(DescribeTableVersionRequest { + id: Some(vec!["node:Person".to_string()]), + version: Some(person_version as i64), + ..Default::default() + }) + .await + .unwrap_err(); + assert!(err.to_string().contains("not found")); + + let err = namespace + .create_table_version(version_metadata.to_create_table_version_request( + "node:Person", + person_version, + 1, + None, + )) + .await + .unwrap_err(); + assert!(err.to_string().contains("already exists")); + + mc.refresh().await.unwrap(); + assert_eq!( + mc.snapshot().entry("node:Person").unwrap().table_version, + person_entry.table_version + ); + assert_eq!(mc.snapshot().entry("node:Person").unwrap().row_count, 0); +} + +#[tokio::test] +async fn test_snapshot_at_reads_branch_pinned_historical_state() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let main_manifest_version = mc.version(); + mc.create_branch("feature").await.unwrap(); + + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + person_ds + .create_branch("feature", person_entry.table_version, None) + .await + .unwrap(); + let mut feature_ds = person_ds.checkout_branch("feature").await.unwrap(); + let person_schema = Arc::new(feature_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + feature_ds.append(reader, None).await.unwrap(); + let feature_version = feature_ds.version().version; + let feature_metadata = table_version_metadata_for_state( + uri, + &person_entry.table_path, + Some("feature"), + feature_version, + ) + .await + .unwrap(); + + let namespace = branch_manifest_namespace(uri, Some("feature")); + let request = feature_metadata.to_create_table_version_request( + "node:Person", + feature_version, + 1, + Some("feature"), + ); + namespace.create_table_version(request).await.unwrap(); + + let feature_mc = ManifestCoordinator::open_at_branch(uri, "feature") + .await + .unwrap(); + let feature_snapshot = + ManifestCoordinator::snapshot_at(uri, Some("feature"), feature_mc.version()) + .await + .unwrap(); + let feature_entry = feature_snapshot.entry("node:Person").unwrap(); + assert_eq!(feature_entry.table_version, feature_version); + assert_eq!(feature_entry.table_branch.as_deref(), Some("feature")); + assert_eq!( + feature_snapshot + .open("node:Person") + .await + .unwrap() + .count_rows(None) + .await + .unwrap(), + 1 + ); + + let main_snapshot = ManifestCoordinator::snapshot_at(uri, None, main_manifest_version) + .await + .unwrap(); + let main_entry = main_snapshot.entry("node:Person").unwrap(); + assert_eq!(main_entry.table_version, person_entry.table_version); + assert_eq!(main_entry.table_branch, None); + assert_eq!( + main_snapshot + .open("node:Person") + .await + .unwrap() + .count_rows(None) + .await + .unwrap(), + 0 + ); +} + +#[tokio::test] +async fn test_branch_manifest_namespace_uses_entry_owner_branch_for_latest_table_reads() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + mc.create_branch("feature").await.unwrap(); + + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let company_entry = snap.entry("node:Company").unwrap().clone(); + + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + person_ds + .create_branch("feature", person_entry.table_version, None) + .await + .unwrap(); + let mut feature_person_ds = person_ds.checkout_branch("feature").await.unwrap(); + let person_schema = Arc::new(feature_person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + feature_person_ds.append(reader, None).await.unwrap(); + let feature_person_version = feature_person_ds.version().version; + let feature_person_metadata = table_version_metadata_for_state( + uri, + &person_entry.table_path, + Some("feature"), + feature_person_version, + ) + .await + .unwrap(); + + branch_manifest_namespace(uri, Some("feature")) + .create_table_version(feature_person_metadata.to_create_table_version_request( + "node:Person", + feature_person_version, + 1, + Some("feature"), + )) + .await + .unwrap(); + + let feature_namespace = branch_manifest_namespace(uri, Some("feature")); + + let inherited_company = feature_namespace + .describe_table(DescribeTableRequest { + id: Some(vec!["node:Company".to_string()]), + with_table_uri: Some(true), + ..Default::default() + }) + .await + .unwrap(); + let inherited_company_uri = inherited_company.table_uri.as_deref().unwrap(); + assert!( + !inherited_company_uri.contains("/tree/feature"), + "inherited table should resolve to its owning branch, got {inherited_company_uri}" + ); + + let branch_owned_person = feature_namespace + .describe_table(DescribeTableRequest { + id: Some(vec!["node:Person".to_string()]), + with_table_uri: Some(true), + ..Default::default() + }) + .await + .unwrap(); + let branch_owned_person_uri = branch_owned_person.table_uri.as_deref().unwrap(); + assert!( + branch_owned_person_uri.contains("/tree/feature"), + "branch-owned table should resolve to feature branch, got {branch_owned_person_uri}" + ); + + let inherited_company_ds = DatasetBuilder::from_namespace( + Arc::clone(&feature_namespace), + vec!["node:Company".to_string()], + ) + .await + .unwrap() + .with_branch("feature", None) + .load() + .await + .unwrap(); + assert_eq!(inherited_company_ds.count_rows(None).await.unwrap(), 0); + + let branch_owned_person_ds = DatasetBuilder::from_namespace( + Arc::clone(&feature_namespace), + vec!["node:Person".to_string()], + ) + .await + .unwrap() + .with_branch("feature", None) + .load() + .await + .unwrap(); + assert_eq!(branch_owned_person_ds.count_rows(None).await.unwrap(), 1); + assert_eq!( + company_entry.table_branch, None, + "sanity check: company table stays inherited on feature" + ); +} + +#[tokio::test] +async fn test_refresh_observes_external_publish_without_mutating_existing_snapshot() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut reader = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let frozen_snapshot = reader.snapshot(); + let person_entry = frozen_snapshot.entry("node:Person").unwrap().clone(); + let manifest_version = reader.version(); + + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader_batch = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader_batch, None).await.unwrap(); + let person_version = person_ds.version().version; + let version_metadata = + table_version_metadata_for_state(uri, &person_entry.table_path, None, person_version) + .await + .unwrap(); + + branch_manifest_namespace(uri, None) + .create_table_version(version_metadata.to_create_table_version_request( + "node:Person", + person_version, + 1, + None, + )) + .await + .unwrap(); + + assert_eq!(reader.version(), manifest_version); + assert_eq!( + frozen_snapshot.entry("node:Person").unwrap().table_version, + person_entry.table_version + ); + assert_eq!( + frozen_snapshot + .open("node:Person") + .await + .unwrap() + .count_rows(None) + .await + .unwrap(), + 0 + ); + + reader.refresh().await.unwrap(); + assert!(reader.version() > manifest_version); + assert_eq!( + reader + .snapshot() + .entry("node:Person") + .unwrap() + .table_version, + person_version + ); + assert_eq!(reader.snapshot().entry("node:Person").unwrap().row_count, 1); +} + +#[tokio::test] +async fn test_batch_create_table_versions_is_atomic_on_conflict() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let manifest_version = mc.version(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let company_entry = snap.entry("node:Company").unwrap().clone(); + + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + + let person_version_metadata = + table_version_metadata_for_state(uri, &person_entry.table_path, None, person_version) + .await + .unwrap(); + let company_version_metadata = table_version_metadata_for_state( + uri, + &company_entry.table_path, + None, + company_entry.table_version, + ) + .await + .unwrap(); + + let person_request = person_version_metadata.to_create_table_version_request( + "node:Person", + person_version, + 1, + None, + ); + + let conflicting_company_request = company_version_metadata.to_create_table_version_request( + "node:Company", + company_entry.table_version, + 0, + None, + ); + + let err = GraphNamespacePublisher::new(uri, None) + .publish_requests(&[person_request, conflicting_company_request]) + .await + .unwrap_err(); + assert!(err.to_string().contains("already exists")); + + let reopened = ManifestCoordinator::open(uri).await.unwrap(); + assert_eq!(reopened.version(), manifest_version); + assert_eq!( + reopened + .snapshot() + .entry("node:Person") + .unwrap() + .table_version, + person_entry.table_version + ); + assert_eq!( + reopened.snapshot().entry("node:Person").unwrap().row_count, + 0 + ); +} + +#[tokio::test] +async fn test_batch_create_table_versions_rejects_duplicate_requests_without_advancing_manifest() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let manifest_version = mc.version(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + let version_metadata = + table_version_metadata_for_state(uri, &person_entry.table_path, None, person_version) + .await + .unwrap(); + let request = + version_metadata.to_create_table_version_request("node:Person", person_version, 1, None); + + let err = GraphNamespacePublisher::new(uri, None) + .publish_requests(&[request.clone(), request]) + .await + .unwrap_err(); + assert!(err.to_string().contains("already exists")); + + let reopened = ManifestCoordinator::open(uri).await.unwrap(); + assert_eq!(reopened.version(), manifest_version); + assert_eq!( + reopened + .snapshot() + .entry("node:Person") + .unwrap() + .table_version, + person_entry.table_version + ); + assert_eq!( + reopened.snapshot().entry("node:Person").unwrap().row_count, + 0 + ); +} + +#[tokio::test] +async fn test_batch_create_table_versions_allows_owner_branch_handoff_at_same_version() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut main_mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + main_mc.create_branch("feature").await.unwrap(); + + let snap = main_mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + person_ds + .create_branch("feature", person_entry.table_version, None) + .await + .unwrap(); + let mut feature_ds = person_ds.checkout_branch("feature").await.unwrap(); + let person_schema = Arc::new(feature_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + feature_ds.append(reader, None).await.unwrap(); + let feature_version = feature_ds.version().version; + let feature_metadata = table_version_metadata_for_state( + uri, + &person_entry.table_path, + Some("feature"), + feature_version, + ) + .await + .unwrap(); + + branch_manifest_namespace(uri, Some("feature")) + .create_table_version(feature_metadata.to_create_table_version_request( + "node:Person", + feature_version, + 1, + Some("feature"), + )) + .await + .unwrap(); + + let mut feature_mc = ManifestCoordinator::open_at_branch(uri, "feature") + .await + .unwrap(); + feature_mc.create_branch("experiment").await.unwrap(); + feature_ds + .create_branch("experiment", feature_version, None) + .await + .unwrap(); + let experiment_metadata = table_version_metadata_for_state( + uri, + &person_entry.table_path, + Some("experiment"), + feature_version, + ) + .await + .unwrap(); + + GraphNamespacePublisher::new(uri, Some("experiment")) + .publish_requests(&[experiment_metadata.to_create_table_version_request( + "node:Person", + feature_version, + 1, + Some("experiment"), + )]) + .await + .unwrap(); + + let experiment_mc = ManifestCoordinator::open_at_branch(uri, "experiment") + .await + .unwrap(); + let experiment_snapshot = experiment_mc.snapshot(); + let experiment_entry = experiment_snapshot.entry("node:Person").unwrap(); + assert_eq!(experiment_entry.table_version, feature_version); + assert_eq!(experiment_entry.table_branch.as_deref(), Some("experiment")); +} + +#[tokio::test] +async fn test_staged_namespace_lists_native_table_versions_before_publish() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + + let namespace = staged_table_namespace(uri, "node:Person", &person_entry.table_path, None); + let listed = namespace + .list_table_versions(ListTableVersionsRequest { + id: Some(vec!["node:Person".to_string()]), + descending: Some(false), + ..Default::default() + }) + .await + .unwrap(); + let listed_versions: Vec = listed + .versions + .into_iter() + .map(|version| version.version as u64) + .collect(); + assert_eq!(listed_versions, vec![1, person_version]); + + let described = namespace + .describe_table_version(DescribeTableVersionRequest { + id: Some(vec!["node:Person".to_string()]), + version: Some(person_version as i64), + ..Default::default() + }) + .await + .unwrap(); + assert_eq!(described.version.version as u64, person_version); +} + +#[derive(Clone)] +struct RecordingPublisher { + inner: Arc, + requests: Arc>>, +} + +impl RecordingPublisher { + fn new(root_uri: &str, branch: Option<&str>) -> Self { + Self { + inner: Arc::new(GraphNamespacePublisher::new(root_uri, branch)), + requests: Arc::new(Mutex::new(Vec::new())), + } + } + + async fn recorded_requests(&self) -> Vec { + self.requests.lock().await.clone() + } +} + +#[async_trait] +impl ManifestBatchPublisher for RecordingPublisher { + async fn publish(&self, updates: &[SubTableUpdate]) -> Result { + let requests: Vec = updates + .iter() + .map(SubTableUpdate::to_create_table_version_request) + .collect(); + self.requests.lock().await.extend_from_slice(&requests); + self.inner.publish_requests(&requests).await + } +} + +struct FailingPublisher; + +#[async_trait] +impl ManifestBatchPublisher for FailingPublisher { + async fn publish(&self, _updates: &[SubTableUpdate]) -> Result { + Err(OmniError::manifest( + "injected batch publisher failure".to_string(), + )) + } +} + +#[tokio::test] +async fn test_commit_routes_through_injected_batch_publisher() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + let version_metadata = + table_version_metadata_for_state(uri, &person_entry.table_path, None, person_version) + .await + .unwrap(); + + let recording = RecordingPublisher::new(uri, None); + mc = mc.with_batch_publisher(Arc::new(recording.clone())); + + mc.commit(&[SubTableUpdate { + table_key: "node:Person".to_string(), + table_version: person_version, + table_branch: None, + row_count: 1, + version_metadata: version_metadata.clone(), + }]) + .await + .unwrap(); + + let recorded = recording.recorded_requests().await; + assert_eq!(recorded.len(), 1); + let request = &recorded[0]; + assert_eq!( + request.id.as_ref().unwrap(), + &vec!["node:Person".to_string()] + ); + assert_eq!(request.version as u64, person_version); + assert_eq!(request.manifest_path, version_metadata.manifest_path()); + assert_eq!( + request.manifest_size, + version_metadata.manifest_size().map(|size| size as i64) + ); + assert_eq!(request.e_tag.as_deref(), version_metadata.e_tag()); + assert_eq!( + request.naming_scheme.as_deref(), + version_metadata.naming_scheme() + ); + assert_eq!( + request + .metadata + .as_ref() + .and_then(|metadata| metadata.get(OMNIGRAPH_ROW_COUNT_KEY)) + .map(String::as_str), + Some("1") + ); + assert_eq!( + mc.snapshot().entry("node:Person").unwrap().table_version, + person_version + ); +} + +#[tokio::test] +async fn test_commit_failure_from_injected_batch_publisher_preserves_visible_state() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let catalog = build_test_catalog(); + + let mut mc = ManifestCoordinator::init(uri, &catalog).await.unwrap(); + let manifest_version = mc.version(); + let snap = mc.snapshot(); + let person_entry = snap.entry("node:Person").unwrap().clone(); + let mut person_ds = Dataset::open(&format!("{}/{}", uri, person_entry.table_path)) + .await + .unwrap(); + let person_schema = Arc::new(person_ds.schema().into()); + let person_batch = RecordBatch::try_new( + Arc::clone(&person_schema), + vec![ + Arc::new(StringArray::from(vec!["person-1"])), + Arc::new(StringArray::from(vec!["Alice"])), + Arc::new(Int32Array::from(vec![Some(30)])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(person_batch)], person_schema); + person_ds.append(reader, None).await.unwrap(); + let person_version = person_ds.version().version; + let version_metadata = + table_version_metadata_for_state(uri, &person_entry.table_path, None, person_version) + .await + .unwrap(); + + mc = mc.with_batch_publisher(Arc::new(FailingPublisher)); + let err = mc + .commit(&[SubTableUpdate { + table_key: "node:Person".to_string(), + table_version: person_version, + table_branch: None, + row_count: 1, + version_metadata, + }]) + .await + .unwrap_err(); + assert!(err.to_string().contains("injected batch publisher failure")); + assert_eq!(mc.version(), manifest_version); + assert_eq!( + mc.snapshot().entry("node:Person").unwrap().table_version, + person_entry.table_version + ); + assert_eq!(mc.snapshot().entry("node:Person").unwrap().row_count, 0); + + let reopened = ManifestCoordinator::open(uri).await.unwrap(); + assert_eq!(reopened.version(), manifest_version); + assert_eq!( + reopened + .snapshot() + .entry("node:Person") + .unwrap() + .table_version, + person_entry.table_version + ); +} + +#[test] +fn manifest_column_helpers_return_error_for_bad_schema() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "table_key", + DataType::UInt64, + false, + )])), + vec![Arc::new(UInt64Array::from(vec![1_u64]))], + ) + .unwrap(); + + let err = string_column(&batch, "table_key").unwrap_err(); + assert!(err.to_string().contains("table_key")); +} diff --git a/crates/omnigraph/src/db/mod.rs b/crates/omnigraph/src/db/mod.rs new file mode 100644 index 0000000..7e5245f --- /dev/null +++ b/crates/omnigraph/src/db/mod.rs @@ -0,0 +1,13 @@ +pub mod commit_graph; +pub mod graph_coordinator; +pub mod manifest; +mod omnigraph; +mod run_registry; +mod schema_state; + +pub use commit_graph::GraphCommit; +pub use graph_coordinator::{GraphCoordinator, ReadTarget, ResolvedTarget, SnapshotId}; +pub use manifest::{Snapshot, SubTableEntry, SubTableUpdate}; +pub use omnigraph::{MergeOutcome, Omnigraph}; +pub(crate) use run_registry::is_internal_run_branch; +pub use run_registry::{RunId, RunRecord, RunStatus}; diff --git a/crates/omnigraph/src/db/omnigraph.rs b/crates/omnigraph/src/db/omnigraph.rs new file mode 100644 index 0000000..2dc93fa --- /dev/null +++ b/crates/omnigraph/src/db/omnigraph.rs @@ -0,0 +1,2636 @@ +use std::collections::{BTreeSet, HashMap, HashSet}; +use std::sync::Arc; + +use arrow_array::{ + Array, BinaryArray, BooleanArray, Date32Array, FixedSizeListArray, Float32Array, Float64Array, + Int32Array, Int64Array, LargeBinaryArray, LargeListArray, LargeStringArray, ListArray, + RecordBatch, StringArray, StructArray, UInt32Array, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema}; +use lance::Dataset; +use lance::blob::{BlobArrayBuilder, blob_field}; +use lance::dataset::BlobFile; +use lance::dataset::scanner::ColumnOrdering; +use lance::datatypes::BlobKind; +use omnigraph_compiler::catalog::{Catalog, EdgeType, NodeType}; +use omnigraph_compiler::schema::parser::parse_schema; +use omnigraph_compiler::types::ScalarType; +use omnigraph_compiler::{ + SchemaIR, SchemaMigrationPlan, build_catalog_from_ir, build_schema_ir, plan_schema_migration, +}; + +use crate::db::graph_coordinator::{GraphCoordinator, PublishedSnapshot}; +use crate::db::run_registry::{RunRecord, RunStatus, is_internal_run_branch}; +use crate::error::{ManifestErrorKind, OmniError, Result}; +use crate::runtime_cache::RuntimeCache; +use crate::storage::{StorageAdapter, join_uri, normalize_root_uri, storage_for_uri}; +use crate::table_store::TableStore; + +use super::commit_graph::GraphCommit; +use super::manifest::Snapshot; +use super::schema_state::{ + SCHEMA_SOURCE_FILENAME, load_or_bootstrap_schema_contract, read_accepted_schema_ir, + validate_schema_contract, write_schema_contract, +}; +use super::{ReadTarget, ResolvedTarget, RunId, SnapshotId}; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MergeOutcome { + AlreadyUpToDate, + FastForward, + Merged, +} + +/// Top-level handle to an Omnigraph database. +/// +/// An Omnigraph is a Lance-native graph database with git-style branching. +/// It stores typed property graphs as per-type Lance datasets coordinated +/// through a Lance manifest table. +pub struct Omnigraph { + root_uri: String, + storage: Arc, + coordinator: GraphCoordinator, + table_store: TableStore, + runtime_cache: RuntimeCache, + catalog: Catalog, + schema_source: String, + pub(crate) audit_actor_id: Option, +} + +impl Omnigraph { + /// Create a new repo at `uri` from schema source. + /// + /// Creates `_schema.pg`, per-type Lance datasets, and `__manifest`. + pub async fn init(uri: &str, schema_source: &str) -> Result { + Self::init_with_storage(uri, schema_source, storage_for_uri(uri)?).await + } + + pub(crate) async fn init_with_storage( + uri: &str, + schema_source: &str, + storage: Arc, + ) -> Result { + let root = normalize_root_uri(uri)?; + let schema_ir = read_schema_ir_from_source(schema_source)?; + let mut catalog = build_catalog_from_ir(&schema_ir)?; + fixup_blob_schemas(&mut catalog); + + // Write _schema.pg + let schema_path = join_uri(&root, SCHEMA_SOURCE_FILENAME); + storage.write_text(&schema_path, schema_source).await?; + write_schema_contract(&root, storage.as_ref(), &schema_ir).await?; + + // Create manifest + per-type datasets + let coordinator = GraphCoordinator::init(&root, &catalog, Arc::clone(&storage)).await?; + + Ok(Self { + root_uri: root.clone(), + storage, + coordinator, + table_store: TableStore::new(&root), + runtime_cache: RuntimeCache::default(), + catalog, + schema_source: schema_source.to_string(), + audit_actor_id: None, + }) + } + + /// Open an existing repo. + /// + /// Reads `_schema.pg`, parses it, builds the catalog, and opens `__manifest`. + pub async fn open(uri: &str) -> Result { + Self::open_with_storage(uri, storage_for_uri(uri)?).await + } + + pub(crate) async fn open_with_storage( + uri: &str, + storage: Arc, + ) -> Result { + let root = normalize_root_uri(uri)?; + // Read _schema.pg + let schema_path = join_uri(&root, SCHEMA_SOURCE_FILENAME); + let schema_source = storage.read_text(&schema_path).await?; + let current_source_ir = read_schema_ir_from_source(&schema_source)?; + let coordinator = GraphCoordinator::open(&root, Arc::clone(&storage)).await?; + let branches = coordinator.branch_list().await?; + let (accepted_ir, _) = load_or_bootstrap_schema_contract( + &root, + Arc::clone(&storage), + &branches, + ¤t_source_ir, + ) + .await?; + let mut catalog = build_catalog_from_ir(&accepted_ir)?; + fixup_blob_schemas(&mut catalog); + + Ok(Self { + root_uri: root.clone(), + storage, + coordinator, + table_store: TableStore::new(&root), + runtime_cache: RuntimeCache::default(), + catalog, + schema_source, + audit_actor_id: None, + }) + } + + pub fn catalog(&self) -> &Catalog { + &self.catalog + } + + pub fn schema_source(&self) -> &str { + &self.schema_source + } + + pub fn uri(&self) -> &str { + &self.root_uri + } + + pub(crate) async fn ensure_schema_state_valid(&self) -> Result<()> { + validate_schema_contract(self.uri(), Arc::clone(&self.storage)).await + } + + pub async fn plan_schema(&self, desired_schema_source: &str) -> Result { + self.ensure_schema_state_valid().await?; + let accepted_ir = read_accepted_schema_ir(self.uri(), Arc::clone(&self.storage)).await?; + let desired_ir = read_schema_ir_from_source(desired_schema_source)?; + plan_schema_migration(&accepted_ir, &desired_ir) + .map_err(|err| OmniError::manifest(err.to_string())) + } + + pub(crate) fn table_store(&self) -> &TableStore { + &self.table_store + } + + pub(crate) async fn open_coordinator_for_branch( + &self, + branch: Option<&str>, + ) -> Result { + match branch { + Some(branch) => { + GraphCoordinator::open_branch(self.uri(), branch, Arc::clone(&self.storage)).await + } + None => GraphCoordinator::open(self.uri(), Arc::clone(&self.storage)).await, + } + } + + pub(crate) async fn swap_coordinator_for_branch( + &mut self, + branch: Option<&str>, + ) -> Result { + let next = self.open_coordinator_for_branch(branch).await?; + Ok(std::mem::replace(&mut self.coordinator, next)) + } + + pub(crate) fn restore_coordinator(&mut self, coordinator: GraphCoordinator) { + self.coordinator = coordinator; + } + + pub(crate) async fn resolved_branch_target( + &self, + branch: Option<&str>, + ) -> Result { + self.ensure_schema_state_valid().await?; + let requested = ReadTarget::Branch(branch.unwrap_or("main").to_string()); + let normalized = normalize_branch_name(branch.unwrap_or("main"))?; + if normalized.as_deref() == self.coordinator.current_branch() { + let snapshot_id = self.coordinator.head_commit_id().await?.unwrap_or_else(|| { + SnapshotId::synthetic( + self.coordinator.current_branch(), + self.coordinator.version(), + ) + }); + return Ok(ResolvedTarget { + requested, + branch: self.coordinator.current_branch().map(str::to_string), + snapshot_id, + snapshot: self.coordinator.snapshot(), + }); + } + self.coordinator.resolve_target(&requested).await + } + + pub(crate) async fn snapshot_for_branch(&self, branch: Option<&str>) -> Result { + self.resolved_branch_target(branch) + .await + .map(|resolved| resolved.snapshot) + } + + pub(crate) fn version(&self) -> u64 { + self.coordinator.version() + } + + /// Return an immutable Snapshot from the known manifest state. No storage I/O. + pub(crate) fn snapshot(&self) -> Snapshot { + self.coordinator.snapshot() + } + + pub async fn snapshot_of(&self, target: impl Into) -> Result { + self.resolved_target(target) + .await + .map(|resolved| resolved.snapshot) + } + + pub async fn version_of(&self, target: impl Into) -> Result { + self.snapshot_of(target) + .await + .map(|snapshot| snapshot.version()) + } + + pub async fn resolved_branch_of( + &self, + target: impl Into, + ) -> Result> { + self.resolved_target(target) + .await + .map(|resolved| resolved.branch) + } + + /// Synchronize this handle's write base to the latest head of the named branch. + pub async fn sync_branch(&mut self, branch: &str) -> Result<()> { + self.ensure_schema_state_valid().await?; + let branch = normalize_branch_name(branch)?; + self.coordinator = self.open_coordinator_for_branch(branch.as_deref()).await?; + self.runtime_cache.invalidate_all().await; + Ok(()) + } + + /// Re-read the handle-local coordinator state from storage. + pub(crate) async fn refresh(&mut self) -> Result<()> { + self.coordinator.refresh().await?; + self.runtime_cache.invalidate_all().await; + Ok(()) + } + + pub async fn resolve_snapshot(&self, branch: &str) -> Result { + self.ensure_schema_state_valid().await?; + self.coordinator.resolve_snapshot_id(branch).await + } + + pub(crate) async fn resolved_target( + &self, + target: impl Into, + ) -> Result { + self.ensure_schema_state_valid().await?; + self.coordinator.resolve_target(&target.into()).await + } + + // ─── Change detection ──────────────────────────────────────────────── + + pub async fn diff_between( + &self, + from: impl Into, + to: impl Into, + filter: &crate::changes::ChangeFilter, + ) -> Result { + let from_resolved = self.resolved_target(from).await?; + let to_resolved = self.resolved_target(to).await?; + crate::changes::diff_snapshots( + self.uri(), + &from_resolved.snapshot, + &to_resolved.snapshot, + filter, + to_resolved.branch.clone().or(from_resolved.branch.clone()), + ) + .await + } + + /// Diff two graph commits. Resolves each commit to `(manifest_branch, manifest_version)` + /// and creates branch-aware snapshots. Supports cross-branch comparison. + pub async fn diff_commits( + &self, + from_commit_id: &str, + to_commit_id: &str, + filter: &crate::changes::ChangeFilter, + ) -> Result { + let from_commit = self + .coordinator + .resolve_commit(&SnapshotId::new(from_commit_id)) + .await?; + let to_commit = self + .coordinator + .resolve_commit(&SnapshotId::new(to_commit_id)) + .await?; + let from_snap = self + .coordinator + .resolve_target(&ReadTarget::Snapshot(SnapshotId::new( + from_commit.graph_commit_id.clone(), + ))) + .await?; + let to_snap = self + .coordinator + .resolve_target(&ReadTarget::Snapshot(SnapshotId::new( + to_commit.graph_commit_id.clone(), + ))) + .await?; + crate::changes::diff_snapshots( + self.uri(), + &from_snap.snapshot, + &to_snap.snapshot, + filter, + to_snap.branch.clone().or(from_snap.branch.clone()), + ) + .await + } + + pub async fn entity_at_target( + &self, + target: impl Into, + table_key: &str, + id: &str, + ) -> Result> { + let resolved = self.resolved_target(target).await?; + self.entity_from_snapshot(&resolved.snapshot, table_key, id) + .await + } + + /// Read one entity at a specific manifest version via time travel (on-demand enrichment). + pub async fn entity_at( + &self, + table_key: &str, + id: &str, + version: u64, + ) -> Result> { + let snap = self.coordinator.snapshot_at_version(version).await?; + self.entity_from_snapshot(&snap, table_key, id).await + } + + /// Create a Snapshot at any historical manifest version. + pub async fn snapshot_at_version(&self, version: u64) -> Result { + self.ensure_schema_state_valid().await?; + self.coordinator.snapshot_at_version(version).await + } + + pub async fn export_jsonl( + &self, + branch: &str, + type_names: &[String], + table_keys: &[String], + ) -> Result { + self.ensure_schema_state_valid().await?; + let snapshot = self.snapshot_of(ReadTarget::branch(branch)).await?; + self.export_snapshot_jsonl(&snapshot, type_names, table_keys) + .await + } + + async fn entity_from_snapshot( + &self, + snapshot: &Snapshot, + table_key: &str, + id: &str, + ) -> Result> { + if snapshot.entry(table_key).is_none() { + return Ok(None); + } + + let ds = self + .table_store + .open_snapshot_table(snapshot, table_key) + .await?; + let filter_sql = format!("id = '{}'", id.replace('\'', "''")); + let batches = self + .table_store + .scan(&ds, None, Some(&filter_sql), None) + .await?; + let Some(batch) = batches.iter().find(|batch| batch.num_rows() > 0) else { + return Ok(None); + }; + Ok(Some(record_batch_row_to_json(batch, 0)?)) + } + + async fn export_snapshot_jsonl( + &self, + snapshot: &Snapshot, + type_names: &[String], + table_keys: &[String], + ) -> Result { + let selected_tables = self.export_table_keys(snapshot, type_names, table_keys)?; + let mut out = String::new(); + for table_key in selected_tables { + for row in self.export_table_rows(snapshot, &table_key).await? { + out.push_str(&serde_json::to_string(&row).map_err(|err| { + OmniError::manifest(format!( + "failed to serialize export row for '{}': {}", + table_key, err + )) + })?); + out.push('\n'); + } + } + Ok(out) + } + + fn export_table_keys( + &self, + snapshot: &Snapshot, + type_names: &[String], + table_keys: &[String], + ) -> Result> { + let available = snapshot + .entries() + .map(|entry| entry.table_key.clone()) + .collect::>(); + let mut selected = BTreeSet::new(); + + for table_key in table_keys { + if !available.contains(table_key) { + return Err(OmniError::manifest(format!( + "unknown export table '{}'", + table_key + ))); + } + selected.insert(table_key.clone()); + } + + for type_name in type_names { + let mut matched = false; + let node_key = format!("node:{}", type_name); + if available.contains(&node_key) { + selected.insert(node_key); + matched = true; + } + let edge_key = format!("edge:{}", type_name); + if available.contains(&edge_key) { + selected.insert(edge_key); + matched = true; + } + if !matched { + return Err(OmniError::manifest(format!( + "unknown export type '{}'", + type_name + ))); + } + } + + if selected.is_empty() { + return Ok(available.into_iter().collect()); + } + + Ok(selected.into_iter().collect()) + } + + async fn export_table_rows( + &self, + snapshot: &Snapshot, + table_key: &str, + ) -> Result> { + let ds = self + .table_store + .open_snapshot_table(snapshot, table_key) + .await?; + let ordering = Some(vec![ColumnOrdering::asc_nulls_last("id".to_string())]); + let blob_properties = blob_properties_for_table_key(self.catalog(), table_key)?; + + if blob_properties.is_empty() { + let batch = concat_or_empty_batches( + schema_for_table_key(self.catalog(), table_key)?, + self.table_store.scan(&ds, None, None, ordering).await?, + )?; + return self.export_rows_from_batch(table_key, &batch, None).await; + } + + let batches = self + .table_store + .scan_with(&ds, None, None, ordering, true, |_| Ok(())) + .await?; + if batches.is_empty() { + return Ok(Vec::new()); + } + + let scan_schema = batches[0].schema(); + let batch = if batches.len() == 1 { + batches.into_iter().next().unwrap() + } else { + arrow_select::concat::concat_batches(&scan_schema, &batches) + .map_err(|e| OmniError::Lance(e.to_string()))? + }; + let row_ids = batch + .column_by_name("_rowid") + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + OmniError::Lance(format!( + "expected _rowid column when exporting '{}'", + table_key + )) + })? + .values() + .iter() + .copied() + .collect::>(); + let blob_values = self + .export_blob_values(&ds, &batch, &row_ids, blob_properties) + .await?; + self.export_rows_from_batch(table_key, &batch, Some(&blob_values)) + .await + } + + // ─── Graph index ────────────────────────────────────────────────────── + + /// Get or build the graph index for the current snapshot. + pub async fn graph_index(&self) -> Result> { + self.ensure_schema_state_valid().await?; + let resolved = self + .coordinator + .resolve_target(&ReadTarget::Branch( + self.coordinator + .current_branch() + .unwrap_or("main") + .to_string(), + )) + .await?; + self.runtime_cache + .graph_index(&resolved, &self.catalog) + .await + } + + pub(crate) async fn graph_index_for_resolved( + &self, + resolved: &ResolvedTarget, + ) -> Result> { + self.runtime_cache + .graph_index(resolved, &self.catalog) + .await + } + + /// Ensure BTree scalar indices exist on key columns. + /// Idempotent — Lance skips if index already exists. + /// + /// Opens sub-tables at their latest version (not snapshot-pinned) because + /// indices must be created on the current head. Any version drift from the + /// snapshot is expected and logged. The resulting versions are committed + /// back to the manifest. + /// + /// On named branches, indexing preserves lazy branching: + /// unbranched subtables keep inheriting `main`, while subtables inherited + /// from an ancestor branch are first forked into the active branch before + /// their index metadata is updated. + pub async fn ensure_indices(&mut self) -> Result<()> { + let current_branch = self.coordinator.current_branch().map(str::to_string); + self.ensure_indices_for_branch(current_branch.as_deref()) + .await + } + + pub async fn ensure_indices_on(&mut self, branch: &str) -> Result<()> { + let branch = normalize_branch_name(branch)?; + self.ensure_indices_for_branch(branch.as_deref()).await + } + + pub(crate) async fn ensure_indices_for_branch(&mut self, branch: Option<&str>) -> Result<()> { + self.ensure_schema_state_valid().await?; + let resolved = self.resolved_branch_target(branch).await?; + let snapshot = resolved.snapshot; + let mut updates = Vec::new(); + let active_branch = resolved.branch; + + for type_name in self.catalog.node_types.keys() { + let table_key = format!("node:{}", type_name); + let Some(entry) = snapshot.entry(&table_key) else { + continue; + }; + let full_path = format!("{}/{}", self.root_uri, entry.table_path); + let (mut ds, resolved_branch) = match active_branch.as_deref() { + Some(active_branch) => match entry.table_branch.as_deref() { + None => continue, + _ => { + self.open_owned_dataset_for_branch_write( + &table_key, + &full_path, + entry.table_branch.as_deref(), + entry.table_version, + active_branch, + ) + .await? + } + }, + None => ( + self.table_store + .open_dataset_head_for_write(&table_key, &full_path, None) + .await?, + None, + ), + }; + let row_count = self.table_store.count_rows(&ds, None).await.unwrap_or(0); + if row_count > 0 { + self.build_indices_on_dataset(&table_key, &mut ds).await?; + } + + let state = self.table_store.table_state(&full_path, &ds).await?; + if state.version != entry.table_version + || resolved_branch.as_deref() != entry.table_branch.as_deref() + { + updates.push(crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch: resolved_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + } + } + + for edge_name in self.catalog.edge_types.keys() { + let table_key = format!("edge:{}", edge_name); + let Some(entry) = snapshot.entry(&table_key) else { + continue; + }; + let full_path = format!("{}/{}", self.root_uri, entry.table_path); + let (mut ds, resolved_branch) = match active_branch.as_deref() { + Some(active_branch) => match entry.table_branch.as_deref() { + None => continue, + _ => { + self.open_owned_dataset_for_branch_write( + &table_key, + &full_path, + entry.table_branch.as_deref(), + entry.table_version, + active_branch, + ) + .await? + } + }, + None => ( + self.table_store + .open_dataset_head_for_write(&table_key, &full_path, None) + .await?, + None, + ), + }; + let row_count = self.table_store.count_rows(&ds, None).await.unwrap_or(0); + if row_count > 0 { + self.build_indices_on_dataset(&table_key, &mut ds).await?; + } + + let state = self.table_store.table_state(&full_path, &ds).await?; + if state.version != entry.table_version + || resolved_branch.as_deref() != entry.table_branch.as_deref() + { + updates.push(crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch: resolved_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + } + } + + if !updates.is_empty() { + self.commit_prepared_updates_on_branch(branch, &updates) + .await?; + } + + Ok(()) + } + + /// Read a blob from a node by its string ID and property name. + /// + /// Returns a `BlobFile` handle with async `read()`, `seek()`, `tell()`, + /// and metadata accessors (`size()`, `kind()`, `uri()`). + /// + /// ```ignore + /// let blob = db.read_blob("Document", "readme", "content").await?; + /// let bytes = blob.read().await?; + /// ``` + pub async fn read_blob(&self, type_name: &str, id: &str, property: &str) -> Result { + self.ensure_schema_state_valid().await?; + let node_type = self + .catalog + .node_types + .get(type_name) + .ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?; + if !node_type.blob_properties.contains(property) { + return Err(OmniError::manifest(format!( + "property '{}' on type '{}' is not a Blob", + property, type_name + ))); + } + + let snapshot = self.snapshot(); + let table_key = format!("node:{}", type_name); + let ds = snapshot.open(&table_key).await?; + + let filter_sql = format!("id = '{}'", id.replace('\'', "''")); + let row_id = self + .table_store + .first_row_id_for_filter(&ds, &filter_sql) + .await? + .ok_or_else(|| { + OmniError::manifest(format!("no {} with id '{}' found", type_name, id)) + })?; + + // Use take_blobs to get the BlobFile handle + let ds = Arc::new(ds); + let mut blobs = ds + .take_blobs(&[row_id], property) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + blobs.pop().ok_or_else(|| { + OmniError::manifest(format!( + "blob '{}' on {} '{}' returned no data", + property, type_name, id + )) + }) + } + + pub(crate) fn active_branch(&self) -> Option<&str> { + self.coordinator.current_branch() + } + + async fn ensure_branch_delete_safe(&self, branch: &str, branches: &[String]) -> Result<()> { + let descendants = self.coordinator.branch_descendants(branch).await?; + if let Some(descendant) = descendants.first() { + return Err(OmniError::manifest_conflict(format!( + "cannot delete branch '{}' because descendant branch '{}' still depends on it", + branch, descendant + ))); + } + + for run in self.list_runs().await? { + if run.target_branch == branch + && matches!(run.status, RunStatus::Running | RunStatus::Failed) + { + return Err(OmniError::manifest_conflict(format!( + "cannot delete branch '{}' while run '{}' targeting it is {}", + branch, + run.run_id, + run.status.as_str() + ))); + } + } + + for other_branch in branches + .iter() + .filter(|candidate| candidate.as_str() != branch) + { + let snapshot = self + .snapshot_of(ReadTarget::branch(other_branch.as_str())) + .await?; + if snapshot + .entries() + .any(|entry| entry.table_branch.as_deref() == Some(branch)) + { + return Err(OmniError::manifest_conflict(format!( + "cannot delete branch '{}' because branch '{}' still depends on it", + branch, other_branch + ))); + } + } + + Ok(()) + } + + async fn cleanup_deleted_branch_tables( + &self, + branch: &str, + owned_tables: &[(String, String)], + ) -> Result<()> { + let mut seen_paths = HashSet::new(); + let mut cleanup_targets = owned_tables + .iter() + .filter(|(_, table_path)| seen_paths.insert(table_path.clone())) + .cloned() + .collect::>(); + cleanup_targets.sort_by(|left, right| left.0.cmp(&right.0)); + + for (table_key, table_path) in cleanup_targets { + let dataset_uri = self.table_store.dataset_uri(&table_path); + if let Err(err) = self.table_store.delete_branch(&dataset_uri, branch).await { + return Err(OmniError::manifest_internal(format!( + "branch '{}' was deleted but cleanup failed for {}: {}", + branch, table_key, err + ))); + } + } + + Ok(()) + } + + async fn delete_branch_storage_only(&mut self, branch: &str) -> Result<()> { + if self.coordinator.current_branch() == Some(branch) { + return Err(OmniError::manifest_conflict(format!( + "cannot delete currently active branch '{}'", + branch + ))); + } + + let branch_snapshot = self.snapshot_of(ReadTarget::branch(branch)).await?; + let owned_tables = branch_snapshot + .entries() + .filter(|entry| entry.table_branch.as_deref() == Some(branch)) + .map(|entry| (entry.table_key.clone(), entry.table_path.clone())) + .collect::>(); + + self.coordinator.branch_delete(branch).await?; + self.cleanup_deleted_branch_tables(branch, &owned_tables) + .await + } + + async fn cleanup_terminal_run_branches_for_target(&mut self, branch: &str) -> Result<()> { + let terminal_run_branches = self + .list_runs() + .await? + .into_iter() + .filter(|run| { + run.target_branch == branch + && matches!(run.status, RunStatus::Published | RunStatus::Aborted) + }) + .map(|run| run.run_branch) + .collect::>(); + + for run_branch in terminal_run_branches { + match self.delete_branch_storage_only(&run_branch).await { + Ok(()) => {} + Err(OmniError::Manifest(err)) if err.kind == ManifestErrorKind::NotFound => {} + Err(err) => return Err(err), + } + } + + Ok(()) + } + + pub(crate) fn normalize_branch_name(branch: &str) -> Result> { + normalize_branch_name(branch) + } + + pub(crate) async fn head_commit_id_for_branch( + &self, + branch: Option<&str>, + ) -> Result> { + let mut coordinator = self.open_coordinator_for_branch(branch).await?; + coordinator.ensure_commit_graph_initialized().await?; + coordinator + .head_commit_id() + .await + .map(|id| id.map(|snapshot_id| snapshot_id.as_str().to_string())) + } + + pub async fn branch_create(&mut self, name: &str) -> Result<()> { + self.ensure_schema_state_valid().await?; + ensure_public_branch_ref(name, "branch_create")?; + self.coordinator.branch_create(name).await + } + + pub(crate) fn current_audit_actor(&self) -> Option<&str> { + self.audit_actor_id.as_deref() + } + + pub async fn branch_create_from( + &mut self, + from: impl Into, + name: &str, + ) -> Result<()> { + self.branch_create_from_impl(from, name, false).await + } + + async fn branch_create_from_impl( + &mut self, + from: impl Into, + name: &str, + allow_internal_refs: bool, + ) -> Result<()> { + let target = from.into(); + let ReadTarget::Branch(branch_name) = target else { + return Err(OmniError::manifest( + "branch creation from pinned snapshots is not supported yet".to_string(), + )); + }; + if !allow_internal_refs { + ensure_public_branch_ref(&branch_name, "branch_create_from")?; + ensure_public_branch_ref(name, "branch_create_from")?; + } + let branch = normalize_branch_name(&branch_name)?; + let previous = self.swap_coordinator_for_branch(branch.as_deref()).await?; + let result = self.coordinator.branch_create(name).await; + self.restore_coordinator(previous); + result + } + + pub async fn branch_list(&self) -> Result> { + self.ensure_schema_state_valid().await?; + self.coordinator.branch_list().await + } + + pub async fn branch_delete(&mut self, name: &str) -> Result<()> { + self.ensure_schema_state_valid().await?; + ensure_public_branch_ref(name, "branch_delete")?; + self.refresh().await?; + let branch = normalize_branch_name(name)? + .ok_or_else(|| OmniError::manifest("cannot delete branch 'main'".to_string()))?; + let branches = self.coordinator.branch_list().await?; + if !branches.iter().any(|candidate| candidate == &branch) { + return Err(OmniError::manifest_not_found(format!( + "branch '{}' not found", + branch + ))); + } + + self.ensure_branch_delete_safe(&branch, &branches).await?; + self.cleanup_terminal_run_branches_for_target(&branch) + .await?; + self.delete_branch_storage_only(&branch).await + } + + pub(crate) async fn latest_branch_snapshot_id(&self, branch: &str) -> Result { + let normalized = normalize_branch_name(branch)?; + let fresh = self + .open_coordinator_for_branch(normalized.as_deref()) + .await?; + fresh.resolve_snapshot_id(branch).await + } + + pub async fn begin_run( + &mut self, + target_branch: &str, + operation_hash: Option<&str>, + ) -> Result { + self.begin_run_as(target_branch, operation_hash, None).await + } + + pub async fn begin_run_as( + &mut self, + target_branch: &str, + operation_hash: Option<&str>, + actor_id: Option<&str>, + ) -> Result { + self.ensure_schema_state_valid().await?; + ensure_public_branch_ref(target_branch, "begin_run")?; + let target_branch = + normalize_branch_name(target_branch)?.unwrap_or_else(|| "main".to_string()); + let fresh = self + .open_coordinator_for_branch(Self::normalize_branch_name(&target_branch)?.as_deref()) + .await?; + let base_snapshot_id = fresh.resolve_snapshot_id(&target_branch).await?; + let base_manifest_version = fresh.version(); + let record = RunRecord::new( + target_branch.clone(), + base_snapshot_id.as_str(), + base_manifest_version, + operation_hash.map(str::to_string), + actor_id + .map(str::to_string) + .or_else(|| self.current_audit_actor().map(str::to_string)), + )?; + + self.branch_create_from_impl( + ReadTarget::branch(target_branch.clone()), + &record.run_branch, + true, + ) + .await?; + self.coordinator.append_run_record(&record).await?; + Ok(record) + } + + pub async fn get_run(&self, run_id: &RunId) -> Result { + self.ensure_schema_state_valid().await?; + self.coordinator.get_run(run_id).await + } + + pub async fn list_runs(&self) -> Result> { + self.ensure_schema_state_valid().await?; + self.coordinator.list_runs().await + } + + pub async fn get_commit(&self, commit_id: &str) -> Result { + self.ensure_schema_state_valid().await?; + self.coordinator + .resolve_commit(&SnapshotId::new(commit_id)) + .await + } + + pub async fn list_commits(&self, branch: Option<&str>) -> Result> { + self.ensure_schema_state_valid().await?; + let branch = match branch { + Some(branch) => normalize_branch_name(branch)?, + None => None, + }; + let coordinator = self.open_coordinator_for_branch(branch.as_deref()).await?; + coordinator.list_commits().await + } + + pub async fn abort_run(&mut self, run_id: &RunId) -> Result { + self.ensure_schema_state_valid().await?; + let run = self.get_run(run_id).await?; + match run.status { + RunStatus::Running | RunStatus::Failed => { + let updated = run.with_status(RunStatus::Aborted, None)?; + self.coordinator.append_run_record(&updated).await?; + Ok(updated) + } + RunStatus::Published => Err(OmniError::manifest_conflict(format!( + "run '{}' is already published", + run_id + ))), + RunStatus::Aborted => Err(OmniError::manifest_conflict(format!( + "run '{}' is already aborted", + run_id + ))), + } + } + + pub async fn fail_run(&mut self, run_id: &RunId) -> Result { + self.ensure_schema_state_valid().await?; + let run = self.get_run(run_id).await?; + match run.status { + RunStatus::Running => { + let updated = run.with_status(RunStatus::Failed, None)?; + self.coordinator.append_run_record(&updated).await?; + Ok(updated) + } + RunStatus::Failed => Ok(run), + RunStatus::Published => Err(OmniError::manifest_conflict(format!( + "run '{}' is already published", + run_id + ))), + RunStatus::Aborted => Err(OmniError::manifest_conflict(format!( + "run '{}' is already aborted", + run_id + ))), + } + } + + pub async fn publish_run(&mut self, run_id: &RunId) -> Result { + self.publish_run_as(run_id, None).await + } + + pub async fn publish_run_as( + &mut self, + run_id: &RunId, + actor_id: Option<&str>, + ) -> Result { + self.ensure_schema_state_valid().await?; + let run = self.get_run(run_id).await?; + match run.status { + RunStatus::Running => {} + RunStatus::Published => { + return run + .published_snapshot_id + .clone() + .map(SnapshotId::new) + .ok_or_else(|| { + OmniError::manifest(format!( + "run '{}' is published but missing published snapshot id", + run_id + )) + }); + } + RunStatus::Failed | RunStatus::Aborted => { + return Err(OmniError::manifest_conflict(format!( + "run '{}' is not publishable from status '{}'", + run_id, + run.status.as_str() + ))); + } + } + + let publish_actor = actor_id + .map(str::to_string) + .or_else(|| run.actor_id.clone()); + let current_target_snapshot_id = self.resolve_snapshot(&run.target_branch).await?; + let previous_actor = self.audit_actor_id.clone(); + self.audit_actor_id = publish_actor.clone(); + let publish_result = if current_target_snapshot_id.as_str() == run.base_snapshot_id { + let run_for_promotion = run.clone(); + self.sync_branch(&run_for_promotion.target_branch).await?; + self.promote_run_snapshot_to_target(&run_for_promotion) + .await + } else { + let run_branch = run.run_branch.clone(); + let target_branch = run.target_branch.clone(); + self.branch_merge_internal(&run_branch, &target_branch) + .await?; + self.reify_internal_run_refs(&target_branch, &run_branch) + .await + }; + self.audit_actor_id = previous_actor; + publish_result?; + let published_snapshot_id = self.resolve_snapshot(&run.target_branch).await?; + let updated = run.with_status( + RunStatus::Published, + Some(published_snapshot_id.as_str().to_string()), + )?; + self.coordinator.append_run_record(&updated).await?; + Ok(published_snapshot_id) + } + + async fn promote_run_snapshot_to_target(&mut self, run: &RunRecord) -> Result<()> { + let target_snapshot = self + .snapshot_of(ReadTarget::branch(run.target_branch.as_str())) + .await?; + let run_snapshot = self + .snapshot_of(ReadTarget::branch(run.run_branch.as_str())) + .await?; + let mut table_keys = std::collections::BTreeSet::new(); + for entry in target_snapshot.entries() { + table_keys.insert(entry.table_key.clone()); + } + for entry in run_snapshot.entries() { + table_keys.insert(entry.table_key.clone()); + } + + let mut updates = Vec::new(); + let mut changed_edge_tables = false; + let target_branch = normalize_branch_name(&run.target_branch)?; + + for table_key in table_keys { + let target_entry = target_snapshot.entry(&table_key); + let run_entry = run_snapshot.entry(&table_key); + if same_manifest_state(target_entry, run_entry) { + continue; + } + let Some(_run_entry) = run_entry else { + return Err(OmniError::manifest(format!( + "run '{}' removed table '{}' which publish_run does not support", + run.run_id, table_key + ))); + }; + + let source_ds = run_snapshot.open(&table_key).await?; + let batch = self.batch_for_table_rewrite(&source_ds, &table_key).await?; + + let (mut target_ds, full_path, table_branch) = self + .open_for_mutation_on_branch(target_branch.as_deref(), &table_key) + .await?; + let state = self + .table_store() + .overwrite_batch(&full_path, &mut target_ds, batch) + .await?; + updates.push(crate::db::SubTableUpdate { + table_key: table_key.clone(), + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + if table_key.starts_with("edge:") { + changed_edge_tables = true; + } + } + + if !updates.is_empty() { + self.commit_updates_on_branch(target_branch.as_deref(), &updates) + .await?; + if changed_edge_tables { + self.invalidate_graph_index().await; + } + } + + Ok(()) + } + + async fn reify_internal_run_refs( + &mut self, + target_branch: &str, + run_branch: &str, + ) -> Result<()> { + let target_snapshot = self.snapshot_of(ReadTarget::branch(target_branch)).await?; + let mut updates = Vec::new(); + let mut changed_edge_tables = false; + let target_branch = normalize_branch_name(target_branch)?; + + for entry in target_snapshot.entries() { + if entry.table_branch.as_deref() != Some(run_branch) { + continue; + } + + let source_ds = target_snapshot.open(&entry.table_key).await?; + let batch = self + .batch_for_table_rewrite(&source_ds, &entry.table_key) + .await?; + + let (mut target_ds, full_path, table_branch) = self + .open_for_mutation_on_branch(target_branch.as_deref(), &entry.table_key) + .await?; + let state = self + .table_store() + .overwrite_batch(&full_path, &mut target_ds, batch) + .await?; + updates.push(crate::db::SubTableUpdate { + table_key: entry.table_key.clone(), + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + if entry.table_key.starts_with("edge:") { + changed_edge_tables = true; + } + } + + if !updates.is_empty() { + self.commit_updates_on_branch(target_branch.as_deref(), &updates) + .await?; + if changed_edge_tables { + self.invalidate_graph_index().await; + } + } + + Ok(()) + } + + /// Open a sub-table for mutation with version-drift guard. + /// + /// Checks that the dataset's current version matches the snapshot-pinned + /// version. If another writer has advanced the version, returns an error + /// prompting the caller to refresh and retry (optimistic concurrency). + pub(crate) async fn open_for_mutation( + &self, + table_key: &str, + ) -> Result<(Dataset, String, Option)> { + let current_branch = self.coordinator.current_branch().map(str::to_string); + self.open_for_mutation_on_branch(current_branch.as_deref(), table_key) + .await + } + + pub(crate) async fn open_for_mutation_on_branch( + &self, + branch: Option<&str>, + table_key: &str, + ) -> Result<(Dataset, String, Option)> { + let resolved = self.resolved_branch_target(branch).await?; + let entry = resolved + .snapshot + .entry(table_key) + .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + let full_path = format!("{}/{}", self.root_uri, entry.table_path); + match resolved.branch.as_deref() { + None => { + let ds = self + .table_store + .open_dataset_head_for_write(table_key, &full_path, None) + .await?; + self.table_store + .ensure_expected_version(&ds, table_key, entry.table_version)?; + Ok((ds, full_path, None)) + } + Some(active_branch) => { + let (ds, table_branch) = self + .open_owned_dataset_for_branch_write( + table_key, + &full_path, + entry.table_branch.as_deref(), + entry.table_version, + active_branch, + ) + .await?; + Ok((ds, full_path, table_branch)) + } + } + } + + /// Open the dataset that should receive a branch-local metadata or data + /// write, forking it from the manifest-pinned source state when the active + /// branch does not yet own the subtable. + pub(crate) async fn open_owned_dataset_for_branch_write( + &self, + table_key: &str, + full_path: &str, + entry_branch: Option<&str>, + entry_version: u64, + active_branch: &str, + ) -> Result<(Dataset, Option)> { + match entry_branch { + Some(branch) if branch == active_branch => { + let ds = self + .table_store + .open_dataset_head_for_write(table_key, full_path, Some(active_branch)) + .await?; + self.table_store + .ensure_expected_version(&ds, table_key, entry_version)?; + Ok((ds, Some(active_branch.to_string()))) + } + source_branch => { + self.fork_dataset_from_entry_state( + table_key, + full_path, + source_branch, + entry_version, + active_branch, + ) + .await?; + let ds = self + .table_store + .open_dataset_head_for_write(table_key, full_path, Some(active_branch)) + .await?; + self.table_store + .ensure_expected_version(&ds, table_key, entry_version)?; + Ok((ds, Some(active_branch.to_string()))) + } + } + } + + pub(crate) async fn fork_dataset_from_entry_state( + &self, + table_key: &str, + full_path: &str, + source_branch: Option<&str>, + source_version: u64, + active_branch: &str, + ) -> Result { + let ds = self + .table_store + .fork_branch_from_state( + full_path, + source_branch, + table_key, + source_version, + active_branch, + ) + .await?; + Ok(ds) + } + + pub(crate) async fn reopen_for_mutation( + &self, + table_key: &str, + full_path: &str, + table_branch: Option<&str>, + expected_version: u64, + ) -> Result { + self.table_store + .reopen_for_mutation(full_path, table_branch, table_key, expected_version) + .await + } + + pub(crate) async fn open_dataset_at_state( + &self, + table_path: &str, + table_branch: Option<&str>, + table_version: u64, + ) -> Result { + self.table_store + .open_dataset_at_state(table_path, table_branch, table_version) + .await + } + + pub(crate) async fn build_indices_on_dataset( + &self, + table_key: &str, + ds: &mut Dataset, + ) -> Result<()> { + if let Some(type_name) = table_key.strip_prefix("node:") { + if !self.table_store.has_btree_index(ds, "id").await? { + self.table_store + .create_btree_index(ds, &["id"]) + .await + .map_err(|e| { + OmniError::Lance(format!("create BTree index on {}(id): {}", table_key, e)) + })?; + } + + if let Some(node_type) = self.catalog.node_types.get(type_name) { + for index_cols in &node_type.indices { + if index_cols.len() != 1 { + continue; + } + let prop_name = &index_cols[0]; + if let Some(prop_type) = node_type.properties.get(prop_name) { + if matches!(prop_type.scalar, ScalarType::String) && !prop_type.list { + if !self.table_store.has_fts_index(ds, prop_name).await? { + self.table_store + .create_inverted_index(ds, prop_name.as_str()) + .await + .map_err(|e| { + OmniError::Lance(format!( + "create Inverted index on {}({}): {}", + table_key, prop_name, e + )) + })?; + } + } else if matches!(prop_type.scalar, ScalarType::Vector(_)) + && !prop_type.list + { + if !self.table_store.has_vector_index(ds, prop_name).await? { + self.table_store + .create_vector_index(ds, prop_name.as_str()) + .await + .map_err(|e| { + OmniError::Lance(format!( + "create Vector index on {}({}): {}", + table_key, prop_name, e + )) + })?; + } + } + } + } + } + return Ok(()); + } + + if table_key.starts_with("edge:") { + if !self.table_store.has_btree_index(ds, "id").await? { + self.table_store + .create_btree_index(ds, &["id"]) + .await + .map_err(|e| { + OmniError::Lance(format!("create BTree index on {}(id): {}", table_key, e)) + })?; + } + if !self.table_store.has_btree_index(ds, "src").await? { + self.table_store + .create_btree_index(ds, &["src"]) + .await + .map_err(|e| { + OmniError::Lance(format!("create BTree index on {}(src): {}", table_key, e)) + })?; + } + if !self.table_store.has_btree_index(ds, "dst").await? { + self.table_store + .create_btree_index(ds, &["dst"]) + .await + .map_err(|e| { + OmniError::Lance(format!("create BTree index on {}(dst): {}", table_key, e)) + })?; + } + return Ok(()); + } + + Err(OmniError::manifest(format!( + "invalid table key '{}'", + table_key + ))) + } + + async fn prepare_updates_for_commit( + &self, + branch: Option<&str>, + updates: &[crate::db::SubTableUpdate], + ) -> Result> { + if updates.is_empty() { + return Ok(Vec::new()); + } + + let snapshot = self.snapshot_for_branch(branch).await?; + let mut prepared = Vec::with_capacity(updates.len()); + + for update in updates { + let Some(entry) = snapshot.entry(&update.table_key) else { + return Err(OmniError::manifest(format!( + "no manifest entry for {}", + update.table_key + ))); + }; + + let mut prepared_update = update.clone(); + if prepared_update.row_count > 0 { + let full_path = format!("{}/{}", self.root_uri, entry.table_path); + let mut ds = self + .reopen_for_mutation( + &prepared_update.table_key, + &full_path, + prepared_update.table_branch.as_deref(), + prepared_update.table_version, + ) + .await?; + self.build_indices_on_dataset(&prepared_update.table_key, &mut ds) + .await?; + let state = self.table_store.table_state(&full_path, &ds).await?; + prepared_update.table_version = state.version; + prepared_update.row_count = state.row_count; + prepared_update.version_metadata = state.version_metadata; + } + + prepared.push(prepared_update); + } + + Ok(prepared) + } + + async fn commit_prepared_updates( + &mut self, + updates: &[crate::db::SubTableUpdate], + ) -> Result { + let actor_id = self.current_audit_actor().map(str::to_string); + let PublishedSnapshot { + manifest_version, + _snapshot_id: _, + } = self + .coordinator + .commit_updates_with_actor(updates, actor_id.as_deref()) + .await?; + Ok(manifest_version) + } + + async fn commit_prepared_updates_on_branch( + &mut self, + branch: Option<&str>, + updates: &[crate::db::SubTableUpdate], + ) -> Result { + let current_branch = self.coordinator.current_branch().map(str::to_string); + let requested_branch = branch.map(str::to_string); + if requested_branch == current_branch { + return self.commit_prepared_updates(updates).await; + } + + let mut coordinator = match requested_branch.as_deref() { + Some(branch) => { + GraphCoordinator::open_branch(self.uri(), branch, Arc::clone(&self.storage)).await? + } + None => GraphCoordinator::open(self.uri(), Arc::clone(&self.storage)).await?, + }; + let actor_id = self.current_audit_actor().map(str::to_string); + let PublishedSnapshot { + manifest_version, + _snapshot_id: _, + } = coordinator + .commit_updates_with_actor(updates, actor_id.as_deref()) + .await?; + Ok(manifest_version) + } + + pub(crate) async fn commit_updates( + &mut self, + updates: &[crate::db::SubTableUpdate], + ) -> Result { + let current_branch = self.coordinator.current_branch().map(str::to_string); + let prepared = self + .prepare_updates_for_commit(current_branch.as_deref(), updates) + .await?; + self.commit_prepared_updates(&prepared).await + } + + pub(crate) async fn commit_manifest_updates( + &mut self, + updates: &[crate::db::SubTableUpdate], + ) -> Result { + self.coordinator.commit_manifest_updates(updates).await + } + + pub(crate) async fn record_merge_commit( + &mut self, + manifest_version: u64, + parent_commit_id: &str, + merged_parent_commit_id: &str, + ) -> Result { + let actor_id = self.current_audit_actor().map(str::to_string); + self.coordinator + .record_merge_commit( + manifest_version, + parent_commit_id, + merged_parent_commit_id, + actor_id.as_deref(), + ) + .await + .map(|snapshot_id| snapshot_id.as_str().to_string()) + } + + pub(crate) async fn commit_updates_on_branch( + &mut self, + branch: Option<&str>, + updates: &[crate::db::SubTableUpdate], + ) -> Result { + let prepared = self.prepare_updates_for_commit(branch, updates).await?; + self.commit_prepared_updates_on_branch(branch, &prepared) + .await + } + + pub(crate) async fn ensure_commit_graph_initialized(&mut self) -> Result<()> { + self.coordinator.ensure_commit_graph_initialized().await + } + + /// Invalidate the cached graph index. Called after edge mutations. + pub(crate) async fn invalidate_graph_index(&self) { + self.runtime_cache.invalidate_all().await; + } + + async fn batch_for_table_rewrite( + &self, + source_ds: &Dataset, + table_key: &str, + ) -> Result { + let target_schema = schema_for_table_key(self.catalog(), table_key)?; + let blob_properties = blob_properties_for_table_key(self.catalog(), table_key)?; + if blob_properties.is_empty() { + let batches = self.table_store().scan_batches(source_ds).await?; + return concat_or_empty_batches(target_schema, batches); + } + + let batches = self + .table_store() + .scan_with(source_ds, None, None, None, true, |_| Ok(())) + .await?; + let batch = concat_or_empty_batches(target_schema.clone(), batches)?; + if batch.num_rows() == 0 { + return Ok(batch); + } + + let row_ids = batch + .column_by_name("_rowid") + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + OmniError::Lance(format!( + "expected _rowid column when rewriting '{}'", + table_key + )) + })?; + let row_ids: Vec = row_ids.values().iter().copied().collect(); + + let mut columns = Vec::with_capacity(target_schema.fields().len()); + for field in target_schema.fields() { + if blob_properties.contains(field.name()) { + let descriptions = batch + .column_by_name(field.name()) + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + OmniError::Lance(format!( + "expected blob descriptions for '{}.{}'", + table_key, + field.name() + )) + })?; + columns.push( + self.rebuild_blob_column(source_ds, field.name(), descriptions, &row_ids) + .await?, + ); + } else { + columns.push(batch.column_by_name(field.name()).cloned().ok_or_else(|| { + OmniError::Lance(format!( + "missing column '{}.{}' in rewrite batch", + table_key, + field.name() + )) + })?); + } + } + + RecordBatch::try_new(target_schema, columns).map_err(|e| OmniError::Lance(e.to_string())) + } + + async fn rebuild_blob_column( + &self, + source_ds: &Dataset, + column_name: &str, + descriptions: &StructArray, + row_ids: &[u64], + ) -> Result> { + let mut builder = BlobArrayBuilder::new(row_ids.len()); + let mut non_null_row_ids = Vec::new(); + let mut row_has_blob = Vec::with_capacity(row_ids.len()); + + for row in 0..row_ids.len() { + let is_null = blob_description_is_null(descriptions, row)?; + row_has_blob.push(!is_null); + if !is_null { + non_null_row_ids.push(row_ids[row]); + } + } + + let blob_files = if non_null_row_ids.is_empty() { + Vec::new() + } else { + Arc::new(source_ds.clone()) + .take_blobs(&non_null_row_ids, column_name) + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + }; + + let mut files = blob_files.into_iter(); + for has_blob in row_has_blob { + if !has_blob { + builder + .push_null() + .map_err(|e| OmniError::Lance(e.to_string()))?; + continue; + } + + let blob = files.next().ok_or_else(|| { + OmniError::Lance(format!( + "blob rewrite for '{}' lost alignment with source rows", + column_name + )) + })?; + if let Some(uri) = blob.uri() { + builder + .push_uri(uri) + .map_err(|e| OmniError::Lance(e.to_string()))?; + } else { + builder + .push_bytes( + blob.read() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?, + ) + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + } + + if files.next().is_some() { + return Err(OmniError::Lance(format!( + "blob rewrite for '{}' produced extra source blobs", + column_name + ))); + } + + builder + .finish() + .map_err(|e| OmniError::Lance(e.to_string())) + } + + async fn export_blob_values( + &self, + source_ds: &Dataset, + batch: &RecordBatch, + row_ids: &[u64], + blob_properties: &std::collections::HashSet, + ) -> Result>>> { + let mut values = HashMap::with_capacity(blob_properties.len()); + for property in blob_properties { + let descriptions = batch + .column_by_name(property) + .and_then(|col| col.as_any().downcast_ref::()) + .ok_or_else(|| { + OmniError::Lance(format!( + "expected blob descriptions for export column '{}'", + property + )) + })?; + values.insert( + property.clone(), + export_blob_column_values(source_ds, property, descriptions, row_ids).await?, + ); + } + Ok(values) + } + + async fn export_rows_from_batch( + &self, + table_key: &str, + batch: &RecordBatch, + blob_values: Option<&HashMap>>>, + ) -> Result> { + if let Some(type_name) = table_key.strip_prefix("node:") { + let node_type = + self.catalog.node_types.get(type_name).ok_or_else(|| { + OmniError::manifest(format!("unknown node type '{}'", type_name)) + })?; + let mut rows = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + let mut data = serde_json::Map::new(); + data.insert( + "id".to_string(), + json_value_from_named_column(batch, "id", row)?, + ); + for field in node_type.arrow_schema.fields().iter().skip(1) { + data.insert( + field.name().clone(), + export_value_for_field( + batch, + field.name(), + row, + blob_values.and_then(|values| values.get(field.name())), + )?, + ); + } + rows.push(serde_json::json!({ + "type": type_name, + "data": serde_json::Value::Object(data), + })); + } + return Ok(rows); + } + + if let Some(edge_name) = table_key.strip_prefix("edge:") { + let edge_type = + self.catalog.edge_types.get(edge_name).ok_or_else(|| { + OmniError::manifest(format!("unknown edge type '{}'", edge_name)) + })?; + let mut rows = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + let from = named_string_value(batch, "src", row)?; + let to = named_string_value(batch, "dst", row)?; + let mut data = serde_json::Map::new(); + data.insert( + "id".to_string(), + json_value_from_named_column(batch, "id", row)?, + ); + for field in edge_type.arrow_schema.fields().iter().skip(3) { + data.insert( + field.name().clone(), + export_value_for_field( + batch, + field.name(), + row, + blob_values.and_then(|values| values.get(field.name())), + )?, + ); + } + rows.push(serde_json::json!({ + "edge": edge_name, + "from": from, + "to": to, + "data": serde_json::Value::Object(data), + })); + } + return Ok(rows); + } + + Err(OmniError::manifest(format!( + "invalid export table key '{}'", + table_key + ))) + } +} + +async fn export_blob_column_values( + source_ds: &Dataset, + column_name: &str, + descriptions: &StructArray, + row_ids: &[u64], +) -> Result>> { + let mut non_null_row_ids = Vec::new(); + let mut non_null_positions = Vec::new(); + let mut values = vec![None; row_ids.len()]; + + for (row, row_id) in row_ids.iter().enumerate() { + if blob_description_is_null(descriptions, row)? { + continue; + } + non_null_row_ids.push(*row_id); + non_null_positions.push(row); + } + + if non_null_row_ids.is_empty() { + return Ok(values); + } + + // Sort row IDs before calling take_blobs — Lance 4's unsorted path has + // a bug that duplicates the _rowaddr column in the returned batch. + let mut perm: Vec = (0..non_null_row_ids.len()).collect(); + perm.sort_by_key(|&i| non_null_row_ids[i]); + let sorted_ids: Vec = perm.iter().map(|&i| non_null_row_ids[i]).collect(); + + let sorted_blobs = Arc::new(source_ds.clone()) + .take_blobs(&sorted_ids, column_name) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + if sorted_blobs.len() != non_null_positions.len() { + return Err(OmniError::Lance(format!( + "blob export for '{}' lost alignment with selected rows", + column_name + ))); + } + + // Restore original order via inverse permutation. Build an index that + // maps each original position to the sorted position so we can iterate + // non_null_positions in order and pick the right blob. + let mut inverse_perm = vec![0usize; perm.len()]; + for (sorted_pos, &orig_pos) in perm.iter().enumerate() { + inverse_perm[orig_pos] = sorted_pos; + } + + for (idx, position) in non_null_positions.into_iter().enumerate() { + let blob = &sorted_blobs[inverse_perm[idx]]; + let value = if let Some(uri) = blob.uri() { + uri.to_string() + } else { + let bytes = blob + .read() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + format!( + "base64:{}", + base64::Engine::encode(&base64::engine::general_purpose::STANDARD, bytes) + ) + }; + values[position] = Some(value); + } + + Ok(values) +} + +fn export_value_for_field( + batch: &RecordBatch, + field_name: &str, + row: usize, + blob_values: Option<&Vec>>, +) -> Result { + if let Some(blob_values) = blob_values { + return Ok(blob_values + .get(row) + .and_then(|value| value.clone()) + .map(serde_json::Value::String) + .unwrap_or(serde_json::Value::Null)); + } + json_value_from_named_column(batch, field_name, row) +} + +fn json_value_from_named_column( + batch: &RecordBatch, + field_name: &str, + row: usize, +) -> Result { + let column = batch.column_by_name(field_name).ok_or_else(|| { + OmniError::Lance(format!("missing column '{}' in export batch", field_name)) + })?; + json_value_from_array(column.as_ref(), row) +} + +fn named_string_value(batch: &RecordBatch, field_name: &str, row: usize) -> Result { + let column = batch.column_by_name(field_name).ok_or_else(|| { + OmniError::Lance(format!("missing column '{}' in export batch", field_name)) + })?; + let array = column + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance(format!("expected Utf8 column '{}'", field_name)))?; + if array.is_null(row) { + return Err(OmniError::Lance(format!( + "unexpected null in export column '{}'", + field_name + ))); + } + Ok(array.value(row).to_string()) +} + +pub(crate) fn normalize_branch_name(branch: &str) -> Result> { + let branch = branch.trim(); + if branch.is_empty() { + return Err(OmniError::manifest( + "branch name cannot be empty".to_string(), + )); + } + if branch == "main" { + return Ok(None); + } + Ok(Some(branch.to_string())) +} + +fn ensure_public_branch_ref(branch: &str, operation: &str) -> Result<()> { + if is_internal_run_branch(branch) { + return Err(OmniError::manifest(format!( + "{} does not allow internal run ref '{}'", + operation, branch + ))); + } + Ok(()) +} + +fn same_manifest_state( + left: Option<&crate::db::SubTableEntry>, + right: Option<&crate::db::SubTableEntry>, +) -> bool { + match (left, right) { + (None, None) => true, + (Some(left), Some(right)) => { + left.table_path == right.table_path + && left.table_version == right.table_version + && left.table_branch == right.table_branch + && left.row_count == right.row_count + } + _ => false, + } +} + +fn concat_or_empty_batches(schema: Arc, batches: Vec) -> Result { + if batches.is_empty() { + return Ok(RecordBatch::new_empty(schema)); + } + if batches.len() == 1 { + return Ok(batches.into_iter().next().unwrap()); + } + let batch_schema = batches[0].schema(); + arrow_select::concat::concat_batches(&batch_schema, &batches) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +fn blob_properties_for_table_key<'a>( + catalog: &'a Catalog, + table_key: &str, +) -> Result<&'a std::collections::HashSet> { + if let Some(type_name) = table_key.strip_prefix("node:") { + return catalog + .node_types + .get(type_name) + .map(|node_type| &node_type.blob_properties) + .ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name))); + } + if let Some(type_name) = table_key.strip_prefix("edge:") { + return catalog + .edge_types + .get(type_name) + .map(|edge_type| &edge_type.blob_properties) + .ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", type_name))); + } + Err(OmniError::manifest(format!( + "invalid table key '{}'", + table_key + ))) +} + +fn blob_description_is_null(descriptions: &StructArray, row: usize) -> Result { + if descriptions.is_null(row) { + return Ok(true); + } + + let kind = descriptions + .column_by_name("kind") + .and_then(|col| col.as_any().downcast_ref::()) + .and_then(|arr| (!arr.is_null(row)).then(|| arr.value(row) as u8)) + .or_else(|| { + descriptions + .column_by_name("kind") + .and_then(|col| col.as_any().downcast_ref::()) + .and_then(|arr| (!arr.is_null(row)).then(|| arr.value(row))) + }); + let position = descriptions + .column_by_name("position") + .and_then(|col| col.as_any().downcast_ref::()) + .and_then(|arr| (!arr.is_null(row)).then(|| arr.value(row))); + let size = descriptions + .column_by_name("size") + .and_then(|col| col.as_any().downcast_ref::()) + .and_then(|arr| (!arr.is_null(row)).then(|| arr.value(row))); + let blob_uri = descriptions + .column_by_name("blob_uri") + .and_then(|col| col.as_any().downcast_ref::()) + .and_then(|arr| (!arr.is_null(row)).then(|| arr.value(row))); + + let Some(kind) = kind else { + return Ok(true); + }; + let kind = BlobKind::try_from(kind).map_err(|e| OmniError::Lance(e.to_string()))?; + if kind != BlobKind::Inline { + return Ok(false); + } + + Ok(position.unwrap_or(0) == 0 && size.unwrap_or(0) == 0 && blob_uri.unwrap_or("").is_empty()) +} + +/// Replace placeholder `LargeBinary` fields with Lance blob v2 fields. +/// +/// The compiler crate has no Lance dependency, so `ScalarType::Blob` maps to +/// `DataType::LargeBinary` as a placeholder. This function replaces those +/// fields with the real blob v2 struct type via `lance::blob::blob_field()`. +fn fixup_blob_schemas(catalog: &mut Catalog) { + for node_type in catalog.node_types.values_mut() { + if node_type.blob_properties.is_empty() { + continue; + } + let fields: Vec = node_type + .arrow_schema + .fields() + .iter() + .map(|f| { + if node_type.blob_properties.contains(f.name()) { + blob_field(f.name(), f.is_nullable()) + } else { + f.as_ref().clone() + } + }) + .collect(); + node_type.arrow_schema = Arc::new(Schema::new(fields)); + } + for edge_type in catalog.edge_types.values_mut() { + if edge_type.blob_properties.is_empty() { + continue; + } + let fields: Vec = edge_type + .arrow_schema + .fields() + .iter() + .map(|f| { + if edge_type.blob_properties.contains(f.name()) { + blob_field(f.name(), f.is_nullable()) + } else { + f.as_ref().clone() + } + }) + .collect(); + edge_type.arrow_schema = Arc::new(Schema::new(fields)); + } +} + +fn read_schema_ir_from_source(schema_source: &str) -> Result { + let schema_ast = parse_schema(schema_source)?; + build_schema_ir(&schema_ast).map_err(|err| OmniError::manifest(err.to_string())) +} + +fn schema_for_table_key(catalog: &Catalog, table_key: &str) -> Result> { + if let Some(type_name) = table_key.strip_prefix("node:") { + let node_type: &NodeType = catalog + .node_types + .get(type_name) + .ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?; + return Ok(node_type.arrow_schema.clone()); + } + if let Some(type_name) = table_key.strip_prefix("edge:") { + let edge_type: &EdgeType = catalog + .edge_types + .get(type_name) + .ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", type_name)))?; + return Ok(edge_type.arrow_schema.clone()); + } + Err(OmniError::manifest(format!( + "invalid table key '{}'", + table_key + ))) +} + +fn record_batch_row_to_json(batch: &RecordBatch, row: usize) -> Result { + let mut obj = serde_json::Map::new(); + for (i, field) in batch.schema().fields().iter().enumerate() { + obj.insert( + field.name().clone(), + json_value_from_array(batch.column(i).as_ref(), row)?, + ); + } + Ok(serde_json::Value::Object(obj)) +} + +fn json_value_from_array(array: &dyn Array, row: usize) -> Result { + if array.is_null(row) { + return Ok(serde_json::Value::Null); + } + + match array.data_type() { + DataType::Utf8 => Ok(serde_json::Value::String( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected StringArray".to_string()))? + .value(row) + .to_string(), + )), + DataType::LargeUtf8 => Ok(serde_json::Value::String( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected LargeStringArray".to_string()))? + .value(row) + .to_string(), + )), + DataType::Boolean => Ok(serde_json::Value::Bool( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected BooleanArray".to_string()))? + .value(row), + )), + DataType::Int32 => Ok(serde_json::Value::Number(serde_json::Number::from( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected Int32Array".to_string()))? + .value(row), + ))), + DataType::Int64 => Ok(serde_json::Value::Number(serde_json::Number::from( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected Int64Array".to_string()))? + .value(row), + ))), + DataType::UInt32 => Ok(serde_json::Value::Number(serde_json::Number::from( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected UInt32Array".to_string()))? + .value(row), + ))), + DataType::UInt64 => Ok(serde_json::Value::Number(serde_json::Number::from( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected UInt64Array".to_string()))? + .value(row), + ))), + DataType::Float32 => { + let value = array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected Float32Array".to_string()))? + .value(row) as f64; + Ok(serde_json::Value::Number( + serde_json::Number::from_f64(value).ok_or_else(|| { + OmniError::Lance(format!("cannot encode f32 value '{}' as JSON", value)) + })?, + )) + } + DataType::Float64 => { + let value = array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected Float64Array".to_string()))? + .value(row); + Ok(serde_json::Value::Number( + serde_json::Number::from_f64(value).ok_or_else(|| { + OmniError::Lance(format!("cannot encode f64 value '{}' as JSON", value)) + })?, + )) + } + DataType::Date32 => Ok(serde_json::Value::Number(serde_json::Number::from( + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected Date32Array".to_string()))? + .value(row), + ))), + DataType::Binary => Ok(serde_json::Value::String(base64::Engine::encode( + &base64::engine::general_purpose::STANDARD, + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected BinaryArray".to_string()))? + .value(row), + ))), + DataType::LargeBinary => Ok(serde_json::Value::String(base64::Engine::encode( + &base64::engine::general_purpose::STANDARD, + array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected LargeBinaryArray".to_string()))? + .value(row), + ))), + DataType::List(_) => { + let list = array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected ListArray".to_string()))?; + let values = list.value(row); + let mut out = Vec::with_capacity(values.len()); + for idx in 0..values.len() { + out.push(json_value_from_array(values.as_ref(), idx)?); + } + Ok(serde_json::Value::Array(out)) + } + DataType::LargeList(_) => { + let list = array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected LargeListArray".to_string()))?; + let values = list.value(row); + let mut out = Vec::with_capacity(values.len()); + for idx in 0..values.len() { + out.push(json_value_from_array(values.as_ref(), idx)?); + } + Ok(serde_json::Value::Array(out)) + } + DataType::FixedSizeList(_, _) => { + let list = array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected FixedSizeListArray".to_string()))?; + let values = list.value(row); + let mut out = Vec::with_capacity(values.len()); + for idx in 0..values.len() { + out.push(json_value_from_array(values.as_ref(), idx)?); + } + Ok(serde_json::Value::Array(out)) + } + DataType::Struct(fields) => { + let struct_array = array + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::Lance("expected StructArray".to_string()))?; + let mut obj = serde_json::Map::new(); + for (field_idx, field) in fields.iter().enumerate() { + obj.insert( + field.name().clone(), + json_value_from_array(struct_array.column(field_idx).as_ref(), row)?, + ); + } + Ok(serde_json::Value::Object(obj)) + } + _ => { + let value = arrow_cast::display::array_value_to_string(array, row) + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(serde_json::Value::String(value)) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + use async_trait::async_trait; + use omnigraph_compiler::{SchemaMigrationStep, SchemaTypeKind}; + use std::fs; + use std::sync::Mutex; + + use crate::storage::{LocalStorageAdapter, StorageAdapter, join_uri}; + + const TEST_SCHEMA: &str = r#" +node Person { + name: String @key + age: I32? +} +node Company { + name: String @key +} +edge Knows: Person -> Person { + since: Date? +} +edge WorksAt: Person -> Company +"#; + + #[derive(Debug, Default)] + struct RecordingStorageAdapter { + inner: LocalStorageAdapter, + reads: Mutex>, + writes: Mutex>, + exists_checks: Mutex>, + } + + impl RecordingStorageAdapter { + fn reads(&self) -> Vec { + self.reads.lock().unwrap().clone() + } + + fn writes(&self) -> Vec { + self.writes.lock().unwrap().clone() + } + + fn exists_checks(&self) -> Vec { + self.exists_checks.lock().unwrap().clone() + } + } + + #[async_trait] + impl StorageAdapter for RecordingStorageAdapter { + async fn read_text(&self, uri: &str) -> Result { + self.reads.lock().unwrap().push(uri.to_string()); + self.inner.read_text(uri).await + } + + async fn write_text(&self, uri: &str, contents: &str) -> Result<()> { + self.writes.lock().unwrap().push(uri.to_string()); + self.inner.write_text(uri, contents).await + } + + async fn exists(&self, uri: &str) -> Result { + self.exists_checks.lock().unwrap().push(uri.to_string()); + self.inner.exists(uri).await + } + } + + #[tokio::test] + async fn test_init_creates_repo() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Schema file written + assert!(dir.path().join("_schema.pg").exists()); + assert!(dir.path().join("_schema.ir.json").exists()); + assert!(dir.path().join("__schema_state.json").exists()); + + // Manifest created with correct entries + let snap = db.snapshot(); + assert!(snap.entry("node:Person").is_some()); + assert!(snap.entry("node:Company").is_some()); + assert!(snap.entry("edge:Knows").is_some()); + assert!(snap.entry("edge:WorksAt").is_some()); + + // Catalog is correct + assert_eq!(db.catalog().node_types.len(), 2); + assert_eq!(db.catalog().edge_types.len(), 2); + assert_eq!( + db.catalog().node_types["Person"].key_property(), + Some("name") + ); + } + + #[tokio::test] + async fn test_open_reads_existing_repo() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Re-open + let db = Omnigraph::open(uri).await.unwrap(); + assert_eq!(db.catalog().node_types.len(), 2); + assert_eq!(db.catalog().edge_types.len(), 2); + let snap = db.snapshot(); + assert!(snap.entry("node:Person").is_some()); + assert!(snap.entry("edge:Knows").is_some()); + } + + #[tokio::test] + async fn test_init_and_open_route_graph_metadata_through_storage_adapter() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let adapter = Arc::new(RecordingStorageAdapter::default()); + + Omnigraph::init_with_storage(uri, TEST_SCHEMA, adapter.clone()) + .await + .unwrap(); + assert!(adapter.writes().contains(&join_uri(uri, "_schema.pg"))); + assert!(adapter.writes().contains(&join_uri(uri, "_schema.ir.json"))); + assert!( + adapter + .writes() + .contains(&join_uri(uri, "__schema_state.json")) + ); + + Omnigraph::open_with_storage(uri, adapter.clone()) + .await + .unwrap(); + assert!(adapter.reads().contains(&join_uri(uri, "_schema.pg"))); + assert!(adapter.reads().contains(&join_uri(uri, "_schema.ir.json"))); + assert!( + adapter + .reads() + .contains(&join_uri(uri, "__schema_state.json")) + ); + assert!( + adapter + .exists_checks() + .contains(&join_uri(uri, "_schema.ir.json")) + ); + assert!( + adapter + .exists_checks() + .contains(&join_uri(uri, "__schema_state.json")) + ); + assert!( + adapter + .exists_checks() + .contains(&join_uri(uri, "_graph_commits.lance")) + ); + } + + #[tokio::test] + async fn test_open_bootstraps_legacy_schema_state_for_main_only_repo() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + fs::remove_file(dir.path().join("_schema.ir.json")).unwrap(); + fs::remove_file(dir.path().join("__schema_state.json")).unwrap(); + + let db = Omnigraph::open(uri).await.unwrap(); + assert_eq!(db.catalog().node_types.len(), 2); + assert!(dir.path().join("_schema.ir.json").exists()); + assert!(dir.path().join("__schema_state.json").exists()); + } + + #[tokio::test] + async fn test_open_rejects_legacy_repo_with_public_branch() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + db.branch_create("feature").await.unwrap(); + + fs::remove_file(dir.path().join("_schema.ir.json")).unwrap(); + fs::remove_file(dir.path().join("__schema_state.json")).unwrap(); + + let err = match Omnigraph::open(uri).await { + Ok(_) => panic!("expected legacy repo with public branch to fail schema bootstrap"), + Err(err) => err, + }; + let message = err.to_string(); + assert!(message.contains("public branches block schema evolution entirely")); + } + + #[tokio::test] + async fn test_long_lived_handle_rejects_schema_source_drift() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let drifted = TEST_SCHEMA.replace("age: I32?", "age: I64?"); + fs::write(dir.path().join("_schema.pg"), drifted).unwrap(); + + let err = match db.snapshot_of(ReadTarget::branch("main")).await { + Ok(_) => panic!("expected schema source drift to be rejected"), + Err(err) => err, + }; + assert!( + err.to_string() + .contains("current _schema.pg no longer matches the accepted compiled schema") + ); + } + + #[tokio::test] + async fn test_long_lived_handle_rejects_schema_ir_drift() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + fs::write(dir.path().join("_schema.ir.json"), "{not valid json").unwrap(); + + let err = match db.snapshot_of(ReadTarget::branch("main")).await { + Ok(_) => panic!("expected schema IR drift to be rejected"), + Err(err) => err, + }; + assert!( + err.to_string() + .contains("accepted compiled schema contract in _schema.ir.json is invalid") + ); + } + + #[tokio::test] + async fn test_long_lived_handle_rejects_ir_and_source_updates_without_state_update() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let drifted = TEST_SCHEMA.replace("age: I32?", "age: I64?"); + let drifted_ir = read_schema_ir_from_source(&drifted).unwrap(); + let drifted_ir_json = omnigraph_compiler::schema_ir_pretty_json(&drifted_ir).unwrap(); + fs::write(dir.path().join("_schema.pg"), drifted).unwrap(); + fs::write(dir.path().join("_schema.ir.json"), drifted_ir_json).unwrap(); + + let err = match db.snapshot_of(ReadTarget::branch("main")).await { + Ok(_) => panic!("expected schema state mismatch to be rejected"), + Err(err) => err, + }; + assert!( + err.to_string() + .contains("accepted compiled schema does not match the recorded schema state") + ); + } + + #[tokio::test] + async fn test_comment_only_schema_edit_keeps_schema_state_valid() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let commented = format!("// comment-only drift\n{}", TEST_SCHEMA); + fs::write(dir.path().join("_schema.pg"), commented).unwrap(); + + let snapshot = db.snapshot_of(ReadTarget::branch("main")).await.unwrap(); + assert!(snapshot.entry("node:Person").is_some()); + } + + #[tokio::test] + async fn test_plan_schema_reports_supported_additive_change() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let desired = TEST_SCHEMA.replace( + " age: I32?\n}", + " age: I32?\n nickname: String?\n}", + ); + + let plan = db.plan_schema(&desired).await.unwrap(); + assert!(plan.supported); + assert!(plan.steps.iter().any(|step| matches!( + step, + SchemaMigrationStep::AddProperty { + type_kind: SchemaTypeKind::Node, + type_name, + property_name, + .. + } if type_name == "Person" && property_name == "nickname" + ))); + } + + #[tokio::test] + async fn test_plan_schema_rejects_when_schema_contract_has_drifted() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let drifted = TEST_SCHEMA.replace("age: I32?", "age: I64?"); + fs::write(dir.path().join("_schema.pg"), drifted).unwrap(); + + let err = db.plan_schema(TEST_SCHEMA).await.unwrap_err(); + assert!( + err.to_string() + .contains("current _schema.pg no longer matches the accepted compiled schema") + ); + } + + #[tokio::test] + async fn test_open_nonexistent_fails() { + let result = Omnigraph::open("/tmp/nonexistent_omnigraph_test_xyz").await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_snapshot_version_is_pinned() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Take snapshot before any writes + let snap1 = db.snapshot(); + let v1 = snap1.version(); + + // Load data — advances manifest version + crate::loader::load_jsonl( + &mut db, + r#"{"type": "Person", "data": {"name": "Alice", "age": 30}}"#, + crate::loader::LoadMode::Overwrite, + ) + .await + .unwrap(); + + // Snapshot from handle sees new version + let snap2 = db.snapshot(); + assert!(snap2.version() > v1); + + // But the old snapshot is still pinned + assert_eq!(snap1.version(), v1); + } +} diff --git a/crates/omnigraph/src/db/run_registry.rs b/crates/omnigraph/src/db/run_registry.rs new file mode 100644 index 0000000..70658dc --- /dev/null +++ b/crates/omnigraph/src/db/run_registry.rs @@ -0,0 +1,622 @@ +use std::collections::HashMap; +use std::fmt; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +use arrow_array::{ + Array, RecordBatch, RecordBatchIterator, StringArray, TimestampMicrosecondArray, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use futures::TryStreamExt; +use lance::Dataset; +use lance::dataset::{WriteMode, WriteParams}; +use lance_file::version::LanceFileVersion; + +use crate::error::{OmniError, Result}; + +const GRAPH_RUNS_DIR: &str = "_graph_runs.lance"; +const GRAPH_RUN_ACTORS_DIR: &str = "_graph_run_actors.lance"; +pub(crate) const INTERNAL_RUN_BRANCH_PREFIX: &str = "__run__"; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct RunId(String); + +impl RunId { + pub fn new(id: impl Into) -> Self { + Self(id.into()) + } + + pub fn as_str(&self) -> &str { + &self.0 + } +} + +impl fmt::Display for RunId { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + self.0.fmt(f) + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum RunStatus { + Running, + Published, + Failed, + Aborted, +} + +impl RunStatus { + pub fn as_str(self) -> &'static str { + match self { + RunStatus::Running => "running", + RunStatus::Published => "published", + RunStatus::Failed => "failed", + RunStatus::Aborted => "aborted", + } + } + + fn parse(value: &str) -> Result { + match value { + "running" => Ok(Self::Running), + "published" => Ok(Self::Published), + "failed" => Ok(Self::Failed), + "aborted" => Ok(Self::Aborted), + other => Err(OmniError::manifest(format!( + "invalid run status '{}'", + other + ))), + } + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct RunRecord { + pub run_id: RunId, + pub target_branch: String, + pub run_branch: String, + pub base_snapshot_id: String, + pub base_manifest_version: u64, + pub operation_hash: Option, + pub actor_id: Option, + pub status: RunStatus, + pub published_snapshot_id: Option, + pub created_at: i64, + pub updated_at: i64, +} + +impl RunRecord { + pub fn new( + target_branch: impl Into, + base_snapshot_id: impl Into, + base_manifest_version: u64, + operation_hash: Option, + actor_id: Option, + ) -> Result { + let now = now_micros()?; + let run_id = RunId::new(ulid::Ulid::new().to_string()); + Ok(Self { + run_branch: internal_run_branch_name(&run_id), + run_id, + target_branch: target_branch.into(), + base_snapshot_id: base_snapshot_id.into(), + base_manifest_version, + operation_hash, + actor_id, + status: RunStatus::Running, + published_snapshot_id: None, + created_at: now, + updated_at: now, + }) + } + + pub fn with_status( + &self, + status: RunStatus, + published_snapshot_id: Option, + ) -> Result { + Ok(Self { + run_id: self.run_id.clone(), + target_branch: self.target_branch.clone(), + run_branch: self.run_branch.clone(), + base_snapshot_id: self.base_snapshot_id.clone(), + base_manifest_version: self.base_manifest_version, + operation_hash: self.operation_hash.clone(), + actor_id: self.actor_id.clone(), + status, + published_snapshot_id, + created_at: self.created_at, + updated_at: now_micros()?, + }) + } +} + +pub struct RunRegistry { + dataset: Dataset, + actor_dataset: Option, + latest_by_id: HashMap, + actor_by_run_id: HashMap, + root_uri: String, +} + +impl RunRegistry { + pub async fn init(root_uri: &str) -> Result { + let uri = graph_runs_uri(root_uri); + let batch = RecordBatch::new_empty(run_registry_schema()); + let reader = RecordBatchIterator::new(vec![Ok(batch)], run_registry_schema()); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + let dataset = Dataset::write(reader, &uri as &str, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let actor_dataset = create_run_actor_dataset(root_uri).await?; + Ok(Self { + dataset, + actor_dataset: Some(actor_dataset), + latest_by_id: HashMap::new(), + actor_by_run_id: HashMap::new(), + root_uri: root_uri.to_string(), + }) + } + + pub async fn open(root_uri: &str) -> Result { + let dataset = Dataset::open(&graph_runs_uri(root_uri)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + let actor_dataset = Dataset::open(&graph_run_actors_uri(root_uri)).await.ok(); + let actor_by_run_id = match &actor_dataset { + Some(dataset) => load_run_actor_cache(dataset).await?, + None => HashMap::new(), + }; + let latest_by_id = load_run_cache(&dataset, &actor_by_run_id).await?; + Ok(Self { + dataset, + actor_dataset, + latest_by_id, + actor_by_run_id, + root_uri: root_uri.to_string(), + }) + } + + pub async fn refresh(&mut self, root_uri: &str) -> Result<()> { + self.dataset = Dataset::open(&graph_runs_uri(root_uri)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.actor_dataset = Dataset::open(&graph_run_actors_uri(root_uri)).await.ok(); + self.actor_by_run_id = match &self.actor_dataset { + Some(dataset) => load_run_actor_cache(dataset).await?, + None => HashMap::new(), + }; + self.latest_by_id = load_run_cache(&self.dataset, &self.actor_by_run_id).await?; + self.root_uri = root_uri.to_string(); + Ok(()) + } + + pub async fn append_record(&mut self, record: &RunRecord) -> Result<()> { + let batch = runs_to_batch(&[record.clone()])?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], run_registry_schema()); + let mut ds = self.dataset.clone(); + ds.append(reader, None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.dataset = ds; + if let Some(actor_id) = &record.actor_id { + self.append_actor(record.run_id.as_str(), actor_id).await?; + } + let mut record = record.clone(); + if record.actor_id.is_none() { + record.actor_id = self.actor_by_run_id.get(record.run_id.as_str()).cloned(); + } + merge_latest_run(&mut self.latest_by_id, record); + Ok(()) + } + + pub async fn get_run(&self, run_id: &RunId) -> Result> { + Ok(self.latest_by_id.get(run_id.as_str()).cloned()) + } + + pub async fn list_runs(&self) -> Result> { + self.load_runs().await + } + + pub async fn load_runs(&self) -> Result> { + let mut runs = self.latest_by_id.values().cloned().collect::>(); + runs.sort_by(|a, b| { + a.created_at + .cmp(&b.created_at) + .then_with(|| a.run_id.as_str().cmp(b.run_id.as_str())) + }); + Ok(runs) + } + + async fn append_actor(&mut self, run_id: &str, actor_id: &str) -> Result<()> { + if self + .actor_by_run_id + .get(run_id) + .is_some_and(|existing| existing == actor_id) + { + return Ok(()); + } + + let record = RunActorRecord { + run_id: run_id.to_string(), + actor_id: actor_id.to_string(), + created_at: now_micros()?, + }; + let batch = run_actors_to_batch(&[record])?; + let reader = RecordBatchIterator::new(vec![Ok(batch)], run_actor_schema()); + let mut dataset = match self.actor_dataset.take() { + Some(dataset) => dataset, + None => create_run_actor_dataset(&self.root_uri).await?, + }; + dataset + .append(reader, None) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.actor_by_run_id + .insert(run_id.to_string(), actor_id.to_string()); + self.actor_dataset = Some(dataset); + Ok(()) + } +} + +pub(crate) fn is_internal_run_branch(name: &str) -> bool { + name.trim_start_matches('/') + .starts_with(INTERNAL_RUN_BRANCH_PREFIX) +} + +pub(crate) fn internal_run_branch_name(run_id: &RunId) -> String { + format!("{}{}", INTERNAL_RUN_BRANCH_PREFIX, run_id.as_str()) +} + +pub(crate) fn graph_runs_uri(root_uri: &str) -> String { + format!("{}/{}", root_uri.trim_end_matches('/'), GRAPH_RUNS_DIR) +} + +fn graph_run_actors_uri(root_uri: &str) -> String { + format!( + "{}/{}", + root_uri.trim_end_matches('/'), + GRAPH_RUN_ACTORS_DIR + ) +} + +fn run_registry_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("run_id", DataType::Utf8, false), + Field::new("target_branch", DataType::Utf8, false), + Field::new("run_branch", DataType::Utf8, false), + Field::new("base_snapshot_id", DataType::Utf8, false), + Field::new("base_manifest_version", DataType::UInt64, false), + Field::new("operation_hash", DataType::Utf8, true), + Field::new("status", DataType::Utf8, false), + Field::new("published_snapshot_id", DataType::Utf8, true), + Field::new( + "created_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + Field::new( + "updated_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + ])) +} + +fn run_actor_schema() -> SchemaRef { + Arc::new(Schema::new(vec![ + Field::new("run_id", DataType::Utf8, false), + Field::new("actor_id", DataType::Utf8, false), + Field::new( + "created_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + ])) +} + +async fn create_run_actor_dataset(root_uri: &str) -> Result { + let batch = RecordBatch::new_empty(run_actor_schema()); + let reader = RecordBatchIterator::new(vec![Ok(batch)], run_actor_schema()); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + Dataset::write( + reader, + &graph_run_actors_uri(root_uri) as &str, + Some(params), + ) + .await + .map_err(|e| OmniError::Lance(e.to_string())) +} + +async fn load_run_cache( + dataset: &Dataset, + actor_by_run_id: &HashMap, +) -> Result> { + let batches: Vec = dataset + .scan() + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let mut latest_by_id = HashMap::new(); + for mut record in load_runs_from_batches(&batches)? { + record.actor_id = actor_by_run_id.get(record.run_id.as_str()).cloned(); + merge_latest_run(&mut latest_by_id, record); + } + Ok(latest_by_id) +} + +async fn load_run_actor_cache(dataset: &Dataset) -> Result> { + let batches: Vec = dataset + .scan() + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let mut actors = HashMap::new(); + for batch in batches { + let run_ids = string_column(&batch, "run_id", "run actor registry")?; + let actor_ids = string_column(&batch, "actor_id", "run actor registry")?; + for row in 0..batch.num_rows() { + actors.insert( + run_ids.value(row).to_string(), + actor_ids.value(row).to_string(), + ); + } + } + Ok(actors) +} + +fn load_runs_from_batches(batches: &[RecordBatch]) -> Result> { + let mut runs = Vec::new(); + for batch in batches { + let run_ids = string_column(batch, "run_id", "run registry")?; + let target_branches = string_column(batch, "target_branch", "run registry")?; + let run_branches = string_column(batch, "run_branch", "run registry")?; + let base_snapshot_ids = string_column(batch, "base_snapshot_id", "run registry")?; + let base_manifest_versions = u64_column(batch, "base_manifest_version", "run registry")?; + let operation_hashes = string_column(batch, "operation_hash", "run registry")?; + let statuses = string_column(batch, "status", "run registry")?; + let published_snapshot_ids = string_column(batch, "published_snapshot_id", "run registry")?; + let created_ats = timestamp_micros_column(batch, "created_at", "run registry")?; + let updated_ats = timestamp_micros_column(batch, "updated_at", "run registry")?; + + for row in 0..batch.num_rows() { + runs.push(RunRecord { + run_id: RunId::new(run_ids.value(row)), + target_branch: target_branches.value(row).to_string(), + run_branch: run_branches.value(row).to_string(), + base_snapshot_id: base_snapshot_ids.value(row).to_string(), + base_manifest_version: base_manifest_versions.value(row), + operation_hash: if operation_hashes.is_null(row) { + None + } else { + Some(operation_hashes.value(row).to_string()) + }, + actor_id: None, + status: RunStatus::parse(statuses.value(row))?, + published_snapshot_id: if published_snapshot_ids.is_null(row) { + None + } else { + Some(published_snapshot_ids.value(row).to_string()) + }, + created_at: created_ats.value(row), + updated_at: updated_ats.value(row), + }); + } + } + Ok(runs) +} + +fn merge_latest_run(latest_by_id: &mut HashMap, record: RunRecord) { + match latest_by_id.get(record.run_id.as_str()) { + Some(existing) + if existing.updated_at > record.updated_at + || (existing.updated_at == record.updated_at + && existing.created_at >= record.created_at) => {} + _ => { + latest_by_id.insert(record.run_id.as_str().to_string(), record); + } + } +} + +fn string_column<'a>(batch: &'a RecordBatch, name: &str, context: &str) -> Result<&'a StringArray> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} column '{name}' is not Utf8")) + }) +} + +fn u64_column<'a>(batch: &'a RecordBatch, name: &str, context: &str) -> Result<&'a UInt64Array> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} column '{name}' is not UInt64")) + }) +} + +fn timestamp_micros_column<'a>( + batch: &'a RecordBatch, + name: &str, + context: &str, +) -> Result<&'a TimestampMicrosecondArray> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("{context} batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!( + "{context} column '{name}' is not Timestamp(Microsecond)" + )) + }) +} + +fn runs_to_batch(records: &[RunRecord]) -> Result { + let run_ids: Vec<&str> = records + .iter() + .map(|record| record.run_id.as_str()) + .collect(); + let target_branches: Vec<&str> = records + .iter() + .map(|record| record.target_branch.as_str()) + .collect(); + let run_branches: Vec<&str> = records + .iter() + .map(|record| record.run_branch.as_str()) + .collect(); + let base_snapshot_ids: Vec<&str> = records + .iter() + .map(|record| record.base_snapshot_id.as_str()) + .collect(); + let base_manifest_versions: Vec = records + .iter() + .map(|record| record.base_manifest_version) + .collect(); + let operation_hashes: Vec> = records + .iter() + .map(|record| record.operation_hash.as_deref()) + .collect(); + let statuses: Vec<&str> = records + .iter() + .map(|record| record.status.as_str()) + .collect(); + let published_snapshot_ids: Vec> = records + .iter() + .map(|record| record.published_snapshot_id.as_deref()) + .collect(); + let created_ats: Vec = records.iter().map(|record| record.created_at).collect(); + let updated_ats: Vec = records.iter().map(|record| record.updated_at).collect(); + + RecordBatch::try_new( + run_registry_schema(), + vec![ + Arc::new(StringArray::from(run_ids)), + Arc::new(StringArray::from(target_branches)), + Arc::new(StringArray::from(run_branches)), + Arc::new(StringArray::from(base_snapshot_ids)), + Arc::new(UInt64Array::from(base_manifest_versions)), + Arc::new(StringArray::from(operation_hashes)), + Arc::new(StringArray::from(statuses)), + Arc::new(StringArray::from(published_snapshot_ids)), + Arc::new(TimestampMicrosecondArray::from(created_ats)), + Arc::new(TimestampMicrosecondArray::from(updated_ats)), + ], + ) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct RunActorRecord { + run_id: String, + actor_id: String, + created_at: i64, +} + +fn run_actors_to_batch(records: &[RunActorRecord]) -> Result { + let run_ids: Vec<&str> = records + .iter() + .map(|record| record.run_id.as_str()) + .collect(); + let actor_ids: Vec<&str> = records + .iter() + .map(|record| record.actor_id.as_str()) + .collect(); + let created_ats: Vec = records.iter().map(|record| record.created_at).collect(); + + RecordBatch::try_new( + run_actor_schema(), + vec![ + Arc::new(StringArray::from(run_ids)), + Arc::new(StringArray::from(actor_ids)), + Arc::new(TimestampMicrosecondArray::from(created_ats)), + ], + ) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +fn now_micros() -> Result { + let duration = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map_err(|e| OmniError::manifest(format!("system clock error: {}", e)))?; + Ok(duration.as_micros() as i64) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_schema::{DataType, Field, Schema}; + + use super::*; + + #[test] + fn load_runs_from_batches_returns_error_for_bad_schema() { + let batch = RecordBatch::try_new( + Arc::new(Schema::new(vec![ + Field::new("run_id", DataType::UInt64, false), + Field::new("target_branch", DataType::Utf8, false), + Field::new("run_branch", DataType::Utf8, false), + Field::new("base_snapshot_id", DataType::Utf8, false), + Field::new("base_manifest_version", DataType::UInt64, false), + Field::new("operation_hash", DataType::Utf8, true), + Field::new("status", DataType::Utf8, false), + Field::new("published_snapshot_id", DataType::Utf8, true), + Field::new( + "created_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + Field::new( + "updated_at", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + ), + ])), + vec![ + Arc::new(UInt64Array::from(vec![1_u64])), + Arc::new(StringArray::from(vec!["main"])), + Arc::new(StringArray::from(vec!["__run__1"])), + Arc::new(StringArray::from(vec!["snap-1"])), + Arc::new(UInt64Array::from(vec![1_u64])), + Arc::new(StringArray::from(vec![None::<&str>])), + Arc::new(StringArray::from(vec!["running"])), + Arc::new(StringArray::from(vec![None::<&str>])), + Arc::new(TimestampMicrosecondArray::from(vec![1_i64])), + Arc::new(TimestampMicrosecondArray::from(vec![1_i64])), + ], + ) + .unwrap(); + + let err = load_runs_from_batches(&[batch]).unwrap_err(); + assert!(err.to_string().contains("run_id")); + } +} diff --git a/crates/omnigraph/src/db/schema_state.rs b/crates/omnigraph/src/db/schema_state.rs new file mode 100644 index 0000000..c62f72e --- /dev/null +++ b/crates/omnigraph/src/db/schema_state.rs @@ -0,0 +1,236 @@ +use std::sync::Arc; + +use omnigraph_compiler::schema::parser::parse_schema; +use omnigraph_compiler::{SchemaIR, build_schema_ir, schema_ir_hash, schema_ir_pretty_json}; +use serde::{Deserialize, Serialize}; + +use crate::error::{OmniError, Result}; +use crate::storage::{StorageAdapter, join_uri}; + +pub(crate) const SCHEMA_SOURCE_FILENAME: &str = "_schema.pg"; +pub(crate) const SCHEMA_IR_FILENAME: &str = "_schema.ir.json"; +pub(crate) const SCHEMA_STATE_FILENAME: &str = "__schema_state.json"; + +const SCHEMA_STATE_FORMAT_VERSION: u32 = 1; +const SCHEMA_IDENTITY_VERSION: u32 = 1; + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub(crate) struct SchemaState { + pub(crate) format_version: u32, + pub(crate) schema_ir_hash: String, + pub(crate) schema_identity_version: u32, +} + +impl SchemaState { + pub(crate) fn new(schema_ir_hash: String) -> Self { + Self { + format_version: SCHEMA_STATE_FORMAT_VERSION, + schema_ir_hash, + schema_identity_version: SCHEMA_IDENTITY_VERSION, + } + } +} + +pub(crate) async fn load_or_bootstrap_schema_contract( + root_uri: &str, + storage: Arc, + public_branches: &[String], + current_source_ir: &SchemaIR, +) -> Result<(SchemaIR, SchemaState)> { + match read_schema_contract(root_uri, storage.as_ref()).await? { + SchemaContractRead::Present { ir, state } => { + validate_persisted_schema_contract(&ir, &state)?; + validate_current_source_matches(&state, current_source_ir)?; + Ok((ir, state)) + } + SchemaContractRead::MissingAll => { + let public_non_main = public_branches + .iter() + .filter(|branch| branch.as_str() != "main") + .cloned() + .collect::>(); + if !public_non_main.is_empty() { + return Err(schema_lock_conflict(format!( + "repo is missing persisted schema state and has public branches ({}); public branches block schema evolution entirely", + public_non_main.join(", ") + ))); + } + let state = + write_schema_contract(root_uri, storage.as_ref(), current_source_ir).await?; + Ok((current_source_ir.clone(), state)) + } + SchemaContractRead::PartialMissing => Err(schema_lock_conflict( + "repo schema state is incomplete (_schema.ir.json and __schema_state.json must either both exist or both be absent)", + )), + } +} + +pub(crate) async fn validate_schema_contract( + root_uri: &str, + storage: Arc, +) -> Result<()> { + let current_source_ir = read_current_source_ir(root_uri, storage.as_ref()).await?; + let (persisted_ir, state) = match read_schema_contract(root_uri, storage.as_ref()).await? { + SchemaContractRead::Present { ir, state } => (ir, state), + SchemaContractRead::MissingAll | SchemaContractRead::PartialMissing => { + return Err(schema_lock_conflict( + "repo is missing persisted schema state; manual coordination is required before schema changes are allowed", + )); + } + }; + + validate_persisted_schema_contract(&persisted_ir, &state)?; + validate_current_source_matches(&state, ¤t_source_ir) +} + +pub(crate) async fn write_schema_contract( + root_uri: &str, + storage: &dyn StorageAdapter, + schema_ir: &SchemaIR, +) -> Result { + let ir_json = schema_ir_pretty_json(schema_ir) + .map_err(|err| OmniError::manifest_internal(err.to_string()))?; + let state = SchemaState::new( + schema_ir_hash(schema_ir).map_err(|err| OmniError::manifest_internal(err.to_string()))?, + ); + let state_json = serde_json::to_string_pretty(&state).map_err(|err| { + OmniError::manifest_internal(format!("serialize schema state error: {}", err)) + })?; + + storage + .write_text(&schema_ir_uri(root_uri), &ir_json) + .await?; + storage + .write_text(&schema_state_uri(root_uri), &state_json) + .await?; + Ok(state) +} + +pub(crate) async fn read_current_source_ir( + root_uri: &str, + storage: &dyn StorageAdapter, +) -> Result { + let source = storage.read_text(&schema_source_uri(root_uri)).await?; + compile_schema_source(&source) +} + +pub(crate) async fn read_accepted_schema_ir( + root_uri: &str, + storage: Arc, +) -> Result { + match read_schema_contract(root_uri, storage.as_ref()).await? { + SchemaContractRead::Present { ir, state } => { + validate_persisted_schema_contract(&ir, &state)?; + Ok(ir) + } + SchemaContractRead::MissingAll | SchemaContractRead::PartialMissing => { + Err(schema_lock_conflict( + "repo is missing persisted schema state; manual coordination is required before schema changes are allowed", + )) + } + } +} + +pub(crate) fn schema_source_uri(root_uri: &str) -> String { + join_uri(root_uri, SCHEMA_SOURCE_FILENAME) +} + +pub(crate) fn schema_ir_uri(root_uri: &str) -> String { + join_uri(root_uri, SCHEMA_IR_FILENAME) +} + +pub(crate) fn schema_state_uri(root_uri: &str) -> String { + join_uri(root_uri, SCHEMA_STATE_FILENAME) +} + +enum SchemaContractRead { + Present { ir: SchemaIR, state: SchemaState }, + MissingAll, + PartialMissing, +} + +async fn read_schema_contract( + root_uri: &str, + storage: &dyn StorageAdapter, +) -> Result { + let ir_uri = schema_ir_uri(root_uri); + let state_uri = schema_state_uri(root_uri); + let ir_exists = storage.exists(&ir_uri).await?; + let state_exists = storage.exists(&state_uri).await?; + + match (ir_exists, state_exists) { + (false, false) => Ok(SchemaContractRead::MissingAll), + (true, true) => { + let ir_json = storage.read_text(&ir_uri).await?; + let state_json = storage.read_text(&state_uri).await?; + let ir = serde_json::from_str::(&ir_json).map_err(|err| { + schema_lock_conflict(format!( + "accepted compiled schema contract in {} is invalid: {}", + SCHEMA_IR_FILENAME, err + )) + })?; + let state = serde_json::from_str::(&state_json).map_err(|err| { + schema_lock_conflict(format!( + "repo schema state in {} is invalid: {}", + SCHEMA_STATE_FILENAME, err + )) + })?; + Ok(SchemaContractRead::Present { ir, state }) + } + _ => Ok(SchemaContractRead::PartialMissing), + } +} + +fn validate_persisted_schema_contract(ir: &SchemaIR, state: &SchemaState) -> Result<()> { + if state.format_version != SCHEMA_STATE_FORMAT_VERSION { + return Err(schema_lock_conflict(format!( + "repo schema state format {} is unsupported", + state.format_version + ))); + } + + let actual_hash = schema_ir_hash(ir).map_err(|err| schema_lock_conflict(err.to_string()))?; + if actual_hash != state.schema_ir_hash { + return Err(schema_lock_conflict( + "accepted compiled schema does not match the recorded schema state", + )); + } + + Ok(()) +} + +fn validate_current_source_matches( + state: &SchemaState, + current_source_ir: &SchemaIR, +) -> Result<()> { + let current_hash = + schema_ir_hash(current_source_ir).map_err(|err| schema_lock_conflict(err.to_string()))?; + if current_hash != state.schema_ir_hash { + return Err(schema_lock_conflict( + "current _schema.pg no longer matches the accepted compiled schema", + )); + } + Ok(()) +} + +fn compile_schema_source(source: &str) -> Result { + let schema = parse_schema(source).map_err(|err| { + schema_lock_conflict(format!( + "current _schema.pg is not a valid accepted schema definition: {}", + err + )) + })?; + build_schema_ir(&schema).map_err(|err| { + schema_lock_conflict(format!( + "current _schema.pg could not be compiled into the accepted schema contract: {}", + err + )) + }) +} + +fn schema_lock_conflict(detail: impl Into) -> OmniError { + OmniError::manifest_conflict(format!( + "schema evolution is locked down in phase 1: {}; manual coordination is required", + detail.into() + )) +} diff --git a/crates/omnigraph/src/embedding.rs b/crates/omnigraph/src/embedding.rs new file mode 100644 index 0000000..cfd4071 --- /dev/null +++ b/crates/omnigraph/src/embedding.rs @@ -0,0 +1,489 @@ +use std::future::Future; +use std::time::Duration; + +use reqwest::Client; +use serde::Deserialize; +use serde_json::{Value, json}; +use tokio::time::sleep; + +use crate::error::{OmniError, Result}; + +const GEMINI_EMBED_MODEL: &str = "gemini-embedding-2-preview"; +const DEFAULT_GEMINI_BASE_URL: &str = "https://generativelanguage.googleapis.com/v1beta"; +const DEFAULT_TIMEOUT_MS: u64 = 30_000; +const DEFAULT_RETRY_ATTEMPTS: usize = 4; +const DEFAULT_RETRY_BACKOFF_MS: u64 = 200; +const QUERY_TASK_TYPE: &str = "RETRIEVAL_QUERY"; +const DOCUMENT_TASK_TYPE: &str = "RETRIEVAL_DOCUMENT"; + +#[derive(Clone, Debug)] +enum EmbeddingTransport { + Mock, + Gemini { + api_key: String, + base_url: String, + http: Client, + }, +} + +#[derive(Clone, Debug)] +pub struct EmbeddingClient { + retry_attempts: usize, + retry_backoff_ms: u64, + transport: EmbeddingTransport, +} + +struct EmbedCallError { + message: String, + retryable: bool, +} + +#[derive(Debug, Deserialize)] +struct GeminiEmbedResponse { + embedding: GeminiContentEmbedding, +} + +#[derive(Debug, Deserialize)] +struct GeminiContentEmbedding { + values: Vec, +} + +#[derive(Debug, Deserialize)] +struct GoogleErrorEnvelope { + error: GoogleErrorBody, +} + +#[derive(Debug, Deserialize)] +struct GoogleErrorBody { + message: String, +} + +impl EmbeddingClient { + pub fn from_env() -> Result { + let retry_attempts = + parse_env_usize("OMNIGRAPH_EMBED_RETRY_ATTEMPTS", DEFAULT_RETRY_ATTEMPTS); + let retry_backoff_ms = + parse_env_u64("OMNIGRAPH_EMBED_RETRY_BACKOFF_MS", DEFAULT_RETRY_BACKOFF_MS); + + if env_flag("OMNIGRAPH_EMBEDDINGS_MOCK") { + return Ok(Self { + retry_attempts, + retry_backoff_ms, + transport: EmbeddingTransport::Mock, + }); + } + + let api_key = std::env::var("GEMINI_API_KEY") + .ok() + .map(|v| v.trim().to_string()) + .filter(|v| !v.is_empty()) + .ok_or_else(|| { + OmniError::manifest_internal( + "GEMINI_API_KEY is required when nearest() needs a string embedding", + ) + })?; + let base_url = std::env::var("OMNIGRAPH_GEMINI_BASE_URL") + .ok() + .map(|v| v.trim_end_matches('/').to_string()) + .filter(|v| !v.is_empty()) + .unwrap_or_else(|| DEFAULT_GEMINI_BASE_URL.to_string()); + let timeout_ms = parse_env_u64("OMNIGRAPH_EMBED_TIMEOUT_MS", DEFAULT_TIMEOUT_MS); + let http = Client::builder() + .timeout(Duration::from_millis(timeout_ms)) + .build() + .map_err(|e| { + OmniError::manifest_internal(format!("failed to initialize HTTP client: {}", e)) + })?; + + Ok(Self { + retry_attempts, + retry_backoff_ms, + transport: EmbeddingTransport::Gemini { + api_key, + base_url, + http, + }, + }) + } + + #[cfg(test)] + fn mock_for_tests() -> Self { + Self { + retry_attempts: DEFAULT_RETRY_ATTEMPTS, + retry_backoff_ms: DEFAULT_RETRY_BACKOFF_MS, + transport: EmbeddingTransport::Mock, + } + } + + pub async fn embed_query_text(&self, input: &str, expected_dim: usize) -> Result> { + self.embed_text(input, expected_dim, QUERY_TASK_TYPE).await + } + + pub async fn embed_document_text(&self, input: &str, expected_dim: usize) -> Result> { + self.embed_text(input, expected_dim, DOCUMENT_TASK_TYPE) + .await + } + + async fn embed_text( + &self, + input: &str, + expected_dim: usize, + task_type: &'static str, + ) -> Result> { + if expected_dim == 0 { + return Err(OmniError::manifest_internal( + "embedding dimension must be greater than zero", + )); + } + + match &self.transport { + EmbeddingTransport::Mock => Ok(mock_embedding(input, expected_dim)), + EmbeddingTransport::Gemini { .. } => { + self.with_retry(|| self.embed_text_gemini_once(input, expected_dim, task_type)) + .await + } + } + } + + async fn with_retry(&self, mut operation: F) -> Result + where + F: FnMut() -> Fut, + Fut: Future>, + { + let max_attempt = self.retry_attempts.max(1); + let mut attempt = 0usize; + loop { + attempt += 1; + match operation().await { + Ok(value) => return Ok(value), + Err(err) => { + if !err.retryable || attempt >= max_attempt { + return Err(OmniError::manifest_internal(err.message)); + } + let shift = (attempt - 1).min(10) as u32; + let delay = self.retry_backoff_ms.saturating_mul(1u64 << shift); + sleep(Duration::from_millis(delay)).await; + } + } + } + } + + async fn embed_text_gemini_once( + &self, + input: &str, + expected_dim: usize, + task_type: &'static str, + ) -> std::result::Result, EmbedCallError> { + let (api_key, base_url, http) = match &self.transport { + EmbeddingTransport::Gemini { + api_key, + base_url, + http, + } => (api_key, base_url, http), + EmbeddingTransport::Mock => unreachable!("mock transport should not call Gemini"), + }; + + let response = http + .post(gemini_endpoint(base_url)) + .header("x-goog-api-key", api_key) + .json(&build_gemini_request(input, expected_dim, task_type)) + .send() + .await; + let response = match response { + Ok(response) => response, + Err(err) => { + let retryable = err.is_timeout() || err.is_connect() || err.is_request(); + return Err(EmbedCallError { + message: format!("embedding request failed: {}", err), + retryable, + }); + } + }; + + let status = response.status(); + let body = match response.text().await { + Ok(body) => body, + Err(err) => { + return Err(EmbedCallError { + message: format!( + "embedding response read failed (status {}): {}", + status, err + ), + retryable: status.is_server_error() || status.as_u16() == 429, + }); + } + }; + + if !status.is_success() { + let message = parse_google_error_message(&body).unwrap_or(body); + return Err(EmbedCallError { + message: format!( + "embedding request failed with status {}: {}", + status, message + ), + retryable: status.is_server_error() || status.as_u16() == 429, + }); + } + + let parsed: GeminiEmbedResponse = + serde_json::from_str(&body).map_err(|err| EmbedCallError { + message: format!("embedding response decode failed: {}", err), + retryable: false, + })?; + + validate_and_normalize_embedding(parsed.embedding.values, expected_dim).map_err(|message| { + EmbedCallError { + message, + retryable: false, + } + }) + } +} + +fn gemini_endpoint(base_url: &str) -> String { + format!( + "{}/models/{}:embedContent", + base_url.trim_end_matches('/'), + GEMINI_EMBED_MODEL + ) +} + +fn build_gemini_request(input: &str, expected_dim: usize, task_type: &'static str) -> Value { + json!({ + "model": format!("models/{}", GEMINI_EMBED_MODEL), + "content": { + "parts": [ + { + "text": input + } + ] + }, + "taskType": task_type, + "outputDimensionality": expected_dim, + }) +} + +fn validate_and_normalize_embedding( + values: Vec, + expected_dim: usize, +) -> std::result::Result, String> { + if values.len() != expected_dim { + return Err(format!( + "embedding dimension mismatch: expected {}, got {}", + expected_dim, + values.len() + )); + } + Ok(normalize_vector(values)) +} + +fn normalize_vector(mut values: Vec) -> Vec { + let norm = values + .iter() + .map(|v| (*v as f64) * (*v as f64)) + .sum::() + .sqrt() as f32; + if norm > f32::EPSILON { + for value in &mut values { + *value /= norm; + } + } + values +} + +fn parse_google_error_message(body: &str) -> Option { + serde_json::from_str::(body) + .ok() + .map(|e| e.error.message) + .filter(|msg| !msg.trim().is_empty()) +} + +fn parse_env_usize(name: &str, default: usize) -> usize { + std::env::var(name) + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|v| *v > 0) + .unwrap_or(default) +} + +fn parse_env_u64(name: &str, default: u64) -> u64 { + std::env::var(name) + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|v| *v > 0) + .unwrap_or(default) +} + +fn env_flag(name: &str) -> bool { + std::env::var(name) + .ok() + .map(|v| { + let s = v.trim().to_ascii_lowercase(); + s == "1" || s == "true" || s == "yes" || s == "on" + }) + .unwrap_or(false) +} + +fn mock_embedding(input: &str, dim: usize) -> Vec { + let mut seed = fnv1a64(input.as_bytes()); + let mut out = Vec::with_capacity(dim); + for _ in 0..dim { + seed = xorshift64(seed); + let ratio = (seed as f64 / u64::MAX as f64) as f32; + out.push((ratio * 2.0) - 1.0); + } + normalize_vector(out) +} + +fn fnv1a64(bytes: &[u8]) -> u64 { + let mut hash = 14695981039346656037u64; + for byte in bytes { + hash ^= *byte as u64; + hash = hash.wrapping_mul(1099511628211u64); + } + hash +} + +fn xorshift64(mut x: u64) -> u64 { + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + x +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + use std::sync::atomic::{AtomicUsize, Ordering}; + + use serial_test::serial; + + use super::*; + + struct EnvGuard { + saved: Vec<(&'static str, Option)>, + } + + impl EnvGuard { + fn set(vars: &[(&'static str, Option<&str>)]) -> Self { + let saved = vars + .iter() + .map(|(name, _)| (*name, std::env::var(name).ok())) + .collect::>(); + for (name, value) in vars { + unsafe { + match value { + Some(value) => std::env::set_var(name, value), + None => std::env::remove_var(name), + } + } + } + Self { saved } + } + } + + impl Drop for EnvGuard { + fn drop(&mut self) { + for (name, value) in self.saved.drain(..) { + unsafe { + match value { + Some(value) => std::env::set_var(name, value), + None => std::env::remove_var(name), + } + } + } + } + } + + #[tokio::test] + async fn mock_embeddings_are_deterministic() { + let client = EmbeddingClient::mock_for_tests(); + let a = client.embed_query_text("alpha", 8).await.unwrap(); + let b = client.embed_query_text("alpha", 8).await.unwrap(); + let c = client.embed_query_text("beta", 8).await.unwrap(); + assert_eq!(a, b); + assert_ne!(a, c); + assert_eq!(a.len(), 8); + } + + #[test] + fn gemini_request_uses_preview_model_retrieval_query_and_dimension() { + let request = build_gemini_request("alpha", 4, QUERY_TASK_TYPE); + assert_eq!(request["model"], "models/gemini-embedding-2-preview"); + assert_eq!(request["taskType"], QUERY_TASK_TYPE); + assert_eq!(request["outputDimensionality"], 4); + assert_eq!(request["content"]["parts"][0]["text"], "alpha"); + } + + #[test] + fn gemini_document_request_uses_retrieval_document_task_type() { + let request = build_gemini_request("alpha", 4, DOCUMENT_TASK_TYPE); + assert_eq!(request["taskType"], DOCUMENT_TASK_TYPE); + } + + #[test] + fn validate_and_normalize_embedding_enforces_dimension() { + let normalized = validate_and_normalize_embedding(vec![3.0, 4.0], 2).unwrap(); + assert!((normalized[0] - 0.6).abs() < 1e-6); + assert!((normalized[1] - 0.8).abs() < 1e-6); + + let err = validate_and_normalize_embedding(vec![1.0, 2.0], 3).unwrap_err(); + assert!(err.contains("expected 3, got 2")); + } + + #[tokio::test] + async fn with_retry_retries_retryable_failures() { + let client = EmbeddingClient::mock_for_tests(); + let attempts = Arc::new(AtomicUsize::new(0)); + let attempts_for_call = Arc::clone(&attempts); + + let value = client + .with_retry(|| { + let attempts_for_call = Arc::clone(&attempts_for_call); + async move { + let attempt = attempts_for_call.fetch_add(1, Ordering::SeqCst); + if attempt == 0 { + Err(EmbedCallError { + message: "retry me".to_string(), + retryable: true, + }) + } else { + Ok("ok") + } + } + }) + .await + .unwrap(); + + assert_eq!(value, "ok"); + assert_eq!(attempts.load(Ordering::SeqCst), 2); + } + + #[tokio::test] + async fn with_retry_stops_on_non_retryable_failures() { + let client = EmbeddingClient::mock_for_tests(); + let err = client + .with_retry(|| async { + Err::<(), _>(EmbedCallError { + message: "do not retry".to_string(), + retryable: false, + }) + }) + .await + .unwrap_err(); + + assert!(err.to_string().contains("do not retry")); + } + + #[test] + #[serial] + fn from_env_requires_gemini_api_key_when_not_mocking() { + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_EMBEDDINGS_MOCK", None), + ("GEMINI_API_KEY", None), + ]); + + let err = EmbeddingClient::from_env().unwrap_err(); + assert!(err.to_string().contains("GEMINI_API_KEY")); + } +} diff --git a/crates/omnigraph/src/error.rs b/crates/omnigraph/src/error.rs new file mode 100644 index 0000000..fe65ccb --- /dev/null +++ b/crates/omnigraph/src/error.rs @@ -0,0 +1,80 @@ +use thiserror::Error; + +pub type Result = std::result::Result; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum ManifestErrorKind { + BadRequest, + NotFound, + Conflict, + Internal, +} + +#[derive(Debug, Clone, Error)] +#[error("{message}")] +pub struct ManifestError { + pub kind: ManifestErrorKind, + pub message: String, +} + +impl ManifestError { + pub fn new(kind: ManifestErrorKind, message: impl Into) -> Self { + Self { + kind, + message: message.into(), + } + } +} + +#[derive(Debug, Clone)] +pub struct MergeConflict { + pub table_key: String, + pub row_id: Option, + pub kind: MergeConflictKind, + pub message: String, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum MergeConflictKind { + DivergentInsert, + DivergentUpdate, + DeleteVsUpdate, + OrphanEdge, + UniqueViolation, + CardinalityViolation, + ValueConstraintViolation, +} + +#[derive(Debug, Error)] +pub enum OmniError { + #[error("{0}")] + Compiler(#[from] omnigraph_compiler::error::NanoError), + #[error("storage: {0}")] + Lance(String), + #[error("query: {0}")] + DataFusion(String), + #[error("io: {0}")] + Io(#[from] std::io::Error), + #[error("{0}")] + Manifest(ManifestError), + #[error("merge conflicts: {0:?}")] + MergeConflicts(Vec), +} + +impl OmniError { + pub fn manifest(message: impl Into) -> Self { + Self::Manifest(ManifestError::new(ManifestErrorKind::BadRequest, message)) + } + + pub fn manifest_not_found(message: impl Into) -> Self { + Self::Manifest(ManifestError::new(ManifestErrorKind::NotFound, message)) + } + + pub fn manifest_conflict(message: impl Into) -> Self { + Self::Manifest(ManifestError::new(ManifestErrorKind::Conflict, message)) + } + + pub fn manifest_internal(message: impl Into) -> Self { + Self::Manifest(ManifestError::new(ManifestErrorKind::Internal, message)) + } +} diff --git a/crates/omnigraph/src/exec/mod.rs b/crates/omnigraph/src/exec/mod.rs new file mode 100644 index 0000000..47dd51f --- /dev/null +++ b/crates/omnigraph/src/exec/mod.rs @@ -0,0 +1,4011 @@ +use std::collections::{HashMap, HashSet}; +use std::env; +use std::path::PathBuf; +use std::sync::Arc; + +use arrow_array::{ + Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, + Int32Array, Int64Array, ListArray, RecordBatch, StringArray, UInt32Array, UInt64Array, + builder::{ + BooleanBuilder, Date32Builder, Date64Builder, FixedSizeListBuilder, Float32Builder, + Float64Builder, Int32Builder, Int64Builder, ListBuilder, StringBuilder, UInt32Builder, + UInt64Builder, + }, +}; +use arrow_cast::display::array_value_to_string; +use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use futures::TryStreamExt; +use lance::Dataset; +use lance::blob::BlobArrayBuilder; +use lance::dataset::scanner::{ColumnOrdering, DatasetRecordBatchStream}; +use omnigraph_compiler::catalog::Catalog; +use omnigraph_compiler::ir::{ + IRAssignment, IRExpr, IRFilter, IRMutationPredicate, IROp, IROrdering, IRProjection, + MutationOpIR, ParamMap, QueryIR, +}; +use omnigraph_compiler::lower_mutation_query; +use omnigraph_compiler::lower_query; +use omnigraph_compiler::query::ast::{CompOp, Literal, NOW_PARAM_NAME}; +use omnigraph_compiler::query::typecheck::{CheckedQuery, typecheck_query, typecheck_query_decl}; +use omnigraph_compiler::result::{MutationResult, QueryResult}; +use omnigraph_compiler::types::Direction; +use omnigraph_compiler::types::ScalarType; +use time::OffsetDateTime; +use time::format_description::well_known::Rfc3339; + +use crate::db::commit_graph::CommitGraph; +use crate::db::manifest::ManifestCoordinator; +use crate::db::{MergeOutcome, Omnigraph, is_internal_run_branch}; +use crate::db::{ReadTarget, Snapshot}; +use crate::embedding::EmbeddingClient; +use crate::error::{MergeConflict, MergeConflictKind, OmniError, Result}; +use crate::graph_index::GraphIndex; +use tempfile::{Builder as TempDirBuilder, TempDir}; + +impl Omnigraph { + /// Run a named query against an explicit branch or snapshot target. + pub async fn query( + &self, + target: impl Into, + query_source: &str, + query_name: &str, + params: &ParamMap, + ) -> Result { + self.ensure_schema_state_valid().await?; + let resolved = self.resolved_target(target).await?; + + let query_decl = omnigraph_compiler::find_named_query(query_source, query_name) + .map_err(|e| OmniError::manifest(e.to_string()))?; + let type_ctx = typecheck_query(self.catalog(), &query_decl)?; + let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?; + + let needs_graph = ir + .pipeline + .iter() + .any(|op| matches!(op, IROp::Expand { .. } | IROp::AntiJoin { .. })); + let graph_index = if needs_graph { + Some(self.graph_index_for_resolved(&resolved).await?) + } else { + None + }; + + execute_query( + &ir, + params, + &resolved.snapshot, + graph_index.as_deref(), + self.catalog(), + ) + .await + } + + /// Run a named query against the graph as it existed at a prior manifest version. + /// + /// Compiles the query normally, builds a temporary (non-cached) graph index + /// if traversal is needed, and executes against the historical snapshot. + pub async fn run_query_at( + &self, + version: u64, + query_source: &str, + query_name: &str, + params: &ParamMap, + ) -> Result { + self.ensure_schema_state_valid().await?; + let snapshot = self.snapshot_at_version(version).await?; + + let query_decl = omnigraph_compiler::find_named_query(query_source, query_name) + .map_err(|e| OmniError::manifest(e.to_string()))?; + let type_ctx = typecheck_query(self.catalog(), &query_decl)?; + let ir = lower_query(self.catalog(), &query_decl, &type_ctx)?; + + let needs_graph = ir + .pipeline + .iter() + .any(|op| matches!(op, IROp::Expand { .. } | IROp::AntiJoin { .. })); + let graph_index = if needs_graph { + let edge_types = self + .catalog() + .edge_types + .iter() + .map(|(name, et)| (name.clone(), (et.from_type.clone(), et.to_type.clone()))) + .collect(); + Some(Arc::new(GraphIndex::build(&snapshot, &edge_types).await?)) + } else { + None + }; + + execute_query( + &ir, + params, + &snapshot, + graph_index.as_deref(), + self.catalog(), + ) + .await + } +} + +const MERGE_STAGE_BATCH_ROWS: usize = 8192; +const MERGE_STAGE_DIR_ENV: &str = "OMNIGRAPH_MERGE_STAGING_DIR"; + +#[derive(Debug)] +enum CandidateTableState { + AdoptSourceState, + RewriteMerged(StagedMergeResult), +} + +#[derive(Debug)] +struct StagedTable { + _dir: TempDir, + dataset: Dataset, +} + +#[derive(Debug)] +struct StagedMergeResult { + full_staged: StagedTable, + delta_staged: Option, + deleted_ids: Vec, +} + +#[derive(Debug, Clone)] +struct CursorRow { + id: String, + signature: String, + batch: RecordBatch, + row_index: usize, +} + +struct OrderedTableCursor { + stream: Option>>, + current_batch: Option, + current_row: usize, + peeked: Option, +} + +impl OrderedTableCursor { + async fn from_snapshot(snapshot: &Snapshot, table_key: &str) -> Result { + let dataset = match snapshot.entry(table_key) { + Some(_) => Some(snapshot.open(table_key).await?), + None => None, + }; + Self::from_dataset(dataset).await + } + + async fn from_dataset(dataset: Option) -> Result { + let stream = if let Some(ds) = dataset { + Some(Box::pin( + crate::table_store::TableStore::scan_stream( + &ds, + None, + None, + Some(vec![ColumnOrdering::asc_nulls_last("id".to_string())]), + false, + ) + .await?, + )) + } else { + None + }; + + Ok(Self { + stream, + current_batch: None, + current_row: 0, + peeked: None, + }) + } + + async fn peek_cloned(&mut self) -> Result> { + if self.peeked.is_none() { + self.peeked = self.next_row().await?; + } + Ok(self.peeked.clone()) + } + + async fn pop(&mut self) -> Result> { + if self.peeked.is_some() { + return Ok(self.peeked.take()); + } + self.next_row().await + } + + async fn next_row(&mut self) -> Result> { + loop { + if let Some(batch) = &self.current_batch { + if self.current_row < batch.num_rows() { + let row_index = self.current_row; + self.current_row += 1; + return Ok(Some(CursorRow { + id: row_id_at(batch, row_index)?, + signature: row_signature(batch, row_index)?, + batch: batch.clone(), + row_index, + })); + } + } + + let Some(stream) = self.stream.as_mut() else { + return Ok(None); + }; + match stream.try_next().await { + Ok(Some(batch)) => { + self.current_batch = Some(batch); + self.current_row = 0; + } + Ok(None) => { + self.stream = None; + self.current_batch = None; + return Ok(None); + } + Err(err) => return Err(OmniError::Lance(err.to_string())), + } + } + } +} + +struct StagedTableWriter { + schema: SchemaRef, + dataset_uri: String, + dir: TempDir, + dataset: Option, + buffered_rows: usize, + row_count: u64, + batches: Vec, +} + +impl StagedTableWriter { + fn new(table_key: &str, schema: SchemaRef) -> Result { + let dir = merge_stage_tempdir(table_key)?; + let dataset_uri = dir.path().join("table.lance").to_string_lossy().to_string(); + Ok(Self { + schema, + dataset_uri, + dir, + dataset: None, + buffered_rows: 0, + row_count: 0, + batches: Vec::new(), + }) + } + + async fn push_row(&mut self, row: &CursorRow) -> Result<()> { + self.row_count += 1; + self.buffered_rows += 1; + self.batches.push(row.batch.slice(row.row_index, 1)); + if self.buffered_rows >= MERGE_STAGE_BATCH_ROWS { + self.flush().await?; + } + Ok(()) + } + + async fn finish(mut self) -> Result { + self.flush().await?; + if self.dataset.is_none() { + self.dataset = Some( + crate::table_store::TableStore::create_empty_dataset( + &self.dataset_uri, + &self.schema, + ) + .await?, + ); + } + Ok(StagedTable { + _dir: self.dir, + dataset: self.dataset.unwrap(), + }) + } + + async fn flush(&mut self) -> Result<()> { + if self.batches.is_empty() { + return Ok(()); + } + + let batch = if self.batches.len() == 1 { + self.batches.pop().unwrap() + } else { + let batches = std::mem::take(&mut self.batches); + arrow_select::concat::concat_batches(&self.schema, &batches) + .map_err(|e| OmniError::Lance(e.to_string()))? + }; + self.buffered_rows = 0; + + let ds = crate::table_store::TableStore::append_or_create_batch( + &self.dataset_uri, + self.dataset.take(), + batch, + ) + .await?; + self.dataset = Some(ds); + Ok(()) + } +} + +fn merge_stage_tempdir(table_key: &str) -> Result { + if let Ok(root) = env::var(MERGE_STAGE_DIR_ENV) { + return TempDirBuilder::new() + .prefix(&format!( + "omnigraph-merge-{}-", + sanitize_table_key(table_key) + )) + .tempdir_in(PathBuf::from(root)) + .map_err(OmniError::from); + } + TempDirBuilder::new() + .prefix(&format!( + "omnigraph-merge-{}-", + sanitize_table_key(table_key) + )) + .tempdir() + .map_err(OmniError::from) +} + +fn sanitize_table_key(table_key: &str) -> String { + table_key + .chars() + .map(|ch| match ch { + ':' | '/' | '\\' => '-', + other => other, + }) + .collect() +} + +/// Computes the delta between base and source for an adopted-source merge. +/// Returns the changed/new rows (for merge_insert) and deleted IDs (for delete). +async fn compute_source_delta( + table_key: &str, + catalog: &Catalog, + base_snapshot: &Snapshot, + source_snapshot: &Snapshot, +) -> Result> { + let schema = schema_for_table_key(catalog, table_key)?; + let mut full_writer = + StagedTableWriter::new(&format!("{}_adopt_full", table_key), schema.clone())?; + let mut delta_writer = StagedTableWriter::new(&format!("{}_adopt_delta", table_key), schema)?; + let mut deleted_ids: Vec = Vec::new(); + let mut base = OrderedTableCursor::from_snapshot(base_snapshot, table_key).await?; + let mut source = OrderedTableCursor::from_snapshot(source_snapshot, table_key).await?; + + let mut needs_update = false; + + loop { + let base_row = base.peek_cloned().await?; + let source_row = source.peek_cloned().await?; + + let next_id = [base_row.as_ref(), source_row.as_ref()] + .into_iter() + .flatten() + .map(|row| row.id.clone()) + .min(); + let Some(next_id) = next_id else { break }; + + let base_row = if base_row.as_ref().map(|r| r.id.as_str()) == Some(next_id.as_str()) { + base.pop().await? + } else { + None + }; + let source_row = if source_row.as_ref().map(|r| r.id.as_str()) == Some(next_id.as_str()) { + source.pop().await? + } else { + None + }; + + let base_sig = base_row.as_ref().map(|r| r.signature.as_str()); + let source_sig = source_row.as_ref().map(|r| r.signature.as_str()); + + match (&base_row, &source_row) { + (Some(_), None) => { + // Deleted on source + deleted_ids.push(next_id); + needs_update = true; + } + (None, Some(src)) => { + // New on source + full_writer.push_row(src).await?; + delta_writer.push_row(src).await?; + needs_update = true; + } + (Some(_), Some(src)) if source_sig != base_sig => { + // Changed on source + full_writer.push_row(src).await?; + delta_writer.push_row(src).await?; + needs_update = true; + } + (Some(base), Some(_)) => { + // Unchanged — write to full (for validation), skip delta + full_writer.push_row(base).await?; + } + (None, None) => unreachable!(), + } + } + + if !needs_update { + return Ok(None); + } + + let delta_staged = if delta_writer.row_count > 0 { + Some(delta_writer.finish().await?) + } else { + None + }; + + Ok(Some(StagedMergeResult { + full_staged: full_writer.finish().await?, + delta_staged, + deleted_ids, + })) +} + +fn min_cursor_id( + base_row: &Option, + source_row: &Option, + target_row: &Option, +) -> Option { + [base_row.as_ref(), source_row.as_ref(), target_row.as_ref()] + .into_iter() + .flatten() + .map(|row| row.id.clone()) + .min() +} + +async fn stage_streaming_table_merge( + table_key: &str, + catalog: &Catalog, + base_snapshot: &Snapshot, + source_snapshot: &Snapshot, + target_snapshot: &Snapshot, + conflicts: &mut Vec, +) -> Result> { + let schema = schema_for_table_key(catalog, table_key)?; + let mut full_writer = StagedTableWriter::new(&format!("{}_full", table_key), schema.clone())?; + let mut delta_writer = StagedTableWriter::new(&format!("{}_delta", table_key), schema)?; + let mut deleted_ids: Vec = Vec::new(); + let mut base = OrderedTableCursor::from_snapshot(base_snapshot, table_key).await?; + let mut source = OrderedTableCursor::from_snapshot(source_snapshot, table_key).await?; + let mut target = OrderedTableCursor::from_snapshot(target_snapshot, table_key).await?; + + let prior_conflict_count = conflicts.len(); + let mut needs_update = false; + + loop { + let base_row = base.peek_cloned().await?; + let source_row = source.peek_cloned().await?; + let target_row = target.peek_cloned().await?; + let Some(next_id) = min_cursor_id(&base_row, &source_row, &target_row) else { + break; + }; + + let base_row = if base_row.as_ref().map(|row| row.id.as_str()) == Some(next_id.as_str()) { + base.pop().await? + } else { + None + }; + let source_row = if source_row.as_ref().map(|row| row.id.as_str()) == Some(next_id.as_str()) + { + source.pop().await? + } else { + None + }; + let target_row = if target_row.as_ref().map(|row| row.id.as_str()) == Some(next_id.as_str()) + { + target.pop().await? + } else { + None + }; + + let base_sig = base_row.as_ref().map(|row| row.signature.as_str()); + let source_sig = source_row.as_ref().map(|row| row.signature.as_str()); + let target_sig = target_row.as_ref().map(|row| row.signature.as_str()); + + let source_changed = source_sig != base_sig; + let target_changed = target_sig != base_sig; + + let selection = if !source_changed { + target_row.as_ref() + } else if !target_changed { + source_row.as_ref() + } else if source_sig == target_sig { + target_row.as_ref() + } else { + conflicts.push(classify_merge_conflict( + table_key, &next_id, base_sig, source_sig, target_sig, + )); + None + }; + + if conflicts.len() > prior_conflict_count { + continue; + } + + // Row existed in target but not in merge result → delete + if selection.is_none() && target_row.is_some() { + deleted_ids.push(next_id.clone()); + needs_update = true; + continue; + } + + if let Some(selection) = selection { + // Always write to full (for validation) + full_writer.push_row(selection).await?; + // Only write changed rows to delta (for publish) + if selection.signature.as_str() != target_sig.unwrap_or("") { + delta_writer.push_row(selection).await?; + needs_update = true; + } + } + } + + if conflicts.len() > prior_conflict_count { + return Ok(None); + } + if !needs_update { + return Ok(None); + } + + let delta_staged = if delta_writer.row_count > 0 { + Some(delta_writer.finish().await?) + } else { + None + }; + + Ok(Some(StagedMergeResult { + full_staged: full_writer.finish().await?, + delta_staged, + deleted_ids, + })) +} + +fn schema_for_table_key(catalog: &Catalog, table_key: &str) -> Result { + if let Some(name) = table_key.strip_prefix("node:") { + return catalog + .node_types + .get(name) + .map(|t| t.arrow_schema.clone()) + .ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", name))); + } + if let Some(name) = table_key.strip_prefix("edge:") { + return catalog + .edge_types + .get(name) + .map(|t| t.arrow_schema.clone()) + .ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", name))); + } + Err(OmniError::manifest(format!( + "invalid table key '{}'", + table_key + ))) +} + +fn same_manifest_state( + left: Option<&crate::db::SubTableEntry>, + right: Option<&crate::db::SubTableEntry>, +) -> bool { + match (left, right) { + (Some(left), Some(right)) => { + left.table_version == right.table_version && left.table_branch == right.table_branch + } + (None, None) => true, + _ => false, + } +} + +fn classify_merge_conflict( + table_key: &str, + row_id: &str, + base_sig: Option<&str>, + source_sig: Option<&str>, + target_sig: Option<&str>, +) -> MergeConflict { + let (kind, message) = match (base_sig, source_sig, target_sig) { + (None, Some(_), Some(_)) => ( + MergeConflictKind::DivergentInsert, + format!("divergent insert for id '{}'", row_id), + ), + (Some(_), None, Some(_)) | (Some(_), Some(_), None) => ( + MergeConflictKind::DeleteVsUpdate, + format!("delete/update conflict for id '{}'", row_id), + ), + _ => ( + MergeConflictKind::DivergentUpdate, + format!("divergent update for id '{}'", row_id), + ), + }; + MergeConflict { + table_key: table_key.to_string(), + row_id: Some(row_id.to_string()), + kind, + message, + } +} + +fn row_signature(batch: &RecordBatch, row: usize) -> Result { + let mut values = Vec::with_capacity(batch.num_columns()); + for column in batch.columns() { + values.push( + array_value_to_string(column.as_ref(), row) + .map_err(|e| OmniError::Lance(e.to_string()))?, + ); + } + Ok(values.join("\u{1f}")) +} + +async fn validate_merge_candidates( + db: &Omnigraph, + source_snapshot: &Snapshot, + target_snapshot: &Snapshot, + candidates: &HashMap, +) -> Result<()> { + let mut conflicts = Vec::new(); + let mut node_ids: HashMap> = HashMap::new(); + + for (type_name, node_type) in &db.catalog().node_types { + let table_key = format!("node:{}", type_name); + let mut values = HashSet::new(); + let mut unique_seen = vec![HashMap::new(); node_type.unique_constraints.len()]; + + if let Some(ds) = + candidate_dataset(source_snapshot, target_snapshot, candidates, &table_key).await? + { + let mut stream = + crate::table_store::TableStore::scan_stream(&ds, None, None, None, false).await?; + while let Some(batch) = stream + .try_next() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + { + if let Err(err) = crate::loader::validate_value_constraints(&batch, node_type) { + conflicts.push(MergeConflict { + table_key: table_key.clone(), + row_id: None, + kind: MergeConflictKind::ValueConstraintViolation, + message: err.to_string(), + }); + } + update_unique_constraints( + &table_key, + &batch, + &node_type.unique_constraints, + &mut unique_seen, + &mut conflicts, + )?; + let ids = batch + .column_by_name("id") + .ok_or_else(|| { + OmniError::manifest(format!("table {} missing id column", table_key)) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest(format!("table {} id column is not Utf8", table_key)) + })?; + for row in 0..ids.len() { + values.insert(ids.value(row).to_string()); + } + } + } + node_ids.insert(type_name.clone(), values); + } + + for (edge_name, edge_type) in &db.catalog().edge_types { + let table_key = format!("edge:{}", edge_name); + let mut unique_seen = vec![HashMap::new(); edge_type.unique_constraints.len()]; + let mut src_counts = HashMap::new(); + + if let Some(ds) = + candidate_dataset(source_snapshot, target_snapshot, candidates, &table_key).await? + { + let mut stream = + crate::table_store::TableStore::scan_stream(&ds, None, None, None, false).await?; + while let Some(batch) = stream + .try_next() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + { + update_unique_constraints( + &table_key, + &batch, + &edge_type.unique_constraints, + &mut unique_seen, + &mut conflicts, + )?; + accumulate_edge_cardinality(&batch, &mut src_counts, &table_key)?; + conflicts.extend(validate_orphan_edges_batch( + &table_key, edge_type, &batch, &node_ids, + )?); + } + } + + conflicts.extend(finalize_edge_cardinality_conflicts( + &table_key, + edge_name, + edge_type.cardinality.min, + edge_type.cardinality.max, + src_counts, + )); + } + + if conflicts.is_empty() { + Ok(()) + } else { + Err(OmniError::MergeConflicts(conflicts)) + } +} + +async fn candidate_dataset( + source_snapshot: &Snapshot, + target_snapshot: &Snapshot, + candidates: &HashMap, + table_key: &str, +) -> Result> { + if let Some(candidate) = candidates.get(table_key) { + return match candidate { + CandidateTableState::AdoptSourceState => match source_snapshot.entry(table_key) { + Some(_) => Ok(Some(source_snapshot.open(table_key).await?)), + None => Ok(None), + }, + CandidateTableState::RewriteMerged(staged) => { + Ok(Some(staged.full_staged.dataset.clone())) + } + }; + } + match target_snapshot.entry(table_key) { + Some(_) => Ok(Some(target_snapshot.open(table_key).await?)), + None => Ok(None), + } +} + +fn update_unique_constraints( + table_key: &str, + batch: &RecordBatch, + constraints: &[Vec], + seen: &mut [HashMap], + conflicts: &mut Vec, +) -> Result<()> { + for (constraint_idx, columns) in constraints.iter().enumerate() { + let seen = &mut seen[constraint_idx]; + for row in 0..batch.num_rows() { + let mut parts = Vec::with_capacity(columns.len()); + let mut any_null = false; + for column_name in columns { + let column = batch.column_by_name(column_name).ok_or_else(|| { + OmniError::manifest(format!( + "table {} missing unique column '{}'", + table_key, column_name + )) + })?; + if column.is_null(row) { + any_null = true; + break; + } + parts.push( + array_value_to_string(column.as_ref(), row) + .map_err(|e| OmniError::Lance(e.to_string()))?, + ); + } + if any_null { + continue; + } + let value = parts.join("|"); + let row_id = row_id_at(batch, row)?; + if let Some(first_row_id) = seen.insert(value.clone(), row_id.clone()) { + conflicts.push(MergeConflict { + table_key: table_key.to_string(), + row_id: Some(row_id.clone()), + kind: MergeConflictKind::UniqueViolation, + message: format!( + "unique constraint {:?} violated by '{}' and '{}'", + columns, first_row_id, row_id + ), + }); + } + } + } + Ok(()) +} + +fn accumulate_edge_cardinality( + batch: &RecordBatch, + counts: &mut HashMap, + table_key: &str, +) -> Result<()> { + let srcs = batch + .column_by_name("src") + .ok_or_else(|| OmniError::manifest(format!("table {} missing src column", table_key)))? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest(format!("table {} src column is not Utf8", table_key)) + })?; + for row in 0..srcs.len() { + *counts.entry(srcs.value(row).to_string()).or_insert(0_u32) += 1; + } + Ok(()) +} + +fn finalize_edge_cardinality_conflicts( + table_key: &str, + edge_name: &str, + min: u32, + max: Option, + counts: HashMap, +) -> Vec { + let mut conflicts = Vec::new(); + for (src, count) in counts { + if let Some(max) = max { + if count > max { + conflicts.push(MergeConflict { + table_key: table_key.to_string(), + row_id: None, + kind: MergeConflictKind::CardinalityViolation, + message: format!( + "@card violation on edge {}: source '{}' has {} edges (max {})", + edge_name, src, count, max + ), + }); + } + } + if count < min { + conflicts.push(MergeConflict { + table_key: table_key.to_string(), + row_id: None, + kind: MergeConflictKind::CardinalityViolation, + message: format!( + "@card violation on edge {}: source '{}' has {} edges (min {})", + edge_name, src, count, min + ), + }); + } + } + conflicts +} + +fn validate_orphan_edges_batch( + table_key: &str, + edge_type: &omnigraph_compiler::catalog::EdgeType, + batch: &RecordBatch, + node_ids: &HashMap>, +) -> Result> { + let srcs = batch + .column_by_name("src") + .ok_or_else(|| OmniError::manifest(format!("table {} missing src column", table_key)))? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest(format!("table {} src column is not Utf8", table_key)) + })?; + let dsts = batch + .column_by_name("dst") + .ok_or_else(|| OmniError::manifest(format!("table {} missing dst column", table_key)))? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest(format!("table {} dst column is not Utf8", table_key)) + })?; + + let from_ids = node_ids.get(&edge_type.from_type).ok_or_else(|| { + OmniError::manifest(format!( + "missing candidate node ids for {}", + edge_type.from_type + )) + })?; + let to_ids = node_ids.get(&edge_type.to_type).ok_or_else(|| { + OmniError::manifest(format!( + "missing candidate node ids for {}", + edge_type.to_type + )) + })?; + + let mut conflicts = Vec::new(); + for row in 0..batch.num_rows() { + let row_id = row_id_at(batch, row)?; + let src = srcs.value(row); + let dst = dsts.value(row); + if !from_ids.contains(src) { + conflicts.push(MergeConflict { + table_key: table_key.to_string(), + row_id: Some(row_id.clone()), + kind: MergeConflictKind::OrphanEdge, + message: format!("src '{}' not found in {}", src, edge_type.from_type), + }); + } + if !to_ids.contains(dst) { + conflicts.push(MergeConflict { + table_key: table_key.to_string(), + row_id: Some(row_id), + kind: MergeConflictKind::OrphanEdge, + message: format!("dst '{}' not found in {}", dst, edge_type.to_type), + }); + } + } + Ok(conflicts) +} + +fn row_id_at(batch: &RecordBatch, row: usize) -> Result { + let ids = batch + .column_by_name("id") + .ok_or_else(|| OmniError::manifest("batch missing id column".to_string()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::manifest("id column is not Utf8".to_string()))?; + Ok(ids.value(row).to_string()) +} + +async fn publish_adopted_source_state( + target_db: &Omnigraph, + catalog: &Catalog, + base_snapshot: &Snapshot, + source_snapshot: &Snapshot, + target_snapshot: &Snapshot, + table_key: &str, +) -> Result { + let source_entry = source_snapshot + .entry(table_key) + .ok_or_else(|| OmniError::manifest(format!("missing source entry for {}", table_key)))?; + let target_entry = target_snapshot.entry(table_key); + + match ( + target_db.active_branch(), + source_entry.table_branch.as_deref(), + ) { + // Both on main — pointer switch is safe (same lineage, version columns valid) + (None, None) => Ok(crate::db::SubTableUpdate { + table_key: table_key.to_string(), + table_version: source_entry.table_version, + table_branch: None, + row_count: source_entry.row_count, + version_metadata: source_entry.version_metadata.clone(), + }), + // Source on main, target on branch — pointer switch to main version + // (target reads from main, same lineage) + (Some(_target_branch), None) => Ok(crate::db::SubTableUpdate { + table_key: table_key.to_string(), + table_version: source_entry.table_version, + table_branch: None, + row_count: source_entry.row_count, + version_metadata: source_entry.version_metadata.clone(), + }), + // Source on branch, target on main — apply delta to preserve version metadata + (None, Some(_source_branch)) => { + let delta = + compute_source_delta(table_key, catalog, base_snapshot, source_snapshot).await?; + match delta { + Some(staged) => publish_rewritten_merge_table(target_db, table_key, &staged).await, + None => Ok(crate::db::SubTableUpdate { + table_key: table_key.to_string(), + table_version: target_entry + .map(|e| e.table_version) + .unwrap_or(source_entry.table_version), + table_branch: None, + row_count: source_entry.row_count, + version_metadata: target_entry + .map(|entry| entry.version_metadata.clone()) + .unwrap_or_else(|| source_entry.version_metadata.clone()), + }), + } + } + // Both on branches + (Some(target_branch), Some(source_branch)) => { + if target_entry.and_then(|entry| entry.table_branch.as_deref()) == Some(target_branch) { + // Target already owns this table — apply delta onto its lineage + let delta = + compute_source_delta(table_key, catalog, base_snapshot, source_snapshot) + .await?; + match delta { + Some(staged) => { + publish_rewritten_merge_table(target_db, table_key, &staged).await + } + None => Ok(crate::db::SubTableUpdate { + table_key: table_key.to_string(), + table_version: target_entry.unwrap().table_version, + table_branch: Some(target_branch.to_string()), + row_count: source_entry.row_count, + version_metadata: target_entry.unwrap().version_metadata.clone(), + }), + } + } else { + // Target doesn't own this table yet — fork from source state. + // This creates the target branch on the sub-table dataset. + let full_path = format!("{}/{}", target_db.uri(), source_entry.table_path); + let ds = target_db + .fork_dataset_from_entry_state( + table_key, + &full_path, + Some(source_branch), + source_entry.table_version, + target_branch, + ) + .await?; + let state = target_db.table_store().table_state(&full_path, &ds).await?; + Ok(crate::db::SubTableUpdate { + table_key: table_key.to_string(), + table_version: state.version, + table_branch: Some(target_branch.to_string()), + row_count: state.row_count, + version_metadata: state.version_metadata, + }) + } + } + } +} + +async fn publish_rewritten_merge_table( + target_db: &Omnigraph, + table_key: &str, + staged: &StagedMergeResult, +) -> Result { + let (ds, full_path, table_branch) = target_db.open_for_mutation(table_key).await?; + let mut current_ds = ds; + + // Phase 1: merge_insert changed/new rows (preserves _row_created_at_version for + // existing rows, bumps _row_last_updated_at_version only for actually-changed rows) + if let Some(delta) = &staged.delta_staged { + let batches: Vec = target_db + .table_store() + .scan_batches(&delta.dataset) + .await? + .into_iter() + .filter(|batch| batch.num_rows() > 0) + .collect(); + if !batches.is_empty() { + let state = target_db + .table_store() + .merge_insert_batches( + &full_path, + current_ds, + batches, + vec!["id".to_string()], + lance::dataset::WhenMatched::UpdateAll, + lance::dataset::WhenNotMatched::InsertAll, + ) + .await?; + current_ds = target_db + .reopen_for_mutation( + table_key, + &full_path, + table_branch.as_deref(), + state.version, + ) + .await?; + } + } + + // Phase 2: delete removed rows via deletion vectors + if !staged.deleted_ids.is_empty() { + let escaped: Vec = staged + .deleted_ids + .iter() + .map(|id| format!("'{}'", id.replace('\'', "''"))) + .collect(); + let filter = format!("id IN ({})", escaped.join(", ")); + target_db + .table_store() + .delete_where(&full_path, &mut current_ds, &filter) + .await?; + } + + // Phase 3: rebuild indices + let row_count = target_db + .table_store() + .table_state(&full_path, ¤t_ds) + .await? + .row_count; + if row_count > 0 { + target_db + .build_indices_on_dataset(table_key, &mut current_ds) + .await?; + } + let final_state = target_db + .table_store() + .table_state(&full_path, ¤t_ds) + .await?; + + Ok(crate::db::SubTableUpdate { + table_key: table_key.to_string(), + table_version: final_state.version, + table_branch, + row_count: final_state.row_count, + version_metadata: final_state.version_metadata, + }) +} + +// ─── Search mode ───────────────────────────────────────────────────────────── + +/// Describes how the query's ordering changes the scan mode. +#[derive(Debug, Default)] +struct SearchMode { + /// Vector ANN search: (variable, property, query_vector, k). + nearest: Option<(String, String, Vec, usize)>, + /// BM25 full-text search: (variable, property, query_text). + bm25: Option<(String, String, String)>, + /// RRF fusion: (primary, secondary, k_constant, limit). + rrf: Option, +} + +#[derive(Debug)] +struct RrfMode { + primary: Box, + secondary: Box, + k: u32, + limit: usize, +} + +/// Extract search ordering mode from the IR. +async fn extract_search_mode( + ir: &QueryIR, + params: &ParamMap, + catalog: &Catalog, +) -> Result { + if ir.order_by.is_empty() { + return Ok(SearchMode::default()); + } + let ordering = &ir.order_by[0]; + match &ordering.expr { + IRExpr::Nearest { + variable, + property, + query, + } => { + let vec = + resolve_nearest_query_vec(ir, catalog, variable, property, query, params).await?; + let k = ir.limit.ok_or_else(|| { + OmniError::manifest("nearest() ordering requires a limit clause".to_string()) + })? as usize; + Ok(SearchMode { + nearest: Some((variable.clone(), property.clone(), vec, k)), + ..Default::default() + }) + } + IRExpr::Bm25 { field, query } => { + let var = match field.as_ref() { + IRExpr::PropAccess { variable, .. } => variable.clone(), + _ => { + return Err(OmniError::manifest( + "bm25 field must be a property access".to_string(), + )); + } + }; + let prop = extract_property(field).ok_or_else(|| { + OmniError::manifest("bm25 field must be a property access".to_string()) + })?; + let text = resolve_to_string(query, params).ok_or_else(|| { + OmniError::manifest("bm25 query must resolve to a string".to_string()) + })?; + Ok(SearchMode { + bm25: Some((var, prop, text)), + ..Default::default() + }) + } + IRExpr::Rrf { + primary, + secondary, + k, + } => { + let limit = ir.limit.ok_or_else(|| { + OmniError::manifest("rrf() ordering requires a limit clause".to_string()) + })? as usize; + let k_val = k + .as_ref() + .and_then(|e| resolve_to_int(e, params)) + .unwrap_or(60) as u32; + + let primary_mode = + extract_sub_search_mode(ir, primary, params, catalog, ir.limit).await?; + let secondary_mode = + extract_sub_search_mode(ir, secondary, params, catalog, ir.limit).await?; + + Ok(SearchMode { + rrf: Some(RrfMode { + primary: Box::new(primary_mode), + secondary: Box::new(secondary_mode), + k: k_val, + limit, + }), + ..Default::default() + }) + } + _ => Ok(SearchMode::default()), + } +} + +/// Extract a sub-search mode from a nested RRF expression (nearest or bm25). +async fn extract_sub_search_mode( + ir: &QueryIR, + expr: &IRExpr, + params: &ParamMap, + catalog: &Catalog, + limit: Option, +) -> Result { + match expr { + IRExpr::Nearest { + variable, + property, + query, + } => { + let vec = + resolve_nearest_query_vec(ir, catalog, variable, property, query, params).await?; + let k = limit.unwrap_or(100) as usize; + Ok(SearchMode { + nearest: Some((variable.clone(), property.clone(), vec, k)), + ..Default::default() + }) + } + IRExpr::Bm25 { field, query } => { + let var = match field.as_ref() { + IRExpr::PropAccess { variable, .. } => variable.clone(), + _ => { + return Err(OmniError::manifest( + "bm25 field must be a property access".to_string(), + )); + } + }; + let prop = extract_property(field).ok_or_else(|| { + OmniError::manifest("bm25 field must be a property access".to_string()) + })?; + let text = resolve_to_string(query, params).ok_or_else(|| { + OmniError::manifest("bm25 query must resolve to a string".to_string()) + })?; + Ok(SearchMode { + bm25: Some((var, prop, text)), + ..Default::default() + }) + } + _ => Ok(SearchMode::default()), + } +} + +/// Resolve an expression to a nearest() query vector. +async fn resolve_nearest_query_vec( + ir: &QueryIR, + catalog: &Catalog, + variable: &str, + property: &str, + expr: &IRExpr, + params: &ParamMap, +) -> Result> { + let lit = resolve_literal_or_param(expr, params)?; + match lit { + Literal::List(_) => literal_to_f32_vec(&lit), + Literal::String(text) => { + let expected_dim = nearest_property_dimension(ir, catalog, variable, property)?; + EmbeddingClient::from_env()? + .embed_query_text(&text, expected_dim) + .await + } + _ => Err(OmniError::manifest( + "nearest query must be a string or list of floats".to_string(), + )), + } +} + +fn resolve_literal_or_param(expr: &IRExpr, params: &ParamMap) -> Result { + Ok(match expr { + IRExpr::Literal(lit) => lit.clone(), + IRExpr::Param(name) => params + .get(name) + .cloned() + .ok_or_else(|| OmniError::manifest(format!("parameter '{}' not provided", name)))?, + _ => { + return Err(OmniError::manifest( + "nearest query must be a literal or parameter".to_string(), + )); + } + }) +} + +/// Resolve a literal vector expression to a Vec. +fn literal_to_f32_vec(lit: &Literal) -> Result> { + match lit { + Literal::List(items) => items + .iter() + .map(|item| match item { + Literal::Float(f) => Ok(*f as f32), + Literal::Integer(n) => Ok(*n as f32), + _ => Err(OmniError::manifest( + "vector elements must be numeric".to_string(), + )), + }) + .collect(), + _ => Err(OmniError::manifest( + "nearest query must be a list of floats".to_string(), + )), + } +} + +fn nearest_property_dimension( + ir: &QueryIR, + catalog: &Catalog, + variable: &str, + property: &str, +) -> Result { + let type_name = resolve_binding_type_name(&ir.pipeline, variable).ok_or_else(|| { + OmniError::manifest_internal(format!( + "nearest() variable '${}' is not bound to a node type in the lowered pipeline", + variable + )) + })?; + let node_type = catalog.node_types.get(type_name).ok_or_else(|| { + OmniError::manifest_internal(format!( + "nearest() binding '${}' resolved unknown node type '{}'", + variable, type_name + )) + })?; + let prop = node_type.properties.get(property).ok_or_else(|| { + OmniError::manifest_internal(format!( + "nearest() property '{}.{}' is missing from the catalog", + type_name, property + )) + })?; + match prop.scalar { + ScalarType::Vector(dim) if !prop.list => Ok(dim as usize), + _ => Err(OmniError::manifest_internal(format!( + "nearest() property '{}.{}' is not a scalar vector", + type_name, property + ))), + } +} + +fn resolve_binding_type_name<'a>(pipeline: &'a [IROp], variable: &str) -> Option<&'a str> { + for op in pipeline { + match op { + IROp::NodeScan { + variable: bound_var, + type_name, + .. + } if bound_var == variable => return Some(type_name.as_str()), + IROp::Expand { + dst_var, dst_type, .. + } if dst_var == variable => return Some(dst_type.as_str()), + IROp::AntiJoin { inner, .. } => { + if let Some(type_name) = resolve_binding_type_name(inner, variable) { + return Some(type_name); + } + } + _ => {} + } + } + None +} + +/// Execute a lowered QueryIR. Pure function — no state, no caches. +pub async fn execute_query( + ir: &QueryIR, + params: &ParamMap, + snapshot: &Snapshot, + graph_index: Option<&GraphIndex>, + catalog: &Catalog, +) -> Result { + let search_mode = extract_search_mode(ir, params, catalog).await?; + + // RRF requires forked execution + if let Some(ref rrf) = search_mode.rrf { + return execute_rrf_query(ir, params, snapshot, graph_index, catalog, rrf).await; + } + + let mut bindings: HashMap = HashMap::new(); + + execute_pipeline( + &ir.pipeline, + params, + snapshot, + graph_index, + catalog, + &mut bindings, + &search_mode, + ) + .await?; + + // Project return expressions + let mut result_batch = project_return(&bindings, &ir.return_exprs, params)?; + + // Apply ordering (skip if search mode already ordered the results) + if !ir.order_by.is_empty() && !is_search_ordered(&search_mode) { + result_batch = apply_ordering(result_batch, &ir.order_by, &bindings, params)?; + } + + // Apply limit + if let Some(limit) = ir.limit { + let len = result_batch.num_rows().min(limit as usize); + result_batch = result_batch.slice(0, len); + } + + Ok(QueryResult::new(result_batch.schema(), vec![result_batch])) +} + +/// Check if the search mode already returns results in the correct order. +fn is_search_ordered(search_mode: &SearchMode) -> bool { + search_mode.nearest.is_some() || search_mode.bm25.is_some() +} + +/// Execute a query with RRF (Reciprocal Rank Fusion) ordering. +async fn execute_rrf_query( + ir: &QueryIR, + params: &ParamMap, + snapshot: &Snapshot, + graph_index: Option<&GraphIndex>, + catalog: &Catalog, + rrf: &RrfMode, +) -> Result { + // Execute primary search + let mut primary_bindings: HashMap = HashMap::new(); + execute_pipeline( + &ir.pipeline, + params, + snapshot, + graph_index, + catalog, + &mut primary_bindings, + &rrf.primary, + ) + .await?; + + // Execute secondary search + let mut secondary_bindings: HashMap = HashMap::new(); + execute_pipeline( + &ir.pipeline, + params, + snapshot, + graph_index, + catalog, + &mut secondary_bindings, + &rrf.secondary, + ) + .await?; + + // For RRF, we need to find the main binding variable + // (the one that both searches operate on) + let primary_var = rrf + .primary + .nearest + .as_ref() + .map(|(v, ..)| v.as_str()) + .or_else(|| rrf.primary.bm25.as_ref().map(|(v, ..)| v.as_str())) + .ok_or_else(|| OmniError::manifest("rrf primary must be nearest or bm25".to_string()))?; + + let primary_batch = primary_bindings.get(primary_var).ok_or_else(|| { + OmniError::manifest(format!( + "rrf primary variable '{}' not in bindings", + primary_var + )) + })?; + let secondary_batch = secondary_bindings.get(primary_var).ok_or_else(|| { + OmniError::manifest(format!( + "rrf secondary variable '{}' not in bindings", + primary_var + )) + })?; + + // Build ID → rank maps + let primary_ids = extract_id_column(primary_batch)?; + let secondary_ids = extract_id_column(secondary_batch)?; + + let mut primary_rank: HashMap = HashMap::new(); + for (i, id) in primary_ids.iter().enumerate() { + primary_rank.entry(id.clone()).or_insert(i); + } + let mut secondary_rank: HashMap = HashMap::new(); + for (i, id) in secondary_ids.iter().enumerate() { + secondary_rank.entry(id.clone()).or_insert(i); + } + + // Collect all unique IDs + let mut all_ids: Vec = primary_ids.clone(); + for id in &secondary_ids { + if !primary_rank.contains_key(id) { + all_ids.push(id.clone()); + } + } + + // Compute RRF scores + let k = rrf.k as f64; + let mut scored: Vec<(String, f64)> = all_ids + .iter() + .map(|id| { + let p = primary_rank + .get(id) + .map(|&r| 1.0 / (k + r as f64 + 1.0)) + .unwrap_or(0.0); + let s = secondary_rank + .get(id) + .map(|&r| 1.0 / (k + r as f64 + 1.0)) + .unwrap_or(0.0); + (id.clone(), p + s) + }) + .collect(); + scored.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal)); + scored.truncate(rrf.limit); + + // Collect winning IDs in order — look up rows from primary or secondary batch + let winning_ids: Vec = scored.iter().map(|(id, _)| id.clone()).collect(); + + // Build a combined row source: merge primary and secondary by id + let mut id_to_batch_row: HashMap = HashMap::new(); + for (i, id) in primary_ids.iter().enumerate() { + id_to_batch_row + .entry(id.clone()) + .or_insert((primary_batch, i)); + } + for (i, id) in secondary_ids.iter().enumerate() { + id_to_batch_row + .entry(id.clone()) + .or_insert((secondary_batch, i)); + } + + // Reconstruct a combined batch for the binding in winning order + let fused_batch = build_fused_batch(&winning_ids, &id_to_batch_row, primary_batch.schema())?; + + // Replace the binding and project + let mut fused_bindings = primary_bindings; + fused_bindings.insert(primary_var.to_string(), fused_batch); + + let result_batch = project_return(&fused_bindings, &ir.return_exprs, params)?; + + // Already ordered by RRF score + already limited + Ok(QueryResult::new(result_batch.schema(), vec![result_batch])) +} + +fn extract_id_column(batch: &RecordBatch) -> Result> { + let col = batch + .column_by_name("id") + .ok_or_else(|| OmniError::manifest("batch missing 'id' column for RRF".to_string()))?; + let ids = col + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::manifest("'id' column is not Utf8".to_string()))?; + Ok((0..ids.len()).map(|i| ids.value(i).to_string()).collect()) +} + +fn build_fused_batch( + ordered_ids: &[String], + id_to_batch_row: &HashMap, + schema: SchemaRef, +) -> Result { + if ordered_ids.is_empty() { + return Ok(RecordBatch::new_empty(schema)); + } + + // Gather indices from source batches, collecting rows in the right order + let mut row_slices: Vec = Vec::with_capacity(ordered_ids.len()); + for id in ordered_ids { + if let Some(&(batch, row_idx)) = id_to_batch_row.get(id) { + row_slices.push(batch.slice(row_idx, 1)); + } + } + + if row_slices.is_empty() { + return Ok(RecordBatch::new_empty(schema)); + } + + let schema = row_slices[0].schema(); + arrow_select::concat::concat_batches(&schema, &row_slices) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +/// Check if a filter is a text search filter that needs Lance SQL pushdown. +fn is_search_filter(filter: &IRFilter) -> bool { + matches!( + &filter.left, + IRExpr::Search { .. } | IRExpr::Fuzzy { .. } | IRExpr::MatchText { .. } + ) +} + +/// Extract the variable name from a search filter's field expression. +fn search_filter_variable(filter: &IRFilter) -> Option<&str> { + let field = match &filter.left { + IRExpr::Search { field, .. } => field, + IRExpr::Fuzzy { field, .. } => field, + IRExpr::MatchText { field, .. } => field, + _ => return None, + }; + match field.as_ref() { + IRExpr::PropAccess { variable, .. } => Some(variable.as_str()), + _ => None, + } +} + +fn execute_pipeline<'a>( + pipeline: &'a [IROp], + params: &'a ParamMap, + snapshot: &'a Snapshot, + graph_index: Option<&'a GraphIndex>, + catalog: &'a Catalog, + bindings: &'a mut HashMap, + search_mode: &'a SearchMode, +) -> std::pin::Pin> + Send + 'a>> { + Box::pin(async move { + // Pre-pass: collect search filters that need to be hoisted to NodeScan + let mut hoisted_search_filters: HashMap> = HashMap::new(); + let mut hoisted_indices: HashSet = HashSet::new(); + for (i, op) in pipeline.iter().enumerate() { + if let IROp::Filter(filter) = op { + if is_search_filter(filter) { + if let Some(var) = search_filter_variable(filter) { + hoisted_search_filters + .entry(var.to_string()) + .or_default() + .push(filter.clone()); + hoisted_indices.insert(i); + } + } + } + } + + for (i, op) in pipeline.iter().enumerate() { + // Skip hoisted search filters + if hoisted_indices.contains(&i) { + continue; + } + match op { + IROp::NodeScan { + variable, + type_name, + filters, + } => { + // Merge inline filters with hoisted search filters + let mut all_filters: Vec = filters.clone(); + if let Some(extra) = hoisted_search_filters.get(variable) { + all_filters.extend(extra.iter().cloned()); + } + let batch = execute_node_scan( + type_name, + variable, + &all_filters, + params, + snapshot, + catalog, + search_mode, + ) + .await?; + bindings.insert(variable.clone(), batch); + } + IROp::Filter(filter) => { + apply_filter(bindings, filter, params)?; + } + IROp::Expand { + src_var, + dst_var, + edge_type, + direction, + dst_type, + min_hops, + max_hops, + } => { + let gi = graph_index.ok_or_else(|| { + OmniError::manifest("graph index required for traversal".to_string()) + })?; + let batch = execute_expand( + bindings, gi, snapshot, catalog, src_var, dst_var, edge_type, *direction, + dst_type, *min_hops, *max_hops, + ) + .await?; + bindings.insert(dst_var.clone(), batch); + } + IROp::AntiJoin { outer_var, inner } => { + let gi = graph_index; + execute_anti_join(bindings, inner, params, snapshot, gi, catalog, outer_var) + .await?; + } + } + } + Ok(()) + }) +} + +/// Execute a graph traversal (Expand). +async fn execute_expand( + bindings: &HashMap, + graph_index: &GraphIndex, + snapshot: &Snapshot, + catalog: &Catalog, + src_var: &str, + _dst_var: &str, + edge_type: &str, + direction: Direction, + dst_type: &str, + min_hops: u32, + max_hops: Option, +) -> Result { + let src_batch = bindings.get(src_var).ok_or_else(|| { + OmniError::manifest(format!("expand references unbound variable '{}'", src_var)) + })?; + + let src_ids = src_batch + .column_by_name("id") + .ok_or_else(|| OmniError::manifest("source batch missing 'id' column".to_string()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::manifest("source 'id' column is not Utf8".to_string()))?; + + // Determine which type index to use for source and destination + let edge_def = catalog + .edge_types + .get(edge_type) + .ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_type)))?; + + let (src_type_name, dst_type_name) = match direction { + Direction::Out => (&edge_def.from_type, &edge_def.to_type), + Direction::In => (&edge_def.to_type, &edge_def.from_type), + }; + + let src_type_idx = graph_index + .type_index(src_type_name) + .ok_or_else(|| OmniError::manifest(format!("no type index for '{}'", src_type_name)))?; + let dst_type_idx = graph_index + .type_index(dst_type_name) + .ok_or_else(|| OmniError::manifest(format!("no type index for '{}'", dst_type_name)))?; + + let adj = match direction { + Direction::Out => graph_index.csr(edge_type), + Direction::In => graph_index.csc(edge_type), + } + .ok_or_else(|| OmniError::manifest(format!("no adjacency index for edge '{}'", edge_type)))?; + + let max = max_hops.unwrap_or(min_hops.max(1)); + + let same_type = src_type_name == dst_type_name; + + // BFS to collect reachable destination dense IDs + let mut result_dst_ids: Vec = Vec::new(); + for i in 0..src_ids.len() { + let src_id = src_ids.value(i); + let Some(src_dense) = src_type_idx.to_dense(src_id) else { + continue; + }; + + // BFS with hop tracking + let mut frontier: Vec = vec![src_dense]; + let mut visited: HashSet = HashSet::new(); + let mut seen_dst_ids: HashSet = HashSet::new(); + // Only track visited in the destination namespace for same-type edges + // (to avoid revisiting the source). For cross-type edges, dense indices + // are in different namespaces so collision is impossible. + if same_type { + visited.insert(src_dense); + } + + for hop in 1..=max { + let mut next_frontier = Vec::new(); + for &node in &frontier { + for &neighbor in adj.neighbors(node) { + if !same_type || visited.insert(neighbor) { + next_frontier.push(neighbor); + if hop >= min_hops { + if let Some(dst_id) = dst_type_idx.to_id(neighbor) { + let dst_id = dst_id.to_string(); + if seen_dst_ids.insert(dst_id.clone()) { + result_dst_ids.push(dst_id); + } + } + } + } + } + } + frontier = next_frontier; + if frontier.is_empty() { + break; + } + } + } + + // Hydrate destination nodes from the snapshot + hydrate_nodes(snapshot, catalog, dst_type, &result_dst_ids).await +} + +/// Load full node rows for a set of IDs from a snapshot. +async fn hydrate_nodes( + snapshot: &Snapshot, + catalog: &Catalog, + type_name: &str, + ids: &[String], +) -> Result { + let node_type = catalog + .node_types + .get(type_name) + .ok_or_else(|| OmniError::manifest(format!("unknown node type '{}'", type_name)))?; + + if ids.is_empty() { + return Ok(RecordBatch::new_empty(node_type.arrow_schema.clone())); + } + + let table_key = format!("node:{}", type_name); + let ds = snapshot.open(&table_key).await?; + + // Build filter: id IN ('a', 'b', 'c') + let escaped: Vec = ids + .iter() + .map(|id| format!("'{}'", id.replace('\'', "''"))) + .collect(); + let filter_sql = format!("id IN ({})", escaped.join(", ")); + let has_blobs = !node_type.blob_properties.is_empty(); + let non_blob_cols: Vec<&str> = node_type + .arrow_schema + .fields() + .iter() + .filter(|f| !node_type.blob_properties.contains(f.name())) + .map(|f| f.name().as_str()) + .collect(); + let projection = has_blobs.then_some(non_blob_cols.as_slice()); + let batches = crate::table_store::TableStore::scan_stream( + &ds, + projection, + Some(&filter_sql), + None, + false, + ) + .await? + .try_collect::>() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let scan_result = if batches.is_empty() { + return Ok(RecordBatch::new_empty(node_type.arrow_schema.clone())); + } else if batches.len() == 1 { + batches.into_iter().next().unwrap() + } else { + let schema = batches[0].schema(); + arrow_select::concat::concat_batches(&schema, &batches) + .map_err(|e| OmniError::Lance(e.to_string()))? + }; + + if has_blobs { + return add_null_blob_columns(&scan_result, node_type); + } + Ok(scan_result) +} + +/// Try bulk anti-join via CSR existence check. Returns Some if the inner +/// pipeline is a single Expand from outer_var (the common negation pattern). +fn try_bulk_anti_join( + outer_batch: &RecordBatch, + inner_pipeline: &[IROp], + graph_index: Option<&GraphIndex>, + catalog: &Catalog, + outer_var: &str, +) -> Option> { + if inner_pipeline.len() != 1 { + return None; + } + let IROp::Expand { + src_var, + edge_type, + direction, + .. + } = &inner_pipeline[0] + else { + return None; + }; + if src_var != outer_var { + return None; + } + let gi = graph_index?; + let edge_def = catalog.edge_types.get(edge_type.as_str())?; + + let src_type_name = match direction { + Direction::Out => &edge_def.from_type, + Direction::In => &edge_def.to_type, + }; + let adj = match direction { + Direction::Out => gi.csr(edge_type), + Direction::In => gi.csc(edge_type), + }?; + let type_idx = gi.type_index(src_type_name)?; + + let outer_ids = outer_batch + .column_by_name("id")? + .as_any() + .downcast_ref::()?; + + let keep_mask: Vec = (0..outer_ids.len()) + .map(|i| { + let id = outer_ids.value(i); + match type_idx.to_dense(id) { + Some(dense) => !adj.has_neighbors(dense), + None => true, // not in graph index = no edges = keep + } + }) + .collect(); + + let mask = BooleanArray::from(keep_mask); + Some( + arrow_select::filter::filter_record_batch(outer_batch, &mask) + .map_err(|e| OmniError::Lance(e.to_string())), + ) +} + +/// Execute an AntiJoin: remove rows from outer_var where the inner pipeline finds matches. +async fn execute_anti_join( + bindings: &mut HashMap, + inner_pipeline: &[IROp], + params: &ParamMap, + snapshot: &Snapshot, + graph_index: Option<&GraphIndex>, + catalog: &Catalog, + outer_var: &str, +) -> Result<()> { + let outer_batch = bindings.get(outer_var).ok_or_else(|| { + OmniError::manifest(format!( + "anti-join references unbound variable '{}'", + outer_var + )) + })?; + + // Fast path: bulk CSR existence check (O(N), zero Lance I/O) + if let Some(result) = + try_bulk_anti_join(outer_batch, inner_pipeline, graph_index, catalog, outer_var) + { + bindings.insert(outer_var.to_string(), result?); + return Ok(()); + } + + // Slow path: per-row inner pipeline execution + let outer_ids = outer_batch + .column_by_name("id") + .ok_or_else(|| OmniError::manifest("outer batch missing 'id' column".to_string()))? + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::manifest("outer 'id' column is not Utf8".to_string()))?; + + let mut keep_mask = vec![true; outer_batch.num_rows()]; + + for i in 0..outer_ids.len() { + let single_row = outer_batch.slice(i, 1); + let mut inner_bindings: HashMap = HashMap::new(); + inner_bindings.insert(outer_var.to_string(), single_row); + + let no_search = SearchMode::default(); + execute_pipeline( + inner_pipeline, + params, + snapshot, + graph_index, + catalog, + &mut inner_bindings, + &no_search, + ) + .await?; + + let has_match = inner_bindings + .iter() + .filter(|(k, _)| *k != outer_var) + .any(|(_, batch)| batch.num_rows() > 0); + + if has_match { + keep_mask[i] = false; + } + } + + let mask = BooleanArray::from(keep_mask); + let filtered = arrow_select::filter::filter_record_batch(outer_batch, &mask) + .map_err(|e| OmniError::Lance(e.to_string()))?; + + bindings.insert(outer_var.to_string(), filtered); + Ok(()) +} + +/// Scan a node type's Lance dataset with optional filter pushdown and search modes. +async fn execute_node_scan( + type_name: &str, + variable: &str, + filters: &[IRFilter], + params: &ParamMap, + snapshot: &Snapshot, + catalog: &Catalog, + search_mode: &SearchMode, +) -> Result { + let table_key = format!("node:{}", type_name); + let ds = snapshot.open(&table_key).await?; + + // Build Lance SQL filter string from non-search IR filters + let filter_sql = build_lance_filter(filters, params); + + // Blob columns must be excluded from scan when a filter is present + // (Lance bug: BlobsDescriptions + filter triggers a projection assertion). + // We exclude blob columns and add metadata post-scan via take_blobs_by_indices. + let node_type = &catalog.node_types[type_name]; + let has_blobs = !node_type.blob_properties.is_empty(); + let non_blob_cols: Vec<&str> = node_type + .arrow_schema + .fields() + .iter() + .filter(|f| !node_type.blob_properties.contains(f.name())) + .map(|f| f.name().as_str()) + .collect(); + let projection = has_blobs.then_some(non_blob_cols.as_slice()); + let batches = crate::table_store::TableStore::scan_stream_with( + &ds, + projection, + filter_sql.as_deref(), + None, + false, + |scanner| { + // Apply FTS queries from hoisted search filters (search/fuzzy/match_text in match clause) + for filter in filters { + if is_search_filter(filter) { + if let Some(fts_query) = build_fts_query(&filter.left, params) { + scanner.full_text_search(fts_query).map_err(|e| { + OmniError::Lance(format!("full_text_search filter: {}", e)) + })?; + } + } + } + + // Apply nearest vector search if this variable is the target + if let Some((ref var, ref prop, ref vec, k)) = search_mode.nearest { + if var == variable { + let query_arr = Float32Array::from(vec.clone()); + scanner + .nearest(prop, &query_arr, k) + .map_err(|e| OmniError::Lance(format!("nearest: {}", e)))?; + } + } + + // Apply BM25 full-text search if this variable is the target + if let Some((ref var, ref prop, ref text)) = search_mode.bm25 { + if var == variable { + let fts_query = lance_index::scalar::FullTextSearchQuery::new(text.clone()) + .with_column(prop.clone()) + .map_err(|e| OmniError::Lance(format!("fts with_column: {}", e)))?; + scanner + .full_text_search(fts_query) + .map_err(|e| OmniError::Lance(format!("full_text_search: {}", e)))?; + } + } + Ok(()) + }, + ) + .await? + .try_collect::>() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let scan_result = if batches.is_empty() { + RecordBatch::new_empty(batches.first().map(|b| b.schema()).unwrap_or_else(|| { + // Build a non-blob schema for empty result + let fields: Vec<_> = node_type + .arrow_schema + .fields() + .iter() + .filter(|f| !node_type.blob_properties.contains(f.name())) + .map(|f| f.as_ref().clone()) + .collect(); + Arc::new(Schema::new(fields)) + })) + } else if batches.len() == 1 { + batches.into_iter().next().unwrap() + } else { + let schema = batches[0].schema(); + arrow_select::concat::concat_batches(&schema, &batches) + .map_err(|e| OmniError::Lance(e.to_string()))? + }; + + // Add null placeholder columns for excluded blob properties + if has_blobs { + return add_null_blob_columns(&scan_result, node_type); + } + Ok(scan_result) +} + +/// Add null Utf8 columns for blob properties excluded from a scan. +/// Uses column_by_name (not positional) so it's order-independent. +fn add_null_blob_columns( + batch: &RecordBatch, + node_type: &omnigraph_compiler::catalog::NodeType, +) -> Result { + let num_rows = batch.num_rows(); + let mut fields = Vec::with_capacity(node_type.arrow_schema.fields().len()); + let mut columns: Vec = Vec::with_capacity(node_type.arrow_schema.fields().len()); + + for field in node_type.arrow_schema.fields() { + if node_type.blob_properties.contains(field.name()) { + fields.push(Field::new(field.name(), DataType::Utf8, true)); + columns.push(Arc::new(StringArray::from(vec![None::<&str>; num_rows]))); + } else if let Some(col) = batch.column_by_name(field.name()) { + let batch_schema = batch.schema(); + let batch_field = batch_schema + .field_with_name(field.name()) + .map_err(|e| OmniError::Lance(e.to_string()))?; + fields.push(batch_field.clone()); + columns.push(col.clone()); + } + } + + RecordBatch::try_new(Arc::new(Schema::new(fields)), columns) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +/// Convert IR filters to a Lance SQL filter string. +fn build_lance_filter(filters: &[IRFilter], params: &ParamMap) -> Option { + if filters.is_empty() { + return None; + } + + let parts: Vec = filters + .iter() + .filter_map(|f| ir_filter_to_sql(f, params)) + .collect(); + + if parts.is_empty() { + return None; + } + + Some(parts.join(" AND ")) +} + +fn ir_filter_to_sql(filter: &IRFilter, params: &ParamMap) -> Option { + // Search predicates (search/fuzzy/match_text = true) are NOT converted to SQL. + // They are handled via scanner.full_text_search() in execute_node_scan. + if is_search_filter(filter) { + return None; + } + + let left = ir_expr_to_sql(&filter.left, params)?; + let right = ir_expr_to_sql(&filter.right, params)?; + let op = match filter.op { + CompOp::Eq => "=", + CompOp::Ne => "!=", + CompOp::Gt => ">", + CompOp::Lt => "<", + CompOp::Ge => ">=", + CompOp::Le => "<=", + CompOp::Contains => return None, // Can't pushdown list contains + }; + Some(format!("{} {} {}", left, op, right)) +} + +/// Build a FullTextSearchQuery from a search IR expression. +fn build_fts_query( + expr: &IRExpr, + params: &ParamMap, +) -> Option { + match expr { + IRExpr::Search { field, query } => { + let prop = extract_property(field)?; + let q = resolve_to_string(query, params)?; + lance_index::scalar::FullTextSearchQuery::new(q) + .with_column(prop) + .ok() + } + IRExpr::Fuzzy { + field, + query, + max_edits, + } => { + let prop = extract_property(field)?; + let q = resolve_to_string(query, params)?; + let edits = max_edits + .as_ref() + .and_then(|e| resolve_to_int(e, params)) + .unwrap_or(2) as u32; + lance_index::scalar::FullTextSearchQuery::new_fuzzy(q, Some(edits)) + .with_column(prop) + .ok() + } + IRExpr::MatchText { field, query } => { + // Use regular text search (phrase search not available in Lance 3.0 Rust API) + let prop = extract_property(field)?; + let q = resolve_to_string(query, params)?; + lance_index::scalar::FullTextSearchQuery::new(q) + .with_column(prop) + .ok() + } + _ => None, + } +} + +/// Extract the property name from a PropAccess expression. +fn extract_property(expr: &IRExpr) -> Option { + match expr { + IRExpr::PropAccess { property, .. } => Some(property.clone()), + _ => None, + } +} + +/// Resolve an expression to a string value (literal or param). +fn resolve_to_string(expr: &IRExpr, params: &ParamMap) -> Option { + match expr { + IRExpr::Literal(Literal::String(s)) => Some(s.clone()), + IRExpr::Param(name) => match params.get(name)? { + Literal::String(s) => Some(s.clone()), + _ => None, + }, + _ => None, + } +} + +/// Resolve an expression to an integer value (literal or param). +fn resolve_to_int(expr: &IRExpr, params: &ParamMap) -> Option { + match expr { + IRExpr::Literal(Literal::Integer(n)) => Some(*n), + IRExpr::Param(name) => match params.get(name)? { + Literal::Integer(n) => Some(*n), + _ => None, + }, + _ => None, + } +} + +fn ir_expr_to_sql(expr: &IRExpr, params: &ParamMap) -> Option { + match expr { + IRExpr::PropAccess { property, .. } => Some(property.clone()), + IRExpr::Literal(lit) => Some(literal_to_sql(lit)), + IRExpr::Param(name) => params.get(name).map(literal_to_sql), + _ => None, + } +} + +fn literal_to_sql(lit: &Literal) -> String { + match lit { + Literal::String(s) => format!("'{}'", s.replace('\'', "''")), + Literal::Integer(n) => n.to_string(), + Literal::Float(f) => f.to_string(), + Literal::Bool(b) => b.to_string(), + Literal::Date(s) => format!("'{}'", s.replace('\'', "''")), + Literal::DateTime(s) => format!("'{}'", s.replace('\'', "''")), + Literal::List(_) => "NULL".to_string(), // Not supported in SQL pushdown + } +} + +/// Apply an IR filter to the bindings (post-scan filtering). +fn apply_filter( + bindings: &mut HashMap, + filter: &IRFilter, + params: &ParamMap, +) -> Result<()> { + // Find which binding this filter applies to + let var_name = match &filter.left { + IRExpr::PropAccess { variable, .. } => variable.clone(), + _ => return Ok(()), // Can't determine variable + }; + + let batch = bindings.get(&var_name).ok_or_else(|| { + OmniError::manifest(format!("filter references unbound variable '{}'", var_name)) + })?; + + let mask = evaluate_filter(batch, filter, params)?; + let filtered = arrow_select::filter::filter_record_batch(batch, &mask) + .map_err(|e| OmniError::Lance(e.to_string()))?; + + bindings.insert(var_name, filtered); + Ok(()) +} + +/// Evaluate a filter predicate against a batch, producing a boolean mask. +fn evaluate_filter( + batch: &RecordBatch, + filter: &IRFilter, + params: &ParamMap, +) -> Result { + let left = evaluate_expr(batch, &filter.left, params)?; + let right = evaluate_expr(batch, &filter.right, params)?; + + if filter.op == CompOp::Contains { + return evaluate_contains_filter(&left, &right); + } + + // Cast right to match left's type if needed (e.g. Int64 literal vs Int32 column) + let right = if left.data_type() != right.data_type() { + arrow_cast::cast::cast(&right, left.data_type()) + .map_err(|e| OmniError::Lance(e.to_string()))? + } else { + right + }; + + use arrow_ord::cmp; + let result = match filter.op { + CompOp::Eq => cmp::eq(&left, &right), + CompOp::Ne => cmp::neq(&left, &right), + CompOp::Gt => cmp::gt(&left, &right), + CompOp::Lt => cmp::lt(&left, &right), + CompOp::Ge => cmp::gt_eq(&left, &right), + CompOp::Le => cmp::lt_eq(&left, &right), + CompOp::Contains => unreachable!("handled above"), + } + .map_err(|e| OmniError::Lance(e.to_string()))?; + + Ok(result) +} + +/// Evaluate an IR expression against a batch, producing an array. +fn evaluate_expr(batch: &RecordBatch, expr: &IRExpr, params: &ParamMap) -> Result { + match expr { + IRExpr::PropAccess { property, .. } => { + batch.column_by_name(property).cloned().ok_or_else(|| { + OmniError::manifest(format!("column '{}' not found in batch", property)) + }) + } + IRExpr::Literal(lit) => literal_to_array(lit, batch.num_rows()), + IRExpr::Param(name) => { + let lit = params + .get(name) + .ok_or_else(|| OmniError::manifest(format!("parameter '{}' not provided", name)))?; + literal_to_array(lit, batch.num_rows()) + } + _ => Err(OmniError::manifest(format!( + "unsupported expression in filter: {:?}", + expr + ))), + } +} + +/// Create a constant array from a literal value. +fn literal_to_array(lit: &Literal, num_rows: usize) -> Result { + Ok(match lit { + Literal::String(s) => Arc::new(StringArray::from(vec![s.as_str(); num_rows])) as ArrayRef, + Literal::Integer(n) => { + // Try to match the most common integer types + Arc::new(Int64Array::from(vec![*n; num_rows])) as ArrayRef + } + Literal::Float(f) => Arc::new(Float64Array::from(vec![*f; num_rows])) as ArrayRef, + Literal::Bool(b) => Arc::new(BooleanArray::from(vec![*b; num_rows])) as ArrayRef, + Literal::Date(s) => { + let days = crate::loader::parse_date32_literal(s)?; + Arc::new(Date32Array::from(vec![days; num_rows])) as ArrayRef + } + Literal::DateTime(s) => { + let ms = crate::loader::parse_date64_literal(s)?; + Arc::new(Date64Array::from(vec![ms; num_rows])) as ArrayRef + } + Literal::List(items) => literal_list_to_array(items, num_rows)?, + }) +} + +fn evaluate_contains_filter(left: &ArrayRef, right: &ArrayRef) -> Result { + let DataType::List(field) = left.data_type() else { + return Err(OmniError::manifest( + "contains requires a list property on the left".to_string(), + )); + }; + let right = if right.data_type() != field.data_type() { + arrow_cast::cast::cast(right, field.data_type()) + .map_err(|e| OmniError::Lance(e.to_string()))? + } else { + Arc::clone(right) + }; + let list = left + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::manifest("contains requires an Arrow ListArray"))?; + + let mut values = Vec::with_capacity(list.len()); + for row in 0..list.len() { + if list.is_null(row) || right.is_null(row) { + values.push(Some(false)); + continue; + } + let items = list.value(row); + let mut found = false; + for idx in 0..items.len() { + if array_value_eq(items.as_ref(), idx, right.as_ref(), row)? { + found = true; + break; + } + } + values.push(Some(found)); + } + Ok(BooleanArray::from(values)) +} + +fn array_value_eq( + left: &dyn Array, + left_index: usize, + right: &dyn Array, + right_index: usize, +) -> Result { + if left.is_null(left_index) || right.is_null(right_index) { + return Ok(false); + } + let left_value = + array_value_to_string(left, left_index).map_err(|e| OmniError::Lance(e.to_string()))?; + let right_value = + array_value_to_string(right, right_index).map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(left_value == right_value) +} + +fn literal_list_to_array(items: &[Literal], num_rows: usize) -> Result { + if items.is_empty() { + let mut builder = ListBuilder::new(StringBuilder::new()); + for _ in 0..num_rows { + builder.append(true); + } + return Ok(Arc::new(builder.finish())); + } + + let scalar_type = list_scalar_type(items)?; + match scalar_type { + ScalarType::String => { + let mut builder = ListBuilder::with_capacity(StringBuilder::new(), num_rows) + .with_field(Arc::new(Field::new("item", DataType::Utf8, true))); + for _ in 0..num_rows { + for item in items { + match item { + Literal::String(value) => builder.values().append_value(value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + ScalarType::Bool => { + let mut builder = ListBuilder::with_capacity(BooleanBuilder::new(), num_rows) + .with_field(Arc::new(Field::new("item", DataType::Boolean, true))); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Bool(value) => builder.values().append_value(*value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + ScalarType::I32 => { + let mut builder = ListBuilder::with_capacity(Int32Builder::new(), num_rows) + .with_field(Arc::new(Field::new("item", DataType::Int32, true))); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => builder.values().append_value(*value as i32), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + ScalarType::I64 | ScalarType::U32 | ScalarType::U64 => { + let mut builder = ListBuilder::with_capacity(Int64Builder::new(), num_rows) + .with_field(Arc::new(Field::new("item", DataType::Int64, true))); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => builder.values().append_value(*value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + ScalarType::F32 | ScalarType::F64 => { + let mut builder = ListBuilder::with_capacity(Float64Builder::new(), num_rows) + .with_field(Arc::new(Field::new("item", DataType::Float64, true))); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => builder.values().append_value(*value as f64), + Literal::Float(value) => builder.values().append_value(*value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + ScalarType::Date => { + let mut builder = ListBuilder::with_capacity(Date32Builder::new(), num_rows) + .with_field(Arc::new(Field::new("item", DataType::Date32, true))); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Date(value) => builder + .values() + .append_value(crate::loader::parse_date32_literal(value)?), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + ScalarType::DateTime => { + let mut builder = ListBuilder::with_capacity(Date64Builder::new(), num_rows) + .with_field(Arc::new(Field::new("item", DataType::Date64, true))); + for _ in 0..num_rows { + for item in items { + match item { + Literal::DateTime(value) => builder + .values() + .append_value(crate::loader::parse_date64_literal(value)?), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + ScalarType::Vector(_) | ScalarType::Blob => Err(OmniError::manifest( + "unsupported list literal element type".to_string(), + )), + } +} + +fn list_scalar_type(items: &[Literal]) -> Result { + let first = items + .first() + .ok_or_else(|| OmniError::manifest("empty list literal"))?; + let expected = literal_scalar_type(first)?; + for item in items.iter().skip(1) { + let item_type = literal_scalar_type(item)?; + if item_type != expected { + return Err(OmniError::manifest( + "list literal elements must share a compatible scalar type".to_string(), + )); + } + } + Ok(expected) +} + +fn literal_scalar_type(lit: &Literal) -> Result { + match lit { + Literal::String(_) => Ok(ScalarType::String), + Literal::Integer(_) => Ok(ScalarType::I64), + Literal::Float(_) => Ok(ScalarType::F64), + Literal::Bool(_) => Ok(ScalarType::Bool), + Literal::Date(_) => Ok(ScalarType::Date), + Literal::DateTime(_) => Ok(ScalarType::DateTime), + Literal::List(_) => Err(OmniError::manifest( + "nested list literals are not supported".to_string(), + )), + } +} + +/// Project return expressions into a result batch. +fn project_return( + bindings: &HashMap, + projections: &[IRProjection], + params: &ParamMap, +) -> Result { + if projections.is_empty() { + return Err(OmniError::manifest( + "query has no return projections".to_string(), + )); + } + + let mut fields = Vec::with_capacity(projections.len()); + let mut columns: Vec = Vec::with_capacity(projections.len()); + + for proj in projections { + let (name, col) = evaluate_projection(bindings, &proj.expr, params)?; + let field_name = proj.alias.as_deref().unwrap_or(&name); + fields.push(Field::new( + field_name, + col.data_type().clone(), + col.null_count() > 0, + )); + columns.push(col); + } + + let schema = Arc::new(Schema::new(fields)); + RecordBatch::try_new(schema, columns).map_err(|e| OmniError::Lance(e.to_string())) +} + +/// Evaluate a single projection expression. +fn evaluate_projection( + bindings: &HashMap, + expr: &IRExpr, + params: &ParamMap, +) -> Result<(String, ArrayRef)> { + match expr { + IRExpr::PropAccess { variable, property } => { + let batch = bindings.get(variable).ok_or_else(|| { + OmniError::manifest(format!( + "projection references unbound variable '{}'", + variable + )) + })?; + let col = batch.column_by_name(property).ok_or_else(|| { + OmniError::manifest(format!( + "column '{}' not found in binding '{}'", + property, variable + )) + })?; + Ok((format!("{}.{}", variable, property), col.clone())) + } + IRExpr::Literal(lit) => { + // Get row count from first binding + let num_rows = bindings.values().next().map(|b| b.num_rows()).unwrap_or(0); + let arr = literal_to_array(lit, num_rows)?; + Ok(("literal".to_string(), arr)) + } + IRExpr::Param(name) => { + let lit = params + .get(name) + .ok_or_else(|| OmniError::manifest(format!("parameter '{}' not provided", name)))?; + let num_rows = bindings.values().next().map(|b| b.num_rows()).unwrap_or(0); + let arr = literal_to_array(lit, num_rows)?; + Ok((name.clone(), arr)) + } + _ => Err(OmniError::manifest(format!( + "unsupported projection expression: {:?}", + expr + ))), + } +} + +/// Apply ordering to a batch. +fn apply_ordering( + batch: RecordBatch, + orderings: &[IROrdering], + bindings: &HashMap, + _params: &ParamMap, +) -> Result { + use arrow_ord::sort::{SortColumn, lexsort_to_indices}; + + let mut sort_columns = Vec::with_capacity(orderings.len()); + + for ordering in orderings { + let col = match &ordering.expr { + IRExpr::PropAccess { variable, property } => { + let binding = bindings.get(variable).ok_or_else(|| { + OmniError::manifest(format!( + "ordering references unbound variable '{}'", + variable + )) + })?; + binding + .column_by_name(property) + .ok_or_else(|| { + OmniError::manifest(format!("column '{}' not found for ordering", property)) + })? + .clone() + } + IRExpr::AliasRef(alias) => { + // Look up in the projected batch by column name + batch + .column_by_name(alias) + .ok_or_else(|| { + OmniError::manifest(format!("alias '{}' not found for ordering", alias)) + })? + .clone() + } + _ => { + return Err(OmniError::manifest( + "unsupported ordering expression".to_string(), + )); + } + }; + + sort_columns.push(SortColumn { + values: col, + options: Some(arrow_schema::SortOptions { + descending: ordering.descending, + nulls_first: !ordering.descending, + }), + }); + } + + let indices = + lexsort_to_indices(&sort_columns, None).map_err(|e| OmniError::Lance(e.to_string()))?; + + let columns: Vec = batch + .columns() + .iter() + .map(|col| arrow_select::take::take(col.as_ref(), &indices, None)) + .collect::, _>>() + .map_err(|e| OmniError::Lance(e.to_string()))?; + + RecordBatch::try_new(batch.schema(), columns).map_err(|e| OmniError::Lance(e.to_string())) +} + +// ─── Mutation helpers ──────────────────────────────────────────────────────── + +/// Resolve an IRExpr to a concrete Literal value at runtime. +fn resolve_expr_value(expr: &IRExpr, params: &ParamMap) -> Result { + match expr { + IRExpr::Literal(lit) => Ok(lit.clone()), + IRExpr::Param(name) => params + .get(name) + .cloned() + .ok_or_else(|| OmniError::manifest(format!("parameter '{}' not provided", name))), + other => Err(OmniError::manifest(format!( + "unsupported expression in mutation: {:?}", + other + ))), + } +} + +/// Create a single-element or N-element array from a Literal, matching the target DataType. +fn literal_to_typed_array( + lit: &Literal, + data_type: &DataType, + num_rows: usize, +) -> Result { + Ok(match (lit, data_type) { + (Literal::String(s), DataType::Utf8) => { + Arc::new(StringArray::from(vec![s.as_str(); num_rows])) as ArrayRef + } + (Literal::Integer(n), DataType::Int32) => { + Arc::new(Int32Array::from(vec![*n as i32; num_rows])) + } + (Literal::Integer(n), DataType::Int64) => Arc::new(Int64Array::from(vec![*n; num_rows])), + (Literal::Integer(n), DataType::UInt32) => { + Arc::new(UInt32Array::from(vec![*n as u32; num_rows])) + } + (Literal::Integer(n), DataType::UInt64) => { + Arc::new(UInt64Array::from(vec![*n as u64; num_rows])) + } + (Literal::Float(f), DataType::Float32) => { + Arc::new(Float32Array::from(vec![*f as f32; num_rows])) + } + (Literal::Float(f), DataType::Float64) => Arc::new(Float64Array::from(vec![*f; num_rows])), + (Literal::Bool(b), DataType::Boolean) => Arc::new(BooleanArray::from(vec![*b; num_rows])), + (Literal::Date(s), DataType::Date32) => { + let days = crate::loader::parse_date32_literal(s)?; + Arc::new(Date32Array::from(vec![days; num_rows])) + } + (Literal::DateTime(s), DataType::Date64) => Arc::new(Date64Array::from(vec![ + crate::loader::parse_date64_literal(s)?; + num_rows + ])), + (Literal::List(items), DataType::List(field)) => { + typed_list_literal_to_array(items, field.data_type(), num_rows)? + } + (Literal::List(items), DataType::FixedSizeList(field, dim)) + if field.data_type() == &DataType::Float32 => + { + if items.len() != *dim as usize { + return Err(OmniError::manifest(format!( + "vector property expects {} dimensions, got {}", + dim, + items.len() + ))); + } + let mut builder = FixedSizeListBuilder::with_capacity( + Float32Builder::with_capacity(num_rows * (*dim as usize)), + *dim, + num_rows, + ) + .with_field(field.clone()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => builder.values().append_value(*value as f32), + Literal::Float(value) => builder.values().append_value(*value as f32), + _ => { + return Err(OmniError::manifest( + "vector elements must be numeric".to_string(), + )); + } + } + } + builder.append(true); + } + Arc::new(builder.finish()) + } + _ => { + return Err(OmniError::manifest(format!( + "cannot convert {:?} to {:?}", + lit, data_type + ))); + } + }) +} + +fn typed_list_literal_to_array( + items: &[Literal], + item_type: &DataType, + num_rows: usize, +) -> Result { + match item_type { + DataType::Utf8 => { + let mut builder = ListBuilder::new(StringBuilder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::String(value) => builder.values().append_value(value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::Boolean => { + let mut builder = ListBuilder::new(BooleanBuilder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Bool(value) => builder.values().append_value(*value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::Int32 => { + let mut builder = ListBuilder::new(Int32Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => { + let value = i32::try_from(*value).map_err(|_| { + OmniError::manifest(format!( + "list value {} exceeds Int32 range", + value + )) + })?; + builder.values().append_value(value); + } + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::Int64 => { + let mut builder = ListBuilder::new(Int64Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => builder.values().append_value(*value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::UInt32 => { + let mut builder = ListBuilder::new(UInt32Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => { + let value = u32::try_from(*value).map_err(|_| { + OmniError::manifest(format!( + "list value {} exceeds UInt32 range", + value + )) + })?; + builder.values().append_value(value); + } + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::UInt64 => { + let mut builder = ListBuilder::new(UInt64Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => { + let value = u64::try_from(*value).map_err(|_| { + OmniError::manifest(format!( + "list value {} exceeds UInt64 range", + value + )) + })?; + builder.values().append_value(value); + } + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::Float32 => { + let mut builder = ListBuilder::new(Float32Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => builder.values().append_value(*value as f32), + Literal::Float(value) => builder.values().append_value(*value as f32), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::Float64 => { + let mut builder = ListBuilder::new(Float64Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Integer(value) => builder.values().append_value(*value as f64), + Literal::Float(value) => builder.values().append_value(*value), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::Date32 => { + let mut builder = ListBuilder::new(Date32Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::Date(value) => builder + .values() + .append_value(crate::loader::parse_date32_literal(value)?), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + DataType::Date64 => { + let mut builder = ListBuilder::new(Date64Builder::new()); + for _ in 0..num_rows { + for item in items { + match item { + Literal::DateTime(value) => builder + .values() + .append_value(crate::loader::parse_date64_literal(value)?), + _ => builder.values().append_null(), + } + } + builder.append(true); + } + Ok(Arc::new(builder.finish())) + } + other => Err(OmniError::manifest(format!( + "cannot convert list literal to {:?}", + other + ))), + } +} + +/// Build a single-element blob array from a URI or base64 value string. +fn build_blob_array_from_value(value: &str) -> Result { + let mut builder = BlobArrayBuilder::new(1); + crate::loader::append_blob_value(&mut builder, value)?; + builder + .finish() + .map_err(|e| OmniError::Lance(e.to_string())) +} + +/// Build a null blob array with one element. +fn build_null_blob_array() -> Result { + let mut builder = BlobArrayBuilder::new(1); + builder + .push_null() + .map_err(|e| OmniError::Lance(e.to_string()))?; + builder + .finish() + .map_err(|e| OmniError::Lance(e.to_string())) +} + +/// Build a single-row RecordBatch from resolved assignments. +fn build_insert_batch( + schema: &SchemaRef, + id: &str, + assignments: &HashMap, + blob_properties: &HashSet, +) -> Result { + let mut columns: Vec = Vec::with_capacity(schema.fields().len()); + + for field in schema.fields() { + if field.name() == "id" { + columns.push(Arc::new(StringArray::from(vec![id]))); + } else if blob_properties.contains(field.name()) { + if let Some(Literal::String(uri)) = assignments.get(field.name()) { + columns.push(build_blob_array_from_value(uri)?); + } else if field.is_nullable() { + columns.push(build_null_blob_array()?); + } else { + return Err(OmniError::manifest(format!( + "missing required blob property '{}'", + field.name() + ))); + } + } else if field.name() == "src" { + let lit = assignments.get("from").ok_or_else(|| { + OmniError::manifest("missing required edge endpoint 'from'".to_string()) + })?; + columns.push(literal_to_typed_array(lit, field.data_type(), 1)?); + } else if field.name() == "dst" { + let lit = assignments.get("to").ok_or_else(|| { + OmniError::manifest("missing required edge endpoint 'to'".to_string()) + })?; + columns.push(literal_to_typed_array(lit, field.data_type(), 1)?); + } else if let Some(lit) = assignments.get(field.name()) { + columns.push(literal_to_typed_array(lit, field.data_type(), 1)?); + } else if field.is_nullable() { + columns.push(arrow_array::new_null_array(field.data_type(), 1)); + } else { + return Err(OmniError::manifest(format!( + "missing required property '{}'", + field.name() + ))); + } + } + + RecordBatch::try_new(schema.clone(), columns).map_err(|e| OmniError::Lance(e.to_string())) +} + +async fn validate_edge_insert_endpoints( + db: &Omnigraph, + edge_name: &str, + assignments: &HashMap, +) -> Result<()> { + let edge_type = db + .catalog() + .edge_types + .get(edge_name) + .ok_or_else(|| OmniError::manifest(format!("unknown edge type '{}'", edge_name)))?; + let from = match assignments.get("from") { + Some(Literal::String(value)) => value.as_str(), + Some(other) => { + return Err(OmniError::manifest(format!( + "edge {} from endpoint must be a string id, got {}", + edge_name, + literal_to_sql(other) + ))); + } + None => { + return Err(OmniError::manifest(format!( + "edge {} missing 'from' endpoint", + edge_name + ))); + } + }; + let to = match assignments.get("to") { + Some(Literal::String(value)) => value.as_str(), + Some(other) => { + return Err(OmniError::manifest(format!( + "edge {} to endpoint must be a string id, got {}", + edge_name, + literal_to_sql(other) + ))); + } + None => { + return Err(OmniError::manifest(format!( + "edge {} missing 'to' endpoint", + edge_name + ))); + } + }; + + ensure_node_id_exists(db, &edge_type.from_type, from, "src").await?; + ensure_node_id_exists(db, &edge_type.to_type, to, "dst").await?; + Ok(()) +} + +async fn ensure_node_id_exists( + db: &Omnigraph, + node_type: &str, + id: &str, + label: &str, +) -> Result<()> { + let snapshot = db.snapshot(); + let table_key = format!("node:{}", node_type); + let ds = snapshot.open(&table_key).await?; + let filter = format!("id = '{}'", id.replace('\'', "''")); + let exists = ds + .count_rows(Some(filter)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + > 0; + if exists { + Ok(()) + } else { + Err(OmniError::manifest(format!( + "{} '{}' not found in {}", + label, id, node_type + ))) + } +} + +/// Convert an IRMutationPredicate to a Lance SQL filter string. +fn predicate_to_sql( + predicate: &IRMutationPredicate, + params: &ParamMap, + is_edge: bool, +) -> Result { + let column = if is_edge { + match predicate.property.as_str() { + "from" => "src".to_string(), + "to" => "dst".to_string(), + other => other.to_string(), + } + } else { + predicate.property.clone() + }; + + let value = resolve_expr_value(&predicate.value, params)?; + let value_sql = literal_to_sql(&value); + + let op = match predicate.op { + CompOp::Eq => "=", + CompOp::Ne => "!=", + CompOp::Gt => ">", + CompOp::Lt => "<", + CompOp::Ge => ">=", + CompOp::Le => "<=", + CompOp::Contains => { + return Err(OmniError::manifest( + "contains predicate not supported in mutations".to_string(), + )); + } + }; + + Ok(format!("{} {} {}", column, op, value_sql)) +} + +/// Replace specific columns in a RecordBatch with new literal values. +/// Blob columns are excluded from the scan result, so assigned blob values are +/// synthesized from the full table schema and included inline in the update +/// batch. Unassigned blob columns are omitted so merge_insert leaves them +/// untouched. +fn apply_assignments( + full_schema: &SchemaRef, + batch: &RecordBatch, + assignments: &HashMap, + blob_properties: &HashSet, +) -> Result { + let mut columns: Vec = Vec::with_capacity(full_schema.fields().len()); + let mut out_fields: Vec = Vec::with_capacity(full_schema.fields().len()); + + for field in full_schema.fields().iter() { + if blob_properties.contains(field.name()) { + // Blob columns aren't in the scan result. If this blob has an + // assignment, build the blob array inline so the single + // merge_insert covers both scalar and blob updates. Unassigned + // blob columns are omitted — merge_insert only touches columns + // present in the batch. + if let Some(Literal::String(uri)) = assignments.get(field.name()) { + let mut builder = BlobArrayBuilder::new(batch.num_rows()); + for _ in 0..batch.num_rows() { + crate::loader::append_blob_value(&mut builder, uri)?; + } + let blob_field = lance::blob::blob_field(field.name(), true); + out_fields.push(blob_field); + columns.push( + builder + .finish() + .map_err(|e| OmniError::Lance(e.to_string()))?, + ); + } + // else: no assignment for this blob column — skip it + } else if let Some(lit) = assignments.get(field.name()) { + out_fields.push(field.as_ref().clone()); + columns.push(literal_to_typed_array( + lit, + field.data_type(), + batch.num_rows(), + )?); + } else { + let col = batch.column_by_name(field.name()).ok_or_else(|| { + OmniError::Lance(format!( + "column '{}' not found in scan result", + field.name() + )) + })?; + out_fields.push(field.as_ref().clone()); + columns.push(col.clone()); + } + } + + RecordBatch::try_new(Arc::new(Schema::new(out_fields)), columns) + .map_err(|e| OmniError::Lance(e.to_string())) +} + +// ─── Mutation execution ────────────────────────────────────────────────────── + +impl Omnigraph { + pub async fn mutate( + &mut self, + branch: &str, + query_source: &str, + query_name: &str, + params: &ParamMap, + ) -> Result { + self.mutate_as(branch, query_source, query_name, params, None) + .await + } + + pub async fn mutate_as( + &mut self, + branch: &str, + query_source: &str, + query_name: &str, + params: &ParamMap, + actor_id: Option<&str>, + ) -> Result { + let previous_actor = self.audit_actor_id.clone(); + self.audit_actor_id = actor_id.map(str::to_string); + let result = self + .mutate_with_current_actor(branch, query_source, query_name, params) + .await; + self.audit_actor_id = previous_actor; + result + } + + async fn mutate_with_current_actor( + &mut self, + branch: &str, + query_source: &str, + query_name: &str, + params: &ParamMap, + ) -> Result { + self.ensure_schema_state_valid().await?; + let requested = Self::normalize_branch_name(branch)?; + let resolved_params = enrich_mutation_params(params)?; + let operation = format!( + "mutation:{}:branch={}", + query_name, + requested.as_deref().unwrap_or("main") + ); + + if requested.as_deref().is_some_and(is_internal_run_branch) { + return self + .execute_named_mutation_on_branch( + requested.as_deref(), + query_source, + query_name, + &resolved_params, + ) + .await; + } + + let target_branch = requested.clone().unwrap_or_else(|| "main".to_string()); + let target_head_before = self.latest_branch_snapshot_id(&target_branch).await?; + let run = self + .begin_run(&target_branch, Some(operation.as_str())) + .await?; + + let staged_result = match self + .execute_named_mutation_on_branch( + Some(run.run_branch.as_str()), + query_source, + query_name, + &resolved_params, + ) + .await + { + Ok(result) => result, + Err(err) => { + let _ = self.fail_run(&run.run_id).await; + return Err(err); + } + }; + + let target_head_now = self.latest_branch_snapshot_id(&target_branch).await?; + if target_head_now.as_str() != target_head_before.as_str() { + let _ = self.fail_run(&run.run_id).await; + return Err(OmniError::manifest_conflict(format!( + "target branch '{}' advanced during transactional mutation; retry", + target_branch + ))); + } + + if let Err(err) = self.publish_run(&run.run_id).await { + let _ = self.fail_run(&run.run_id).await; + return Err(err); + } + + Ok(staged_result) + } + + async fn execute_named_mutation_on_branch( + &mut self, + branch: Option<&str>, + query_source: &str, + query_name: &str, + params: &ParamMap, + ) -> Result { + let requested = match branch { + Some(branch) => Self::normalize_branch_name(branch)?, + None => None, + }; + let current = self.active_branch().map(str::to_string); + if requested == current { + return self + .execute_named_mutation(query_source, query_name, params) + .await; + } + + let previous = self + .swap_coordinator_for_branch(requested.as_deref()) + .await?; + let result = self + .execute_named_mutation(query_source, query_name, params) + .await; + self.restore_coordinator(previous); + result + } + + async fn execute_named_mutation( + &mut self, + query_source: &str, + query_name: &str, + params: &ParamMap, + ) -> Result { + let query_decl = omnigraph_compiler::find_named_query(query_source, query_name) + .map_err(|e| OmniError::manifest(e.to_string()))?; + + let checked = typecheck_query_decl(self.catalog(), &query_decl)?; + match checked { + CheckedQuery::Mutation(_) => {} + CheckedQuery::Read(_) => { + return Err(OmniError::manifest( + "mutation execution called on a read query; use query instead".to_string(), + )); + } + } + + let ir = lower_mutation_query(&query_decl)?; + + match &ir.op { + MutationOpIR::Insert { + type_name, + assignments, + } => self.execute_insert(type_name, assignments, params).await, + MutationOpIR::Update { + type_name, + assignments, + predicate, + } => { + self.execute_update(type_name, assignments, predicate, params) + .await + } + MutationOpIR::Delete { + type_name, + predicate, + } => self.execute_delete(type_name, predicate, params).await, + } + } + + pub async fn branch_merge(&mut self, source: &str, target: &str) -> Result { + self.branch_merge_as(source, target, None).await + } + + pub async fn branch_merge_as( + &mut self, + source: &str, + target: &str, + actor_id: Option<&str>, + ) -> Result { + let previous_actor = self.audit_actor_id.clone(); + self.audit_actor_id = actor_id.map(str::to_string); + let result = self.branch_merge_impl(source, target, false).await; + self.audit_actor_id = previous_actor; + result + } + + pub(crate) async fn branch_merge_internal( + &mut self, + source: &str, + target: &str, + ) -> Result { + self.branch_merge_impl(source, target, true).await + } + + async fn branch_merge_impl( + &mut self, + source: &str, + target: &str, + allow_internal_refs: bool, + ) -> Result { + if !allow_internal_refs { + if is_internal_run_branch(source) || is_internal_run_branch(target) { + return Err(OmniError::manifest(format!( + "branch_merge does not allow internal run refs ('{}' -> '{}')", + source, target + ))); + } + } + let source_branch = Omnigraph::normalize_branch_name(source)?; + let target_branch = Omnigraph::normalize_branch_name(target)?; + if source_branch == target_branch { + return Err(OmniError::manifest( + "branch_merge requires distinct source and target branches".to_string(), + )); + } + + let source_head_commit_id = self + .head_commit_id_for_branch(source_branch.as_deref()) + .await? + .ok_or_else(|| OmniError::manifest("source branch has no head commit".to_string()))?; + let target_head_commit_id = self + .head_commit_id_for_branch(target_branch.as_deref()) + .await? + .ok_or_else(|| OmniError::manifest("target branch has no head commit".to_string()))?; + let base_commit = CommitGraph::merge_base( + self.uri(), + source_branch.as_deref(), + target_branch.as_deref(), + ) + .await? + .ok_or_else(|| OmniError::manifest("branches have no common ancestor".to_string()))?; + + if source_head_commit_id == target_head_commit_id + || base_commit.graph_commit_id == source_head_commit_id + { + return Ok(MergeOutcome::AlreadyUpToDate); + } + let is_fast_forward = base_commit.graph_commit_id == target_head_commit_id; + + let base_snapshot = ManifestCoordinator::snapshot_at( + self.uri(), + base_commit.manifest_branch.as_deref(), + base_commit.manifest_version, + ) + .await?; + let source_snapshot = self + .resolved_target(ReadTarget::Branch( + source_branch.clone().unwrap_or_else(|| "main".to_string()), + )) + .await? + .snapshot; + let previous_branch = self.active_branch().map(str::to_string); + let previous = self + .swap_coordinator_for_branch(target_branch.as_deref()) + .await?; + let merge_result = self + .branch_merge_on_current_target( + &base_snapshot, + &source_snapshot, + &target_head_commit_id, + &source_head_commit_id, + is_fast_forward, + ) + .await; + self.restore_coordinator(previous); + + if merge_result.is_ok() && previous_branch == target_branch { + self.refresh().await?; + } + + merge_result + } + + async fn branch_merge_on_current_target( + &mut self, + base_snapshot: &Snapshot, + source_snapshot: &Snapshot, + target_head_commit_id: &str, + source_head_commit_id: &str, + is_fast_forward: bool, + ) -> Result { + self.ensure_commit_graph_initialized().await?; + let target_snapshot = self.snapshot(); + + let mut table_keys = HashSet::new(); + for entry in base_snapshot.entries() { + table_keys.insert(entry.table_key.clone()); + } + for entry in source_snapshot.entries() { + table_keys.insert(entry.table_key.clone()); + } + for entry in target_snapshot.entries() { + table_keys.insert(entry.table_key.clone()); + } + + let mut ordered_table_keys: Vec = table_keys.into_iter().collect(); + ordered_table_keys.sort(); + + let mut conflicts = Vec::new(); + let mut candidates: HashMap = HashMap::new(); + + for table_key in &ordered_table_keys { + let base_entry = base_snapshot.entry(table_key); + let source_entry = source_snapshot.entry(table_key); + let target_entry = target_snapshot.entry(table_key); + if same_manifest_state(source_entry, target_entry) { + continue; + } + if same_manifest_state(base_entry, source_entry) { + continue; + } + if same_manifest_state(base_entry, target_entry) { + candidates.insert(table_key.clone(), CandidateTableState::AdoptSourceState); + continue; + } + + if let Some(staged) = stage_streaming_table_merge( + table_key, + self.catalog(), + base_snapshot, + source_snapshot, + &target_snapshot, + &mut conflicts, + ) + .await? + { + candidates.insert( + table_key.clone(), + CandidateTableState::RewriteMerged(staged), + ); + } + } + + if !conflicts.is_empty() { + return Err(OmniError::MergeConflicts(conflicts)); + } + + validate_merge_candidates(self, source_snapshot, &target_snapshot, &candidates).await?; + + let mut updates = Vec::new(); + let mut changed_edge_tables = false; + for table_key in &ordered_table_keys { + let Some(candidate_state) = candidates.get(table_key) else { + continue; + }; + let update = match candidate_state { + CandidateTableState::AdoptSourceState => { + publish_adopted_source_state( + self, + self.catalog(), + base_snapshot, + source_snapshot, + &target_snapshot, + table_key, + ) + .await? + } + CandidateTableState::RewriteMerged(staged) => { + publish_rewritten_merge_table(self, table_key, staged).await? + } + }; + if table_key.starts_with("edge:") { + changed_edge_tables = true; + } + updates.push(update); + } + + let manifest_version = if updates.is_empty() { + self.version() + } else { + self.commit_manifest_updates(&updates).await? + }; + self.record_merge_commit( + manifest_version, + target_head_commit_id, + source_head_commit_id, + ) + .await?; + + if changed_edge_tables { + self.invalidate_graph_index().await; + } + + Ok(if is_fast_forward { + MergeOutcome::FastForward + } else { + MergeOutcome::Merged + }) + } + + async fn execute_insert( + &mut self, + type_name: &str, + assignments: &[IRAssignment], + params: &ParamMap, + ) -> Result { + let mut resolved: HashMap = HashMap::new(); + for a in assignments { + resolved.insert(a.property.clone(), resolve_expr_value(&a.value, params)?); + } + + let is_node = self.catalog().node_types.contains_key(type_name); + let is_edge = self.catalog().edge_types.contains_key(type_name); + + if is_node { + let node_type = &self.catalog().node_types[type_name]; + let schema = node_type.arrow_schema.clone(); + let blob_props = node_type.blob_properties.clone(); + let id = if let Some(key_prop) = node_type.key_property() { + match resolved.get(key_prop) { + Some(Literal::String(s)) => s.clone(), + Some(other) => literal_to_sql(other).trim_matches('\'').to_string(), + None => { + return Err(OmniError::manifest(format!( + "insert missing @key property '{}'", + key_prop + ))); + } + } + } else { + ulid::Ulid::new().to_string() + }; + + let batch = build_insert_batch(&schema, &id, &resolved, &blob_props)?; + crate::loader::validate_value_constraints(&batch, node_type)?; + let has_key = node_type.key_property().is_some(); + let (state, table_branch) = if has_key { + self.upsert_batch(type_name, true, schema, batch).await? + } else { + self.append_batch(type_name, true, schema, batch).await? + }; + + let table_key = format!("node:{}", type_name); + self.commit_updates(&[crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }]) + .await?; + + Ok(MutationResult { + affected_nodes: 1, + affected_edges: 0, + }) + } else if is_edge { + let edge_type = &self.catalog().edge_types[type_name]; + let schema = edge_type.arrow_schema.clone(); + let blob_props = edge_type.blob_properties.clone(); + let id = ulid::Ulid::new().to_string(); + + let batch = build_insert_batch(&schema, &id, &resolved, &blob_props)?; + validate_edge_insert_endpoints(self, type_name, &resolved).await?; + let (state, table_branch) = self.append_batch(type_name, false, schema, batch).await?; + + let table_key = format!("edge:{}", type_name); + self.commit_updates(&[crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }]) + .await?; + + self.invalidate_graph_index().await; + + Ok(MutationResult { + affected_nodes: 0, + affected_edges: 1, + }) + } else { + Err(OmniError::manifest(format!("unknown type '{}'", type_name))) + } + } + + /// Append a batch to a sub-table, returning (new_version, row_count). + async fn append_batch( + &self, + type_name: &str, + is_node: bool, + _schema: SchemaRef, + batch: RecordBatch, + ) -> Result<(crate::table_store::TableState, Option)> { + let table_key = if is_node { + format!("node:{}", type_name) + } else { + format!("edge:{}", type_name) + }; + let (mut ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + let state = self + .table_store() + .append_batch(&full_path, &mut ds, batch) + .await?; + Ok((state, table_branch)) + } + + /// Upsert a batch into a sub-table using merge_insert keyed by "id". + /// Used for @key node types to enforce uniqueness. + async fn upsert_batch( + &self, + type_name: &str, + is_node: bool, + _schema: SchemaRef, + batch: RecordBatch, + ) -> Result<(crate::table_store::TableState, Option)> { + let table_key = if is_node { + format!("node:{}", type_name) + } else { + format!("edge:{}", type_name) + }; + let (ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + let state = self + .table_store() + .merge_insert_batch( + &full_path, + ds, + batch, + vec!["id".to_string()], + lance::dataset::WhenMatched::UpdateAll, + lance::dataset::WhenNotMatched::InsertAll, + ) + .await?; + Ok((state, table_branch)) + } + + async fn execute_update( + &mut self, + type_name: &str, + assignments: &[IRAssignment], + predicate: &IRMutationPredicate, + params: &ParamMap, + ) -> Result { + // Defense in depth: ensure this is a node type + if !self.catalog().node_types.contains_key(type_name) { + return Err(OmniError::manifest(format!( + "update is only supported for node types, not '{}'", + type_name + ))); + } + + // Reject updates to @key properties — identity is immutable + if let Some(key_prop) = self.catalog().node_types[type_name].key_property() { + if assignments.iter().any(|a| a.property == key_prop) { + return Err(OmniError::manifest(format!( + "cannot update @key property '{}' — delete and re-insert instead", + key_prop + ))); + } + } + + let pred_sql = predicate_to_sql(predicate, params, false)?; + let schema = self.catalog().node_types[type_name].arrow_schema.clone(); + let blob_props = self.catalog().node_types[type_name].blob_properties.clone(); + + let table_key = format!("node:{}", type_name); + let (ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + let initial_version = ds.version().version; + + let non_blob_cols: Vec<&str> = schema + .fields() + .iter() + .filter(|f| !blob_props.contains(f.name())) + .map(|f| f.name().as_str()) + .collect(); + let batches = self + .table_store() + .scan( + &ds, + (!blob_props.is_empty()).then_some(non_blob_cols.as_slice()), + Some(&pred_sql), + None, + ) + .await?; + + if batches.is_empty() || batches.iter().all(|b| b.num_rows() == 0) { + return Ok(MutationResult { + affected_nodes: 0, + affected_edges: 0, + }); + } + + let matched = if batches.len() == 1 { + batches.into_iter().next().unwrap() + } else { + let s = batches[0].schema(); + arrow_select::concat::concat_batches(&s, &batches) + .map_err(|e| OmniError::Lance(e.to_string()))? + }; + + let affected_count = matched.num_rows(); + + let mut resolved: HashMap = HashMap::new(); + for a in assignments { + resolved.insert(a.property.clone(), resolve_expr_value(&a.value, params)?); + } + let updated = apply_assignments(&schema, &matched, &resolved, &blob_props)?; + crate::loader::validate_value_constraints(&updated, &self.catalog().node_types[type_name])?; + + // Re-open for merge_insert (scan consumed the dataset; + // version guard was already applied by open_for_mutation above) + let ds = self + .reopen_for_mutation( + &table_key, + &full_path, + table_branch.as_deref(), + initial_version, + ) + .await?; + let update_state = self + .table_store() + .merge_insert_batch( + &full_path, + ds, + updated, + vec!["id".to_string()], + lance::dataset::WhenMatched::UpdateAll, + lance::dataset::WhenNotMatched::DoNothing, + ) + .await?; + + self.commit_updates(&[crate::db::SubTableUpdate { + table_key, + table_version: update_state.version, + table_branch, + row_count: update_state.row_count, + version_metadata: update_state.version_metadata, + }]) + .await?; + + Ok(MutationResult { + affected_nodes: affected_count, + affected_edges: 0, + }) + } + + async fn execute_delete( + &mut self, + type_name: &str, + predicate: &IRMutationPredicate, + params: &ParamMap, + ) -> Result { + let is_node = self.catalog().node_types.contains_key(type_name); + if is_node { + self.execute_delete_node(type_name, predicate, params).await + } else { + self.execute_delete_edge(type_name, predicate, params).await + } + } + + async fn execute_delete_node( + &mut self, + type_name: &str, + predicate: &IRMutationPredicate, + params: &ParamMap, + ) -> Result { + let pred_sql = predicate_to_sql(predicate, params, false)?; + + let table_key = format!("node:{}", type_name); + let (ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + let initial_version = ds.version().version; + + // Scan matching IDs for cascade + let batches = self + .table_store() + .scan(&ds, Some(&["id"]), Some(&pred_sql), None) + .await?; + + let deleted_ids: Vec = batches + .iter() + .flat_map(|batch| { + let ids = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + (0..ids.len()) + .map(|i| ids.value(i).to_string()) + .collect::>() + }) + .collect(); + + if deleted_ids.is_empty() { + return Ok(MutationResult { + affected_nodes: 0, + affected_edges: 0, + }); + } + + let affected_nodes = deleted_ids.len(); + + // Delete nodes (re-open needed because the scan consumed the dataset; + // version guard was already applied by open_for_mutation above) + let mut ds = self + .reopen_for_mutation( + &table_key, + &full_path, + table_branch.as_deref(), + initial_version, + ) + .await?; + let delete_state = self + .table_store() + .delete_where(&full_path, &mut ds, &pred_sql) + .await?; + + let mut updates = vec![crate::db::SubTableUpdate { + table_key, + table_version: delete_state.version, + table_branch: table_branch.clone(), + row_count: delete_state.row_count, + version_metadata: delete_state.version_metadata, + }]; + + let mut affected_edges = 0usize; + let escaped: Vec = deleted_ids + .iter() + .map(|id| format!("'{}'", id.replace('\'', "''"))) + .collect(); + let id_list = escaped.join(", "); + + let edge_info: Vec<(String, String, String)> = self + .catalog() + .edge_types + .iter() + .map(|(name, et)| (name.clone(), et.from_type.clone(), et.to_type.clone())) + .collect(); + + for (edge_name, from_type, to_type) in &edge_info { + let mut cascade_filters = Vec::new(); + if from_type == type_name { + cascade_filters.push(format!("src IN ({})", id_list)); + } + if to_type == type_name { + cascade_filters.push(format!("dst IN ({})", id_list)); + } + if cascade_filters.is_empty() { + continue; + } + + let edge_table_key = format!("edge:{}", edge_name); + let cascade_filter = cascade_filters.join(" OR "); + let (mut edge_ds, edge_full_path, edge_table_branch) = + self.open_for_mutation(&edge_table_key).await?; + + let edge_delete = self + .table_store() + .delete_where(&edge_full_path, &mut edge_ds, &cascade_filter) + .await?; + + affected_edges += edge_delete.deleted_rows; + + if edge_delete.deleted_rows > 0 { + updates.push(crate::db::SubTableUpdate { + table_key: edge_table_key, + table_version: edge_delete.version, + table_branch: edge_table_branch, + row_count: edge_delete.row_count, + version_metadata: edge_delete.version_metadata, + }); + } + } + + self.commit_updates(&updates).await?; + + if affected_edges > 0 { + self.invalidate_graph_index().await; + } + + Ok(MutationResult { + affected_nodes, + affected_edges, + }) + } + + async fn execute_delete_edge( + &mut self, + type_name: &str, + predicate: &IRMutationPredicate, + params: &ParamMap, + ) -> Result { + let pred_sql = predicate_to_sql(predicate, params, true)?; + + let table_key = format!("edge:{}", type_name); + let (mut ds, full_path, table_branch) = self.open_for_mutation(&table_key).await?; + + let delete_state = self + .table_store() + .delete_where(&full_path, &mut ds, &pred_sql) + .await?; + let affected = delete_state.deleted_rows; + + if affected > 0 { + self.commit_updates(&[crate::db::SubTableUpdate { + table_key, + table_version: delete_state.version, + table_branch, + row_count: delete_state.row_count, + version_metadata: delete_state.version_metadata, + }]) + .await?; + + self.invalidate_graph_index().await; + } + + Ok(MutationResult { + affected_nodes: 0, + affected_edges: affected, + }) + } +} + +fn enrich_mutation_params(params: &ParamMap) -> Result { + let mut resolved = params.clone(); + if !resolved.contains_key(NOW_PARAM_NAME) { + let now = OffsetDateTime::now_utc() + .format(&Rfc3339) + .map_err(|e| OmniError::manifest(format!("failed to format now(): {}", e)))?; + resolved.insert(NOW_PARAM_NAME.to_string(), Literal::DateTime(now)); + } + Ok(resolved) +} diff --git a/crates/omnigraph/src/failpoints.rs b/crates/omnigraph/src/failpoints.rs new file mode 100644 index 0000000..461b73e --- /dev/null +++ b/crates/omnigraph/src/failpoints.rs @@ -0,0 +1,37 @@ +use crate::error::Result; + +pub(crate) fn maybe_fail(_name: &str) -> Result<()> { + #[cfg(feature = "failpoints")] + { + let name = _name; + fail::fail_point!(name, |_| { + return Err(crate::error::OmniError::manifest(format!( + "injected failpoint triggered: {}", + name + ))); + }); + } + Ok(()) +} + +#[cfg(feature = "failpoints")] +pub struct ScopedFailPoint { + name: String, +} + +#[cfg(feature = "failpoints")] +impl ScopedFailPoint { + pub fn new(name: &str, action: &str) -> Self { + fail::cfg(name, action).expect("configure failpoint"); + Self { + name: name.to_string(), + } + } +} + +#[cfg(feature = "failpoints")] +impl Drop for ScopedFailPoint { + fn drop(&mut self) { + fail::remove(&self.name); + } +} diff --git a/crates/omnigraph/src/graph_index/mod.rs b/crates/omnigraph/src/graph_index/mod.rs new file mode 100644 index 0000000..ae3173a --- /dev/null +++ b/crates/omnigraph/src/graph_index/mod.rs @@ -0,0 +1,315 @@ +use std::collections::HashMap; + +use arrow_array::StringArray; +use futures::TryStreamExt; + +use crate::db::Snapshot; +use crate::error::{OmniError, Result}; + +/// Dense u32 mapping for a single node type: String ID ↔ dense index. +#[derive(Debug, Clone)] +pub struct TypeIndex { + id_to_dense: HashMap, + dense_to_id: Vec, +} + +impl TypeIndex { + pub(crate) fn new() -> Self { + Self { + id_to_dense: HashMap::new(), + dense_to_id: Vec::new(), + } + } + + /// Get or insert a string ID, returning its dense index. + pub(crate) fn get_or_insert(&mut self, id: &str) -> u32 { + if let Some(&idx) = self.id_to_dense.get(id) { + return idx; + } + let idx = self.dense_to_id.len() as u32; + self.dense_to_id.push(id.to_string()); + self.id_to_dense.insert(id.to_string(), idx); + idx + } + + pub fn to_dense(&self, id: &str) -> Option { + self.id_to_dense.get(id).copied() + } + + pub fn to_id(&self, dense: u32) -> Option<&str> { + self.dense_to_id.get(dense as usize).map(|s| s.as_str()) + } + + pub fn len(&self) -> usize { + self.dense_to_id.len() + } +} + +/// CSR (Compressed Sparse Row) adjacency index. +#[derive(Debug, Clone)] +pub struct CsrIndex { + /// offsets[i] .. offsets[i+1] gives the neighbor range for node i. + offsets: Vec, + /// Dense indices of destination nodes. + targets: Vec, +} + +impl CsrIndex { + pub(crate) fn build(num_nodes: usize, edges: &[(u32, u32)]) -> Self { + // Count outgoing edges per source + let mut counts = vec![0u32; num_nodes]; + for &(src, _) in edges { + counts[src as usize] += 1; + } + + // Build offset array (prefix sum) + let mut offsets = Vec::with_capacity(num_nodes + 1); + offsets.push(0); + for &c in &counts { + offsets.push(offsets.last().unwrap() + c); + } + + // Fill targets + let mut targets = vec![0u32; edges.len()]; + let mut cursors = vec![0u32; num_nodes]; + for &(src, dst) in edges { + let s = src as usize; + let pos = offsets[s] + cursors[s]; + targets[pos as usize] = dst; + cursors[s] += 1; + } + + Self { offsets, targets } + } + + /// Return the dense indices of neighbors for a given dense node index. + pub fn neighbors(&self, node: u32) -> &[u32] { + let start = self.offsets[node as usize] as usize; + let end = self.offsets[node as usize + 1] as usize; + &self.targets[start..end] + } + + /// Check if a node has any outgoing edges. O(1), no allocation. + pub fn has_neighbors(&self, node: u32) -> bool { + let n = node as usize; + self.offsets[n + 1] > self.offsets[n] + } +} + +/// Topology-only graph index. No node data cached — just adjacency. +#[derive(Debug, Clone)] +pub struct GraphIndex { + /// Dense index per node type (built from edge src/dst columns). + type_indices: HashMap, + /// Outgoing adjacency per edge type. + csr: HashMap, + /// Incoming adjacency per edge type. + csc: HashMap, +} + +impl GraphIndex { + /// Build a graph index by scanning edge sub-tables from a snapshot. + pub async fn build( + snapshot: &Snapshot, + edge_types: &HashMap, // edge_name → (from_type, to_type) + ) -> Result { + let mut type_indices: HashMap = HashMap::new(); + let mut csr = HashMap::new(); + let mut csc = HashMap::new(); + + // Phase 1: Scan all edges, build TypeIndices and collect edge pairs + let mut edge_pairs: HashMap> = HashMap::new(); + + for (edge_name, (from_type, to_type)) in edge_types { + let table_key = format!("edge:{}", edge_name); + if snapshot.entry(&table_key).is_none() { + continue; + } + + let ds = snapshot.open(&table_key).await?; + + let batches: Vec = ds + .scan() + .project(&["src", "dst"]) + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + + type_indices + .entry(from_type.clone()) + .or_insert_with(TypeIndex::new); + type_indices + .entry(to_type.clone()) + .or_insert_with(TypeIndex::new); + + let mut edges: Vec<(u32, u32)> = Vec::new(); + for batch in &batches { + let srcs = string_column(batch, "src")?; + let dsts = string_column(batch, "dst")?; + + for i in 0..batch.num_rows() { + let src_dense = type_indices + .get_mut(from_type) + .unwrap() + .get_or_insert(srcs.value(i)); + let dst_dense = type_indices + .get_mut(to_type) + .unwrap() + .get_or_insert(dsts.value(i)); + edges.push((src_dense, dst_dense)); + } + } + edge_pairs.insert(edge_name.clone(), edges); + } + + // Phase 2: Build CSR/CSC using final TypeIndex sizes + for (edge_name, (from_type, to_type)) in edge_types { + let Some(edges) = edge_pairs.get(edge_name) else { + continue; + }; + + let src_count = type_indices[from_type].len(); + let dst_count = type_indices[to_type].len(); + + csr.insert(edge_name.clone(), CsrIndex::build(src_count, edges)); + + let reversed: Vec<(u32, u32)> = edges.iter().map(|&(s, d)| (d, s)).collect(); + csc.insert(edge_name.clone(), CsrIndex::build(dst_count, &reversed)); + } + + Ok(Self { + type_indices, + csr, + csc, + }) + } + + pub fn type_index(&self, type_name: &str) -> Option<&TypeIndex> { + self.type_indices.get(type_name) + } + + pub fn csr(&self, edge_type: &str) -> Option<&CsrIndex> { + self.csr.get(edge_type) + } + + pub fn csc(&self, edge_type: &str) -> Option<&CsrIndex> { + self.csc.get(edge_type) + } + + #[cfg(test)] + pub(crate) fn empty_for_test() -> Self { + Self { + type_indices: HashMap::new(), + csr: HashMap::new(), + csc: HashMap::new(), + } + } +} + +fn string_column<'a>(batch: &'a arrow_array::RecordBatch, name: &str) -> Result<&'a StringArray> { + batch + .column_by_name(name) + .ok_or_else(|| { + OmniError::manifest_internal(format!("graph index batch missing '{name}' column")) + })? + .as_any() + .downcast_ref::() + .ok_or_else(|| { + OmniError::manifest_internal(format!("graph index column '{name}' is not Utf8")) + }) +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use arrow_array::UInt64Array; + use arrow_schema::{DataType, Field, Schema}; + + use super::*; + + #[test] + fn type_index_round_trip() { + let mut idx = TypeIndex::new(); + let a = idx.get_or_insert("Alice"); + let b = idx.get_or_insert("Bob"); + let c = idx.get_or_insert("Charlie"); + + assert_eq!(idx.to_dense("Alice"), Some(a)); + assert_eq!(idx.to_dense("Bob"), Some(b)); + assert_eq!(idx.to_dense("Charlie"), Some(c)); + + assert_eq!(idx.to_id(a), Some("Alice")); + assert_eq!(idx.to_id(b), Some("Bob")); + assert_eq!(idx.to_id(c), Some("Charlie")); + assert_eq!(idx.len(), 3); + } + + #[test] + fn type_index_idempotent_insert() { + let mut idx = TypeIndex::new(); + let a1 = idx.get_or_insert("Alice"); + let a2 = idx.get_or_insert("Alice"); + assert_eq!(a1, a2); + assert_eq!(idx.len(), 1); + } + + #[test] + fn type_index_unknown_returns_none() { + let idx = TypeIndex::new(); + assert_eq!(idx.to_dense("unknown"), None); + assert_eq!(idx.to_id(999), None); + } + + #[test] + fn csr_neighbors_correct() { + // Graph: 0→1, 0→2, 1→2 + let edges = vec![(0, 1), (0, 2), (1, 2)]; + let csr = CsrIndex::build(3, &edges); + + let mut n0: Vec = csr.neighbors(0).to_vec(); + n0.sort(); + assert_eq!(n0, vec![1, 2]); + + assert_eq!(csr.neighbors(1), &[2]); + assert_eq!(csr.neighbors(2), &[] as &[u32]); + } + + #[test] + fn csr_empty_graph() { + let csr = CsrIndex::build(3, &[]); + assert_eq!(csr.neighbors(0), &[] as &[u32]); + assert_eq!(csr.neighbors(1), &[] as &[u32]); + assert_eq!(csr.neighbors(2), &[] as &[u32]); + assert!(!csr.has_neighbors(0)); + } + + #[test] + fn csr_has_neighbors() { + // 0→1, 1→2 + let csr = CsrIndex::build(3, &[(0, 1), (1, 2)]); + assert!(csr.has_neighbors(0)); + assert!(csr.has_neighbors(1)); + assert!(!csr.has_neighbors(2)); + } + + #[test] + fn string_column_returns_error_for_bad_schema() { + let batch = arrow_array::RecordBatch::try_new( + Arc::new(Schema::new(vec![Field::new( + "src", + DataType::UInt64, + false, + )])), + vec![Arc::new(UInt64Array::from(vec![1_u64]))], + ) + .unwrap(); + + let err = string_column(&batch, "src").unwrap_err(); + assert!(err.to_string().contains("src")); + } +} diff --git a/crates/omnigraph/src/lib.rs b/crates/omnigraph/src/lib.rs new file mode 100644 index 0000000..78d62ea --- /dev/null +++ b/crates/omnigraph/src/lib.rs @@ -0,0 +1,11 @@ +pub mod changes; +pub mod db; +pub mod embedding; +pub mod error; +mod exec; +pub mod failpoints; +pub mod graph_index; +pub mod loader; +pub mod runtime_cache; +pub mod storage; +pub mod table_store; diff --git a/crates/omnigraph/src/loader/constraints.rs b/crates/omnigraph/src/loader/constraints.rs new file mode 100644 index 0000000..d76decb --- /dev/null +++ b/crates/omnigraph/src/loader/constraints.rs @@ -0,0 +1,476 @@ +use std::collections::HashMap; +#[cfg(test)] +use std::collections::HashSet; + +use arrow_array::{ + Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, + Int32Array, Int64Array, StringArray, UInt32Array, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema}; + +use crate::catalog::schema_ir::SchemaIR; +use crate::error::{NanoError, Result}; + +use super::super::graph::DatasetAccumulator; + +#[derive(Debug, Default)] +pub(crate) struct NodeConstraintAnnotations { + pub(crate) key_props: HashMap, + pub(crate) unique_props: HashMap>, +} + +pub(crate) fn load_node_constraint_annotations( + schema_ir: &SchemaIR, +) -> Result { + let mut constraints = NodeConstraintAnnotations::default(); + + for node in schema_ir.node_types() { + let mut node_key_prop: Option = None; + let mut node_unique_props: Vec = Vec::new(); + + for prop in &node.properties { + if prop.key && node_key_prop.replace(prop.name.clone()).is_some() { + return Err(NanoError::Storage(format!( + "node type {} has multiple @key properties; only one is currently supported", + node.name + ))); + } + if prop.unique { + node_unique_props.push(prop.name.clone()); + } + } + + if let Some(prop_name) = node_key_prop { + if !node_unique_props.contains(&prop_name) { + node_unique_props.push(prop_name.clone()); + } + constraints.key_props.insert(node.name.clone(), prop_name); + } + if !node_unique_props.is_empty() { + node_unique_props.sort(); + node_unique_props.dedup(); + constraints + .unique_props + .insert(node.name.clone(), node_unique_props); + } + } + + Ok(constraints) +} + +pub(crate) fn enforce_node_unique_constraints( + storage: &DatasetAccumulator, + unique_props: &HashMap>, +) -> Result<()> { + for (type_name, properties) in unique_props { + let Some(batch) = storage.get_all_nodes(type_name)? else { + continue; + }; + + for property in properties { + let prop_idx = + node_property_index(batch.schema().as_ref(), property).ok_or_else(|| { + NanoError::Storage(format!( + "node type {} missing @unique property {}", + type_name, property + )) + })?; + let arr = batch.column(prop_idx); + let mut seen: HashMap = HashMap::new(); + for row in 0..batch.num_rows() { + let Some(value) = unique_value_string(arr, row, type_name, property)? else { + continue; + }; + if let Some(prev_row) = seen.insert(value.clone(), row) { + return Err(NanoError::UniqueConstraint { + type_name: type_name.clone(), + property: property.clone(), + value, + first_row: prev_row, + second_row: row, + }); + } + } + } + } + Ok(()) +} + +#[cfg(test)] +pub(crate) fn collect_incoming_node_types(data_source: &str) -> Result> { + let mut node_types = HashSet::new(); + for line in data_source.lines() { + let line = line.trim(); + if line.is_empty() || line.starts_with("//") { + continue; + } + + let obj: serde_json::Value = serde_json::from_str(line) + .map_err(|e| NanoError::Storage(format!("JSON parse error: {}", e)))?; + if let Some(type_name) = obj.get("type").and_then(|v| v.as_str()) { + node_types.insert(type_name.to_string()); + } + } + Ok(node_types) +} + +pub(crate) fn build_name_seed_for_keyed_load( + storage: &DatasetAccumulator, + key_props: &HashMap, +) -> Result> { + let mut seed = HashMap::new(); + + for (type_name, key_prop) in key_props { + let Some(batch) = storage.get_all_nodes(type_name)? else { + continue; + }; + + let key_idx = node_property_index(batch.schema().as_ref(), key_prop).ok_or_else(|| { + NanoError::Storage(format!( + "node type {} missing @key property {}", + type_name, key_prop + )) + })?; + let key_arr = batch.column(key_idx).clone(); + let id_arr = batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or_else(|| { + NanoError::Storage(format!("node type {} has non-UInt64 id column", type_name)) + })?; + + for row in 0..batch.num_rows() { + let key = key_value_string(&key_arr, row, key_prop)?; + seed.insert((type_name.clone(), key), id_arr.value(row)); + } + } + + Ok(seed) +} + +pub(crate) fn build_name_seed_for_append( + storage: &DatasetAccumulator, + key_props: &HashMap, +) -> Result> { + build_name_seed_for_keyed_load(storage, key_props) +} + +pub(crate) fn node_property_index(schema: &Schema, prop_name: &str) -> Option { + schema + .fields() + .iter() + .enumerate() + .skip(1) + .find_map(|(idx, field)| (field.name() == prop_name).then_some(idx)) +} + +pub(crate) fn node_property_field<'a>(schema: &'a Schema, prop_name: &str) -> Option<&'a Field> { + node_property_index(schema, prop_name).map(|idx| schema.field(idx)) +} + +pub(crate) fn key_value_string(array: &ArrayRef, row: usize, prop_name: &str) -> Result { + let value = scalar_value_string(array, row, "key", None, prop_name)?; + if let Some(value) = value { + return Ok(value); + } + Err(NanoError::Storage(format!( + "@key property {} cannot be null", + prop_name + ))) +} + +fn unique_value_string( + array: &ArrayRef, + row: usize, + type_name: &str, + prop_name: &str, +) -> Result> { + scalar_value_string(array, row, "unique", Some(type_name), prop_name) +} + +fn scalar_value_string( + array: &ArrayRef, + row: usize, + annotation: &str, + type_name: Option<&str>, + prop_name: &str, +) -> Result> { + if array.is_null(row) { + return Ok(None); + } + + let value = match array.data_type() { + DataType::Utf8 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::Boolean => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::Int32 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::Int64 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::UInt32 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::UInt64 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::Float32 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::Float64 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::Date32 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + DataType::Date64 => array + .as_any() + .downcast_ref::() + .map(|a| a.value(row).to_string()), + _ => None, + }; + + let value = value.ok_or_else(|| { + let target = match type_name { + Some(name) => format!("{}.{}", name, prop_name), + None => prop_name.to_string(), + }; + NanoError::Storage(format!( + "unsupported @{} data type {:?} for {}", + annotation, + array.data_type(), + target + )) + })?; + + Ok(Some(value)) +} + +#[cfg(test)] +mod tests { + use std::collections::{HashMap, HashSet}; + + use arrow_array::StringArray; + + use crate::catalog::schema_ir::{build_catalog_from_ir, build_schema_ir}; + use crate::schema::parser::parse_schema; + + use super::super::jsonl::load_jsonl_data; + use super::*; + + fn build_schema_ir_and_storage(schema_src: &str) -> (SchemaIR, DatasetAccumulator) { + let schema = parse_schema(schema_src).unwrap(); + let ir = build_schema_ir(&schema).unwrap(); + let catalog = build_catalog_from_ir(&ir).unwrap(); + (ir, DatasetAccumulator::new(catalog)) + } + + #[test] + fn load_node_constraint_annotations_collects_key_and_unique() { + let schema = r#"node Person { + name: String @key + email: String @unique + alias: String? @unique +}"#; + let (ir, _) = build_schema_ir_and_storage(schema); + let annotations = load_node_constraint_annotations(&ir).unwrap(); + + assert_eq!(annotations.key_props.get("Person").unwrap(), "name"); + assert_eq!( + annotations.unique_props.get("Person").unwrap(), + &vec!["alias".to_string(), "email".to_string(), "name".to_string()] + ); + } + + #[test] + fn collect_incoming_node_types_ignores_comments_and_blanks() { + let data = r#" +// comment +{"type":"Person","data":{"name":"Alice"}} + +{"edge":"Knows","from":"Alice","to":"Bob"} +{"type":"Company","data":{"name":"Acme"}} +"#; + let types = collect_incoming_node_types(data).unwrap(); + assert_eq!( + types, + HashSet::from(["Person".to_string(), "Company".to_string()]) + ); + } + + #[test] + fn enforce_node_unique_constraints_detects_duplicate_non_null() { + let schema = r#"node Person { + name: String + email: String? @unique +}"#; + let (_, mut storage) = build_schema_ir_and_storage(schema); + let key_props = HashMap::new(); + load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"name":"Alice","email":"dupe@example.com"}} +{"type":"Person","data":{"name":"Bob","email":"dupe@example.com"}}"#, + &key_props, + ) + .unwrap(); + + let unique_props = HashMap::from([("Person".to_string(), vec!["email".to_string()])]); + let err = enforce_node_unique_constraints(&storage, &unique_props).unwrap_err(); + match err { + NanoError::UniqueConstraint { + type_name, + property, + value, + .. + } => { + assert_eq!(type_name, "Person"); + assert_eq!(property, "email"); + assert_eq!(value, "dupe@example.com"); + } + other => panic!("expected UniqueConstraint, got {other}"), + } + } + + #[test] + fn enforce_node_unique_constraints_allows_multiple_nulls() { + let schema = r#"node Person { + name: String + nick: String? @unique +}"#; + let (_, mut storage) = build_schema_ir_and_storage(schema); + let key_props = HashMap::new(); + load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"name":"Alice","nick":null}} +{"type":"Person","data":{"name":"Bob","nick":null}}"#, + &key_props, + ) + .unwrap(); + + let unique_props = HashMap::from([("Person".to_string(), vec!["nick".to_string()])]); + enforce_node_unique_constraints(&storage, &unique_props).unwrap(); + } + + #[test] + fn enforce_node_unique_constraints_uses_user_property_named_id() { + let schema = r#"node Person { + id: String @unique + name: String +}"#; + let (_, mut storage) = build_schema_ir_and_storage(schema); + let key_props = HashMap::new(); + load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"id":"user-1","name":"Alice"}} +{"type":"Person","data":{"id":"user-1","name":"Bob"}}"#, + &key_props, + ) + .unwrap(); + + let unique_props = HashMap::from([("Person".to_string(), vec!["id".to_string()])]); + let err = enforce_node_unique_constraints(&storage, &unique_props).unwrap_err(); + match err { + NanoError::UniqueConstraint { + type_name, + property, + value, + .. + } => { + assert_eq!(type_name, "Person"); + assert_eq!(property, "id"); + assert_eq!(value, "user-1"); + } + other => panic!("expected UniqueConstraint, got {other}"), + } + } + + #[test] + fn build_name_seed_for_keyed_load_uses_declared_key_property() { + let schema = r#"node Person { + uid: String @key + name: String +} +node Company { + name: String +}"#; + let (_, mut storage) = build_schema_ir_and_storage(schema); + let key_props = HashMap::from([("Person".to_string(), "uid".to_string())]); + load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"uid":"u1","name":"Alice"}} +{"type":"Company","data":{"name":"Acme"}}"#, + &key_props, + ) + .unwrap(); + + let seed = build_name_seed_for_keyed_load(&storage, &key_props).unwrap(); + + assert!(seed.contains_key(&("Person".to_string(), "u1".to_string()))); + assert!(!seed.contains_key(&("Company".to_string(), "Acme".to_string()))); + } + + #[test] + fn build_name_seed_for_keyed_load_uses_user_property_named_id() { + let schema = r#"node Person { + id: String @key + name: String +}"#; + let (_, mut storage) = build_schema_ir_and_storage(schema); + let key_props = HashMap::from([("Person".to_string(), "id".to_string())]); + load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"id":"user-1","name":"Alice"}}"#, + &key_props, + ) + .unwrap(); + + let seed = build_name_seed_for_keyed_load(&storage, &key_props).unwrap(); + assert!(seed.contains_key(&("Person".to_string(), "user-1".to_string()))); + } + + #[test] + fn build_name_seed_for_append_keeps_all_existing_keyed_nodes() { + let schema = r#"node Person { + uid: String @key + name: String +} +node Company { + name: String +}"#; + let (_, mut storage) = build_schema_ir_and_storage(schema); + let key_props = HashMap::from([("Person".to_string(), "uid".to_string())]); + load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"uid":"u1","name":"Alice"}} +{"type":"Company","data":{"name":"Acme"}}"#, + &key_props, + ) + .unwrap(); + + let seed = build_name_seed_for_append(&storage, &key_props).unwrap(); + assert!(seed.contains_key(&("Person".to_string(), "u1".to_string()))); + assert!(!seed.contains_key(&("Company".to_string(), "Acme".to_string()))); + } + + #[test] + fn key_value_string_rejects_null() { + let arr: ArrayRef = std::sync::Arc::new(StringArray::from(vec![Some("x"), None])); + assert_eq!(key_value_string(&arr, 0, "name").unwrap(), "x"); + let err = key_value_string(&arr, 1, "name").unwrap_err(); + assert!(err.to_string().contains("cannot be null")); + } +} diff --git a/crates/omnigraph/src/loader/embeddings.rs b/crates/omnigraph/src/loader/embeddings.rs new file mode 100644 index 0000000..58ecb93 --- /dev/null +++ b/crates/omnigraph/src/loader/embeddings.rs @@ -0,0 +1,1732 @@ +use std::collections::{BTreeMap, HashMap, HashSet, VecDeque}; +use std::io::{BufRead, BufWriter, Write}; +use std::path::{Path, PathBuf}; +use std::time::{Duration, SystemTime, UNIX_EPOCH}; + +use serde::{Deserialize, Serialize}; + +use crate::catalog::schema_ir::{PropDef, SchemaIR}; +use crate::embedding::EmbeddingClient; +use crate::error::{NanoError, Result}; +use crate::store::manifest::hash_string; +use crate::types::ScalarType; + +const EMBEDDING_CACHE_FILENAME: &str = "_embedding_cache.jsonl"; +const DEFAULT_EMBED_BATCH_SIZE: usize = 64; +const DEFAULT_EMBED_CHUNK_CHARS: usize = 0; +const DEFAULT_EMBED_CHUNK_OVERLAP_CHARS: usize = 128; +const DEFAULT_EMBED_CACHE_MAX_ENTRIES: usize = 50_000; +const DEFAULT_EMBED_CACHE_LOCK_STALE_SECS: usize = 60; +const EMBEDDING_CACHE_LOCK_RETRIES: usize = 200; +const EMBEDDING_CACHE_LOCK_RETRY_DELAY_MS: u64 = 10; + +#[derive(Debug, Clone)] +pub(crate) struct EmbedSpec { + pub target_prop: String, + pub source_prop: String, + pub dim: usize, +} + +#[derive(Debug, Clone)] +pub(crate) struct EmbedValueRequest { + pub source_text: String, + pub dim: usize, +} + +#[cfg_attr(not(test), allow(dead_code))] +#[derive(Debug, Clone)] +struct PendingAssignment { + line_index: usize, + target_prop: String, + source_text: String, + dim: usize, + content_hash: String, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct CacheKey { + model: String, + dim: usize, + content_hash: String, + chunk_chars: usize, + chunk_overlap_chars: usize, +} + +#[derive(Debug, Clone, Serialize, Deserialize)] +struct CacheRecord { + model: String, + dim: usize, + content_hash: String, + vector: Vec, + #[serde(default)] + chunk_chars: usize, + #[serde(default)] + chunk_overlap_chars: usize, +} + +enum ParsedLine { + Raw(String), + Json(serde_json::Value), +} + +struct StreamPendingLine { + line_id: usize, + line: ParsedLine, + missing_assignments: usize, +} + +#[derive(Debug, Clone)] +struct StreamPendingAssignment { + line_id: usize, + target_prop: String, + source_text: String, + dim: usize, + content_hash: String, +} + +impl StreamPendingAssignment { + fn cache_key(&self, model: &str, chunking: EmbedChunkingConfig) -> CacheKey { + CacheKey { + model: model.to_string(), + dim: self.dim, + content_hash: self.content_hash.clone(), + chunk_chars: chunking.chunk_chars, + chunk_overlap_chars: chunking.chunk_overlap_chars, + } + } +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +struct EmbedChunkingConfig { + chunk_chars: usize, + chunk_overlap_chars: usize, +} + +impl EmbedChunkingConfig { + fn from_env() -> Self { + let chunk_chars = parse_env_usize("NANOGRAPH_EMBED_CHUNK_CHARS", DEFAULT_EMBED_CHUNK_CHARS); + let overlap = parse_env_usize( + "NANOGRAPH_EMBED_CHUNK_OVERLAP_CHARS", + DEFAULT_EMBED_CHUNK_OVERLAP_CHARS, + ); + Self::new(chunk_chars, overlap) + } + + fn new(chunk_chars: usize, chunk_overlap_chars: usize) -> Self { + let chunk_overlap_chars = if chunk_chars == 0 { + 0 + } else { + chunk_overlap_chars.min(chunk_chars.saturating_sub(1)) + }; + Self { + chunk_chars, + chunk_overlap_chars, + } + } + + fn is_enabled(self) -> bool { + self.chunk_chars > 0 + } +} + +#[allow(dead_code)] +pub(crate) async fn materialize_embeddings_for_load( + db_path: &Path, + schema_ir: &SchemaIR, + data_source: &str, +) -> Result { + materialize_embeddings_for_load_inner(db_path, schema_ir, data_source, None).await +} + +#[cfg_attr(not(test), allow(dead_code))] +async fn materialize_embeddings_for_load_inner( + db_path: &Path, + schema_ir: &SchemaIR, + data_source: &str, + client_override: Option<&EmbeddingClient>, +) -> Result { + materialize_embeddings_for_load_inner_with_chunking( + db_path, + schema_ir, + data_source, + client_override, + EmbedChunkingConfig::from_env(), + ) + .await +} + +pub(crate) fn has_embedding_specs(schema_ir: &SchemaIR) -> bool { + schema_ir.node_types().any(|node| { + node.properties + .iter() + .any(|prop| prop.embed_source.is_some()) + }) +} + +pub(crate) async fn materialize_embeddings_for_load_to_tempfile( + db_path: &Path, + schema_ir: &SchemaIR, + reader: R, +) -> Result { + materialize_embeddings_for_load_to_tempfile_inner(db_path, schema_ir, reader, None).await +} + +pub(crate) async fn resolve_embedding_requests( + db_path: &Path, + requests: &[EmbedValueRequest], +) -> Result>> { + resolve_embedding_requests_with_chunking(db_path, requests, EmbedChunkingConfig::from_env()) + .await +} + +async fn materialize_embeddings_for_load_to_tempfile_inner( + db_path: &Path, + schema_ir: &SchemaIR, + reader: R, + client_override: Option<&EmbeddingClient>, +) -> Result { + materialize_embeddings_for_load_to_tempfile_inner_with_chunking( + db_path, + schema_ir, + reader, + client_override, + EmbedChunkingConfig::from_env(), + ) + .await +} + +async fn resolve_embedding_requests_with_chunking( + db_path: &Path, + requests: &[EmbedValueRequest], + chunking: EmbedChunkingConfig, +) -> Result>> { + if requests.is_empty() { + return Ok(Vec::new()); + } + + let cache_path = db_path.join(EMBEDDING_CACHE_FILENAME); + let mut cache = load_embedding_cache(&cache_path)?; + let client = EmbeddingClient::from_env() + .map_err(|err| NanoError::Storage(format!("embedding initialization failed: {}", err)))?; + let model = client.model().to_string(); + let batch_size = parse_env_usize("NANOGRAPH_EMBED_BATCH_SIZE", DEFAULT_EMBED_BATCH_SIZE); + + let mut results: Vec>> = vec![None; requests.len()]; + let mut missing_by_dim: BTreeMap> = BTreeMap::new(); + let mut missing_indices: HashMap> = HashMap::new(); + + for (idx, request) in requests.iter().enumerate() { + if request.dim == 0 { + return Err(NanoError::Storage( + "embedding dimension must be greater than zero".to_string(), + )); + } + + let key = CacheKey { + model: model.clone(), + dim: request.dim, + content_hash: hash_string(&request.source_text), + chunk_chars: chunking.chunk_chars, + chunk_overlap_chars: chunking.chunk_overlap_chars, + }; + + if let Some(vector) = cache.get(&key) { + results[idx] = Some(vector.clone()); + continue; + } + + missing_indices.entry(key.clone()).or_default().push(idx); + let entries = missing_by_dim.entry(request.dim).or_default(); + if !entries.iter().any(|(existing, _)| existing == &key) { + entries.push((key, request.source_text.clone())); + } + } + + let mut new_cache_records = Vec::new(); + for (dim, entries) in missing_by_dim { + if chunking.is_enabled() { + for (key, text) in entries { + let vector = + embed_text_with_chunking(&client, &text, dim, batch_size, chunking).await?; + if vector.len() != dim { + return Err(NanoError::Storage(format!( + "embedding dimension mismatch for {}: expected {}, got {}", + key.content_hash, + dim, + vector.len() + ))); + } + cache.insert(key.clone(), vector.clone()); + new_cache_records.push(CacheRecord { + model: key.model.clone(), + dim: key.dim, + content_hash: key.content_hash.clone(), + vector, + chunk_chars: key.chunk_chars, + chunk_overlap_chars: key.chunk_overlap_chars, + }); + } + continue; + } + + for chunk in entries.chunks(batch_size.max(1)) { + let texts: Vec = chunk.iter().map(|(_, text)| text.clone()).collect(); + let vectors = client + .embed_texts(&texts, dim) + .await + .map_err(|err| NanoError::Storage(format!("embedding request failed: {}", err)))?; + if vectors.len() != chunk.len() { + return Err(NanoError::Storage(format!( + "embedding response size mismatch: expected {}, got {}", + chunk.len(), + vectors.len() + ))); + } + + for ((key, _), vector) in chunk.iter().zip(vectors.into_iter()) { + if vector.len() != dim { + return Err(NanoError::Storage(format!( + "embedding dimension mismatch for {}: expected {}, got {}", + key.content_hash, + dim, + vector.len() + ))); + } + cache.insert(key.clone(), vector.clone()); + new_cache_records.push(CacheRecord { + model: key.model.clone(), + dim: key.dim, + content_hash: key.content_hash.clone(), + vector, + chunk_chars: key.chunk_chars, + chunk_overlap_chars: key.chunk_overlap_chars, + }); + } + } + } + + append_embedding_cache(&cache_path, &new_cache_records)?; + + for (key, indices) in missing_indices { + let vector = cache.get(&key).ok_or_else(|| { + NanoError::Storage(format!( + "embedding cache miss for content hash {}", + key.content_hash + )) + })?; + for idx in indices { + results[idx] = Some(vector.clone()); + } + } + + results + .into_iter() + .enumerate() + .map(|(idx, vector)| { + vector.ok_or_else(|| { + NanoError::Storage(format!( + "missing embedding result for request index {}", + idx + )) + }) + }) + .collect() +} + +async fn materialize_embeddings_for_load_to_tempfile_inner_with_chunking( + db_path: &Path, + schema_ir: &SchemaIR, + reader: R, + client_override: Option<&EmbeddingClient>, + chunking: EmbedChunkingConfig, +) -> Result { + let output_path = create_materialized_temp_file(db_path)?; + let embed_specs = collect_embed_specs(schema_ir)?; + let cache_path = db_path.join(EMBEDDING_CACHE_FILENAME); + + if embed_specs.is_empty() { + let mut writer = BufWriter::new(std::fs::File::create(&output_path)?); + copy_reader_to_writer(reader, &mut writer)?; + writer.flush()?; + return Ok(output_path); + } + + let mut cache = load_embedding_cache(&cache_path)?; + let owned_client; + let client = if let Some(client) = client_override { + client + } else { + owned_client = EmbeddingClient::from_env().map_err(|err| { + NanoError::Storage(format!("embedding initialization failed: {}", err)) + })?; + &owned_client + }; + let model = client.model().to_string(); + let batch_size = parse_env_usize("NANOGRAPH_EMBED_BATCH_SIZE", DEFAULT_EMBED_BATCH_SIZE); + let mut writer = BufWriter::new(std::fs::File::create(&output_path)?); + let mut pending_lines: VecDeque = VecDeque::new(); + let mut pending_by_dim: BTreeMap> = BTreeMap::new(); + let mut new_cache_records = Vec::new(); + let mut next_line_id = 0usize; + + for (line_no, line) in reader.lines().enumerate() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with("//") { + pending_lines.push_back(StreamPendingLine { + line_id: next_line_id, + line: ParsedLine::Raw(line), + missing_assignments: 0, + }); + next_line_id += 1; + flush_ready_stream_lines(&mut writer, &mut pending_lines)?; + continue; + } + + let mut obj: serde_json::Value = serde_json::from_str(trimmed).map_err(|e| { + NanoError::Storage(format!("JSON parse error on line {}: {}", line_no + 1, e)) + })?; + let mut output_line = ParsedLine::Raw(line); + let mut missing_assignments = 0usize; + + if let Some(type_name) = obj + .get("type") + .and_then(|value| value.as_str()) + .map(|value| value.to_string()) + && let Some(specs) = embed_specs.get(type_name.as_str()) + { + let data_obj = obj + .get_mut("data") + .and_then(|value| value.as_object_mut()) + .ok_or_else(|| { + NanoError::Storage(format!( + "node {} line {} is missing object field `data`", + type_name, + line_no + 1 + )) + })?; + let mut mutated = false; + + for spec in specs { + let needs_embedding = match data_obj.get(&spec.target_prop) { + Some(value) => value.is_null(), + None => true, + }; + if !needs_embedding { + continue; + } + + let source_value = data_obj.get(&spec.source_prop).ok_or_else(|| { + NanoError::Storage(format!( + "node {} line {} missing @embed source property `{}` for `{}`", + type_name, + line_no + 1, + spec.source_prop, + spec.target_prop + )) + })?; + let source_text = source_value.as_str().ok_or_else(|| { + NanoError::Storage(format!( + "node {} line {} @embed source property `{}` must be String", + type_name, + line_no + 1, + spec.source_prop + )) + })?; + + let assignment = StreamPendingAssignment { + line_id: next_line_id, + target_prop: spec.target_prop.clone(), + source_text: source_text.to_string(), + dim: spec.dim, + content_hash: hash_string(source_text), + }; + let cache_key = assignment.cache_key(&model, chunking); + if let Some(vector) = cache.get(&cache_key) { + data_obj.insert( + spec.target_prop.clone(), + serde_json::to_value(vector).map_err(|e| { + NanoError::Storage(format!("serialize embedding vector failed: {}", e)) + })?, + ); + } else { + missing_assignments += 1; + pending_by_dim + .entry(spec.dim) + .or_default() + .push_back(assignment); + } + mutated = true; + } + + if mutated { + output_line = ParsedLine::Json(obj); + } + } + + pending_lines.push_back(StreamPendingLine { + line_id: next_line_id, + line: output_line, + missing_assignments, + }); + next_line_id += 1; + + let mut runtime = StreamEmbedRuntime { + cache: &mut cache, + model: &model, + client, + new_cache_records: &mut new_cache_records, + batch_size, + chunking, + }; + resolve_pending_stream_batches( + &mut pending_by_dim, + &mut pending_lines, + &mut runtime, + false, + ) + .await?; + flush_ready_stream_lines(&mut writer, &mut pending_lines)?; + } + + let mut runtime = StreamEmbedRuntime { + cache: &mut cache, + model: &model, + client, + new_cache_records: &mut new_cache_records, + batch_size, + chunking, + }; + resolve_pending_stream_batches(&mut pending_by_dim, &mut pending_lines, &mut runtime, true) + .await?; + flush_ready_stream_lines(&mut writer, &mut pending_lines)?; + writer.flush()?; + + if !pending_lines.is_empty() { + return Err(NanoError::Storage( + "embedding materialization left unresolved output rows".to_string(), + )); + } + + append_embedding_cache(&cache_path, &new_cache_records)?; + Ok(output_path) +} + +#[cfg_attr(not(test), allow(dead_code))] +async fn materialize_embeddings_for_load_inner_with_chunking( + db_path: &Path, + schema_ir: &SchemaIR, + data_source: &str, + client_override: Option<&EmbeddingClient>, + chunking: EmbedChunkingConfig, +) -> Result { + let embed_specs = collect_embed_specs(schema_ir)?; + if embed_specs.is_empty() { + return Ok(data_source.to_string()); + } + + let mut lines = Vec::new(); + let mut pending = Vec::new(); + parse_input_lines(data_source, &embed_specs, &mut lines, &mut pending)?; + if pending.is_empty() { + return Ok(data_source.to_string()); + } + + let cache_path = db_path.join(EMBEDDING_CACHE_FILENAME); + let mut cache = load_embedding_cache(&cache_path)?; + + let owned_client; + let client = if let Some(client) = client_override { + client + } else { + owned_client = EmbeddingClient::from_env().map_err(|err| { + NanoError::Storage(format!("embedding initialization failed: {}", err)) + })?; + &owned_client + }; + let model = client.model().to_string(); + + let mut missing_by_dim: BTreeMap> = BTreeMap::new(); + for assignment in &pending { + let key = CacheKey { + model: model.clone(), + dim: assignment.dim, + content_hash: assignment.content_hash.clone(), + chunk_chars: chunking.chunk_chars, + chunk_overlap_chars: chunking.chunk_overlap_chars, + }; + if cache.contains_key(&key) { + continue; + } + let entries = missing_by_dim.entry(assignment.dim).or_default(); + if !entries.iter().any(|(existing, _)| existing == &key) { + entries.push((key, assignment.source_text.clone())); + } + } + + let batch_size = parse_env_usize("NANOGRAPH_EMBED_BATCH_SIZE", DEFAULT_EMBED_BATCH_SIZE); + let mut new_cache_records = Vec::new(); + for (dim, entries) in missing_by_dim { + if chunking.is_enabled() { + for (key, text) in entries { + let vector = + embed_text_with_chunking(client, &text, dim, batch_size, chunking).await?; + if vector.len() != dim { + return Err(NanoError::Storage(format!( + "embedding dimension mismatch for {}: expected {}, got {}", + key.content_hash, + dim, + vector.len() + ))); + } + cache.insert(key.clone(), vector.clone()); + new_cache_records.push(CacheRecord { + model: key.model.clone(), + dim: key.dim, + content_hash: key.content_hash.clone(), + vector, + chunk_chars: key.chunk_chars, + chunk_overlap_chars: key.chunk_overlap_chars, + }); + } + continue; + } + + for chunk in entries.chunks(batch_size) { + let texts: Vec = chunk.iter().map(|(_, text)| text.clone()).collect(); + let vectors = client + .embed_texts(&texts, dim) + .await + .map_err(|err| NanoError::Storage(format!("embedding request failed: {}", err)))?; + if vectors.len() != chunk.len() { + return Err(NanoError::Storage(format!( + "embedding response size mismatch: expected {}, got {}", + chunk.len(), + vectors.len() + ))); + } + for ((key, _), vector) in chunk.iter().zip(vectors.into_iter()) { + if vector.len() != dim { + return Err(NanoError::Storage(format!( + "embedding dimension mismatch for {}: expected {}, got {}", + key.content_hash, + dim, + vector.len() + ))); + } + cache.insert(key.clone(), vector.clone()); + new_cache_records.push(CacheRecord { + model: key.model.clone(), + dim: key.dim, + content_hash: key.content_hash.clone(), + vector, + chunk_chars: key.chunk_chars, + chunk_overlap_chars: key.chunk_overlap_chars, + }); + } + } + } + append_embedding_cache(&cache_path, &new_cache_records)?; + + apply_embeddings_to_lines(&mut lines, &pending, &cache, &model, chunking)?; + render_output_lines(data_source, lines) +} + +#[cfg_attr(not(test), allow(dead_code))] +fn parse_input_lines( + data_source: &str, + embed_specs: &HashMap>, + lines: &mut Vec, + pending: &mut Vec, +) -> Result<()> { + for (line_no, raw_line) in data_source.lines().enumerate() { + let trimmed = raw_line.trim(); + if trimmed.is_empty() || trimmed.starts_with("//") { + lines.push(ParsedLine::Raw(raw_line.to_string())); + continue; + } + + let mut obj: serde_json::Value = serde_json::from_str(trimmed).map_err(|e| { + NanoError::Storage(format!("JSON parse error on line {}: {}", line_no + 1, e)) + })?; + + if let Some(type_name) = obj + .get("type") + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + && let Some(specs) = embed_specs.get(type_name.as_str()) + { + let data_obj = obj + .get_mut("data") + .and_then(|v| v.as_object_mut()) + .ok_or_else(|| { + NanoError::Storage(format!( + "node {} line {} is missing object field `data`", + type_name, + line_no + 1 + )) + })?; + let line_index = lines.len(); + + for spec in specs { + let needs_embedding = match data_obj.get(&spec.target_prop) { + Some(value) => value.is_null(), + None => true, + }; + if !needs_embedding { + continue; + } + + let source_value = data_obj.get(&spec.source_prop).ok_or_else(|| { + NanoError::Storage(format!( + "node {} line {} missing @embed source property `{}` for `{}`", + type_name, + line_no + 1, + spec.source_prop, + spec.target_prop + )) + })?; + let source_text = source_value.as_str().ok_or_else(|| { + NanoError::Storage(format!( + "node {} line {} @embed source property `{}` must be String", + type_name, + line_no + 1, + spec.source_prop + )) + })?; + + pending.push(PendingAssignment { + line_index, + target_prop: spec.target_prop.clone(), + source_text: source_text.to_string(), + dim: spec.dim, + content_hash: hash_string(source_text), + }); + } + } + + lines.push(ParsedLine::Json(obj)); + } + Ok(()) +} + +#[cfg_attr(not(test), allow(dead_code))] +fn apply_embeddings_to_lines( + lines: &mut [ParsedLine], + pending: &[PendingAssignment], + cache: &HashMap>, + model: &str, + chunking: EmbedChunkingConfig, +) -> Result<()> { + for assignment in pending { + let key = CacheKey { + model: model.to_string(), + dim: assignment.dim, + content_hash: assignment.content_hash.clone(), + chunk_chars: chunking.chunk_chars, + chunk_overlap_chars: chunking.chunk_overlap_chars, + }; + let vector = cache.get(&key).ok_or_else(|| { + NanoError::Storage(format!( + "embedding cache miss for content hash {}", + assignment.content_hash + )) + })?; + let line = lines.get_mut(assignment.line_index).ok_or_else(|| { + NanoError::Storage(format!( + "embedding assignment line out of range: {}", + assignment.line_index + )) + })?; + let ParsedLine::Json(obj) = line else { + return Err(NanoError::Storage(format!( + "embedding assignment line {} is not JSON", + assignment.line_index + ))); + }; + let data_obj = obj + .get_mut("data") + .and_then(|v| v.as_object_mut()) + .ok_or_else(|| { + NanoError::Storage("node row is missing object field `data`".to_string()) + })?; + data_obj.insert( + assignment.target_prop.clone(), + serde_json::to_value(vector).map_err(|e| { + NanoError::Storage(format!("serialize embedding vector failed: {}", e)) + })?, + ); + } + Ok(()) +} + +#[cfg_attr(not(test), allow(dead_code))] +fn render_output_lines(original: &str, lines: Vec) -> Result { + let mut out = String::new(); + for (idx, line) in lines.into_iter().enumerate() { + if idx > 0 { + out.push('\n'); + } + match line { + ParsedLine::Raw(raw) => out.push_str(&raw), + ParsedLine::Json(obj) => { + out.push_str(&serde_json::to_string(&obj).map_err(|e| { + NanoError::Storage(format!("serialize JSONL row failed: {}", e)) + })?) + } + } + } + if original.ends_with('\n') { + out.push('\n'); + } + Ok(out) +} + +async fn resolve_pending_stream_batches( + pending_by_dim: &mut BTreeMap>, + pending_lines: &mut VecDeque, + runtime: &mut StreamEmbedRuntime<'_>, + flush_all: bool, +) -> Result<()> { + loop { + let next_dim = pending_by_dim + .iter() + .find(|(_, queue)| { + if flush_all { + !queue.is_empty() + } else { + queue.len() >= runtime.batch_size.max(1) + } + }) + .map(|(dim, _)| *dim); + let Some(dim) = next_dim else { + break; + }; + + let queue = pending_by_dim.get_mut(&dim).ok_or_else(|| { + NanoError::Storage(format!("missing pending embedding queue for dim {}", dim)) + })?; + resolve_pending_stream_batch(queue, pending_lines, runtime).await?; + if queue.is_empty() { + pending_by_dim.remove(&dim); + } + } + + Ok(()) +} + +async fn resolve_pending_stream_batch( + queue: &mut VecDeque, + pending_lines: &mut VecDeque, + runtime: &mut StreamEmbedRuntime<'_>, +) -> Result<()> { + let batch_size = runtime.batch_size.max(1); + let mut assignments = Vec::new(); + let mut unique_entries = Vec::new(); + let mut seen_keys = HashSet::new(); + + while let Some(assignment) = queue.pop_front() { + let cache_key = assignment.cache_key(runtime.model, runtime.chunking); + if seen_keys.insert(cache_key.clone()) { + unique_entries.push((cache_key, assignment.source_text.clone())); + } + assignments.push(assignment); + if unique_entries.len() >= batch_size { + break; + } + } + + if unique_entries.is_empty() { + return Ok(()); + } + + if runtime.chunking.is_enabled() { + for (cache_key, text) in &unique_entries { + let vector = embed_text_with_chunking( + runtime.client, + text, + cache_key.dim, + batch_size, + runtime.chunking, + ) + .await?; + if vector.len() != cache_key.dim { + return Err(NanoError::Storage(format!( + "embedding dimension mismatch for {}: expected {}, got {}", + cache_key.content_hash, + cache_key.dim, + vector.len() + ))); + } + runtime.cache.insert(cache_key.clone(), vector.clone()); + runtime.new_cache_records.push(CacheRecord { + model: cache_key.model.clone(), + dim: cache_key.dim, + content_hash: cache_key.content_hash.clone(), + vector, + chunk_chars: cache_key.chunk_chars, + chunk_overlap_chars: cache_key.chunk_overlap_chars, + }); + } + } else { + let texts: Vec = unique_entries + .iter() + .map(|(_, text)| text.clone()) + .collect(); + let dim = unique_entries[0].0.dim; + let vectors = runtime + .client + .embed_texts(&texts, dim) + .await + .map_err(|err| NanoError::Storage(format!("embedding request failed: {}", err)))?; + if vectors.len() != unique_entries.len() { + return Err(NanoError::Storage(format!( + "embedding response size mismatch: expected {}, got {}", + unique_entries.len(), + vectors.len() + ))); + } + + for ((cache_key, _), vector) in unique_entries.iter().zip(vectors.into_iter()) { + if vector.len() != cache_key.dim { + return Err(NanoError::Storage(format!( + "embedding dimension mismatch for {}: expected {}, got {}", + cache_key.content_hash, + cache_key.dim, + vector.len() + ))); + } + runtime.cache.insert(cache_key.clone(), vector.clone()); + runtime.new_cache_records.push(CacheRecord { + model: cache_key.model.clone(), + dim: cache_key.dim, + content_hash: cache_key.content_hash.clone(), + vector, + chunk_chars: cache_key.chunk_chars, + chunk_overlap_chars: cache_key.chunk_overlap_chars, + }); + } + } + + for assignment in &assignments { + apply_stream_assignment( + pending_lines, + assignment, + runtime.cache, + runtime.model, + runtime.chunking, + )?; + } + + Ok(()) +} + +struct StreamEmbedRuntime<'a> { + cache: &'a mut HashMap>, + model: &'a str, + client: &'a EmbeddingClient, + new_cache_records: &'a mut Vec, + batch_size: usize, + chunking: EmbedChunkingConfig, +} + +fn apply_stream_assignment( + pending_lines: &mut VecDeque, + assignment: &StreamPendingAssignment, + cache: &HashMap>, + model: &str, + chunking: EmbedChunkingConfig, +) -> Result<()> { + let cache_key = assignment.cache_key(model, chunking); + let vector = cache.get(&cache_key).ok_or_else(|| { + NanoError::Storage(format!( + "embedding cache miss for content hash {}", + assignment.content_hash + )) + })?; + let line = pending_lines + .iter_mut() + .find(|line| line.line_id == assignment.line_id) + .ok_or_else(|| { + NanoError::Storage(format!( + "embedding assignment line out of range: {}", + assignment.line_id + )) + })?; + let ParsedLine::Json(obj) = &mut line.line else { + return Err(NanoError::Storage(format!( + "embedding assignment line {} is not JSON", + assignment.line_id + ))); + }; + let data_obj = obj + .get_mut("data") + .and_then(|value| value.as_object_mut()) + .ok_or_else(|| NanoError::Storage("node row is missing object field `data`".to_string()))?; + data_obj.insert( + assignment.target_prop.clone(), + serde_json::to_value(vector) + .map_err(|e| NanoError::Storage(format!("serialize embedding vector failed: {}", e)))?, + ); + if line.missing_assignments == 0 { + return Err(NanoError::Storage(format!( + "embedding assignment line {} underflow", + assignment.line_id + ))); + } + line.missing_assignments -= 1; + Ok(()) +} + +fn flush_ready_stream_lines( + writer: &mut BufWriter, + pending_lines: &mut VecDeque, +) -> Result<()> { + while pending_lines + .front() + .map(|line| line.missing_assignments == 0) + .unwrap_or(false) + { + let line = pending_lines.pop_front().ok_or_else(|| { + NanoError::Storage("pending embedding output queue unexpectedly empty".to_string()) + })?; + match line.line { + ParsedLine::Raw(raw) => writer.write_all(raw.as_bytes())?, + ParsedLine::Json(obj) => serde_json::to_writer(&mut *writer, &obj) + .map_err(|e| NanoError::Storage(format!("serialize JSONL row failed: {}", e)))?, + } + writer.write_all(b"\n")?; + } + Ok(()) +} + +fn copy_reader_to_writer( + reader: R, + writer: &mut BufWriter, +) -> Result<()> { + for line in reader.lines() { + let line = line?; + writer.write_all(line.as_bytes())?; + writer.write_all(b"\n")?; + } + Ok(()) +} + +fn create_materialized_temp_file(db_path: &Path) -> Result { + std::fs::create_dir_all(db_path)?; + let pid = std::process::id(); + for attempt in 0..256u32 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + let path = db_path.join(format!( + ".nanograph_embed_materialized_{}_{}_{}.jsonl", + pid, now, attempt + )); + match std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&path) + { + Ok(_) => return Ok(path), + Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => continue, + Err(err) => return Err(err.into()), + } + } + + Err(NanoError::Storage( + "failed to create temp embedding materialization file".to_string(), + )) +} + +async fn embed_text_with_chunking( + client: &EmbeddingClient, + source_text: &str, + dim: usize, + batch_size: usize, + chunking: EmbedChunkingConfig, +) -> Result> { + let chunks = split_text_into_chunks( + source_text, + chunking.chunk_chars, + chunking.chunk_overlap_chars, + ); + if chunks.len() == 1 { + return client + .embed_text(&chunks[0], dim) + .await + .map_err(|err| NanoError::Storage(format!("embedding request failed: {}", err))); + } + + let batch_size = batch_size.max(1); + let mut vectors = Vec::with_capacity(chunks.len()); + for chunk_batch in chunks.chunks(batch_size) { + let texts: Vec = chunk_batch.to_vec(); + let mut embedded = client + .embed_texts(&texts, dim) + .await + .map_err(|err| NanoError::Storage(format!("embedding request failed: {}", err)))?; + if embedded.len() != texts.len() { + return Err(NanoError::Storage(format!( + "embedding response size mismatch: expected {}, got {}", + texts.len(), + embedded.len() + ))); + } + vectors.append(&mut embedded); + } + + average_pool_embeddings(&vectors, dim) +} + +fn split_text_into_chunks(text: &str, chunk_chars: usize, overlap_chars: usize) -> Vec { + if chunk_chars == 0 { + return vec![text.to_string()]; + } + + let total_chars = text.chars().count(); + if total_chars <= chunk_chars { + return vec![text.to_string()]; + } + + let mut char_boundaries = Vec::with_capacity(total_chars + 1); + char_boundaries.push(0); + for (idx, _) in text.char_indices().skip(1) { + char_boundaries.push(idx); + } + char_boundaries.push(text.len()); + + let step = chunk_chars.saturating_sub(overlap_chars).max(1); + let mut out = Vec::new(); + let mut start_char = 0usize; + while start_char < total_chars { + let end_char = (start_char + chunk_chars).min(total_chars); + let start_byte = char_boundaries[start_char]; + let end_byte = char_boundaries[end_char]; + out.push(text[start_byte..end_byte].to_string()); + if end_char == total_chars { + break; + } + start_char = start_char.saturating_add(step); + } + + if out.is_empty() { + vec![text.to_string()] + } else { + out + } +} + +fn average_pool_embeddings(vectors: &[Vec], dim: usize) -> Result> { + if vectors.is_empty() { + return Err(NanoError::Storage( + "embedding aggregation received no chunk vectors".to_string(), + )); + } + + let mut accum = vec![0.0f64; dim]; + for vector in vectors { + if vector.len() != dim { + return Err(NanoError::Storage(format!( + "embedding dimension mismatch during chunk aggregation: expected {}, got {}", + dim, + vector.len() + ))); + } + for (idx, value) in vector.iter().enumerate() { + accum[idx] += *value as f64; + } + } + + let inv_len = 1.0f64 / vectors.len() as f64; + let mut pooled: Vec = accum + .into_iter() + .map(|sum| (sum * inv_len) as f32) + .collect(); + let norm = pooled + .iter() + .map(|v| (*v as f64) * (*v as f64)) + .sum::() + .sqrt() as f32; + if norm > f32::EPSILON { + for value in &mut pooled { + *value /= norm; + } + } + Ok(pooled) +} + +pub(crate) fn collect_embed_specs(schema_ir: &SchemaIR) -> Result>> { + let mut specs_by_type: HashMap> = HashMap::new(); + for node in schema_ir.node_types() { + let mut prop_by_name: HashMap<&str, &PropDef> = HashMap::new(); + for prop in &node.properties { + prop_by_name.insert(prop.name.as_str(), prop); + } + + let mut node_specs = Vec::new(); + for prop in &node.properties { + let Some(source_prop) = prop.embed_source.as_ref() else { + continue; + }; + + if prop.list { + return Err(NanoError::Storage(format!( + "@embed target {}.{} cannot be a list type", + node.name, prop.name + ))); + } + let dim = match ScalarType::from_str_name(&prop.scalar_type) { + Some(ScalarType::Vector(dim)) if dim > 0 => dim as usize, + _ => { + return Err(NanoError::Storage(format!( + "@embed target {}.{} must be Vector(dim)", + node.name, prop.name + ))); + } + }; + + let source_def = prop_by_name.get(source_prop.as_str()).ok_or_else(|| { + NanoError::Storage(format!( + "@embed on {}.{} references unknown source property {}", + node.name, prop.name, source_prop + )) + })?; + if source_def.list || source_def.scalar_type != "String" { + return Err(NanoError::Storage(format!( + "@embed source {}.{} must be String", + node.name, source_prop + ))); + } + + node_specs.push(EmbedSpec { + target_prop: prop.name.clone(), + source_prop: source_prop.clone(), + dim, + }); + } + + if !node_specs.is_empty() { + specs_by_type.insert(node.name.clone(), node_specs); + } + } + Ok(specs_by_type) +} + +fn load_embedding_cache(path: &Path) -> Result>> { + let records = load_embedding_cache_records(path)?; + let mut cache = HashMap::new(); + for record in records { + let key = cache_key_from_record(&record); + cache.insert(key, record.vector); + } + Ok(cache) +} + +fn append_embedding_cache(path: &Path, records: &[CacheRecord]) -> Result<()> { + let max_entries = parse_env_usize( + "NANOGRAPH_EMBED_CACHE_MAX_ENTRIES", + DEFAULT_EMBED_CACHE_MAX_ENTRIES, + ); + append_embedding_cache_with_limit(path, records, max_entries) +} + +fn append_embedding_cache_with_limit( + path: &Path, + records: &[CacheRecord], + max_entries: usize, +) -> Result<()> { + if records.is_empty() { + return Ok(()); + } + let _lock = acquire_embedding_cache_lock(path)?; + let mut merged = load_embedding_cache_records(path)?; + merged.extend(records.iter().cloned()); + let compacted = compact_embedding_cache_records(merged, max_entries); + write_embedding_cache_records(path, &compacted)?; + Ok(()) +} + +fn load_embedding_cache_records(path: &Path) -> Result> { + if !path.exists() { + return Ok(Vec::new()); + } + let data = std::fs::read_to_string(path)?; + parse_embedding_cache_records(path, &data) +} + +fn parse_embedding_cache_records(path: &Path, data: &str) -> Result> { + let mut records = Vec::new(); + for (line_no, line) in data.lines().enumerate() { + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + let record: CacheRecord = serde_json::from_str(trimmed).map_err(|e| { + NanoError::Storage(format!( + "invalid embedding cache at {} line {}: {}", + path.display(), + line_no + 1, + e + )) + })?; + if record.vector.len() != record.dim { + return Err(NanoError::Storage(format!( + "invalid embedding cache at {} line {}: vector dim {} does not match {}", + path.display(), + line_no + 1, + record.vector.len(), + record.dim + ))); + } + records.push(record); + } + Ok(records) +} + +fn write_embedding_cache_records(path: &Path, records: &[CacheRecord]) -> Result<()> { + let mut file = std::fs::OpenOptions::new() + .create(true) + .write(true) + .truncate(true) + .open(path)?; + for record in records { + let mut line = serde_json::to_vec(record).map_err(|e| { + NanoError::Storage(format!( + "failed to write embedding cache {}: {}", + path.display(), + e + )) + })?; + line.push(b'\n'); + file.write_all(&line)?; + } + file.flush()?; + Ok(()) +} + +fn compact_embedding_cache_records( + records: Vec, + max_entries: usize, +) -> Vec { + let max_entries = max_entries.max(1); + let mut seen = HashSet::new(); + let mut compacted_rev = Vec::with_capacity(records.len().min(max_entries)); + for record in records.into_iter().rev() { + if seen.insert(cache_key_from_record(&record)) { + compacted_rev.push(record); + if compacted_rev.len() == max_entries { + break; + } + } + } + compacted_rev.reverse(); + compacted_rev +} + +fn cache_key_from_record(record: &CacheRecord) -> CacheKey { + CacheKey { + model: record.model.clone(), + dim: record.dim, + content_hash: record.content_hash.clone(), + chunk_chars: record.chunk_chars, + chunk_overlap_chars: record.chunk_overlap_chars, + } +} + +struct EmbeddingCacheLock { + path: PathBuf, + _file: std::fs::File, +} + +impl Drop for EmbeddingCacheLock { + fn drop(&mut self) { + let _ = std::fs::remove_file(&self.path); + } +} + +fn embedding_cache_lock_path(path: &Path) -> PathBuf { + let mut lock_path = path.as_os_str().to_os_string(); + lock_path.push(".lock"); + PathBuf::from(lock_path) +} + +fn acquire_embedding_cache_lock(path: &Path) -> Result { + let stale_after_secs = parse_env_usize( + "NANOGRAPH_EMBED_CACHE_LOCK_STALE_SECS", + DEFAULT_EMBED_CACHE_LOCK_STALE_SECS, + ); + let stale_after = Duration::from_secs(stale_after_secs as u64); + acquire_embedding_cache_lock_with_stale_after(path, stale_after) +} + +fn acquire_embedding_cache_lock_with_stale_after( + path: &Path, + stale_after: Duration, +) -> Result { + let lock_path = embedding_cache_lock_path(path); + for attempt in 0..EMBEDDING_CACHE_LOCK_RETRIES { + match std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&lock_path) + { + Ok(file) => { + return Ok(EmbeddingCacheLock { + path: lock_path, + _file: file, + }); + } + Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => { + if lock_file_is_stale(&lock_path, stale_after) { + match std::fs::remove_file(&lock_path) { + Ok(()) => continue, + Err(remove_err) if remove_err.kind() == std::io::ErrorKind::NotFound => { + continue; + } + Err(remove_err) => { + return Err(NanoError::Storage(format!( + "failed to remove stale embedding cache lock {}: {}", + lock_path.display(), + remove_err + ))); + } + } + } + if attempt + 1 == EMBEDDING_CACHE_LOCK_RETRIES { + return Err(NanoError::Storage(format!( + "embedding cache lock timed out for {} (lock file: {})", + path.display(), + lock_path.display() + ))); + } + std::thread::sleep(Duration::from_millis(EMBEDDING_CACHE_LOCK_RETRY_DELAY_MS)); + } + Err(err) => { + return Err(NanoError::Storage(format!( + "failed to acquire embedding cache lock {}: {}", + lock_path.display(), + err + ))); + } + } + } + + Err(NanoError::Storage(format!( + "embedding cache lock acquisition failed for {}", + path.display() + ))) +} + +fn lock_file_is_stale(lock_path: &Path, stale_after: Duration) -> bool { + let metadata = match std::fs::metadata(lock_path) { + Ok(meta) => meta, + Err(_) => return false, + }; + let timestamp = metadata.modified().ok().or_else(|| metadata.created().ok()); + let Some(timestamp) = timestamp else { + return false; + }; + match timestamp.elapsed() { + Ok(age) => age >= stale_after, + Err(_) => false, + } +} + +fn parse_env_usize(name: &str, default: usize) -> usize { + std::env::var(name) + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|v| *v > 0) + .unwrap_or(default) +} + +#[cfg(test)] +mod tests { + use std::collections::HashSet; + use std::io::Cursor; + use std::sync::{Arc, Barrier}; + + use tempfile::TempDir; + + use crate::catalog::schema_ir::build_schema_ir; + use crate::schema::parser::parse_schema; + + use super::*; + + #[tokio::test] + async fn materialize_embeddings_populates_missing_vector() { + let schema = parse_schema( + r#" +node Doc { + slug: String @key + title: String + embedding: Vector(6) @embed(title) +} +"#, + ) + .unwrap(); + let ir = build_schema_ir(&schema).unwrap(); + let data = r#"{"type":"Doc","data":{"slug":"a","title":"alpha"}} +{"type":"Doc","data":{"slug":"b","title":"beta"}} +"#; + let temp = TempDir::new().unwrap(); + let client = EmbeddingClient::mock_for_tests(); + let out = materialize_embeddings_for_load_inner(temp.path(), &ir, data, Some(&client)) + .await + .unwrap(); + assert!(out.contains("\"embedding\"")); + assert!(temp.path().join(EMBEDDING_CACHE_FILENAME).exists()); + } + + #[tokio::test] + async fn materialize_embeddings_is_noop_when_vectors_present() { + let schema = parse_schema( + r#" +node Doc { + slug: String @key + title: String + embedding: Vector(3) @embed(title) +} +"#, + ) + .unwrap(); + let ir = build_schema_ir(&schema).unwrap(); + let data = + r#"{"type":"Doc","data":{"slug":"a","title":"alpha","embedding":[1.0,0.0,0.0]}}"#; + let temp = TempDir::new().unwrap(); + let out = materialize_embeddings_for_load_inner( + temp.path(), + &ir, + data, + Some(&EmbeddingClient::mock_for_tests()), + ) + .await + .unwrap(); + assert_eq!(out, data); + assert!(!temp.path().join(EMBEDDING_CACHE_FILENAME).exists()); + } + + #[tokio::test] + async fn materialize_embeddings_to_tempfile_matches_string_path() { + let schema = parse_schema( + r#" +node Doc { + slug: String @key + title: String + embedding: Vector(6) @embed(title) +} +"#, + ) + .unwrap(); + let ir = build_schema_ir(&schema).unwrap(); + let data = r#"{"type":"Doc","data":{"slug":"a","title":"alpha"}} +{"type":"Doc","data":{"slug":"b","title":"beta"}} +"#; + let temp = TempDir::new().unwrap(); + let client = EmbeddingClient::mock_for_tests(); + + let string_out = + materialize_embeddings_for_load_inner(temp.path(), &ir, data, Some(&client)) + .await + .unwrap(); + let tempfile_out = materialize_embeddings_for_load_to_tempfile_inner( + temp.path(), + &ir, + Cursor::new(data.as_bytes()), + Some(&client), + ) + .await + .unwrap(); + let stream_out = std::fs::read_to_string(tempfile_out).unwrap(); + + let parse_rows = |text: &str| { + text.lines() + .filter(|line| !line.trim().is_empty()) + .map(|line| serde_json::from_str::(line).unwrap()) + .collect::>() + }; + + assert_eq!(parse_rows(&string_out), parse_rows(&stream_out)); + } + + #[test] + fn split_text_into_chunks_respects_overlap() { + let chunks = split_text_into_chunks("abcdefghij", 4, 1); + assert_eq!(chunks, vec!["abcd", "defg", "ghij"]); + } + + #[test] + fn append_embedding_cache_handles_concurrent_writers() { + let temp = TempDir::new().unwrap(); + let cache_path = temp.path().join(EMBEDDING_CACHE_FILENAME); + let writer_count = 8usize; + let barrier = Arc::new(Barrier::new(writer_count)); + let mut threads = Vec::new(); + + for idx in 0..writer_count { + let path = cache_path.clone(); + let barrier = Arc::clone(&barrier); + threads.push(std::thread::spawn(move || { + let record = CacheRecord { + model: "test-model".to_string(), + dim: 3, + content_hash: format!("hash-{}", idx), + vector: vec![idx as f32, 1.0, 2.0], + chunk_chars: 0, + chunk_overlap_chars: 0, + }; + barrier.wait(); + append_embedding_cache(&path, &[record]).unwrap(); + })); + } + + for thread in threads { + thread.join().unwrap(); + } + + let file = std::fs::read_to_string(&cache_path).unwrap(); + let lines: Vec<&str> = file + .lines() + .filter(|line| !line.trim().is_empty()) + .collect(); + assert_eq!(lines.len(), writer_count); + + let mut seen = HashSet::new(); + for line in lines { + let record: CacheRecord = serde_json::from_str(line).unwrap(); + assert!(seen.insert(record.content_hash)); + } + } + + #[test] + fn append_embedding_cache_with_limit_compacts_and_deduplicates() { + let temp = TempDir::new().unwrap(); + let cache_path = temp.path().join(EMBEDDING_CACHE_FILENAME); + + let record = |hash: &str, marker: f32| CacheRecord { + model: "test-model".to_string(), + dim: 3, + content_hash: hash.to_string(), + vector: vec![marker, 1.0, 2.0], + chunk_chars: 0, + chunk_overlap_chars: 0, + }; + + append_embedding_cache_with_limit( + &cache_path, + &[record("a", 1.0), record("b", 2.0), record("c", 3.0)], + 3, + ) + .unwrap(); + append_embedding_cache_with_limit(&cache_path, &[record("d", 4.0), record("b", 20.0)], 3) + .unwrap(); + + let cache = load_embedding_cache(&cache_path).unwrap(); + assert_eq!(cache.len(), 3); + + let key_b = CacheKey { + model: "test-model".to_string(), + dim: 3, + content_hash: "b".to_string(), + chunk_chars: 0, + chunk_overlap_chars: 0, + }; + let key_c = CacheKey { + content_hash: "c".to_string(), + ..key_b.clone() + }; + let key_d = CacheKey { + content_hash: "d".to_string(), + ..key_b.clone() + }; + + assert_eq!(cache.get(&key_b).unwrap()[0], 20.0); + assert!(cache.contains_key(&key_c)); + assert!(cache.contains_key(&key_d)); + } + + #[test] + fn acquire_embedding_cache_lock_reclaims_stale_lock_file() { + let temp = TempDir::new().unwrap(); + let cache_path = temp.path().join(EMBEDDING_CACHE_FILENAME); + let lock_path = embedding_cache_lock_path(&cache_path); + + std::fs::OpenOptions::new() + .create_new(true) + .write(true) + .open(&lock_path) + .unwrap(); + std::thread::sleep(Duration::from_secs(2)); + + let lock = + acquire_embedding_cache_lock_with_stale_after(&cache_path, Duration::from_secs(1)) + .unwrap(); + drop(lock); + + assert!(!lock_path.exists()); + } + + #[tokio::test] + async fn materialize_embeddings_chunking_pools_chunk_vectors() { + let schema = parse_schema( + r#" +node Doc { + slug: String @key + body: String + embedding: Vector(6) @embed(body) +} +"#, + ) + .unwrap(); + let ir = build_schema_ir(&schema).unwrap(); + let data = r#"{"type":"Doc","data":{"slug":"doc-1","body":"alpha beta gamma delta epsilon zeta"}}"#; + let temp = TempDir::new().unwrap(); + let client = EmbeddingClient::mock_for_tests(); + let chunking = EmbedChunkingConfig::new(12, 3); + let out = materialize_embeddings_for_load_inner_with_chunking( + temp.path(), + &ir, + data, + Some(&client), + chunking, + ) + .await + .unwrap(); + + let embedded: serde_json::Value = serde_json::from_str(&out).unwrap(); + let values = embedded["data"]["embedding"].as_array().unwrap(); + let actual: Vec = values.iter().map(|v| v.as_f64().unwrap() as f32).collect(); + + let chunk_texts = split_text_into_chunks( + "alpha beta gamma delta epsilon zeta", + chunking.chunk_chars, + chunking.chunk_overlap_chars, + ); + let chunk_vectors = client.embed_texts(&chunk_texts, 6).await.unwrap(); + let expected = average_pool_embeddings(&chunk_vectors, 6).unwrap(); + + assert_eq!(actual.len(), expected.len()); + for (got, want) in actual.iter().zip(expected.iter()) { + assert!((got - want).abs() < 1e-6, "got={}, want={}", got, want); + } + } + + #[test] + fn cache_key_differs_by_chunking_config() { + let key_a = CacheKey { + model: "text-embedding-3-small".to_string(), + dim: 8, + content_hash: "abc".to_string(), + chunk_chars: 0, + chunk_overlap_chars: 0, + }; + let key_b = CacheKey { + chunk_chars: 256, + chunk_overlap_chars: 64, + ..key_a.clone() + }; + assert_ne!(key_a, key_b); + } +} diff --git a/crates/omnigraph/src/loader/jsonl.rs b/crates/omnigraph/src/loader/jsonl.rs new file mode 100644 index 0000000..8eb9617 --- /dev/null +++ b/crates/omnigraph/src/loader/jsonl.rs @@ -0,0 +1,1532 @@ +use std::collections::{BTreeMap, HashMap}; +use std::fs::{File, OpenOptions}; +use std::io::{BufRead, BufReader, BufWriter, Cursor, Write}; +use std::path::{Path, PathBuf}; +use std::sync::Arc; +use std::time::{SystemTime, UNIX_EPOCH}; + +use arrow_array::builder::{ + ArrayBuilder, BooleanBuilder, Date32Builder, Date64Builder, FixedSizeListBuilder, + Float32Builder, Float64Builder, Int32Builder, Int64Builder, ListBuilder, StringBuilder, + UInt32Builder, UInt64Builder, make_builder, +}; +use arrow_array::{ + Array, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, Int32Array, + Int64Array, RecordBatch, StringArray, UInt32Array, UInt64Array, +}; +use arrow_schema::{DataType, Field, Schema}; + +use crate::error::{NanoError, Result}; + +use super::super::graph::DatasetAccumulator; +use super::constraints::{key_value_string, node_property_field}; + +#[cfg_attr(not(test), allow(dead_code))] +/// Load JSONL-formatted data into a DatasetAccumulator. +/// Each line is either a node `{"type": "...", "data": {...}}` or edge `{"edge": "...", "from": "...", "to": "..."}`. +pub(crate) fn load_jsonl_data( + storage: &mut DatasetAccumulator, + data: &str, + key_props: &HashMap, +) -> Result<()> { + load_jsonl_data_with_name_seed(storage, data, key_props, None) +} + +#[cfg_attr(not(test), allow(dead_code))] +/// Load JSONL-formatted data into a DatasetAccumulator with an optional pre-populated +/// @key-value-to-id mapping for resolving edges that reference existing nodes. +pub(crate) fn load_jsonl_data_with_name_seed( + storage: &mut DatasetAccumulator, + data: &str, + key_props: &HashMap, + name_seed: Option<&HashMap<(String, String), u64>>, +) -> Result<()> { + let cursor = Cursor::new(data.as_bytes()); + load_jsonl_reader_with_name_seed(storage, cursor, key_props, name_seed) +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn load_jsonl_reader( + storage: &mut DatasetAccumulator, + reader: R, + key_props: &HashMap, +) -> Result<()> { + load_jsonl_reader_with_name_seed(storage, reader, key_props, None) +} + +#[cfg_attr(not(test), allow(dead_code))] +pub(crate) fn load_jsonl_reader_with_name_seed( + storage: &mut DatasetAccumulator, + reader: R, + key_props: &HashMap, + name_seed: Option<&HashMap<(String, String), u64>>, +) -> Result<()> { + let spool_dir = std::env::temp_dir(); + load_jsonl_reader_with_name_seed_at_path(storage, &spool_dir, reader, key_props, name_seed) +} + +pub(crate) fn load_jsonl_reader_with_name_seed_at_path( + storage: &mut DatasetAccumulator, + spool_dir: &Path, + reader: R, + key_props: &HashMap, + name_seed: Option<&HashMap<(String, String), u64>>, +) -> Result<()> { + let batch_size = parse_env_usize("NANOGRAPH_LOAD_ROW_BATCH_SIZE", 2048); + let mut spool_paths = TempSpoolPaths::default(); + let mut node_paths = HashMap::new(); + let mut node_writers = HashMap::new(); + let mut edge_paths = HashMap::new(); + let mut edge_writers = HashMap::new(); + + for (line_no, line) in reader.lines().enumerate() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() || trimmed.starts_with("//") { + continue; + } + + let obj: serde_json::Value = serde_json::from_str(trimmed).map_err(|e| { + NanoError::Storage(format!("JSON parse error on line {}: {}", line_no + 1, e)) + })?; + + if let Some(type_name) = obj.get("type").and_then(|v| v.as_str()) { + if !storage.catalog.node_types.contains_key(type_name) { + return Err(NanoError::Storage(format!( + "unknown node type in data: {}", + type_name + ))); + } + let writer = spool_writer_for_type( + spool_dir, + "load_nodes", + type_name, + &mut node_writers, + &mut node_paths, + &mut spool_paths, + )?; + write_jsonl_line(writer, &obj)?; + } else if let Some(edge_type) = obj.get("edge").and_then(|v| v.as_str()) { + let edge_name = resolve_edge_name(storage, edge_type)?; + let writer = spool_writer_for_type( + spool_dir, + "load_edges", + &edge_name, + &mut edge_writers, + &mut edge_paths, + &mut spool_paths, + )?; + write_jsonl_line(writer, &obj)?; + } + } + + drop(node_writers); + drop(edge_writers); + + let mut key_to_id: HashMap<(String, String), u64> = name_seed.cloned().unwrap_or_default(); + + let mut node_types: Vec = node_paths.keys().cloned().collect(); + node_types.sort(); + for type_name in node_types { + let path = node_paths.get(&type_name).ok_or_else(|| { + NanoError::Storage(format!("missing node spool path for {}", type_name)) + })?; + load_spooled_nodes( + storage, + &type_name, + path, + key_props, + &mut key_to_id, + batch_size, + )?; + } + + let mut edge_names: Vec = edge_paths.keys().cloned().collect(); + edge_names.sort(); + for edge_name in edge_names { + let path = edge_paths.get(&edge_name).ok_or_else(|| { + NanoError::Storage(format!("missing edge spool path for {}", edge_name)) + })?; + load_spooled_edges(storage, &edge_name, path, key_props, &key_to_id, batch_size)?; + } + + Ok(()) +} + +#[derive(Debug)] +struct PendingNodeRow { + row_idx: usize, + data: serde_json::Map, +} + +#[derive(Debug)] +struct ResolvedEdge { + from_id: u64, + to_id: u64, + data: Option>, +} + +#[derive(Default)] +struct TempSpoolPaths { + paths: Vec, +} + +impl TempSpoolPaths { + fn push(&mut self, path: PathBuf) { + self.paths.push(path); + } +} + +impl Drop for TempSpoolPaths { + fn drop(&mut self) { + for path in &self.paths { + let _ = std::fs::remove_file(path); + } + } +} + +fn load_spooled_nodes( + storage: &mut DatasetAccumulator, + type_name: &str, + path: &Path, + key_props: &HashMap, + key_to_id: &mut HashMap<(String, String), u64>, + batch_size: usize, +) -> Result<()> { + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut rows = Vec::with_capacity(batch_size); + let mut next_row_idx = 0usize; + + for (line_no, line) in reader.lines().enumerate() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + let obj: serde_json::Value = serde_json::from_str(trimmed).map_err(|e| { + NanoError::Storage(format!( + "JSON parse error in node spool {} line {}: {}", + type_name, + line_no + 1, + e + )) + })?; + let data = obj + .get("data") + .and_then(|value| value.as_object()) + .cloned() + .ok_or_else(|| { + NanoError::Storage(format!( + "node {} is missing object field `data` in spooled load", + type_name + )) + })?; + rows.push(PendingNodeRow { + row_idx: next_row_idx, + data, + }); + next_row_idx += 1; + if rows.len() >= batch_size { + flush_node_rows(storage, type_name, &mut rows, key_props, key_to_id)?; + } + } + + if !rows.is_empty() { + flush_node_rows(storage, type_name, &mut rows, key_props, key_to_id)?; + } + + Ok(()) +} + +fn flush_node_rows( + storage: &mut DatasetAccumulator, + type_name: &str, + rows: &mut Vec, + key_props: &HashMap, + key_to_id: &mut HashMap<(String, String), u64>, +) -> Result<()> { + if rows.is_empty() { + return Ok(()); + } + + let node_type = + storage.catalog.node_types.get(type_name).ok_or_else(|| { + NanoError::Storage(format!("unknown node type in data: {}", type_name)) + })?; + let prop_fields: Vec = node_type + .arrow_schema + .fields() + .iter() + .skip(1) + .map(|field| field.as_ref().clone()) + .collect(); + let mut builders: Vec> = + vec![Vec::with_capacity(rows.len()); prop_fields.len()]; + + for row in rows.iter() { + for (idx, field) in prop_fields.iter().enumerate() { + let value = row + .data + .get(field.name()) + .cloned() + .unwrap_or(serde_json::Value::Null); + if value.is_null() && !field.is_nullable() { + return Err(NanoError::Storage(format!( + "node {}: required field '{}' missing on row {}", + type_name, + field.name(), + row.row_idx + ))); + } + if let Some(prop_type) = node_type.properties.get(field.name()) { + validate_json_value(type_name, field.name(), prop_type, &value)?; + } + builders[idx].push(value); + } + } + + let mut columns: Vec> = Vec::with_capacity(prop_fields.len()); + for (idx, field) in prop_fields.iter().enumerate() { + columns.push(json_values_to_array( + &builders[idx], + field.data_type(), + field.is_nullable(), + )?); + } + + let prop_schema = Arc::new(Schema::new(prop_fields.clone())); + let batch = RecordBatch::try_new(prop_schema, columns) + .map_err(|e| NanoError::Storage(format!("batch error: {}", e)))?; + + let key_rows: Option> = if let Some(key_prop) = key_props.get(type_name) { + let key_col_idx = prop_fields + .iter() + .position(|field| field.name() == key_prop) + .ok_or_else(|| { + NanoError::Storage(format!( + "node type {} missing @key property {}", + type_name, key_prop + )) + })?; + let key_arr = batch.column(key_col_idx).clone(); + let mut keys = Vec::with_capacity(batch.num_rows()); + for row in 0..batch.num_rows() { + keys.push(key_value_string(&key_arr, row, key_prop)?); + } + Some(keys) + } else { + None + }; + + let assigned_ids = storage.insert_nodes(type_name, batch)?; + if let Some(keys) = key_rows { + for (row, key) in keys.into_iter().enumerate() { + key_to_id.insert((type_name.to_string(), key), assigned_ids[row]); + } + } + + rows.clear(); + Ok(()) +} + +fn load_spooled_edges( + storage: &mut DatasetAccumulator, + edge_name: &str, + path: &Path, + key_props: &HashMap, + key_to_id: &HashMap<(String, String), u64>, + batch_size: usize, +) -> Result<()> { + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut edges_by_pair: BTreeMap<(u64, u64), ResolvedEdge> = BTreeMap::new(); + + for (line_no, line) in reader.lines().enumerate() { + let line = line?; + let trimmed = line.trim(); + if trimmed.is_empty() { + continue; + } + let obj: serde_json::Value = serde_json::from_str(trimmed).map_err(|e| { + NanoError::Storage(format!( + "JSON parse error in edge spool {} line {}: {}", + edge_name, + line_no + 1, + e + )) + })?; + let resolved = resolve_edge_object(storage, &obj, key_props, key_to_id)?; + edges_by_pair.insert((resolved.from_id, resolved.to_id), resolved); + } + + if edges_by_pair.is_empty() { + return Ok(()); + } + + let resolved_edges: Vec<&ResolvedEdge> = edges_by_pair.values().collect(); + for chunk in resolved_edges.chunks(batch_size.max(1)) { + insert_resolved_edge_chunk(storage, edge_name, chunk)?; + } + + Ok(()) +} + +fn insert_resolved_edge_chunk( + storage: &mut DatasetAccumulator, + edge_name: &str, + edges: &[&ResolvedEdge], +) -> Result<()> { + let src_ids: Vec = edges.iter().map(|edge| edge.from_id).collect(); + let dst_ids: Vec = edges.iter().map(|edge| edge.to_id).collect(); + + let edge_seg = storage + .edge_segments + .get(edge_name) + .ok_or_else(|| NanoError::Storage(format!("no edge segment: {}", edge_name)))?; + let edge_type = + storage.catalog.edge_types.get(edge_name).ok_or_else(|| { + NanoError::Storage(format!("unknown edge type in data: {}", edge_name)) + })?; + let prop_fields: Vec = edge_seg + .schema + .fields() + .iter() + .skip(3) + .map(|field| field.as_ref().clone()) + .collect(); + + let prop_batch = if prop_fields.is_empty() { + None + } else { + let mut columns: Vec> = Vec::with_capacity(prop_fields.len()); + for field in &prop_fields { + let values: Vec = edges + .iter() + .map(|edge| { + edge.data + .as_ref() + .and_then(|data| data.get(field.name())) + .cloned() + .unwrap_or(serde_json::Value::Null) + }) + .collect(); + if let Some(prop_type) = edge_type.properties.get(field.name()) { + for value in &values { + validate_json_value(edge_name, field.name(), prop_type, value)?; + } + } + columns.push(json_values_to_array( + &values, + field.data_type(), + field.is_nullable(), + )?); + } + let schema = Arc::new(Schema::new(prop_fields)); + Some( + RecordBatch::try_new(schema, columns) + .map_err(|e| NanoError::Storage(format!("edge prop batch error: {}", e)))?, + ) + }; + + storage.insert_edges(edge_name, &src_ids, &dst_ids, prop_batch)?; + Ok(()) +} + +fn resolve_edge_object( + storage: &DatasetAccumulator, + edge_obj: &serde_json::Value, + key_props: &HashMap, + key_to_id: &HashMap<(String, String), u64>, +) -> Result { + let edge_type = edge_obj + .get("edge") + .and_then(|value| value.as_str()) + .ok_or_else(|| NanoError::Storage("edge missing type".to_string()))?; + let et = resolve_edge_type(storage, edge_type)?; + + let from_token = edge_obj + .get("from") + .and_then(|value| value.as_str()) + .ok_or_else(|| NanoError::Storage("edge missing from".to_string()))?; + let to_token = edge_obj + .get("to") + .and_then(|value| value.as_str()) + .ok_or_else(|| NanoError::Storage("edge missing to".to_string()))?; + + let from_type = et.from_type.clone(); + let to_type = et.to_type.clone(); + let edge_name = et.name.clone(); + + let (src_key_prop, dst_key_prop) = match (key_props.get(&from_type), key_props.get(&to_type)) { + (Some(src), Some(dst)) => (src, dst), + _ => { + return Err(NanoError::Storage(format!( + "edge '{}' requires @key on source type '{}' and destination type '{}'", + edge_name, from_type, to_type + ))); + } + }; + + let from_key_type = storage + .catalog + .node_types + .get(&from_type) + .and_then(|node_type| node_property_field(node_type.arrow_schema.as_ref(), src_key_prop)) + .map(|field| field.data_type().clone()) + .ok_or_else(|| { + NanoError::Storage(format!( + "missing @key field {} on source type {}", + src_key_prop, from_type + )) + })?; + let to_key_type = storage + .catalog + .node_types + .get(&to_type) + .and_then(|node_type| node_property_field(node_type.arrow_schema.as_ref(), dst_key_prop)) + .map(|field| field.data_type().clone()) + .ok_or_else(|| { + NanoError::Storage(format!( + "missing @key field {} on destination type {}", + dst_key_prop, to_type + )) + })?; + + let from_key = parse_edge_endpoint_key_token(from_token, &from_key_type).map_err(|e| { + NanoError::Storage(format!( + "invalid edge endpoint key for {}.{} from='{}': {}", + from_type, src_key_prop, from_token, e + )) + })?; + let to_key = parse_edge_endpoint_key_token(to_token, &to_key_type).map_err(|e| { + NanoError::Storage(format!( + "invalid edge endpoint key for {}.{} to='{}': {}", + to_type, dst_key_prop, to_token, e + )) + })?; + + let from_id = *key_to_id + .get(&(from_type.clone(), from_key.clone())) + .ok_or_else(|| { + NanoError::Storage(format!( + "node not found by @key: {}.{}={}", + from_type, src_key_prop, from_key + )) + })?; + let to_id = *key_to_id + .get(&(to_type.clone(), to_key.clone())) + .ok_or_else(|| { + NanoError::Storage(format!( + "node not found by @key: {}.{}={}", + to_type, dst_key_prop, to_key + )) + })?; + + Ok(ResolvedEdge { + from_id, + to_id, + data: edge_obj + .get("data") + .and_then(|value| value.as_object()) + .cloned(), + }) +} + +fn resolve_edge_name(storage: &DatasetAccumulator, edge_type: &str) -> Result { + Ok(resolve_edge_type(storage, edge_type)?.name.clone()) +} + +fn resolve_edge_type<'a>( + storage: &'a DatasetAccumulator, + edge_type: &str, +) -> Result<&'a crate::catalog::EdgeType> { + storage + .catalog + .edge_types + .get(edge_type) + .or_else(|| { + storage + .catalog + .edge_name_index + .get(edge_type) + .and_then(|name| storage.catalog.edge_types.get(name)) + }) + .ok_or_else(|| NanoError::Storage(format!("unknown edge type: {}", edge_type))) +} + +fn spool_writer_for_type<'a>( + spool_dir: &Path, + prefix: &str, + type_name: &str, + writers: &'a mut HashMap>, + paths: &mut HashMap, + spool_paths: &mut TempSpoolPaths, +) -> Result<&'a mut BufWriter> { + if !writers.contains_key(type_name) { + let path = create_temp_spool_file(spool_dir, prefix, type_name)?; + spool_paths.push(path.clone()); + let writer = BufWriter::new( + OpenOptions::new() + .create_new(false) + .write(true) + .open(&path)?, + ); + writers.insert(type_name.to_string(), writer); + paths.insert(type_name.to_string(), path); + } + writers + .get_mut(type_name) + .ok_or_else(|| NanoError::Storage(format!("failed to open spool writer for {}", type_name))) +} + +fn create_temp_spool_file(spool_dir: &Path, prefix: &str, type_name: &str) -> Result { + std::fs::create_dir_all(spool_dir)?; + let pid = std::process::id(); + let sanitized = type_name.replace(['/', '\\', ' '], "_"); + for attempt in 0..256u32 { + let now = SystemTime::now() + .duration_since(UNIX_EPOCH) + .unwrap_or_default() + .as_nanos(); + let path = spool_dir.join(format!( + ".nanograph_{}_{}_{}_{}_{}.jsonl", + prefix, sanitized, pid, now, attempt + )); + match OpenOptions::new().create_new(true).write(true).open(&path) { + Ok(_) => return Ok(path), + Err(err) if err.kind() == std::io::ErrorKind::AlreadyExists => continue, + Err(err) => return Err(err.into()), + } + } + + Err(NanoError::Storage(format!( + "failed to create temp spool file for {}", + type_name + ))) +} + +fn write_jsonl_line(writer: &mut BufWriter, value: &serde_json::Value) -> Result<()> { + serde_json::to_writer(&mut *writer, value) + .map_err(|e| NanoError::Storage(format!("serialize JSONL row failed: {}", e)))?; + writer.write_all(b"\n")?; + Ok(()) +} + +fn parse_env_usize(name: &str, default: usize) -> usize { + std::env::var(name) + .ok() + .and_then(|value| value.parse::().ok()) + .filter(|value| *value > 0) + .unwrap_or(default) +} + +fn validate_json_value( + type_name: &str, + field_name: &str, + prop_type: &crate::types::PropType, + value: &serde_json::Value, +) -> Result<()> { + if value.is_null() { + return Ok(()); + } + if prop_type.list { + let Some(items) = value.as_array() else { + return Err(type_mismatch_error( + type_name, + field_name, + &expected_type_name(prop_type), + value, + )); + }; + let item_type = crate::types::PropType { + scalar: prop_type.scalar, + nullable: true, + list: false, + enum_values: prop_type.enum_values.clone(), + }; + for item in items { + validate_json_value(type_name, field_name, &item_type, item)?; + } + return Ok(()); + } + if let Some(enum_values) = &prop_type.enum_values { + let Some(raw) = value.as_str() else { + return Err(type_mismatch_error( + type_name, + field_name, + &expected_type_name(prop_type), + value, + )); + }; + if enum_values.iter().any(|allowed| allowed == raw) { + return Ok(()); + } + return Err(NanoError::Storage(format!( + "invalid enum value '{}' for {}.{} (expected: {})", + raw, + type_name, + field_name, + enum_values.join(", ") + ))); + } + + let valid = match prop_type.scalar { + crate::types::ScalarType::String => value.is_string(), + crate::types::ScalarType::Bool => value.is_boolean(), + crate::types::ScalarType::I32 => { + value.as_i64().and_then(|n| i32::try_from(n).ok()).is_some() + } + crate::types::ScalarType::I64 => value.as_i64().is_some(), + crate::types::ScalarType::U32 => { + value.as_u64().and_then(|n| u32::try_from(n).ok()).is_some() + } + crate::types::ScalarType::U64 => value.as_u64().is_some(), + crate::types::ScalarType::F32 => value.as_f64().is_some(), + crate::types::ScalarType::F64 => value.as_f64().is_some(), + crate::types::ScalarType::Date => parse_date32_json_value(value).is_ok(), + crate::types::ScalarType::DateTime => parse_date64_json_value(value).is_ok(), + crate::types::ScalarType::Vector(dim) => match value.as_array() { + Some(items) if items.len() == dim as usize => { + items.iter().all(|item| item.as_f64().is_some()) + } + _ => false, + }, + }; + if valid { + Ok(()) + } else { + Err(type_mismatch_error( + type_name, + field_name, + &expected_type_name(prop_type), + value, + )) + } +} + +fn expected_type_name(prop_type: &crate::types::PropType) -> String { + let base = if let Some(enum_values) = &prop_type.enum_values { + format!("enum({})", enum_values.join(", ")) + } else { + prop_type.scalar.to_string() + }; + if prop_type.list { + format!("[{}]", base) + } else { + base + } +} + +fn type_mismatch_error( + type_name: &str, + field_name: &str, + expected: &str, + value: &serde_json::Value, +) -> NanoError { + NanoError::Storage(format!( + "type mismatch for {}.{}: expected {}, got {}", + type_name, + field_name, + expected, + describe_json_value(value) + )) +} + +fn describe_json_value(value: &serde_json::Value) -> String { + match value { + serde_json::Value::Null => "Null".to_string(), + serde_json::Value::Bool(v) => format!("Bool {}", v), + serde_json::Value::Number(v) => { + if v.is_i64() || v.is_u64() { + format!("Integer {}", v) + } else { + format!("Float {}", v) + } + } + serde_json::Value::String(v) => format!("String {:?}", v), + serde_json::Value::Array(v) => format!("Array {}", serde_json::Value::Array(v.clone())), + serde_json::Value::Object(v) => { + format!("Object {}", serde_json::Value::Object(v.clone())) + } + } +} + +/// Convert JSON values to an Arrow array based on the target DataType. +pub(crate) fn json_values_to_array( + values: &[serde_json::Value], + dt: &DataType, + nullable: bool, +) -> Result> { + let arr: Arc = match dt { + DataType::Utf8 => { + let arr: StringArray = values + .iter() + .map(|v| v.as_str().map(|s| s.to_string())) + .collect(); + Arc::new(arr) + } + DataType::Int32 => { + let arr: Int32Array = values + .iter() + .map(|v| v.as_i64().map(|n| n as i32)) + .collect(); + Arc::new(arr) + } + DataType::Int64 => { + let arr: Int64Array = values.iter().map(|v| v.as_i64()).collect(); + Arc::new(arr) + } + DataType::UInt64 => { + let arr: UInt64Array = values.iter().map(|v| v.as_u64()).collect(); + Arc::new(arr) + } + DataType::Float64 => { + let arr: Float64Array = values.iter().map(|v| v.as_f64()).collect(); + Arc::new(arr) + } + DataType::Boolean => { + let arr: BooleanArray = values.iter().map(|v| v.as_bool()).collect(); + Arc::new(arr) + } + DataType::Float32 => { + let arr: Float32Array = values + .iter() + .map(|v| v.as_f64().map(|n| n as f32)) + .collect(); + Arc::new(arr) + } + DataType::UInt32 => { + let arr: UInt32Array = values + .iter() + .map(|v| v.as_u64().map(|n| n as u32)) + .collect(); + Arc::new(arr) + } + DataType::Date32 => { + let mut out = Vec::with_capacity(values.len()); + for value in values { + out.push(parse_date32_json_value(value)?); + } + Arc::new(Date32Array::from(out)) + } + DataType::Date64 => { + let mut out = Vec::with_capacity(values.len()); + for value in values { + out.push(parse_date64_json_value(value)?); + } + Arc::new(Date64Array::from(out)) + } + DataType::List(field) => { + let mut builder = ListBuilder::with_capacity( + make_builder(field.data_type(), values.len()), + values.len(), + ) + .with_field(field.clone()); + for value in values { + if value.is_null() { + builder.append(false); + continue; + } + let Some(items) = value.as_array() else { + builder.append(false); + continue; + }; + for item in items { + append_json_to_builder(builder.values(), field.data_type(), item)?; + } + builder.append(true); + } + Arc::new(builder.finish()) + } + DataType::FixedSizeList(field, dim) => { + if *dim <= 0 { + return Err(NanoError::Storage(format!( + "invalid FixedSizeList dimension: {}", + dim + ))); + } + if field.data_type() != &DataType::Float32 { + return Err(NanoError::Storage(format!( + "unsupported FixedSizeList element type {:?}; expected Float32", + field.data_type() + ))); + } + + let list_len = *dim as usize; + let mut builder = FixedSizeListBuilder::with_capacity( + Float32Builder::with_capacity(values.len() * list_len), + *dim, + values.len(), + ) + .with_field(field.clone()); + + for value in values { + if value.is_null() { + for _ in 0..list_len { + builder.values().append_null(); + } + builder.append(false); + continue; + } + let items = value.as_array().ok_or_else(|| { + NanoError::Storage(format!( + "expected JSON array for FixedSizeList, got {}", + dim, value + )) + })?; + if items.len() != list_len { + return Err(NanoError::Storage(format!( + "FixedSizeList length mismatch: got {}", + dim, + items.len() + ))); + } + + for item in items { + let num = item.as_f64().ok_or_else(|| { + NanoError::Storage(format!( + "expected numeric vector element in FixedSizeList, got {}", + dim, item + )) + })?; + builder.values().append_value(num as f32); + } + builder.append(true); + } + Arc::new(builder.finish()) + } + _ => { + // Fallback to string + let arr: StringArray = values.iter().map(|v| Some(v.to_string())).collect(); + Arc::new(arr) + } + }; + if !nullable && arr.null_count() > 0 { + return Err(NanoError::Storage(format!( + "field has {} null value(s) from type mismatch (expected {:?})", + arr.null_count(), + dt + ))); + } + Ok(arr) +} + +fn append_json_to_builder( + builder: &mut Box, + dt: &DataType, + value: &serde_json::Value, +) -> Result<()> { + match dt { + DataType::Utf8 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Utf8 builder downcast failed".to_string()) + })?; + if let Some(s) = value.as_str() { + b.append_value(s); + } else { + b.append_null(); + } + } + DataType::Boolean => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Boolean builder downcast failed".to_string()) + })?; + if let Some(v) = value.as_bool() { + b.append_value(v); + } else { + b.append_null(); + } + } + DataType::Int32 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Int32 builder downcast failed".to_string()) + })?; + if let Some(v) = value.as_i64() { + if let Ok(n) = i32::try_from(v) { + b.append_value(n); + } else { + b.append_null(); + } + } else { + b.append_null(); + } + } + DataType::Int64 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Int64 builder downcast failed".to_string()) + })?; + if let Some(v) = value.as_i64() { + b.append_value(v); + } else { + b.append_null(); + } + } + DataType::UInt32 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list UInt32 builder downcast failed".to_string()) + })?; + if let Some(v) = value.as_u64() { + if let Ok(n) = u32::try_from(v) { + b.append_value(n); + } else { + b.append_null(); + } + } else { + b.append_null(); + } + } + DataType::UInt64 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list UInt64 builder downcast failed".to_string()) + })?; + if let Some(v) = value.as_u64() { + b.append_value(v); + } else { + b.append_null(); + } + } + DataType::Float32 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Float32 builder downcast failed".to_string()) + })?; + if let Some(v) = value.as_f64() { + b.append_value(v as f32); + } else { + b.append_null(); + } + } + DataType::Float64 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Float64 builder downcast failed".to_string()) + })?; + if let Some(v) = value.as_f64() { + b.append_value(v); + } else { + b.append_null(); + } + } + DataType::Date32 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Date32 builder downcast failed".to_string()) + })?; + if let Some(v) = parse_date32_json_value(value)? { + b.append_value(v); + } else { + b.append_null(); + } + } + DataType::Date64 => { + let b = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| { + NanoError::Storage("list Date64 builder downcast failed".to_string()) + })?; + if let Some(v) = parse_date64_json_value(value)? { + b.append_value(v); + } else { + b.append_null(); + } + } + DataType::List(field) => { + let b = builder + .as_any_mut() + .downcast_mut::>>() + .ok_or_else(|| { + NanoError::Storage("nested list builder downcast failed".to_string()) + })?; + if value.is_null() { + b.append(false); + } else if let Some(items) = value.as_array() { + for item in items { + append_json_to_builder(b.values(), field.data_type(), item)?; + } + b.append(true); + } else { + b.append(false); + } + } + other => { + return Err(NanoError::Storage(format!( + "unsupported list element data type {:?}", + other + ))); + } + } + + Ok(()) +} + +fn parse_date32_json_value(value: &serde_json::Value) -> Result> { + if value.is_null() { + return Ok(None); + } + if let Some(days) = value.as_i64() { + return i32::try_from(days) + .map(Some) + .map_err(|_| NanoError::Storage(format!("Date32 value out of range: {}", days))); + } + if let Some(days) = value.as_u64() { + return i32::try_from(days) + .map(Some) + .map_err(|_| NanoError::Storage(format!("Date32 value out of range: {}", days))); + } + if let Some(s) = value.as_str() { + return Ok(Some(parse_date32_literal(s)?)); + } + Ok(None) +} + +fn parse_date64_json_value(value: &serde_json::Value) -> Result> { + if value.is_null() { + return Ok(None); + } + if let Some(ms) = value.as_i64() { + return Ok(Some(ms)); + } + if let Some(ms) = value.as_u64() { + return i64::try_from(ms) + .map(Some) + .map_err(|_| NanoError::Storage(format!("Date64 value out of range: {}", ms))); + } + if let Some(s) = value.as_str() { + return Ok(Some(parse_date64_literal(s)?)); + } + Ok(None) +} + +fn parse_edge_endpoint_key_token(token: &str, dt: &DataType) -> Result { + match dt { + DataType::Utf8 => Ok(token.to_string()), + DataType::Boolean => token + .parse::() + .map(|v| v.to_string()) + .map_err(|e| NanoError::Storage(format!("expected bool token: {}", e))), + DataType::Int32 => token + .parse::() + .map(|v| v.to_string()) + .map_err(|e| NanoError::Storage(format!("expected Int32 token: {}", e))), + DataType::Int64 => token + .parse::() + .map(|v| v.to_string()) + .map_err(|e| NanoError::Storage(format!("expected Int64 token: {}", e))), + DataType::UInt32 => token + .parse::() + .map(|v| v.to_string()) + .map_err(|e| NanoError::Storage(format!("expected UInt32 token: {}", e))), + DataType::UInt64 => token + .parse::() + .map(|v| v.to_string()) + .map_err(|e| NanoError::Storage(format!("expected UInt64 token: {}", e))), + DataType::Float32 => token + .parse::() + .map(|v| v.to_string()) + .map_err(|e| NanoError::Storage(format!("expected Float32 token: {}", e))), + DataType::Float64 => token + .parse::() + .map(|v| v.to_string()) + .map_err(|e| NanoError::Storage(format!("expected Float64 token: {}", e))), + DataType::Date32 => parse_date32_literal(token).map(|v| v.to_string()), + DataType::Date64 => parse_date64_literal(token).map(|v| v.to_string()), + other => Err(NanoError::Storage(format!( + "unsupported @key type for edge endpoint resolution: {:?}", + other + ))), + } +} + +pub(crate) fn parse_date32_literal(s: &str) -> Result { + let raw: Arc = Arc::new(StringArray::from(vec![Some(s)])); + let casted = arrow_cast::cast(raw.as_ref(), &DataType::Date32) + .map_err(|e| NanoError::Storage(format!("invalid Date literal '{}': {}", s, e)))?; + let out = casted + .as_any() + .downcast_ref::() + .ok_or_else(|| NanoError::Storage("Date32 cast produced unexpected array".to_string()))?; + if out.is_null(0) { + return Err(NanoError::Storage(format!("invalid Date literal '{}'", s))); + } + Ok(out.value(0)) +} + +pub(crate) fn parse_date64_literal(s: &str) -> Result { + let raw: Arc = Arc::new(StringArray::from(vec![Some(s)])); + let casted = arrow_cast::cast(raw.as_ref(), &DataType::Date64) + .map_err(|e| NanoError::Storage(format!("invalid DateTime literal '{}': {}", s, e)))?; + let out = casted + .as_any() + .downcast_ref::() + .ok_or_else(|| NanoError::Storage("Date64 cast produced unexpected array".to_string()))?; + if out.is_null(0) { + return Err(NanoError::Storage(format!( + "invalid DateTime literal '{}'", + s + ))); + } + Ok(out.value(0)) +} + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + use std::io::Cursor; + + use serde_json::json; + + use crate::catalog::schema_ir::{build_catalog_from_ir, build_schema_ir}; + use crate::schema::parser::parse_schema; + + use super::*; + + fn test_schema() -> &'static str { + r#"node Person { + name: String @key +} +edge Knows: Person -> Person +"# + } + + fn build_storage(schema_src: &str) -> DatasetAccumulator { + let schema = parse_schema(schema_src).unwrap(); + let ir = build_schema_ir(&schema).unwrap(); + let catalog = build_catalog_from_ir(&ir).unwrap(); + DatasetAccumulator::new(catalog) + } + + fn person_key_props() -> HashMap { + HashMap::from([("Person".to_string(), "name".to_string())]) + } + + fn person_id_by_name(storage: &DatasetAccumulator, name: &str) -> u64 { + let batch = storage.get_all_nodes("Person").unwrap().unwrap(); + let id_col = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let name_col = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + (0..batch.num_rows()) + .find(|&i| name_col.value(i) == name) + .map(|i| id_col.value(i)) + .unwrap() + } + + #[test] + fn json_values_to_array_rejects_non_nullable_mismatch() { + let values = vec![json!("abc"), json!(42)]; + let err = json_values_to_array(&values, &DataType::Int32, false).unwrap_err(); + assert!( + err.to_string().contains("null value"), + "unexpected error: {err}" + ); + } + + #[test] + fn json_values_to_array_accepts_iso_date_strings() { + let values = vec![json!("2026-02-14"), json!(null)]; + let arr = json_values_to_array(&values, &DataType::Date32, true).unwrap(); + let arr = arr.as_any().downcast_ref::().unwrap(); + assert!(!arr.is_null(0)); + assert!(arr.is_null(1)); + } + + #[test] + fn json_values_to_array_accepts_iso_datetime_strings() { + let values = vec![json!("2026-02-14T10:00:00Z"), json!(null)]; + let arr = json_values_to_array(&values, &DataType::Date64, true).unwrap(); + let arr = arr.as_any().downcast_ref::().unwrap(); + assert!(!arr.is_null(0)); + assert!(arr.is_null(1)); + } + + #[test] + fn json_values_to_array_builds_list_values() { + let values = vec![json!([1, 2]), json!(null), json!([3])]; + let dt = DataType::List(Arc::new(Field::new("item", DataType::Int64, true))); + let arr = json_values_to_array(&values, &dt, true).unwrap(); + let list = arr + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(list.len(), 3); + assert!(!list.is_null(0)); + assert!(list.is_null(1)); + assert!(!list.is_null(2)); + + let first = list.value(0); + let first = first.as_any().downcast_ref::().unwrap(); + assert_eq!(first.len(), 2); + assert_eq!(first.value(0), 1); + assert_eq!(first.value(1), 2); + } + + #[test] + fn json_values_to_array_builds_fixed_size_list_vectors() { + let values = vec![json!([0.1, 0.2, 0.3]), json!(null), json!([1, 2, 3])]; + let dt = DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 3); + let arr = json_values_to_array(&values, &dt, true).unwrap(); + let vecs = arr + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(vecs.len(), 3); + assert!(!vecs.is_null(0)); + assert!(vecs.is_null(1)); + assert!(!vecs.is_null(2)); + } + + #[test] + fn json_values_to_array_rejects_fixed_size_list_length_mismatch() { + let values = vec![json!([0.1, 0.2])]; + let dt = DataType::FixedSizeList(Arc::new(Field::new("item", DataType::Float32, true)), 3); + let err = json_values_to_array(&values, &dt, true).unwrap_err(); + assert!(err.to_string().contains("length mismatch")); + } + + #[test] + fn load_jsonl_with_name_seed_resolves_edges_to_existing_nodes() { + let mut existing = build_storage(test_schema()); + load_jsonl_data( + &mut existing, + r#"{"type":"Person","data":{"name":"Alice"}}"#, + &person_key_props(), + ) + .unwrap(); + let alice_id = person_id_by_name(&existing, "Alice"); + + let data = r#"{"type":"Person","data":{"name":"Bob"}} +{"edge":"Knows","from":"Alice","to":"Bob"}"#; + + let mut no_seed = build_storage(test_schema()); + let err = load_jsonl_data(&mut no_seed, data, &person_key_props()).unwrap_err(); + assert!( + err.to_string().contains("node not found by @key"), + "unexpected error: {err}" + ); + + let mut seeded = build_storage(test_schema()); + let mut seed = HashMap::new(); + seed.insert(("Person".to_string(), "Alice".to_string()), alice_id); + load_jsonl_data_with_name_seed(&mut seeded, data, &person_key_props(), Some(&seed)) + .unwrap(); + + let bob_id = person_id_by_name(&seeded, "Bob"); + let knows = &seeded.edge_segments["Knows"]; + assert_eq!(knows.edge_ids.len(), 1); + assert_eq!(knows.src_ids[0], alice_id); + assert_eq!(knows.dst_ids[0], bob_id); + } + + #[test] + fn load_jsonl_reader_handles_forward_reference_edges() { + let mut storage = build_storage(test_schema()); + let data = r#"{"edge":"Knows","from":"Alice","to":"Bob"} +{"type":"Person","data":{"name":"Alice"}} +{"type":"Person","data":{"name":"Bob"}}"#; + + load_jsonl_reader( + &mut storage, + Cursor::new(data.as_bytes()), + &person_key_props(), + ) + .unwrap(); + + let knows = &storage.edge_segments["Knows"]; + assert_eq!(knows.edge_ids.len(), 1); + } + + #[test] + fn load_jsonl_deduplicates_duplicate_edges() { + let mut storage = build_storage(test_schema()); + let data = r#"{"type":"Person","data":{"name":"Alice"}} +{"type":"Person","data":{"name":"Bob"}} +{"edge":"Knows","from":"Alice","to":"Bob"} +{"edge":"Knows","from":"Alice","to":"Bob"}"#; + + load_jsonl_data(&mut storage, data, &person_key_props()).unwrap(); + let knows = &storage.edge_segments["Knows"]; + assert_eq!(knows.edge_ids.len(), 1); + } + + #[test] + fn load_jsonl_edges_require_endpoint_key_annotations() { + let schema = r#"node Event { + title: String + at: Date +} +edge Precedes: Event -> Event +"#; + let mut storage = build_storage(schema); + let data = r#"{"type":"Event","data":{"title":"Kickoff","at":"2026-02-14"}} +{"type":"Event","data":{"title":"Wrap","at":"2026-02-15"}} +{"edge":"Precedes","from":"Kickoff","to":"Wrap"}"#; + + let err = load_jsonl_data(&mut storage, data, &HashMap::new()).unwrap_err(); + assert!( + err.to_string() + .contains("requires @key on source type 'Event' and destination type 'Event'"), + "unexpected error: {err}" + ); + } + + #[test] + fn load_jsonl_edges_resolve_by_non_name_key() { + let schema = r#"node User { + uid: String @key + display_name: String +} +edge Follows: User -> User +"#; + let mut storage = build_storage(schema); + let key_props = HashMap::from([("User".to_string(), "uid".to_string())]); + let data = r#"{"type":"User","data":{"uid":"usr_01","display_name":"Alice"}} +{"type":"User","data":{"uid":"usr_02","display_name":"Bob"}} +{"edge":"Follows","from":"usr_01","to":"usr_02"}"#; + + load_jsonl_data(&mut storage, data, &key_props).unwrap(); + let follows = &storage.edge_segments["Follows"]; + assert_eq!(follows.edge_ids.len(), 1); + } + + #[test] + fn load_jsonl_edges_resolve_by_user_property_named_id() { + let schema = r#"node User { + id: String @key + display_name: String +} +edge Follows: User -> User +"#; + let mut storage = build_storage(schema); + let key_props = HashMap::from([("User".to_string(), "id".to_string())]); + let data = r#"{"type":"User","data":{"id":"usr_01","display_name":"Alice"}} +{"type":"User","data":{"id":"usr_02","display_name":"Bob"}} +{"edge":"Follows","from":"usr_01","to":"usr_02"}"#; + + load_jsonl_data(&mut storage, data, &key_props).unwrap(); + + let users = storage.get_all_nodes("User").unwrap().unwrap(); + let user_ids = users + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(user_ids.value(0), "usr_01"); + assert_eq!(user_ids.value(1), "usr_02"); + + let follows = &storage.edge_segments["Follows"]; + assert_eq!(follows.edge_ids.len(), 1); + } + + #[test] + fn load_jsonl_edges_parse_non_string_key_tokens() { + let schema = r#"node User { + uid: U64 @key + display_name: String +} +edge Follows: User -> User +"#; + let mut storage = build_storage(schema); + let key_props = HashMap::from([("User".to_string(), "uid".to_string())]); + let data = r#"{"type":"User","data":{"uid":1,"display_name":"Alice"}} +{"type":"User","data":{"uid":2,"display_name":"Bob"}} +{"edge":"Follows","from":"1","to":"2"}"#; + + load_jsonl_data(&mut storage, data, &key_props).unwrap(); + let follows = &storage.edge_segments["Follows"]; + assert_eq!(follows.edge_ids.len(), 1); + } + + #[test] + fn load_jsonl_rejects_invalid_node_enum_values() { + let schema = r#"node Person { + name: String @key + role: enum(admin, member, guest) +}"#; + let mut storage = build_storage(schema); + let err = load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"name":"Bad","role":"superadmin"}}"#, + &HashMap::from([("Person".to_string(), "name".to_string())]), + ) + .unwrap_err(); + assert_eq!( + err.to_string(), + "storage error: invalid enum value 'superadmin' for Person.role (expected: admin, guest, member)" + ); + } + + #[test] + fn load_jsonl_rejects_invalid_edge_enum_values() { + let schema = r#"node Person { + name: String @key +} +edge WorksWith: Person -> Person { + role: enum(lead, contributor) +}"#; + let mut storage = build_storage(schema); + let data = r#"{"type":"Person","data":{"name":"Alice"}} +{"type":"Person","data":{"name":"Bob"}} +{"edge":"WorksWith","from":"Alice","to":"Bob","data":{"role":"manager"}}"#; + let err = load_jsonl_data( + &mut storage, + data, + &HashMap::from([("Person".to_string(), "name".to_string())]), + ) + .unwrap_err(); + assert_eq!( + err.to_string(), + "storage error: invalid enum value 'manager' for WorksWith.role (expected: contributor, lead)" + ); + } + + #[test] + fn load_jsonl_rejects_wrong_type_for_nullable_node_field() { + let schema = r#"node Person { + name: String @key + age: I32? +}"#; + let mut storage = build_storage(schema); + let err = load_jsonl_data( + &mut storage, + r#"{"type":"Person","data":{"name":"Bad","age":"not-a-number"}}"#, + &HashMap::from([("Person".to_string(), "name".to_string())]), + ) + .unwrap_err(); + assert_eq!( + err.to_string(), + r#"storage error: type mismatch for Person.age: expected I32, got String "not-a-number""# + ); + } +} diff --git a/crates/omnigraph/src/loader/mod.rs b/crates/omnigraph/src/loader/mod.rs new file mode 100644 index 0000000..bb7f5cc --- /dev/null +++ b/crates/omnigraph/src/loader/mod.rs @@ -0,0 +1,1631 @@ +use std::collections::{HashMap, HashSet}; + +use std::io::{BufRead, BufReader, Cursor}; +use std::sync::Arc; + +use arrow_array::{ + Array, ArrayRef, BooleanArray, Date32Array, Date64Array, Float32Array, Float64Array, + Int32Array, Int64Array, RecordBatch, StringArray, UInt32Array, UInt64Array, + builder::{ + ArrayBuilder, BooleanBuilder, Date32Builder, Date64Builder, FixedSizeListBuilder, + Float32Builder, Float64Builder, Int32Builder, Int64Builder, ListBuilder, StringBuilder, + UInt32Builder, UInt64Builder, + }, +}; +use arrow_schema::DataType; +use base64::Engine; +use lance::blob::BlobArrayBuilder; +use omnigraph_compiler::catalog::NodeType; +use serde::{Deserialize, Serialize}; +use serde_json::Value as JsonValue; + +use crate::db::Omnigraph; +use crate::error::{OmniError, Result}; + +/// Result of a load operation. +#[derive(Debug, Clone, Default)] +pub struct LoadResult { + pub nodes_loaded: HashMap, + pub edges_loaded: HashMap, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct IngestTableResult { + pub table_key: String, + pub rows_loaded: usize, +} + +#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] +pub struct IngestResult { + pub branch: String, + pub base_branch: String, + pub branch_created: bool, + pub mode: LoadMode, + pub tables: Vec, +} + +/// Load mode for data ingestion. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(rename_all = "snake_case")] +pub enum LoadMode { + /// Overwrite existing data. + Overwrite, + /// Append to existing data. + Append, + /// Merge by `id` key (upsert). + Merge, +} + +/// Load JSONL data into an Omnigraph database. +pub async fn load_jsonl(db: &mut Omnigraph, data: &str, mode: LoadMode) -> Result { + let current_branch = db.active_branch().map(str::to_string); + let branch = current_branch.as_deref().unwrap_or("main"); + db.load(branch, data, mode).await +} + +/// Load JSONL data from a file path. +pub async fn load_jsonl_file(db: &mut Omnigraph, path: &str, mode: LoadMode) -> Result { + let current_branch = db.active_branch().map(str::to_string); + let branch = current_branch.as_deref().unwrap_or("main"); + db.load_file(branch, path, mode).await +} + +impl Omnigraph { + pub async fn ingest( + &mut self, + branch: &str, + from: Option<&str>, + data: &str, + mode: LoadMode, + ) -> Result { + self.ingest_as(branch, from, data, mode, None).await + } + + pub async fn ingest_as( + &mut self, + branch: &str, + from: Option<&str>, + data: &str, + mode: LoadMode, + actor_id: Option<&str>, + ) -> Result { + let previous_actor = self.audit_actor_id.clone(); + self.audit_actor_id = actor_id.map(str::to_string); + let result = self + .ingest_with_current_actor(branch, from, data, mode) + .await; + self.audit_actor_id = previous_actor; + result + } + + pub async fn ingest_file( + &mut self, + branch: &str, + from: Option<&str>, + path: &str, + mode: LoadMode, + ) -> Result { + self.ingest_file_as(branch, from, path, mode, None).await + } + + pub async fn ingest_file_as( + &mut self, + branch: &str, + from: Option<&str>, + path: &str, + mode: LoadMode, + actor_id: Option<&str>, + ) -> Result { + let data = std::fs::read_to_string(path).map_err(OmniError::Io)?; + self.ingest_as(branch, from, &data, mode, actor_id).await + } + + async fn ingest_with_current_actor( + &mut self, + branch: &str, + from: Option<&str>, + data: &str, + mode: LoadMode, + ) -> Result { + self.ensure_schema_state_valid().await?; + let target_branch = + Self::normalize_branch_name(branch)?.unwrap_or_else(|| "main".to_string()); + let base_branch = Self::normalize_branch_name(from.unwrap_or("main"))? + .unwrap_or_else(|| "main".to_string()); + let branch_created = !self + .branch_list() + .await? + .iter() + .any(|name| name == &target_branch); + if branch_created { + self.branch_create_from(crate::db::ReadTarget::branch(&base_branch), &target_branch) + .await?; + } + + let result = self.load(&target_branch, data, mode).await?; + Ok(IngestResult { + branch: target_branch, + base_branch, + branch_created, + mode, + tables: result.to_ingest_tables(), + }) + } + + pub async fn load(&mut self, branch: &str, data: &str, mode: LoadMode) -> Result { + self.ensure_schema_state_valid().await?; + let requested = Self::normalize_branch_name(branch)?.unwrap_or_else(|| "main".to_string()); + if crate::db::is_internal_run_branch(&requested) { + return self + .load_direct_on_branch(Some(requested.as_str()), data, mode) + .await; + } + + let target_head_before = self.latest_branch_snapshot_id(&requested).await?; + let op = format!("load_jsonl:branch={}:mode={}", requested, mode.as_str()); + let run = self.begin_run(&requested, Some(op.as_str())).await?; + let staged_result = match self + .load_direct_on_branch(Some(run.run_branch.as_str()), data, mode) + .await + { + Ok(result) => result, + Err(err) => { + let _ = self.fail_run(&run.run_id).await; + return Err(err); + } + }; + + let target_head_now = self.latest_branch_snapshot_id(&requested).await?; + if target_head_now.as_str() != target_head_before.as_str() { + let _ = self.fail_run(&run.run_id).await; + return Err(OmniError::manifest_conflict(format!( + "target branch '{}' advanced during transactional load; retry", + requested + ))); + } + + if let Err(err) = self.publish_run(&run.run_id).await { + let _ = self.fail_run(&run.run_id).await; + return Err(err); + } + + Ok(staged_result) + } + + pub async fn load_file( + &mut self, + branch: &str, + path: &str, + mode: LoadMode, + ) -> Result { + let data = std::fs::read_to_string(path).map_err(|e| OmniError::Io(e))?; + self.load(branch, &data, mode).await + } + + async fn load_direct_on_branch( + &mut self, + branch: Option<&str>, + data: &str, + mode: LoadMode, + ) -> Result { + let reader = BufReader::new(Cursor::new(data.as_bytes())); + load_jsonl_reader(self, branch, reader, mode).await + } +} + +impl LoadMode { + pub fn as_str(self) -> &'static str { + match self { + LoadMode::Overwrite => "overwrite", + LoadMode::Append => "append", + LoadMode::Merge => "merge", + } + } +} + +impl LoadResult { + pub fn to_ingest_tables(&self) -> Vec { + let mut tables = self + .nodes_loaded + .iter() + .map(|(type_name, rows_loaded)| IngestTableResult { + table_key: format!("node:{type_name}"), + rows_loaded: *rows_loaded, + }) + .chain( + self.edges_loaded + .iter() + .map(|(edge_name, rows_loaded)| IngestTableResult { + table_key: format!("edge:{edge_name}"), + rows_loaded: *rows_loaded, + }), + ) + .collect::>(); + tables.sort_by(|a, b| a.table_key.cmp(&b.table_key)); + tables + } +} + +async fn load_jsonl_reader( + db: &mut Omnigraph, + branch: Option<&str>, + reader: R, + mode: LoadMode, +) -> Result { + let catalog = db.catalog().clone(); + + // Phase 1: Parse all lines, spool into per-type collections + let mut node_rows: HashMap> = HashMap::new(); + let mut edge_rows: HashMap> = HashMap::new(); + + for (line_num, line) in reader.lines().enumerate() { + let line = line?; + let line = line.trim(); + if line.is_empty() { + continue; + } + let value: JsonValue = serde_json::from_str(line).map_err(|e| { + OmniError::manifest(format!("invalid JSON on line {}: {}", line_num + 1, e)) + })?; + + if let Some(type_name) = value.get("type").and_then(|v| v.as_str()) { + if !catalog.node_types.contains_key(type_name) { + return Err(OmniError::manifest(format!( + "line {}: unknown node type '{}'", + line_num + 1, + type_name + ))); + } + let data = value + .get("data") + .cloned() + .unwrap_or(JsonValue::Object(serde_json::Map::new())); + node_rows + .entry(type_name.to_string()) + .or_default() + .push(data); + } else if let Some(edge_name) = value.get("edge").and_then(|v| v.as_str()) { + if catalog.lookup_edge_by_name(edge_name).is_none() { + return Err(OmniError::manifest(format!( + "line {}: unknown edge type '{}'", + line_num + 1, + edge_name + ))); + } + let from = value + .get("from") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + OmniError::manifest(format!("line {}: edge missing 'from'", line_num + 1)) + })? + .to_string(); + let to = value + .get("to") + .and_then(|v| v.as_str()) + .ok_or_else(|| { + OmniError::manifest(format!("line {}: edge missing 'to'", line_num + 1)) + })? + .to_string(); + let data = value + .get("data") + .cloned() + .unwrap_or(JsonValue::Object(serde_json::Map::new())); + let canonical = catalog.lookup_edge_by_name(edge_name).unwrap().name.clone(); + edge_rows + .entry(canonical) + .or_default() + .push((from, to, data)); + } else { + return Err(OmniError::manifest(format!( + "line {}: expected 'type' or 'edge' field", + line_num + 1 + ))); + } + } + + // Phase 2: Build per-type RecordBatches and write to Lance + + let mut updates = Vec::new(); + let mut result = LoadResult::default(); + let snapshot = db.snapshot_for_branch(branch).await?; + + // Write nodes first (edges reference node IDs) + for (type_name, rows) in &node_rows { + let node_type = &catalog.node_types[type_name]; + let batch = build_node_batch(node_type, rows)?; + + // Validate value constraints before writing + validate_value_constraints(&batch, node_type)?; + + let loaded_count = batch.num_rows(); + + let table_key = format!("node:{}", type_name); + snapshot + .entry(&table_key) + .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + + let (state, table_branch) = + write_batch_to_dataset(db, branch, &table_key, batch, mode).await?; + + updates.push(crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + result.nodes_loaded.insert(type_name.clone(), loaded_count); + } + + // Phase 2b: Validate edge referential integrity — every src/dst must + // reference an existing node ID in the appropriate type. + for (edge_name, rows) in &edge_rows { + let edge_type = &catalog.edge_types[edge_name]; + let from_ids = collect_node_ids( + db, + branch, + &edge_type.from_type, + &node_rows, + &catalog, + &updates, + ) + .await?; + let to_ids = collect_node_ids( + db, + branch, + &edge_type.to_type, + &node_rows, + &catalog, + &updates, + ) + .await?; + + for (i, (src, dst, _)) in rows.iter().enumerate() { + if !from_ids.contains(src.as_str()) { + return Err(OmniError::manifest(format!( + "edge {} row {}: src '{}' not found in {}", + edge_name, + i + 1, + src, + edge_type.from_type + ))); + } + if !to_ids.contains(dst.as_str()) { + return Err(OmniError::manifest(format!( + "edge {} row {}: dst '{}' not found in {}", + edge_name, + i + 1, + dst, + edge_type.to_type + ))); + } + } + } + + // Write edges + for (edge_name, rows) in &edge_rows { + let edge_type = &catalog.edge_types[edge_name]; + let batch = build_edge_batch(edge_type, rows)?; + let loaded_count = batch.num_rows(); + + let table_key = format!("edge:{}", edge_name); + snapshot + .entry(&table_key) + .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + + let (state, table_branch) = + write_batch_to_dataset(db, branch, &table_key, batch, mode).await?; + + updates.push(crate::db::SubTableUpdate { + table_key, + table_version: state.version, + table_branch, + row_count: state.row_count, + version_metadata: state.version_metadata, + }); + result.edges_loaded.insert(edge_name.clone(), loaded_count); + } + + // Phase 3: Validate edge cardinality constraints (before commit — invalid + // data must not be committed). Opens edge sub-tables at their just-written + // versions, not through the snapshot (which still pins to pre-write state). + for (edge_name, _) in &edge_rows { + let table_key = format!("edge:{}", edge_name); + if let Some(update) = updates.iter().find(|u| u.table_key == table_key) { + validate_edge_cardinality( + db, + branch, + edge_name, + update.table_version, + update.table_branch.as_deref(), + ) + .await?; + } + } + + // Phase 4: Atomic manifest commit + db.commit_updates_on_branch(branch, &updates).await?; + + Ok(result) +} + +fn build_node_batch(node_type: &NodeType, rows: &[JsonValue]) -> Result { + let schema = node_type.arrow_schema.clone(); + + // Build id column: explicit id, @key value, or generated ULID. + let ids: Vec = rows + .iter() + .map(|row| { + let explicit_id = row.get("id").and_then(|v| v.as_str()).map(str::to_string); + if let Some(key_prop) = node_type.key_property() { + let key_value = row + .get(key_prop) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + .ok_or_else(|| { + OmniError::manifest(format!( + "node {} missing @key property '{}'", + node_type.name, key_prop + )) + })?; + if let Some(explicit_id) = explicit_id { + if explicit_id != key_value { + return Err(OmniError::manifest(format!( + "node {} has explicit id '{}' that does not match @key property '{}' value '{}'", + node_type.name, explicit_id, key_prop, key_value + ))); + } + } + Ok(key_value) + } else if let Some(explicit_id) = explicit_id { + Ok(explicit_id) + } else { + Ok(generate_id()) + } + }) + .collect::>>()?; + + let mut columns: Vec = Vec::with_capacity(schema.fields().len()); + columns.push(Arc::new(StringArray::from(ids))); + + // Build property columns (skip "id" field at index 0) + for field in schema.fields().iter().skip(1) { + if node_type.blob_properties.contains(field.name()) { + let col = build_blob_column(field.name(), field.is_nullable(), rows)?; + columns.push(col); + } else { + let col = + build_column_from_json(field.name(), field.data_type(), field.is_nullable(), rows)?; + columns.push(col); + } + } + + RecordBatch::try_new(schema, columns).map_err(|e| OmniError::Lance(e.to_string())) +} + +fn build_edge_batch( + edge_type: &omnigraph_compiler::catalog::EdgeType, + rows: &[(String, String, JsonValue)], +) -> Result { + let schema = edge_type.arrow_schema.clone(); + + let ids: Vec = rows + .iter() + .map(|(_, _, data)| { + data.get("id") + .and_then(|v| v.as_str()) + .map(str::to_string) + .unwrap_or_else(generate_id) + }) + .collect(); + let srcs: Vec<&str> = rows.iter().map(|(from, _, _)| from.as_str()).collect(); + let dsts: Vec<&str> = rows.iter().map(|(_, to, _)| to.as_str()).collect(); + + let mut columns: Vec = Vec::with_capacity(schema.fields().len()); + columns.push(Arc::new(StringArray::from(ids))); + columns.push(Arc::new(StringArray::from(srcs))); + columns.push(Arc::new(StringArray::from(dsts))); + + // Build edge property columns (skip id, src, dst at indices 0-2) + let data_values: Vec = rows.iter().map(|(_, _, data)| data.clone()).collect(); + for field in schema.fields().iter().skip(3) { + if edge_type.blob_properties.contains(field.name()) { + let col = build_blob_column(field.name(), field.is_nullable(), &data_values)?; + columns.push(col); + } else { + let col = build_column_from_json( + field.name(), + field.data_type(), + field.is_nullable(), + &data_values, + )?; + columns.push(col); + } + } + + RecordBatch::try_new(schema, columns).map_err(|e| OmniError::Lance(e.to_string())) +} + +/// Append a blob value (URI or base64 bytes) to a BlobArrayBuilder. +pub(crate) fn append_blob_value(builder: &mut BlobArrayBuilder, value: &str) -> Result<()> { + if let Some(encoded) = value.strip_prefix("base64:") { + let bytes = base64::engine::general_purpose::STANDARD + .decode(encoded) + .map_err(|e| OmniError::manifest(format!("invalid base64 blob data: {}", e)))?; + builder + .push_bytes(bytes) + .map_err(|e| OmniError::Lance(e.to_string())) + } else { + // Treat as URI (file://, s3://, gs://, or any other scheme) + builder + .push_uri(value) + .map_err(|e| OmniError::Lance(e.to_string())) + } +} + +/// Build a blob column from JSON values using Lance BlobArrayBuilder. +fn build_blob_column(name: &str, nullable: bool, rows: &[JsonValue]) -> Result { + let mut builder = BlobArrayBuilder::new(rows.len()); + for row in rows { + match row.get(name) { + Some(JsonValue::String(s)) => { + append_blob_value(&mut builder, s)?; + } + Some(JsonValue::Null) | None if nullable => { + builder + .push_null() + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + Some(JsonValue::Null) | None => { + return Err(OmniError::manifest(format!( + "non-nullable blob property '{}' has null values", + name + ))); + } + _ => { + return Err(OmniError::manifest(format!( + "blob property '{}' must be a URI string or base64: prefixed data", + name + ))); + } + } + } + builder + .finish() + .map_err(|e| OmniError::Lance(e.to_string())) +} + +fn build_column_from_json( + name: &str, + data_type: &DataType, + nullable: bool, + rows: &[JsonValue], +) -> Result { + let array: ArrayRef = match data_type { + DataType::Utf8 => { + let values: Vec> = rows + .iter() + .map(|row| { + row.get(name) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + }) + .collect(); + Arc::new(StringArray::from(values)) + } + DataType::Int32 => { + let values: Vec> = rows + .iter() + .map(|row| row.get(name).and_then(|v| v.as_i64()).map(|v| v as i32)) + .collect(); + Arc::new(Int32Array::from(values)) + } + DataType::Int64 => { + let values: Vec> = rows + .iter() + .map(|row| row.get(name).and_then(|v| v.as_i64())) + .collect(); + Arc::new(Int64Array::from(values)) + } + DataType::UInt32 => { + let values: Vec> = rows + .iter() + .map(|row| row.get(name).and_then(|v| v.as_u64()).map(|v| v as u32)) + .collect(); + Arc::new(UInt32Array::from(values)) + } + DataType::UInt64 => { + let values: Vec> = rows + .iter() + .map(|row| row.get(name).and_then(|v| v.as_u64())) + .collect(); + Arc::new(UInt64Array::from(values)) + } + DataType::Float32 => { + let values: Vec> = rows + .iter() + .map(|row| row.get(name).and_then(|v| v.as_f64()).map(|v| v as f32)) + .collect(); + Arc::new(Float32Array::from(values)) + } + DataType::Float64 => { + let values: Vec> = rows + .iter() + .map(|row| row.get(name).and_then(|v| v.as_f64())) + .collect(); + Arc::new(Float64Array::from(values)) + } + DataType::Boolean => { + let values: Vec> = rows + .iter() + .map(|row| row.get(name).and_then(|v| v.as_bool())) + .collect(); + Arc::new(BooleanArray::from(values)) + } + DataType::Date32 => { + let mut values = Vec::with_capacity(rows.len()); + for row in rows { + values.push(parse_date32_json_value( + row.get(name).unwrap_or(&JsonValue::Null), + )?); + } + Arc::new(Date32Array::from(values)) + } + DataType::Date64 => { + let mut values = Vec::with_capacity(rows.len()); + for row in rows { + values.push(parse_date64_json_value( + row.get(name).unwrap_or(&JsonValue::Null), + )?); + } + Arc::new(Date64Array::from(values)) + } + DataType::List(field) => { + let mut builder = ListBuilder::with_capacity( + make_list_value_builder(field.data_type(), rows.len())?, + rows.len(), + ) + .with_field(field.clone()); + for row in rows { + let value = row.get(name).unwrap_or(&JsonValue::Null); + if value.is_null() { + builder.append(false); + continue; + } + let items = value.as_array().ok_or_else(|| { + OmniError::manifest(format!( + "list property '{}' expects a JSON array, got {}", + name, value + )) + })?; + for item in items { + append_json_list_item(builder.values(), field.data_type(), item)?; + } + builder.append(true); + } + Arc::new(builder.finish()) + } + DataType::FixedSizeList(child_field, dim) => { + // Vector type: parse JSON array of floats into FixedSizeList + let dim = *dim; + let mut builder = FixedSizeListBuilder::with_capacity( + Float32Builder::with_capacity(rows.len() * dim as usize), + dim, + rows.len(), + ) + .with_field(child_field.clone()); + for row in rows { + if let Some(arr) = row.get(name).and_then(|v| v.as_array()) { + if arr.len() != dim as usize { + return Err(OmniError::manifest(format!( + "vector property '{}' expects {} dimensions, got {}", + name, + dim, + arr.len() + ))); + } + for val in arr { + builder + .values() + .append_value(val.as_f64().unwrap_or(0.0) as f32); + } + builder.append(true); + } else if nullable { + for _ in 0..dim as usize { + builder.values().append_null(); + } + builder.append(false); + } else { + return Err(OmniError::manifest(format!( + "non-nullable vector property '{}' has null values", + name + ))); + } + } + Arc::new(builder.finish()) + } + _ => { + // Unsupported type: fill with nulls + let values: Vec> = vec![None; rows.len()]; + Arc::new(StringArray::from(values)) + } + }; + + if !nullable && array.null_count() > 0 { + return Err(OmniError::manifest(format!( + "non-nullable property '{}' has null or invalid values", + name + ))); + } + + Ok(array) +} + +fn make_list_value_builder(data_type: &DataType, capacity: usize) -> Result> { + Ok(match data_type { + DataType::Utf8 => Box::new(StringBuilder::with_capacity(capacity, capacity * 8)), + DataType::Boolean => Box::new(BooleanBuilder::with_capacity(capacity)), + DataType::Int32 => Box::new(Int32Builder::with_capacity(capacity)), + DataType::Int64 => Box::new(Int64Builder::with_capacity(capacity)), + DataType::UInt32 => Box::new(UInt32Builder::with_capacity(capacity)), + DataType::UInt64 => Box::new(UInt64Builder::with_capacity(capacity)), + DataType::Float32 => Box::new(Float32Builder::with_capacity(capacity)), + DataType::Float64 => Box::new(Float64Builder::with_capacity(capacity)), + DataType::Date32 => Box::new(Date32Builder::with_capacity(capacity)), + DataType::Date64 => Box::new(Date64Builder::with_capacity(capacity)), + other => { + return Err(OmniError::manifest(format!( + "unsupported list element data type {:?}", + other + ))); + } + }) +} + +fn append_json_list_item( + builder: &mut Box, + data_type: &DataType, + value: &JsonValue, +) -> Result<()> { + match data_type { + DataType::Utf8 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Utf8 builder downcast failed"))?; + if let Some(value) = value.as_str() { + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::Boolean => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Boolean builder downcast failed"))?; + if let Some(value) = value.as_bool() { + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::Int32 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Int32 builder downcast failed"))?; + if let Some(value) = value.as_i64() { + let value = i32::try_from(value).map_err(|_| { + OmniError::manifest(format!("list value {} exceeds Int32 range", value)) + })?; + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::Int64 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Int64 builder downcast failed"))?; + if let Some(value) = value.as_i64() { + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::UInt32 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list UInt32 builder downcast failed"))?; + if let Some(value) = value.as_u64() { + let value = u32::try_from(value).map_err(|_| { + OmniError::manifest(format!("list value {} exceeds UInt32 range", value)) + })?; + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::UInt64 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list UInt64 builder downcast failed"))?; + if let Some(value) = value.as_u64() { + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::Float32 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Float32 builder downcast failed"))?; + if let Some(value) = value.as_f64() { + builder.append_value(value as f32); + } else { + builder.append_null(); + } + } + DataType::Float64 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Float64 builder downcast failed"))?; + if let Some(value) = value.as_f64() { + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::Date32 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Date32 builder downcast failed"))?; + if let Some(value) = parse_date32_json_value(value)? { + builder.append_value(value); + } else { + builder.append_null(); + } + } + DataType::Date64 => { + let builder = builder + .as_any_mut() + .downcast_mut::() + .ok_or_else(|| OmniError::manifest("list Date64 builder downcast failed"))?; + if let Some(value) = parse_date64_json_value(value)? { + builder.append_value(value); + } else { + builder.append_null(); + } + } + other => { + return Err(OmniError::manifest(format!( + "unsupported list element data type {:?}", + other + ))); + } + } + + Ok(()) +} + +fn parse_date32_json_value(value: &JsonValue) -> Result> { + if value.is_null() { + return Ok(None); + } + if let Some(days) = value.as_i64() { + let days = i32::try_from(days) + .map_err(|_| OmniError::manifest(format!("Date value out of range: {}", days)))?; + return Ok(Some(days)); + } + if let Some(days) = value.as_u64() { + let days = i32::try_from(days) + .map_err(|_| OmniError::manifest(format!("Date value out of range: {}", days)))?; + return Ok(Some(days)); + } + if let Some(value) = value.as_str() { + return Ok(Some(parse_date32_literal(value)?)); + } + Ok(None) +} + +fn parse_date64_json_value(value: &JsonValue) -> Result> { + if value.is_null() { + return Ok(None); + } + if let Some(ms) = value.as_i64() { + return Ok(Some(ms)); + } + if let Some(ms) = value.as_u64() { + let ms = i64::try_from(ms) + .map_err(|_| OmniError::manifest(format!("DateTime value out of range: {}", ms)))?; + return Ok(Some(ms)); + } + if let Some(value) = value.as_str() { + return Ok(Some(parse_date64_literal(value)?)); + } + Ok(None) +} + +/// Write a batch to a Lance dataset, returning (new_version, total_row_count). +async fn write_batch_to_dataset( + db: &Omnigraph, + branch: Option<&str>, + table_key: &str, + batch: RecordBatch, + mode: LoadMode, +) -> Result<(crate::table_store::TableState, Option)> { + let (mut ds, full_path, table_branch) = + db.open_for_mutation_on_branch(branch, table_key).await?; + let table_store = db.table_store(); + + match mode { + LoadMode::Overwrite => { + let state = table_store + .overwrite_batch(&full_path, &mut ds, batch) + .await?; + Ok((state, table_branch)) + } + LoadMode::Append => { + let state = table_store.append_batch(&full_path, &mut ds, batch).await?; + Ok((state, table_branch)) + } + LoadMode::Merge => { + let state = table_store + .merge_insert_batch( + &full_path, + ds, + batch, + vec!["id".to_string()], + lance::dataset::WhenMatched::UpdateAll, + lance::dataset::WhenNotMatched::InsertAll, + ) + .await?; + Ok((state, table_branch)) + } + } +} + +fn generate_id() -> String { + ulid::Ulid::new().to_string() +} + +pub(crate) fn parse_date32_literal(value: &str) -> Result { + let raw: Arc = Arc::new(StringArray::from(vec![Some(value)])); + let casted = arrow_cast::cast::cast(raw.as_ref(), &DataType::Date32) + .map_err(|e| OmniError::manifest(format!("invalid Date literal '{}': {}", value, e)))?; + let out = casted + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::manifest("Date32 cast produced unexpected array"))?; + if out.is_null(0) { + return Err(OmniError::manifest(format!( + "invalid Date literal '{}'", + value + ))); + } + Ok(out.value(0)) +} + +pub(crate) fn parse_date64_literal(value: &str) -> Result { + let raw: Arc = Arc::new(StringArray::from(vec![Some(value)])); + let casted = arrow_cast::cast::cast(raw.as_ref(), &DataType::Date64) + .map_err(|e| OmniError::manifest(format!("invalid DateTime literal '{}': {}", value, e)))?; + let out = casted + .as_any() + .downcast_ref::() + .ok_or_else(|| OmniError::manifest("Date64 cast produced unexpected array"))?; + if out.is_null(0) { + return Err(OmniError::manifest(format!( + "invalid DateTime literal '{}'", + value + ))); + } + Ok(out.value(0)) +} + +// ─── Value constraint validation ───────────────────────────────────────────── + +pub(crate) fn validate_value_constraints( + batch: &RecordBatch, + node_type: &omnigraph_compiler::catalog::NodeType, +) -> Result<()> { + use arrow_array::Array; + + // Range constraints + for rc in &node_type.range_constraints { + let Some(col) = batch.column_by_name(&rc.property) else { + continue; + }; + for row in 0..batch.num_rows() { + if col.is_null(row) { + continue; + } + let value = extract_numeric_value(col, row); + if let Some(val) = value { + if val.is_nan() { + return Err(OmniError::manifest(format!( + "@range violation on {}.{}: value is NaN", + node_type.name, rc.property + ))); + } + if let Some(ref min) = rc.min { + let min_f = literal_value_to_f64(min); + if val < min_f { + return Err(OmniError::manifest(format!( + "@range violation on {}.{}: value {} < min {}", + node_type.name, rc.property, val, min_f + ))); + } + } + if let Some(ref max) = rc.max { + let max_f = literal_value_to_f64(max); + if val > max_f { + return Err(OmniError::manifest(format!( + "@range violation on {}.{}: value {} > max {}", + node_type.name, rc.property, val, max_f + ))); + } + } + } + } + } + + // Check constraints (regex) + for cc in &node_type.check_constraints { + let re = regex::Regex::new(&cc.pattern).map_err(|e| { + OmniError::manifest(format!( + "@check on {}.{} has invalid regex '{}': {}", + node_type.name, cc.property, cc.pattern, e + )) + })?; + let Some(col) = batch.column_by_name(&cc.property) else { + continue; + }; + let str_col = col.as_any().downcast_ref::(); + if let Some(str_col) = str_col { + for row in 0..str_col.len() { + if str_col.is_null(row) { + continue; + } + let val = str_col.value(row); + if !re.is_match(val) { + return Err(OmniError::manifest(format!( + "@check violation on {}.{}: value '{}' does not match pattern '{}'", + node_type.name, cc.property, val, cc.pattern + ))); + } + } + } + } + + Ok(()) +} + +fn extract_numeric_value(col: &ArrayRef, row: usize) -> Option { + use arrow_array::{ + Array, Float32Array, Float64Array, Int32Array, Int64Array, UInt32Array, UInt64Array, + }; + if let Some(a) = col.as_any().downcast_ref::() { + return Some(a.value(row) as f64); + } + if let Some(a) = col.as_any().downcast_ref::() { + return Some(a.value(row) as f64); + } + if let Some(a) = col.as_any().downcast_ref::() { + return Some(a.value(row) as f64); + } + if let Some(a) = col.as_any().downcast_ref::() { + return Some(a.value(row) as f64); + } + if let Some(a) = col.as_any().downcast_ref::() { + return Some(a.value(row) as f64); + } + if let Some(a) = col.as_any().downcast_ref::() { + return Some(a.value(row)); + } + None +} + +fn literal_value_to_f64(v: &omnigraph_compiler::catalog::LiteralValue) -> f64 { + use omnigraph_compiler::catalog::LiteralValue; + match v { + LiteralValue::Integer(n) => *n as f64, + LiteralValue::Float(f) => *f, + } +} + +// ─── Edge cardinality validation ───────────────────────────────────────────── + +async fn validate_edge_cardinality( + db: &crate::db::Omnigraph, + branch: Option<&str>, + edge_name: &str, + written_version: u64, + written_branch: Option<&str>, +) -> Result<()> { + use arrow_array::Array; + let catalog = db.catalog(); + let edge_type = &catalog.edge_types[edge_name]; + if edge_type.cardinality.is_default() { + return Ok(()); + } + + // Open edge sub-table at the just-written version, not the snapshot's + // (the snapshot still pins to the pre-write version). + let snapshot = db.snapshot_for_branch(branch).await?; + let table_key = format!("edge:{}", edge_name); + let entry = snapshot + .entry(&table_key) + .ok_or_else(|| OmniError::manifest(format!("no manifest entry for {}", table_key)))?; + let ds = db + .open_dataset_at_state( + &entry.table_path, + written_branch.or(entry.table_branch.as_deref()), + written_version, + ) + .await?; + + // Scan src column, count per source + let batches = db + .table_store() + .scan(&ds, Some(&["src"]), None, None) + .await?; + + let mut counts: HashMap = HashMap::new(); + for batch in &batches { + let srcs = batch + .column_by_name("src") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..srcs.len() { + *counts.entry(srcs.value(i).to_string()).or_insert(0) += 1; + } + } + + let card = &edge_type.cardinality; + for (src, count) in &counts { + if let Some(max) = card.max { + if *count > max { + return Err(OmniError::manifest(format!( + "@card violation on edge {}: source '{}' has {} edges (max {})", + edge_name, src, count, max + ))); + } + } + if *count < card.min { + return Err(OmniError::manifest(format!( + "@card violation on edge {}: source '{}' has {} edges (min {})", + edge_name, src, count, card.min + ))); + } + } + + Ok(()) +} + +/// Collect all valid node IDs for a given type. Union of: +/// - IDs from the just-loaded batch (in memory, from node_rows) +/// - IDs from the sub-table at the just-written version (if it was updated) +/// - IDs from the sub-table at the snapshot-pinned version (if it was not updated) +async fn collect_node_ids( + db: &Omnigraph, + branch: Option<&str>, + type_name: &str, + node_rows: &HashMap>, + catalog: &omnigraph_compiler::catalog::Catalog, + updates: &[crate::db::SubTableUpdate], +) -> Result> { + let mut ids = HashSet::new(); + + // IDs from the in-memory batch (just loaded in this operation) + if let Some(rows) = node_rows.get(type_name) { + if let Some(node_type) = catalog.node_types.get(type_name) { + if let Some(key_prop) = node_type.key_property() { + for row in rows { + if let Some(id) = row.get(key_prop).and_then(|v| v.as_str()) { + ids.insert(id.to_string()); + } + } + } + } + } + + // IDs from the Lance sub-table + let table_key = format!("node:{}", type_name); + let snapshot = db.snapshot_for_branch(branch).await?; + let Some(entry) = snapshot.entry(&table_key) else { + return Ok(ids); + }; + // Use the just-written version if this type was updated, else snapshot version + let updated = updates + .iter() + .find(|u| u.table_key == table_key) + .map(|u| (u.table_version, u.table_branch.as_deref())); + let (version, branch) = updated.unwrap_or((entry.table_version, entry.table_branch.as_deref())); + let ds = db + .open_dataset_at_state(&entry.table_path, branch, version) + .await?; + + let batches = db + .table_store() + .scan(&ds, Some(&["id"]), None, None) + .await?; + + for batch in &batches { + let id_col = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..batch.num_rows() { + ids.insert(id_col.value(i).to_string()); + } + } + + Ok(ids) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::db::Omnigraph; + use arrow_array::Array; + use futures::TryStreamExt; + use std::collections::HashMap; + + const TEST_SCHEMA: &str = r#" +node Person { + name: String @key + age: I32? +} +node Company { + name: String @key +} +edge Knows: Person -> Person { + since: Date? +} +edge WorksAt: Person -> Company +"#; + + const TEST_DATA: &str = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Person", "data": {"name": "Bob", "age": 25}} +{"type": "Company", "data": {"name": "Acme"}} +{"edge": "Knows", "from": "Alice", "to": "Bob"} +{"edge": "WorksAt", "from": "Alice", "to": "Acme"} +"#; + + #[tokio::test] + async fn test_load_creates_data() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let result = load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + assert_eq!(result.nodes_loaded["Person"], 2); + assert_eq!(result.nodes_loaded["Company"], 1); + assert_eq!(result.edges_loaded["Knows"], 1); + assert_eq!(result.edges_loaded["WorksAt"], 1); + } + + #[tokio::test] + async fn test_load_data_readable_via_lance() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + // Read back via snapshot + let snap = db.snapshot(); + let person_ds = snap.open("node:Person").await.unwrap(); + + assert_eq!(person_ds.count_rows(None).await.unwrap(), 2); + + // Verify data + let batches: Vec = person_ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + let batch = &batches[0]; + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + // @key=name, so ids should be "Alice" and "Bob" + let id_values: Vec<&str> = (0..ids.len()).map(|i| ids.value(i)).collect(); + assert!(id_values.contains(&"Alice")); + assert!(id_values.contains(&"Bob")); + } + + #[tokio::test] + async fn test_load_edges_reference_node_keys() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = db.snapshot(); + let knows_ds = snap.open("edge:Knows").await.unwrap(); + + let batches: Vec = knows_ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + let batch = &batches[0]; + let srcs = batch + .column_by_name("src") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let dsts = batch + .column_by_name("dst") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + assert_eq!(srcs.value(0), "Alice"); + assert_eq!(dsts.value(0), "Bob"); + } + + #[tokio::test] + async fn test_load_manifest_version_advances() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + let v1 = db.version(); + + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + assert!(db.version() > v1); + } + + #[tokio::test] + async fn test_load_append_adds_rows() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let batch1 = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}}"#; + let batch2 = r#"{"type": "Person", "data": {"name": "Bob", "age": 25}}"#; + + load_jsonl(&mut db, batch1, LoadMode::Overwrite) + .await + .unwrap(); + load_jsonl(&mut db, batch2, LoadMode::Append).await.unwrap(); + + let snap = db.snapshot(); + let person_ds = snap.open("node:Person").await.unwrap(); + assert_eq!(person_ds.count_rows(None).await.unwrap(), 2); + } + + #[tokio::test] + async fn test_load_unknown_type_rejected() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let bad = r#"{"type": "FakeType", "data": {"name": "x"}}"#; + let result = load_jsonl(&mut db, bad, LoadMode::Overwrite).await; + assert!(result.is_err()); + } + + #[tokio::test] + async fn test_ingest_creates_branch_and_reports_tables() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let result = db + .ingest("feature", Some("main"), TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + assert_eq!(result.branch, "feature"); + assert_eq!(result.base_branch, "main"); + assert!(result.branch_created); + assert_eq!(result.mode, LoadMode::Overwrite); + assert_eq!( + result.tables, + vec![ + IngestTableResult { + table_key: "edge:Knows".to_string(), + rows_loaded: 1 + }, + IngestTableResult { + table_key: "edge:WorksAt".to_string(), + rows_loaded: 1 + }, + IngestTableResult { + table_key: "node:Company".to_string(), + rows_loaded: 1 + }, + IngestTableResult { + table_key: "node:Person".to_string(), + rows_loaded: 2 + }, + ] + ); + assert!( + db.branch_list() + .await + .unwrap() + .contains(&"feature".to_string()) + ); + } + + #[tokio::test] + async fn test_ingest_existing_branch_ignores_from_and_merges_data() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + db.branch_create_from(crate::db::ReadTarget::branch("main"), "feature") + .await + .unwrap(); + + let result = db + .ingest( + "feature", + Some("missing-base"), + r#"{"type":"Person","data":{"name":"Bob","age":26}} +{"type":"Person","data":{"name":"Eve","age":31}}"#, + LoadMode::Merge, + ) + .await + .unwrap(); + + assert_eq!(result.branch, "feature"); + assert_eq!(result.base_branch, "missing-base"); + assert!(!result.branch_created); + assert_eq!(result.mode, LoadMode::Merge); + assert_eq!( + result.tables, + vec![IngestTableResult { + table_key: "node:Person".to_string(), + rows_loaded: 2 + }] + ); + + let snap = db + .snapshot_of(crate::db::ReadTarget::branch("feature")) + .await + .unwrap(); + let person_ds = snap.open("node:Person").await.unwrap(); + assert_eq!(person_ds.count_rows(None).await.unwrap(), 3); + + let batches: Vec = person_ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let mut ages_by_id = HashMap::new(); + for batch in &batches { + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let ages = batch + .column_by_name("age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for idx in 0..ids.len() { + ages_by_id.insert(ids.value(idx).to_string(), ages.value(idx)); + } + } + + assert_eq!(ages_by_id.get("Bob"), Some(&26)); + assert_eq!(ages_by_id.get("Eve"), Some(&31)); + assert_eq!(ages_by_id.get("Alice"), Some(&30)); + } + + #[tokio::test] + async fn test_ingest_as_stamps_actor_on_branch_head_commit() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + db.ingest_as( + "feature", + Some("main"), + TEST_DATA, + LoadMode::Overwrite, + Some("act-andrew"), + ) + .await + .unwrap(); + + let head = db + .list_commits(Some("feature")) + .await + .unwrap() + .into_iter() + .last() + .unwrap(); + assert_eq!(head.actor_id.as_deref(), Some("act-andrew")); + } + + #[test] + fn test_range_constraint_rejects_nan() { + use arrow_array::{Float64Array, RecordBatch, StringArray}; + use omnigraph_compiler::catalog::{LiteralValue, NodeType, RangeConstraint}; + use std::sync::Arc; + + let schema = Arc::new(arrow_schema::Schema::new(vec![ + arrow_schema::Field::new("name", arrow_schema::DataType::Utf8, false), + arrow_schema::Field::new("score", arrow_schema::DataType::Float64, true), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["bad"])), + Arc::new(Float64Array::from(vec![f64::NAN])), + ], + ) + .unwrap(); + + let node_type = NodeType { + name: "Test".to_string(), + implements: vec![], + properties: Default::default(), + key: None, + unique_constraints: vec![], + indices: vec![], + range_constraints: vec![RangeConstraint { + property: "score".to_string(), + min: Some(LiteralValue::Float(0.0)), + max: Some(LiteralValue::Float(1.0)), + }], + check_constraints: vec![], + embed_sources: Default::default(), + blob_properties: Default::default(), + arrow_schema: schema, + }; + + let result = validate_value_constraints(&batch, &node_type); + assert!(result.is_err(), "expected NaN to be rejected"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("NaN"), "error should mention NaN: {}", err); + } +} diff --git a/crates/omnigraph/src/runtime_cache.rs b/crates/omnigraph/src/runtime_cache.rs new file mode 100644 index 0000000..84b562a --- /dev/null +++ b/crates/omnigraph/src/runtime_cache.rs @@ -0,0 +1,159 @@ +use std::collections::{HashMap, VecDeque}; +use std::sync::Arc; + +use omnigraph_compiler::catalog::Catalog; +use tokio::sync::Mutex; + +use crate::db::ResolvedTarget; +use crate::error::Result; +use crate::graph_index::GraphIndex; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct GraphIndexCacheKey { + snapshot_id: String, + edge_tables: Vec, +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +struct GraphIndexTableState { + table_key: String, + table_version: u64, + table_branch: Option, +} + +#[derive(Debug, Default)] +pub struct RuntimeCache { + graph_indices: Mutex, +} + +#[derive(Debug, Default)] +struct GraphIndexCache { + entries: HashMap>, + lru: VecDeque, +} + +impl RuntimeCache { + pub async fn invalidate_all(&self) { + let mut cache = self.graph_indices.lock().await; + cache.entries.clear(); + cache.lru.clear(); + } + + pub async fn graph_index( + &self, + resolved: &ResolvedTarget, + catalog: &Catalog, + ) -> Result> { + let key = graph_index_cache_key(resolved, catalog); + { + let mut cache = self.graph_indices.lock().await; + if let Some(index) = cache.entries.get(&key).cloned() { + cache.touch(key.clone()); + return Ok(index); + } + } + + let edge_types = catalog + .edge_types + .iter() + .map(|(name, et)| (name.clone(), (et.from_type.clone(), et.to_type.clone()))) + .collect(); + + let index = Arc::new(GraphIndex::build(&resolved.snapshot, &edge_types).await?); + let mut cache = self.graph_indices.lock().await; + if let Some(existing) = cache.entries.get(&key).cloned() { + cache.touch(key); + return Ok(existing); + } + cache.insert(key, Arc::clone(&index)); + Ok(index) + } +} + +impl GraphIndexCache { + fn insert(&mut self, key: GraphIndexCacheKey, value: Arc) { + self.entries.insert(key.clone(), value); + self.touch(key); + while self.entries.len() > 8 { + let Some(oldest) = self.lru.pop_front() else { + break; + }; + if self.entries.remove(&oldest).is_some() { + break; + } + } + } + + fn touch(&mut self, key: GraphIndexCacheKey) { + self.lru.retain(|existing| existing != &key); + self.lru.push_back(key); + } +} + +fn graph_index_cache_key(resolved: &ResolvedTarget, catalog: &Catalog) -> GraphIndexCacheKey { + let mut edge_tables: Vec = catalog + .edge_types + .keys() + .filter_map(|edge_name| { + let table_key = format!("edge:{}", edge_name); + resolved + .snapshot + .entry(&table_key) + .map(|entry| GraphIndexTableState { + table_key, + table_version: entry.table_version, + table_branch: entry.table_branch.clone(), + }) + }) + .collect(); + edge_tables.sort_by(|a, b| a.table_key.cmp(&b.table_key)); + + GraphIndexCacheKey { + snapshot_id: resolved.snapshot_id.as_str().to_string(), + edge_tables, + } +} + +#[cfg(test)] +mod tests { + use std::sync::Arc; + + use super::*; + + fn key(id: usize) -> GraphIndexCacheKey { + GraphIndexCacheKey { + snapshot_id: format!("snap-{id}"), + edge_tables: Vec::new(), + } + } + + fn empty_index() -> Arc { + Arc::new(GraphIndex::empty_for_test()) + } + + #[test] + fn graph_index_cache_evicts_oldest_entry() { + let mut cache = GraphIndexCache::default(); + for idx in 0..9 { + cache.insert(key(idx), empty_index()); + } + + assert_eq!(cache.entries.len(), 8); + assert!(!cache.entries.contains_key(&key(0))); + assert!(cache.entries.contains_key(&key(8))); + } + + #[test] + fn graph_index_cache_touch_keeps_recent_entry() { + let mut cache = GraphIndexCache::default(); + for idx in 0..8 { + cache.insert(key(idx), empty_index()); + } + + cache.touch(key(0)); + cache.insert(key(8), empty_index()); + + assert!(cache.entries.contains_key(&key(0))); + assert!(!cache.entries.contains_key(&key(1))); + } +} diff --git a/crates/omnigraph/src/storage.rs b/crates/omnigraph/src/storage.rs new file mode 100644 index 0000000..73d9441 --- /dev/null +++ b/crates/omnigraph/src/storage.rs @@ -0,0 +1,325 @@ +use std::env; +use std::fmt::Debug; +use std::path::{Path, PathBuf}; +use std::sync::Arc; + +use async_trait::async_trait; +use futures::TryStreamExt; +use object_store::aws::AmazonS3Builder; +use object_store::path::Path as ObjectPath; +use object_store::{DynObjectStore, ObjectStore, PutPayload}; +use url::Url; + +use crate::error::{OmniError, Result}; + +const FILE_SCHEME_PREFIX: &str = "file://"; +const S3_SCHEME_PREFIX: &str = "s3://"; + +#[async_trait] +pub trait StorageAdapter: Debug + Send + Sync { + async fn read_text(&self, uri: &str) -> Result; + async fn write_text(&self, uri: &str, contents: &str) -> Result<()>; + async fn exists(&self, uri: &str) -> Result; +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum StorageKind { + Local, + S3, +} + +#[derive(Debug, Default)] +pub struct LocalStorageAdapter; + +#[derive(Debug)] +pub struct S3StorageAdapter { + bucket: String, + store: Arc, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +struct S3Location { + bucket: String, + key: String, +} + +#[async_trait] +impl StorageAdapter for LocalStorageAdapter { + async fn read_text(&self, uri: &str) -> Result { + let path = local_path_from_uri(uri)?; + Ok(tokio::fs::read_to_string(&path).await?) + } + + async fn write_text(&self, uri: &str, contents: &str) -> Result<()> { + let path = local_path_from_uri(uri)?; + tokio::fs::write(&path, contents).await?; + Ok(()) + } + + async fn exists(&self, uri: &str) -> Result { + Ok(local_path_from_uri(uri)?.exists()) + } +} + +#[async_trait] +impl StorageAdapter for S3StorageAdapter { + async fn read_text(&self, uri: &str) -> Result { + let location = self.object_path(uri)?; + let bytes = self + .store + .get(&location) + .await + .map_err(|err| storage_backend_error("read", uri, err))? + .bytes() + .await + .map_err(|err| storage_backend_error("read", uri, err))?; + + String::from_utf8(bytes.to_vec()).map_err(|err| { + OmniError::manifest_internal(format!("storage read failed for '{}': {}", uri, err)) + }) + } + + async fn write_text(&self, uri: &str, contents: &str) -> Result<()> { + let location = self.object_path(uri)?; + self.store + .put(&location, PutPayload::from(contents.as_bytes().to_vec())) + .await + .map_err(|err| storage_backend_error("write", uri, err))?; + Ok(()) + } + + async fn exists(&self, uri: &str) -> Result { + let location = self.object_path(uri)?; + match self.store.head(&location).await { + Ok(_) => Ok(true), + Err(object_store::Error::NotFound { .. }) => { + let mut entries = self.store.list(Some(&location)); + let has_prefix_entries = entries + .try_next() + .await + .map_err(|err| storage_backend_error("exists", uri, err))? + .is_some(); + Ok(has_prefix_entries) + } + Err(err) => Err(storage_backend_error("exists", uri, err)), + } + } +} + +impl S3StorageAdapter { + fn from_root_uri(root_uri: &str) -> Result { + let location = parse_s3_uri(root_uri)?; + let mut builder = AmazonS3Builder::from_env().with_bucket_name(&location.bucket); + + if let Some(endpoint) = env::var("AWS_ENDPOINT_URL_S3") + .ok() + .or_else(|| env::var("AWS_ENDPOINT_URL").ok()) + { + builder = builder.with_endpoint(&endpoint); + if endpoint.starts_with("http://") || env_var_truthy("AWS_ALLOW_HTTP") { + builder = builder.with_allow_http(true); + } + } + + if env_var_truthy("AWS_S3_FORCE_PATH_STYLE") { + builder = builder.with_virtual_hosted_style_request(false); + } + + let store = builder.build().map_err(|err| { + OmniError::manifest_internal(format!( + "failed to initialize s3 storage for '{}': {}", + root_uri, err + )) + })?; + + Ok(Self { + bucket: location.bucket, + store: Arc::new(store), + }) + } + + fn object_path(&self, uri: &str) -> Result { + let location = parse_s3_uri(uri)?; + if location.bucket != self.bucket { + return Err(OmniError::manifest_internal(format!( + "s3 storage bucket mismatch for '{}': expected '{}', found '{}'", + uri, self.bucket, location.bucket + ))); + } + if location.key.is_empty() { + return Err(OmniError::manifest_internal(format!( + "s3 storage path is empty for '{}'", + uri + ))); + } + ObjectPath::parse(&location.key).map_err(|err| { + OmniError::manifest_internal(format!("invalid s3 object path for '{}': {}", uri, err)) + }) + } +} + +pub fn storage_kind_for_uri(uri: &str) -> StorageKind { + if uri.starts_with(S3_SCHEME_PREFIX) { + StorageKind::S3 + } else { + StorageKind::Local + } +} + +pub fn storage_for_uri(uri: &str) -> Result> { + match storage_kind_for_uri(uri) { + StorageKind::Local => Ok(Arc::new(LocalStorageAdapter)), + StorageKind::S3 => Ok(Arc::new(S3StorageAdapter::from_root_uri(uri)?)), + } +} + +pub fn normalize_root_uri(uri: &str) -> Result { + match storage_kind_for_uri(uri) { + StorageKind::Local => { + let path = local_path_from_uri(uri)?; + Ok(normalize_local_path(&path)) + } + StorageKind::S3 => Ok(trim_trailing_slashes(uri)), + } +} + +pub fn join_uri(root_uri: &str, relative_path: &str) -> String { + let relative_path = relative_path.trim_start_matches('/'); + match storage_kind_for_uri(root_uri) { + StorageKind::S3 => { + let root = trim_trailing_slashes(root_uri); + if root.is_empty() { + relative_path.to_string() + } else { + format!("{}/{}", root, relative_path) + } + } + StorageKind::Local => { + let root = if root_uri.starts_with(FILE_SCHEME_PREFIX) { + local_path_from_file_uri(root_uri) + .map(|path| normalize_local_path(&path)) + .unwrap_or_else(|_| trim_trailing_slashes(root_uri)) + } else { + normalize_local_path(Path::new(root_uri)) + }; + let joined = Path::new(&root).join(relative_path); + normalize_local_path(&joined) + } + } +} + +fn local_path_from_uri(uri: &str) -> Result { + if uri.starts_with(FILE_SCHEME_PREFIX) { + return local_path_from_file_uri(uri); + } + Ok(PathBuf::from(uri)) +} + +fn local_path_from_file_uri(uri: &str) -> Result { + let url = Url::parse(uri).map_err(|err| { + OmniError::manifest_internal(format!("invalid file uri '{}': {}", uri, err)) + })?; + url.to_file_path() + .map_err(|_| OmniError::manifest_internal(format!("invalid file uri '{}'", uri))) +} + +fn parse_s3_uri(uri: &str) -> Result { + let url = Url::parse(uri).map_err(|err| { + OmniError::manifest_internal(format!("invalid s3 uri '{}': {}", uri, err)) + })?; + if url.scheme() != "s3" { + return Err(OmniError::manifest_internal(format!( + "unsupported s3 uri '{}'", + uri + ))); + } + let bucket = url + .host_str() + .ok_or_else(|| OmniError::manifest_internal(format!("missing s3 bucket in '{}'", uri)))?; + Ok(S3Location { + bucket: bucket.to_string(), + key: url.path().trim_start_matches('/').to_string(), + }) +} + +fn storage_backend_error(action: &str, uri: &str, err: impl std::fmt::Display) -> OmniError { + OmniError::manifest_internal(format!("storage {} failed for '{}': {}", action, uri, err)) +} + +fn normalize_local_path(path: &Path) -> String { + let raw = path.as_os_str().to_string_lossy(); + if raw == "/" { + return raw.to_string(); + } + trim_trailing_slashes(&raw) +} + +fn trim_trailing_slashes(value: &str) -> String { + let trimmed = value.trim_end_matches('/'); + if trimmed.is_empty() { + value.to_string() + } else { + trimmed.to_string() + } +} + +fn env_var_truthy(key: &str) -> bool { + matches!( + env::var(key).ok().as_deref(), + Some("1" | "true" | "TRUE" | "True" | "yes" | "YES" | "on" | "ON") + ) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn storage_backend_selection_is_scheme_aware() { + assert_eq!(storage_kind_for_uri("/tmp/repo"), StorageKind::Local); + assert_eq!(storage_kind_for_uri("file:///tmp/repo"), StorageKind::Local); + assert_eq!( + storage_kind_for_uri("s3://omnigraph-preview/repo"), + StorageKind::S3 + ); + } + + #[test] + fn normalize_root_uri_preserves_local_and_s3_shapes() { + assert_eq!( + normalize_root_uri("/tmp/omnigraph/").unwrap(), + "/tmp/omnigraph" + ); + assert_eq!( + normalize_root_uri("file:///tmp/omnigraph/").unwrap(), + "/tmp/omnigraph" + ); + assert_eq!( + normalize_root_uri("s3://bucket/prefix/").unwrap(), + "s3://bucket/prefix" + ); + } + + #[test] + fn join_uri_handles_local_file_and_s3_roots() { + assert_eq!( + join_uri("/tmp/omnigraph", "_schema.pg"), + "/tmp/omnigraph/_schema.pg" + ); + assert_eq!( + join_uri("file:///tmp/omnigraph", "_schema.pg"), + "/tmp/omnigraph/_schema.pg" + ); + assert_eq!( + join_uri("s3://bucket/prefix", "_schema.pg"), + "s3://bucket/prefix/_schema.pg" + ); + } + + #[test] + fn parse_s3_uri_splits_bucket_and_key() { + let location = parse_s3_uri("s3://bucket/repo/_schema.pg").unwrap(); + assert_eq!(location.bucket, "bucket"); + assert_eq!(location.key, "repo/_schema.pg"); + } +} diff --git a/crates/omnigraph/src/table_store.rs b/crates/omnigraph/src/table_store.rs new file mode 100644 index 0000000..e9403f6 --- /dev/null +++ b/crates/omnigraph/src/table_store.rs @@ -0,0 +1,603 @@ +use arrow_array::{RecordBatch, UInt64Array}; +use arrow_schema::SchemaRef; +use arrow_select::concat::concat_batches; +use futures::TryStreamExt; +use lance::Dataset; +use lance::dataset::scanner::{ColumnOrdering, DatasetRecordBatchStream, Scanner}; +use lance::dataset::{MergeInsertBuilder, WhenMatched, WhenNotMatched, WriteMode, WriteParams}; +use lance::datatypes::BlobHandling; +use lance::index::scalar::IndexDetails; +use lance_file::version::LanceFileVersion; +use lance_index::scalar::{InvertedIndexParams, ScalarIndexParams}; +use lance_index::{DatasetIndexExt, IndexType, is_system_index}; +use lance_linalg::distance::MetricType; +use lance_table::format::IndexMetadata; +use std::sync::Arc; + +use crate::db::manifest::{TableVersionMetadata, open_table_head_for_write}; +use crate::db::{Snapshot, SubTableEntry}; +use crate::error::{OmniError, Result}; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct TableState { + pub version: u64, + pub row_count: u64, + pub(crate) version_metadata: TableVersionMetadata, +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct DeleteState { + pub version: u64, + pub row_count: u64, + pub deleted_rows: usize, + pub(crate) version_metadata: TableVersionMetadata, +} + +#[derive(Debug, Clone)] +pub struct TableStore { + root_uri: String, +} + +impl TableStore { + pub fn new(root_uri: &str) -> Self { + Self { + root_uri: root_uri.trim_end_matches('/').to_string(), + } + } + + pub fn root_uri(&self) -> &str { + &self.root_uri + } + + pub fn dataset_uri(&self, table_path: &str) -> String { + format!("{}/{}", self.root_uri, table_path) + } + + fn table_path_from_dataset_uri(&self, dataset_uri: &str) -> Result { + let prefix = format!("{}/", self.root_uri.trim_end_matches('/')); + let table_path = dataset_uri + .strip_prefix(&prefix) + .map(|path| path.to_string()) + .ok_or_else(|| { + OmniError::manifest_internal(format!( + "dataset uri '{}' is not under root '{}'", + dataset_uri, self.root_uri + )) + })?; + Ok(table_path + .split_once("/tree/") + .map(|(path, _)| path.to_string()) + .unwrap_or(table_path)) + } + + fn dataset_version_metadata( + &self, + dataset_uri: &str, + ds: &Dataset, + ) -> Result { + let table_path = self.table_path_from_dataset_uri(dataset_uri)?; + TableVersionMetadata::from_dataset(&self.root_uri, &table_path, ds) + } + + pub async fn open_snapshot_table( + &self, + snapshot: &Snapshot, + table_key: &str, + ) -> Result { + snapshot.open(table_key).await + } + + pub async fn open_at_entry(&self, entry: &SubTableEntry) -> Result { + entry.open(&self.root_uri).await + } + + pub async fn open_dataset_head( + &self, + dataset_uri: &str, + branch: Option<&str>, + ) -> Result { + let ds = Dataset::open(dataset_uri) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + match branch { + Some(branch) if branch != "main" => ds + .checkout_branch(branch) + .await + .map_err(|e| OmniError::Lance(e.to_string())), + _ => Ok(ds), + } + } + + pub async fn open_dataset_head_for_write( + &self, + table_key: &str, + dataset_uri: &str, + branch: Option<&str>, + ) -> Result { + let table_path = self.table_path_from_dataset_uri(dataset_uri)?; + open_table_head_for_write(&self.root_uri, table_key, &table_path, branch).await + } + + pub async fn delete_branch(&self, dataset_uri: &str, branch: &str) -> Result<()> { + let mut ds = Dataset::open(dataset_uri) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + ds.delete_branch(branch) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn open_dataset_at_state( + &self, + table_path: &str, + branch: Option<&str>, + version: u64, + ) -> Result { + let ds = self + .open_dataset_head(&self.dataset_uri(table_path), branch) + .await?; + ds.checkout_version(version) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub fn ensure_expected_version( + &self, + ds: &Dataset, + table_key: &str, + expected_version: u64, + ) -> Result<()> { + if ds.version().version != expected_version { + return Err(OmniError::manifest_conflict(format!( + "version drift on {}: snapshot pinned v{} but dataset is at v{} — call sync_branch() and retry", + table_key, + expected_version, + ds.version().version + ))); + } + Ok(()) + } + + pub async fn reopen_for_mutation( + &self, + dataset_uri: &str, + branch: Option<&str>, + table_key: &str, + expected_version: u64, + ) -> Result { + let ds = self + .open_dataset_head_for_write(table_key, dataset_uri, branch) + .await?; + self.ensure_expected_version(&ds, table_key, expected_version)?; + Ok(ds) + } + + pub async fn fork_branch_from_state( + &self, + dataset_uri: &str, + source_branch: Option<&str>, + table_key: &str, + source_version: u64, + target_branch: &str, + ) -> Result { + let mut source_ds = self + .open_dataset_head(dataset_uri, source_branch) + .await? + .checkout_version(source_version) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.ensure_expected_version(&source_ds, table_key, source_version)?; + + match source_ds + .create_branch(target_branch, source_version, None) + .await + { + Ok(_) => {} + Err(create_err) => match self + .open_dataset_head(dataset_uri, Some(target_branch)) + .await + { + Ok(ds) => { + self.ensure_expected_version(&ds, table_key, source_version)?; + return Ok(ds); + } + Err(_) => return Err(OmniError::Lance(create_err.to_string())), + }, + } + + let ds = self + .open_dataset_head(dataset_uri, Some(target_branch)) + .await?; + self.ensure_expected_version(&ds, table_key, source_version)?; + Ok(ds) + } + + pub async fn scan_batches(&self, ds: &Dataset) -> Result> { + self.scan(ds, None, None, None).await + } + + pub async fn scan_batches_for_rewrite(&self, ds: &Dataset) -> Result> { + let has_blob_columns = ds.schema().fields_pre_order().any(|field| field.is_blob()); + if !has_blob_columns { + return self.scan_batches(ds).await; + } + + let mut scanner = ds.scan(); + scanner.blob_handling(BlobHandling::AllBinary); + scanner + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string()))? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn scan_stream( + ds: &Dataset, + projection: Option<&[&str]>, + filter: Option<&str>, + order_by: Option>, + with_row_id: bool, + ) -> Result { + Self::scan_stream_with(ds, projection, filter, order_by, with_row_id, |_| Ok(())).await + } + + pub async fn scan_stream_with( + ds: &Dataset, + projection: Option<&[&str]>, + filter: Option<&str>, + order_by: Option>, + with_row_id: bool, + configure: F, + ) -> Result + where + F: FnOnce(&mut Scanner) -> Result<()>, + { + let mut scanner = ds.scan(); + if with_row_id { + scanner.with_row_id(); + } + if let Some(columns) = projection { + scanner + .project(columns) + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + if let Some(filter_sql) = filter { + scanner + .filter(filter_sql) + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + if let Some(ordering) = order_by { + scanner + .order_by(Some(ordering)) + .map_err(|e| OmniError::Lance(e.to_string()))?; + } + configure(&mut scanner)?; + scanner + .try_into_stream() + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn scan( + &self, + ds: &Dataset, + projection: Option<&[&str]>, + filter: Option<&str>, + order_by: Option>, + ) -> Result> { + Self::scan_stream(ds, projection, filter, order_by, false) + .await? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn scan_with( + &self, + ds: &Dataset, + projection: Option<&[&str]>, + filter: Option<&str>, + order_by: Option>, + with_row_id: bool, + configure: F, + ) -> Result> + where + F: FnOnce(&mut Scanner) -> Result<()>, + { + Self::scan_stream_with(ds, projection, filter, order_by, with_row_id, configure) + .await? + .try_collect() + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn count_rows(&self, ds: &Dataset, filter: Option) -> Result { + ds.count_rows(filter) + .await + .map(|count| count as usize) + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub fn dataset_version(&self, ds: &Dataset) -> u64 { + ds.version().version + } + + pub async fn table_state(&self, dataset_uri: &str, ds: &Dataset) -> Result { + Ok(TableState { + version: self.dataset_version(ds), + row_count: self.count_rows(ds, None).await? as u64, + version_metadata: self.dataset_version_metadata(dataset_uri, ds)?, + }) + } + + pub async fn append_batch( + &self, + dataset_uri: &str, + ds: &mut Dataset, + batch: RecordBatch, + ) -> Result { + if batch.num_rows() == 0 { + return self.table_state(dataset_uri, ds).await; + } + let schema = batch.schema(); + let reader = arrow_array::RecordBatchIterator::new(vec![Ok(batch)], schema); + let params = WriteParams { + mode: WriteMode::Append, + allow_external_blob_outside_bases: true, + ..Default::default() + }; + ds.append(reader, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.table_state(dataset_uri, ds).await + } + + pub async fn append_or_create_batch( + dataset_uri: &str, + dataset: Option, + batch: RecordBatch, + ) -> Result { + let reader = arrow_array::RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()); + match dataset { + Some(mut ds) => { + let params = WriteParams { + mode: WriteMode::Append, + allow_external_blob_outside_bases: true, + ..Default::default() + }; + ds.append(reader, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(ds) + } + None => { + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + allow_external_blob_outside_bases: true, + ..Default::default() + }; + Dataset::write(reader, dataset_uri, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } + } + } + + pub async fn overwrite_batch( + &self, + dataset_uri: &str, + ds: &mut Dataset, + batch: RecordBatch, + ) -> Result { + ds.truncate_table() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.append_batch(dataset_uri, ds, batch).await + } + + pub async fn merge_insert_batch( + &self, + dataset_uri: &str, + ds: Dataset, + batch: RecordBatch, + key_columns: Vec, + when_matched: WhenMatched, + when_not_matched: WhenNotMatched, + ) -> Result { + if batch.num_rows() == 0 { + return self.table_state(dataset_uri, &ds).await; + } + + // TODO(lance-upstream): MergeInsertBuilder does not accept WriteParams, + // so allow_external_blob_outside_bases cannot be set here. External URI + // blobs via merge_insert (LoadMode::Merge, mutations) are unsupported + // until Lance exposes WriteParams on MergeInsertBuilder. + let ds = Arc::new(ds); + let job = MergeInsertBuilder::try_new(ds, key_columns) + .map_err(|e| OmniError::Lance(e.to_string()))? + .when_matched(when_matched) + .when_not_matched(when_not_matched) + .try_build() + .map_err(|e| OmniError::Lance(e.to_string()))?; + + let schema = batch.schema(); + let reader = arrow_array::RecordBatchIterator::new(vec![Ok(batch)], schema); + let (new_ds, _stats) = job + .execute(lance_datafusion::utils::reader_to_stream(Box::new(reader))) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + self.table_state(dataset_uri, &new_ds).await + } + + pub async fn merge_insert_batches( + &self, + dataset_uri: &str, + ds: Dataset, + batches: Vec, + key_columns: Vec, + when_matched: WhenMatched, + when_not_matched: WhenNotMatched, + ) -> Result { + if batches.is_empty() { + return self.table_state(dataset_uri, &ds).await; + } + let batch = if batches.len() == 1 { + batches.into_iter().next().unwrap() + } else { + let schema = batches[0].schema(); + concat_batches(&schema, &batches).map_err(|e| OmniError::Lance(e.to_string()))? + }; + self.merge_insert_batch( + dataset_uri, + ds, + batch, + key_columns, + when_matched, + when_not_matched, + ) + .await + } + + pub async fn delete_where( + &self, + dataset_uri: &str, + ds: &mut Dataset, + filter: &str, + ) -> Result { + let delete_result = ds + .delete(filter) + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(DeleteState { + version: delete_result.new_dataset.version().version, + row_count: self.count_rows(&delete_result.new_dataset, None).await? as u64, + deleted_rows: delete_result.num_deleted_rows as usize, + version_metadata: self + .dataset_version_metadata(dataset_uri, &delete_result.new_dataset)?, + }) + } + + async fn user_indices_for_column( + &self, + ds: &Dataset, + column: &str, + ) -> Result> { + let field_id = ds + .schema() + .field(column) + .map(|field| field.id) + .ok_or_else(|| { + OmniError::manifest_internal(format!( + "dataset is missing expected index column '{}'", + column + )) + })?; + let indices = ds + .load_indices() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(indices + .iter() + .filter(|index| !is_system_index(index)) + .filter(|index| index.fields.len() == 1 && index.fields[0] == field_id) + .cloned() + .collect()) + } + + pub async fn has_btree_index(&self, ds: &Dataset, column: &str) -> Result { + let indices = self.user_indices_for_column(ds, column).await?; + Ok(indices.iter().any(|index| { + index + .index_details + .as_ref() + .map(|details| details.type_url.ends_with("BTreeIndexDetails")) + .unwrap_or(false) + })) + } + + pub async fn has_fts_index(&self, ds: &Dataset, column: &str) -> Result { + let indices = self.user_indices_for_column(ds, column).await?; + Ok(indices.iter().any(|index| { + index + .index_details + .as_ref() + .map(|details| IndexDetails(details.clone()).supports_fts()) + .unwrap_or(false) + })) + } + + pub async fn has_vector_index(&self, ds: &Dataset, column: &str) -> Result { + let indices = self.user_indices_for_column(ds, column).await?; + Ok(indices.iter().any(|index| { + index + .index_details + .as_ref() + .map(|details| IndexDetails(details.clone()).is_vector()) + .unwrap_or(false) + })) + } + + pub async fn create_btree_index(&self, ds: &mut Dataset, columns: &[&str]) -> Result<()> { + let params = ScalarIndexParams::default(); + ds.create_index_builder(columns, IndexType::BTree, ¶ms) + .replace(true) + .await + .map(|_| ()) + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn create_inverted_index(&self, ds: &mut Dataset, column: &str) -> Result<()> { + let params = InvertedIndexParams::default(); + ds.create_index_builder(&[column], IndexType::Inverted, ¶ms) + .replace(true) + .await + .map(|_| ()) + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn create_vector_index(&self, ds: &mut Dataset, column: &str) -> Result<()> { + let params = lance::index::vector::VectorIndexParams::ivf_flat(1, MetricType::L2); + ds.create_index_builder(&[column], IndexType::Vector, ¶ms) + .replace(true) + .await + .map(|_| ()) + .map_err(|e| OmniError::Lance(e.to_string())) + } + + pub async fn create_empty_dataset(dataset_uri: &str, schema: &SchemaRef) -> Result { + let batch = RecordBatch::new_empty(schema.clone()); + Self::write_dataset(dataset_uri, batch).await + } + + pub async fn first_row_id_for_filter(&self, ds: &Dataset, filter: &str) -> Result> { + let batches = Self::scan_stream(ds, Some(&["id"]), Some(filter), None, true) + .await? + .try_collect::>() + .await + .map_err(|e| OmniError::Lance(e.to_string()))?; + Ok(batches.iter().find_map(|batch| { + batch + .column_by_name("_rowid") + .and_then(|col| col.as_any().downcast_ref::()) + .and_then(|arr| (arr.len() > 0).then(|| arr.value(0))) + })) + } + + pub async fn write_dataset(dataset_uri: &str, batch: RecordBatch) -> Result { + let reader = arrow_array::RecordBatchIterator::new(vec![Ok(batch.clone())], batch.schema()); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + allow_external_blob_outside_bases: true, + ..Default::default() + }; + Dataset::write(reader, dataset_uri, Some(params)) + .await + .map_err(|e| OmniError::Lance(e.to_string())) + } +} diff --git a/crates/omnigraph/tests/branching.rs b/crates/omnigraph/tests/branching.rs new file mode 100644 index 0000000..5ac5186 --- /dev/null +++ b/crates/omnigraph/tests/branching.rs @@ -0,0 +1,1481 @@ +mod helpers; + +use std::fs; + +use arrow_array::{Array, Int32Array, UInt64Array}; +use futures::TryStreamExt; +use lance_index::{DatasetIndexExt, is_system_index}; + +use omnigraph::db::commit_graph::CommitGraph; +use omnigraph::db::{MergeOutcome, Omnigraph, ReadTarget}; +use omnigraph::error::{MergeConflictKind, OmniError}; +use omnigraph::loader::{LoadMode, load_jsonl}; + +use helpers::*; + +const SEARCH_SCHEMA: &str = include_str!("fixtures/search.pg"); +const SEARCH_DATA: &str = include_str!("fixtures/search.jsonl"); +const SEARCH_QUERIES: &str = include_str!("fixtures/search.gq"); +const SEARCH_MUTATIONS: &str = r#" +query set_doc_title($slug: String, $title: String) { + update Doc set { title: $title } where slug = $slug +} +"#; + +const UNIQUE_SCHEMA: &str = r#" +node User { + name: String @key + email: String? + @unique(email) +} +"#; + +const UNIQUE_DATA: &str = r#"{"type":"User","data":{"name":"Alice","email":"alice@example.com"}}"#; + +const UNIQUE_MUTATIONS: &str = r#" +query insert_user($name: String, $email: String) { + insert User { name: $name, email: $email } +} +"#; + +const CARDINALITY_SCHEMA: &str = r#" +node Person { + name: String @key +} + +node Company { + name: String @key +} + +edge WorksAt: Person -> Company @card(0..1) +"#; + +const CARDINALITY_DATA: &str = r#"{"type":"Person","data":{"name":"Alice"}} +{"type":"Company","data":{"name":"Acme"}} +{"type":"Company","data":{"name":"Beta"}}"#; + +const CARDINALITY_MUTATIONS: &str = r#" +query add_employment($person: String, $company: String) { + insert WorksAt { from: $person, to: $company } +} +"#; + +async fn init_search_db(dir: &tempfile::TempDir) -> Omnigraph { + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, SEARCH_SCHEMA).await.unwrap(); + load_jsonl(&mut db, SEARCH_DATA, LoadMode::Overwrite) + .await + .unwrap(); + db.ensure_indices().await.unwrap(); + db +} + +async fn init_db_from_schema_and_data( + dir: &tempfile::TempDir, + schema: &str, + data: &str, +) -> Omnigraph { + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + db +} + +#[tokio::test] +async fn branch_create_open_list_and_lazy_branching_work() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + assert_eq!(main.branch_list().await.unwrap(), vec!["main", "feature"]); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + assert_eq!( + count_rows_branch(&feature, "feature", "node:Person").await, + 4 + ); + let initial_feature_snap = snapshot_branch(&feature, "feature").await.unwrap(); + assert_eq!( + initial_feature_snap + .entry("node:Person") + .unwrap() + .table_branch + .as_deref(), + None + ); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let snap = snapshot_branch(&feature, "feature").await.unwrap(); + assert_eq!( + snap.entry("node:Person").unwrap().table_branch.as_deref(), + Some("feature") + ); + assert_eq!( + snap.entry("edge:Knows").unwrap().table_branch.as_deref(), + None + ); + + let main = Omnigraph::open(uri).await.unwrap(); + assert_eq!(count_rows(&main, "node:Person").await, 4); +} + +#[tokio::test] +async fn explicit_target_query_reads_multiple_branches_from_one_handle() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + db.branch_create("feature").await.unwrap(); + db.mutate( + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let main_qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(main_qr.num_rows(), 0); + + let feature_qr = db + .query( + ReadTarget::branch("feature"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(feature_qr.num_rows(), 1); +} + +#[tokio::test] +async fn resolved_snapshot_stays_pinned_after_branch_advances() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let snapshot_id = db.resolve_snapshot("main").await.unwrap(); + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let pinned = db + .query( + ReadTarget::Snapshot(snapshot_id.clone()), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(pinned.num_rows(), 0); + + let head = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(head.num_rows(), 1); +} + +#[tokio::test] +async fn explicit_target_load_writes_to_named_branch() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + db.branch_create("feature").await.unwrap(); + db.load( + "feature", + r#"{"type":"Person","data":{"name":"Eve","age":22}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + + let main_qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(main_qr.num_rows(), 0); + + let feature_qr = db + .query( + ReadTarget::branch("feature"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(feature_qr.num_rows(), 1); +} + +#[tokio::test] +async fn branch_merge_updates_main_traversal() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Alice"), ("$to", "Diana")]), + ) + .await + .unwrap(); + + let feature_qr = query_branch( + &mut feature, + "feature", + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(feature_qr.num_rows(), 3); + + let main_before = query_main( + &mut main, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(main_before.num_rows(), 2); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::FastForward); + + let merged = query_main( + &mut main, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(merged.num_rows(), 3); +} + +#[tokio::test] +async fn branch_merge_applies_node_insert_to_main() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let outcome = feature.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::FastForward); + + let mut reopened = Omnigraph::open(uri).await.unwrap(); + let qr = query_main( + &mut reopened, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); +} + +#[tokio::test] +async fn branch_merge_records_single_latest_commit_with_two_parents() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let source_head_before = CommitGraph::open_at_branch(uri, "feature") + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap(); + let target_head_before = CommitGraph::open(uri) + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap(); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::FastForward); + + let commit_graph = CommitGraph::open(uri).await.unwrap(); + let head = commit_graph.head_commit().await.unwrap().unwrap(); + let commits = commit_graph.load_commits().await.unwrap(); + let latest_manifest_version = commits.iter().map(|c| c.manifest_version).max().unwrap(); + let latest_commits: Vec<_> = commits + .iter() + .filter(|commit| commit.manifest_version == latest_manifest_version) + .collect(); + + assert_eq!(latest_commits.len(), 1); + assert_eq!(head.manifest_version, latest_manifest_version); + assert_eq!( + head.parent_commit_id.as_deref(), + Some(target_head_before.graph_commit_id.as_str()) + ); + assert_eq!( + head.merged_parent_commit_id.as_deref(), + Some(source_head_before.graph_commit_id.as_str()) + ); +} + +#[tokio::test] +async fn branch_merge_records_actor_on_latest_commit() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let outcome = main + .branch_merge_as("feature", "main", Some("act-ragnor")) + .await + .unwrap(); + assert_eq!(outcome, MergeOutcome::FastForward); + + let head = CommitGraph::open(uri) + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap(); + assert_eq!(head.actor_id.as_deref(), Some("act-ragnor")); +} + +#[tokio::test] +async fn already_up_to_date_branch_merge_returns_without_new_commit() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let source_head_before = CommitGraph::open_at_branch(uri, "feature") + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap(); + let target_head_before = CommitGraph::open(uri) + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap(); + assert_eq!( + source_head_before.manifest_version, + target_head_before.manifest_version + ); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::AlreadyUpToDate); + + let commit_graph = CommitGraph::open(uri).await.unwrap(); + let head = commit_graph.head_commit().await.unwrap().unwrap(); + + assert_eq!(head.manifest_version, target_head_before.manifest_version); + assert_eq!(head.graph_commit_id, target_head_before.graph_commit_id); + assert_eq!(head.graph_commit_id, source_head_before.graph_commit_id); +} + +#[tokio::test] +async fn branch_merge_returns_merged_for_non_fast_forward_auto_merge() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 26)]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::Merged); + + let bob = query_main( + &mut main, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Bob")]), + ) + .await + .unwrap() + .concat_batches() + .unwrap(); + let bob_ages = bob.column(1).as_any().downcast_ref::().unwrap(); + assert_eq!(bob_ages.value(0), 26); + + let eve = query_main( + &mut main, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(eve.num_rows(), 1); +} + +#[tokio::test] +async fn branch_merge_allows_identical_updates_on_both_sides() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), + ) + .await + .unwrap(); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::Merged); + + let alice = query_main( + &mut main, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap() + .concat_batches() + .unwrap(); + let ages = alice + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ages.value(0), 31); +} + +#[tokio::test] +async fn merged_rewritten_indexed_table_is_searchable_immediately() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_search_db(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + SEARCH_MUTATIONS, + "set_doc_title", + ¶ms(&[("$slug", "ml-intro"), ("$title", "Orion ML Intro")]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + SEARCH_MUTATIONS, + "set_doc_title", + ¶ms(&[("$slug", "dl-basics"), ("$title", "Orion DL Basics")]), + ) + .await + .unwrap(); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::Merged); + + let result = query_main( + &mut main, + SEARCH_QUERIES, + "text_search", + ¶ms(&[("$q", "Orion")]), + ) + .await + .unwrap(); + let batch = result.concat_batches().unwrap(); + let slugs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let values: Vec<&str> = (0..slugs.len()).map(|idx| slugs.value(idx)).collect(); + assert!(values.contains(&"ml-intro")); + assert!(values.contains(&"dl-basics")); + + let ds = snapshot_main(&main) + .await + .unwrap() + .open("node:Doc") + .await + .unwrap(); + let indices = ds.load_indices().await.unwrap(); + let user_indices: Vec<_> = indices.iter().filter(|idx| !is_system_index(idx)).collect(); + assert_eq!( + user_indices.len(), + 4, + "expected rebuilt id BTree plus key-property and title/body indices after rewritten merge" + ); +} + +#[tokio::test] +async fn branch_merge_reports_divergent_update_conflict() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 32)]), + ) + .await + .unwrap(); + + let err = feature.branch_merge("feature", "main").await.unwrap_err(); + match err { + OmniError::MergeConflicts(conflicts) => { + assert!(conflicts.iter().any(|conflict| { + conflict.table_key == "node:Person" + && conflict.row_id.as_deref() == Some("Alice") + && conflict.kind == MergeConflictKind::DivergentUpdate + })); + } + other => panic!("expected merge conflicts, got {other:?}"), + } + + let mut reopened = Omnigraph::open(uri).await.unwrap(); + let qr = query_main( + &mut reopened, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + let batch = qr.concat_batches().unwrap(); + let ages = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ages.value(0), 31); +} + +#[tokio::test] +async fn explicit_target_reads_see_branch_local_writes_without_refresh() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut writer = Omnigraph::open(uri).await.unwrap(); + let mut reader = Omnigraph::open(uri).await.unwrap(); + let mut main_reader = Omnigraph::open(uri).await.unwrap(); + + mutate_branch( + &mut writer, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let visible = query_branch( + &mut reader, + "feature", + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(visible.num_rows(), 1); + + let main_result = query_main( + &mut main_reader, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(main_result.num_rows(), 0); +} + +#[tokio::test] +async fn branch_created_from_non_main_inherits_branch_state() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + feature + .branch_create_from(ReadTarget::branch("feature"), "experiment") + .await + .unwrap(); + + assert_eq!( + feature.branch_list().await.unwrap(), + vec!["main", "experiment", "feature"] + ); + + let mut experiment = Omnigraph::open(uri).await.unwrap(); + let qr = query_branch( + &mut experiment, + "experiment", + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + + let mut reopened_main = Omnigraph::open(uri).await.unwrap(); + let main_qr = query_main( + &mut reopened_main, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(main_qr.num_rows(), 0); +} + +#[tokio::test] +async fn ensure_indices_on_child_branch_forks_inherited_table_ownership() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + feature + .branch_create_from(ReadTarget::branch("feature"), "experiment") + .await + .unwrap(); + + let mut experiment = Omnigraph::open(uri).await.unwrap(); + let experiment_inherited = snapshot_branch(&experiment, "experiment").await.unwrap(); + assert_eq!( + experiment_inherited + .entry("node:Person") + .unwrap() + .table_branch + .as_deref(), + Some("feature") + ); + + experiment.ensure_indices_on("experiment").await.unwrap(); + + let experiment_snap = snapshot_branch(&experiment, "experiment").await.unwrap(); + assert_eq!( + experiment_snap + .entry("node:Person") + .unwrap() + .table_branch + .as_deref(), + Some("experiment") + ); + assert_eq!( + experiment_snap + .entry("edge:Knows") + .unwrap() + .table_branch + .as_deref(), + None + ); + + let feature_snap = snapshot_branch(&feature, "feature").await.unwrap(); + assert_eq!( + feature_snap + .entry("node:Person") + .unwrap() + .table_branch + .as_deref(), + Some("feature") + ); + assert_eq!( + count_rows_branch(&feature, "feature", "node:Person").await, + 5 + ); + assert_eq!( + count_rows_branch(&experiment, "experiment", "node:Person").await, + 5 + ); +} + +#[tokio::test] +async fn branch_edge_only_write_only_branches_edge_table() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Alice"), ("$to", "Diana")]), + ) + .await + .unwrap(); + + let snap = snapshot_branch(&feature, "feature").await.unwrap(); + assert_eq!( + snap.entry("node:Person").unwrap().table_branch.as_deref(), + None + ); + assert_eq!( + snap.entry("edge:Knows").unwrap().table_branch.as_deref(), + Some("feature") + ); + assert_eq!( + snap.entry("edge:WorksAt").unwrap().table_branch.as_deref(), + None + ); + + let feature_qr = query_branch( + &mut feature, + "feature", + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(feature_qr.num_rows(), 3); + + let mut reopened_main = Omnigraph::open(uri).await.unwrap(); + let main_qr = query_main( + &mut reopened_main, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(main_qr.num_rows(), 2); +} + +#[tokio::test] +async fn branch_merge_into_non_main_target_works() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + feature + .branch_create_from(ReadTarget::branch("feature"), "experiment") + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 26)]), + ) + .await + .unwrap(); + + let outcome = main.branch_merge("feature", "experiment").await.unwrap(); + assert_eq!(outcome, MergeOutcome::FastForward); + + let mut experiment = Omnigraph::open(uri).await.unwrap(); + let bob = query_branch( + &mut experiment, + "experiment", + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Bob")]), + ) + .await + .unwrap(); + let bob_batch = bob.concat_batches().unwrap(); + let bob_ages = bob_batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(bob_ages.value(0), 26); + + let eve = query_branch( + &mut experiment, + "experiment", + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(eve.num_rows(), 1); + let experiment_snap = snapshot_branch(&experiment, "experiment").await.unwrap(); + assert_eq!( + experiment_snap + .entry("node:Person") + .unwrap() + .table_branch + .as_deref(), + Some("experiment") + ); + + let mut reopened_main = Omnigraph::open(uri).await.unwrap(); + let main_bob = query_main( + &mut reopened_main, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Bob")]), + ) + .await + .unwrap(); + let main_batch = main_bob.concat_batches().unwrap(); + let main_ages = main_batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(main_ages.value(0), 25); +} + +#[tokio::test] +async fn branch_merge_reports_divergent_insert_conflict() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 21)]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let err = feature.branch_merge("feature", "main").await.unwrap_err(); + match err { + OmniError::MergeConflicts(conflicts) => { + assert!(conflicts.iter().any(|conflict| { + conflict.table_key == "node:Person" + && conflict.row_id.as_deref() == Some("Eve") + && conflict.kind == MergeConflictKind::DivergentInsert + })); + } + other => panic!("expected merge conflicts, got {other:?}"), + } +} + +#[tokio::test] +async fn branch_merge_reports_delete_vs_update_conflict() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 32)]), + ) + .await + .unwrap(); + + let err = feature.branch_merge("feature", "main").await.unwrap_err(); + match err { + OmniError::MergeConflicts(conflicts) => { + assert!(conflicts.iter().any(|conflict| { + conflict.table_key == "node:Person" + && conflict.row_id.as_deref() == Some("Alice") + && conflict.kind == MergeConflictKind::DeleteVsUpdate + })); + } + other => panic!("expected merge conflicts, got {other:?}"), + } +} + +#[tokio::test] +async fn branch_merge_reports_orphan_edge_conflict() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + mutate_main( + &mut main, + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Alice"), ("$to", "Diana")]), + ) + .await + .unwrap(); + + let err = feature.branch_merge("feature", "main").await.unwrap_err(); + match err { + OmniError::MergeConflicts(conflicts) => { + assert!(conflicts.iter().any(|conflict| { + conflict.table_key == "edge:Knows" && conflict.kind == MergeConflictKind::OrphanEdge + })); + } + other => panic!("expected merge conflicts, got {other:?}"), + } +} + +#[tokio::test] +async fn branch_merge_reports_unique_violation_conflict() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_db_from_schema_and_data(&dir, UNIQUE_SCHEMA, UNIQUE_DATA).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + UNIQUE_MUTATIONS, + "insert_user", + ¶ms(&[("$name", "Bob"), ("$email", "dup@example.com")]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + UNIQUE_MUTATIONS, + "insert_user", + ¶ms(&[("$name", "Carol"), ("$email", "dup@example.com")]), + ) + .await + .unwrap(); + + let err = main.branch_merge("feature", "main").await.unwrap_err(); + match err { + OmniError::MergeConflicts(conflicts) => { + assert!(conflicts.iter().any(|conflict| { + conflict.table_key == "node:User" + && conflict.kind == MergeConflictKind::UniqueViolation + })); + } + other => panic!("expected merge conflicts, got {other:?}"), + } +} + +#[tokio::test] +async fn branch_merge_reports_cardinality_violation_conflict() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_db_from_schema_and_data(&dir, CARDINALITY_SCHEMA, CARDINALITY_DATA).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + + mutate_main( + &mut main, + CARDINALITY_MUTATIONS, + "add_employment", + ¶ms(&[("$person", "Alice"), ("$company", "Acme")]), + ) + .await + .unwrap(); + + mutate_branch( + &mut feature, + "feature", + CARDINALITY_MUTATIONS, + "add_employment", + ¶ms(&[("$person", "Alice"), ("$company", "Beta")]), + ) + .await + .unwrap(); + + let err = main.branch_merge("feature", "main").await.unwrap_err(); + match err { + OmniError::MergeConflicts(conflicts) => { + assert!(conflicts.iter().any(|conflict| { + conflict.table_key == "edge:WorksAt" + && conflict.kind == MergeConflictKind::CardinalityViolation + })); + } + other => panic!("expected merge conflicts, got {other:?}"), + } +} + +#[tokio::test] +async fn branch_create_bootstraps_missing_commit_graph() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = init_and_load(&dir).await; + drop(db); + + fs::remove_dir_all(dir.path().join("_graph_commits.lance")).unwrap(); + + let mut reopened = Omnigraph::open(uri).await.unwrap(); + reopened.branch_create("feature").await.unwrap(); + + assert!(dir.path().join("_graph_commits.lance").exists()); + + let feature = Omnigraph::open(uri).await.unwrap(); + assert_eq!( + count_rows_branch(&feature, "feature", "node:Person").await, + 4 + ); +} + +#[tokio::test] +async fn branch_api_rejects_reserved_main_and_same_source_target_merge() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let err = db.branch_create("main").await.unwrap_err(); + assert!(err.to_string().contains("cannot create branch 'main'")); + + let err = db.branch_delete("main").await.unwrap_err(); + assert!(err.to_string().contains("cannot delete branch 'main'")); + + let err = db.branch_merge("main", "main").await.unwrap_err(); + assert!(err.to_string().contains("distinct source and target")); + + db.branch_create("feature").await.unwrap(); + db.sync_branch("feature").await.unwrap(); + let err = db.branch_delete("feature").await.unwrap_err(); + assert!(err.to_string().contains("currently active branch")); +} + +#[tokio::test] +async fn branch_delete_removes_owned_table_branches_and_allows_recreate() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + main.branch_delete("feature").await.unwrap(); + assert_eq!(main.branch_list().await.unwrap(), vec!["main"]); + + main.branch_create("feature").await.unwrap(); + mutate_branch( + &mut main, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 41)]), + ) + .await + .unwrap(); + + assert_eq!(count_rows_branch(&main, "feature", "node:Person").await, 5); +} + +#[tokio::test] +async fn branch_delete_rejects_branches_still_referenced_by_descendants() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + feature + .branch_create_from(ReadTarget::branch("feature"), "experiment") + .await + .unwrap(); + + let err = main.branch_delete("feature").await.unwrap_err(); + assert!(err.to_string().contains("still depends on it")); +} + +// ─── Step 9b: Surgical merge publish tests ────────────────────────────────── + +#[tokio::test] +async fn merged_table_preserves_row_version_for_unchanged_rows() { + // After a non-FF merge, unchanged rows retain their original _row_created_at_version. + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.ensure_indices().await.unwrap(); + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + + // Main updates Bob's age → changes one row + mutate_main( + &mut main, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 26)]), + ) + .await + .unwrap(); + + // Feature inserts Eve → adds one row + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::Merged); + + // After merge: scan node:Person with _row_created_at_version + let snap = snapshot_main(&main).await.unwrap(); + let ds = snap.open("node:Person").await.unwrap(); + let mut scanner = ds.scan(); + scanner.project(&["id", "_row_created_at_version"]).unwrap(); + let batches: Vec<_> = scanner + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + // Collect _row_created_at_version for each person + let mut version_by_id: std::collections::HashMap = + std::collections::HashMap::new(); + for batch in &batches { + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let versions = batch + .column_by_name("_row_created_at_version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..ids.len() { + version_by_id.insert(ids.value(i).to_string(), versions.value(i)); + } + } + + // The key assertion: NOT all rows have the same _row_created_at_version. + // With truncate+append, all rows would be re-stamped to the merge version. + // With surgical merge_insert, unchanged rows keep their original version. + let unique_versions: std::collections::HashSet = version_by_id.values().copied().collect(); + assert!( + unique_versions.len() > 1, + "After surgical merge, rows should have different _row_created_at_version values \ + (original rows keep old version, merged-in rows get new version). \ + Got only {:?} for ids {:?}", + unique_versions, + version_by_id + ); +} + +#[tokio::test] +async fn edge_tables_have_id_btree_after_ensure_indices() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + db.ensure_indices().await.unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("edge:Knows").await.unwrap(); + let indices = ds.load_indices().await.unwrap(); + let user_indices: Vec<_> = indices.iter().filter(|idx| !is_system_index(idx)).collect(); + + // Should have BTree on id, src, dst = 3 indices + let index_names: Vec<_> = user_indices.iter().map(|idx| idx.fields.clone()).collect(); + assert!( + user_indices.len() >= 3, + "Edge table should have at least 3 indices (id, src, dst), got {:?}", + index_names + ); +} + +#[tokio::test] +async fn merge_delta_only_bumps_changed_rows() { + // After a non-FF merge, unchanged rows should NOT have _row_last_updated_at_version + // bumped. Only rows that were actually modified should get new version stamps. + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.ensure_indices().await.unwrap(); + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + + // Main updates Bob's age → changes one Person row + mutate_main( + &mut main, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 26)]), + ) + .await + .unwrap(); + + // Feature inserts Eve → adds one Person row (makes it non-FF) + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::Merged); + + // Scan all persons with _row_last_updated_at_version + let snap = snapshot_main(&main).await.unwrap(); + let ds = snap.open("node:Person").await.unwrap(); + let mut scanner = ds.scan(); + scanner + .project(&["id", "_row_last_updated_at_version"]) + .unwrap(); + let batches: Vec<_> = scanner + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + // Collect all _row_last_updated_at_version values + let mut versions: Vec = Vec::new(); + for batch in &batches { + let v = batch + .column_by_name("_row_last_updated_at_version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..v.len() { + versions.push(v.value(i)); + } + } + + // Not all rows should have the same version — unchanged rows keep old version + let unique_versions: std::collections::HashSet = versions.iter().copied().collect(); + assert!( + unique_versions.len() > 1, + "After surgical merge, rows should have different _row_last_updated_at_version values. \ + Unchanged rows should keep old version, changed rows get new version. \ + Got only {:?}", + unique_versions + ); +} diff --git a/crates/omnigraph/tests/changes.rs b/crates/omnigraph/tests/changes.rs new file mode 100644 index 0000000..aa5c00f --- /dev/null +++ b/crates/omnigraph/tests/changes.rs @@ -0,0 +1,677 @@ +mod helpers; + +use omnigraph::changes::{ChangeFilter, ChangeOp, EntityKind}; +use omnigraph::db::commit_graph::CommitGraph; +use omnigraph::db::{MergeOutcome, Omnigraph, ReadTarget}; + +use helpers::*; + +async fn head_commit_id(uri: &str, branch: Option<&str>) -> String { + let commit_graph = match branch { + Some(branch) => CommitGraph::open_at_branch(uri, branch).await.unwrap(), + None => CommitGraph::open(uri).await.unwrap(), + }; + commit_graph.head_commit_id().await.unwrap().unwrap() +} + +fn change_tuples(change_set: &omnigraph::changes::ChangeSet) -> Vec<(String, String, ChangeOp)> { + let mut tuples: Vec<_> = change_set + .changes + .iter() + .map(|change| (change.table_key.clone(), change.id.clone(), change.op)) + .collect(); + tuples.sort_by(|a, b| { + a.0.cmp(&b.0).then_with(|| a.1.cmp(&b.1)).then_with(|| { + let a_op = match a.2 { + ChangeOp::Insert => 0, + ChangeOp::Update => 1, + ChangeOp::Delete => 2, + }; + let b_op = match b.2 { + ChangeOp::Insert => 0, + ChangeOp::Update => 1, + ChangeOp::Delete => 2, + }; + a_op.cmp(&b_op) + }) + }); + tuples +} + +// ─── Same-branch diff tests ──────────────────────────────────────────────── + +#[tokio::test] +async fn diff_empty_when_nothing_changed() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + let v = snapshot_id(&db, "main").await.unwrap(); + let cs = db + .diff_between( + ReadTarget::Snapshot(v.clone()), + ReadTarget::Snapshot(v), + &ChangeFilter::default(), + ) + .await + .unwrap(); + assert!(cs.changes.is_empty()); + assert_eq!(cs.stats.inserts, 0); + assert_eq!(cs.stats.updates, 0); + assert_eq!(cs.stats.deletes, 0); +} + +#[tokio::test] +async fn diff_detects_node_insert() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = snapshot_id(&db, "main").await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let cs = diff_since_branch(&db, "main", v_before, &ChangeFilter::default()) + .await + .unwrap(); + let inserts: Vec<_> = cs + .changes + .iter() + .filter(|c| c.op == ChangeOp::Insert && c.table_key == "node:Person") + .collect(); + assert!( + !inserts.is_empty(), + "Should detect the Person insert. Got changes: {:?}", + cs.changes + .iter() + .map(|c| (&c.table_key, &c.id, c.op)) + .collect::>() + ); + assert!( + inserts.iter().any(|c| c.id == "Eve"), + "Insert should contain Eve. Got: {:?}", + inserts.iter().map(|c| &c.id).collect::>() + ); + assert_eq!(inserts[0].kind, EntityKind::Node); + assert_eq!(inserts[0].endpoints, None); +} + +#[tokio::test] +async fn diff_detects_node_update() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = snapshot_id(&db, "main").await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 99)]), + ) + .await + .unwrap(); + + let cs = diff_since_branch(&db, "main", v_before, &ChangeFilter::default()) + .await + .unwrap(); + let updates: Vec<_> = cs + .changes + .iter() + .filter(|c| c.op == ChangeOp::Update && c.table_key == "node:Person") + .collect(); + assert!( + !updates.is_empty(), + "Should detect the Person update. Got changes: {:?}", + cs.changes + .iter() + .map(|c| (&c.table_key, &c.id, c.op)) + .collect::>() + ); +} + +#[tokio::test] +async fn diff_detects_node_delete_with_cascade() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = snapshot_id(&db, "main").await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + let cs = diff_since_branch(&db, "main", v_before, &ChangeFilter::default()) + .await + .unwrap(); + + // Should have node:Person delete + let person_deletes: Vec<_> = cs + .changes + .iter() + .filter(|c| c.op == ChangeOp::Delete && c.table_key == "node:Person") + .collect(); + assert!( + !person_deletes.is_empty(), + "Should detect Person delete. Changes: {:?}", + cs.changes + .iter() + .map(|c| (&c.table_key, &c.id, c.op)) + .collect::>() + ); + + // Should also have edge:Knows cascade deletes + let edge_deletes: Vec<_> = cs + .changes + .iter() + .filter(|c| c.op == ChangeOp::Delete && c.table_key == "edge:Knows") + .collect(); + assert!( + !edge_deletes.is_empty(), + "Should detect cascaded Knows edge deletes. Changes: {:?}", + cs.changes + .iter() + .map(|c| (&c.table_key, &c.id, c.op)) + .collect::>() + ); + + // Cascaded edge deletes should have endpoints + for edge_del in &edge_deletes { + assert!( + edge_del.endpoints.is_some(), + "Deleted edge should have endpoint context" + ); + } +} + +#[tokio::test] +async fn diff_detects_edge_insert_with_endpoints() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = snapshot_id(&db, "main").await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Bob"), ("$to", "Charlie")]), + ) + .await + .unwrap(); + + let cs = diff_since_branch(&db, "main", v_before, &ChangeFilter::default()) + .await + .unwrap(); + + let edge_inserts: Vec<_> = cs + .changes + .iter() + .filter(|c| c.op == ChangeOp::Insert && c.table_key == "edge:Knows") + .collect(); + assert!( + !edge_inserts.is_empty(), + "Should detect Knows edge insert. Changes: {:?}", + cs.changes + .iter() + .map(|c| (&c.table_key, &c.id, c.op)) + .collect::>() + ); + + let e = &edge_inserts[0]; + assert_eq!(e.kind, EntityKind::Edge); + let ep = e + .endpoints + .as_ref() + .expect("Edge insert should have endpoints"); + assert!(!ep.src.is_empty(), "src should not be empty"); + assert!(!ep.dst.is_empty(), "dst should not be empty"); +} + +// ─── Filter tests ────────────────────────────────────────────────────────── + +#[tokio::test] +async fn filter_by_type_name_skips_non_matching() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = snapshot_id(&db, "main").await.unwrap(); + + // Insert a person (node:Person) and add a friend (edge:Knows) + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "FilterTest")], &[("$age", 30)]), + ) + .await + .unwrap(); + + // Filter to Company only — should not see Person changes + let filter = ChangeFilter { + type_names: Some(vec!["Company".to_string()]), + ..Default::default() + }; + let cs = diff_since_branch(&db, "main", v_before, &filter) + .await + .unwrap(); + assert!( + cs.changes.is_empty(), + "Filter to Company should skip Person changes. Got: {:?}", + cs.changes + .iter() + .map(|c| (&c.table_key, &c.id, c.op)) + .collect::>() + ); +} + +#[tokio::test] +async fn filter_by_op_skips_unwanted_operations() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = snapshot_id(&db, "main").await.unwrap(); + + // Insert Eve, update Bob, delete Alice + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 99)]), + ) + .await + .unwrap(); + + // Filter to Insert only + let filter = ChangeFilter { + ops: Some(vec![ChangeOp::Insert]), + ..Default::default() + }; + let cs = diff_since_branch(&db, "main", v_before, &filter) + .await + .unwrap(); + + // Should only have inserts, no updates or deletes + for c in &cs.changes { + assert_eq!( + c.op, + ChangeOp::Insert, + "Filter for Insert-only should not include {:?} for {} ({})", + c.op, + c.table_key, + c.id + ); + } +} + +// ─── Cross-branch diff tests ────────────────────────────────────────────── + +#[tokio::test] +async fn diff_after_merge_reports_actual_changes() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.ensure_indices().await.unwrap(); + let v_before_branch = snapshot_id(&main, "main").await.unwrap(); + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + + // Main updates Bob + mutate_main( + &mut main, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 26)]), + ) + .await + .unwrap(); + + // Feature inserts Eve + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::Merged); + + // Diff from pre-branch to post-merge on main + let cs = diff_since_branch(&main, "main", v_before_branch, &ChangeFilter::default()) + .await + .unwrap(); + + // Should have: + // - Person insert (Eve) — from the merge + // - Person update (Bob) — from the main write + // Should NOT have: all original persons re-reported as inserts + let person_changes: Vec<_> = cs + .changes + .iter() + .filter(|c| c.table_key == "node:Person") + .collect(); + + let person_inserts: Vec<_> = person_changes + .iter() + .filter(|c| c.op == ChangeOp::Insert) + .collect(); + let person_updates: Vec<_> = person_changes + .iter() + .filter(|c| c.op == ChangeOp::Update) + .collect(); + + // There should be exactly 1 insert (Eve) not all persons + assert!( + person_inserts.len() <= 2, + "After surgical merge, should not re-report all persons as inserts. \ + Got {} inserts: {:?}", + person_inserts.len(), + person_inserts.iter().map(|c| &c.id).collect::>() + ); + + // Bob's update should be detected + assert!( + !person_updates.is_empty() || person_inserts.len() > 0, + "Should detect Bob's age update or Eve's insert" + ); +} + +#[tokio::test] +async fn diff_commits_resolves_feature_commit_from_main_handle() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let main_head = CommitGraph::open(uri) + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap() + .graph_commit_id; + let feature_head = CommitGraph::open_at_branch(uri, "feature") + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap() + .graph_commit_id; + + let cs = main + .diff_commits(&main_head, &feature_head, &ChangeFilter::default()) + .await + .unwrap(); + assert!( + cs.changes + .iter() + .any(|change| change.op == ChangeOp::Insert && change.id == "Eve"), + "expected feature-only insert to be diffable from a main handle" + ); +} + +#[tokio::test] +async fn cross_branch_diff_honors_insert_only_filter() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let main_head = CommitGraph::open(uri) + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap() + .graph_commit_id; + let feature_head = CommitGraph::open_at_branch(uri, "feature") + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap() + .graph_commit_id; + + let filter = ChangeFilter { + ops: Some(vec![ChangeOp::Insert]), + ..Default::default() + }; + let cs = main + .diff_commits(&main_head, &feature_head, &filter) + .await + .unwrap(); + assert!(!cs.changes.is_empty()); + assert!( + cs.changes + .iter() + .all(|change| change.op == ChangeOp::Insert) + ); +} + +#[tokio::test] +async fn diff_commits_resolves_commits_across_branches_from_any_handle() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + let base_commit = head_commit_id(uri, None).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + let feature_commit = head_commit_id(uri, Some("feature")).await; + + let from_main = main + .diff_commits(&base_commit, &feature_commit, &ChangeFilter::default()) + .await + .unwrap(); + let from_feature = feature + .diff_commits(&base_commit, &feature_commit, &ChangeFilter::default()) + .await + .unwrap(); + + assert_eq!(change_tuples(&from_main), change_tuples(&from_feature)); + assert!(from_main.changes.iter().any(|change| { + change.table_key == "node:Person" && change.id == "Eve" && change.op == ChangeOp::Insert + })); +} + +#[tokio::test] +async fn cross_lineage_diff_honors_delete_only_filter() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + let before = snapshot_id(&feature, "feature").await.unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 99)]), + ) + .await + .unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + let filter = ChangeFilter { + ops: Some(vec![ChangeOp::Delete]), + ..Default::default() + }; + let change_set = diff_since_branch(&feature, "feature", before, &filter) + .await + .unwrap(); + + assert!( + !change_set.changes.is_empty(), + "expected delete changes after removing Alice" + ); + assert!( + change_set + .changes + .iter() + .all(|change| change.op == ChangeOp::Delete) + ); +} + +#[tokio::test] +async fn same_branch_diff_across_first_lazy_fork_detects_update() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + let before = snapshot_id(&feature, "feature").await.unwrap(); + + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 77)]), + ) + .await + .unwrap(); + + let change_set = diff_since_branch(&feature, "feature", before, &ChangeFilter::default()) + .await + .unwrap(); + assert!(change_set.changes.iter().any(|change| { + change.table_key == "node:Person" && change.id == "Bob" && change.op == ChangeOp::Update + })); +} + +#[tokio::test] +async fn diff_commits_cross_branch_reports_property_only_updates() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + let base_commit = head_commit_id(uri, None).await; + + main.branch_create("feature").await.unwrap(); + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 55)]), + ) + .await + .unwrap(); + let feature_commit = head_commit_id(uri, Some("feature")).await; + + let change_set = main + .diff_commits(&base_commit, &feature_commit, &ChangeFilter::default()) + .await + .unwrap(); + + assert!(change_set.changes.iter().any(|change| { + change.table_key == "node:Person" && change.id == "Bob" && change.op == ChangeOp::Update + })); + assert!(!change_set.changes.iter().any(|change| { + change.table_key == "node:Person" && change.id == "Bob" && change.op == ChangeOp::Insert + })); +} + +#[tokio::test] +async fn diff_commits_ignores_row_version_only_differences() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 55)]), + ) + .await + .unwrap(); + let feature_commit = head_commit_id(uri, Some("feature")).await; + + mutate_main( + &mut main, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 55)]), + ) + .await + .unwrap(); + let main_commit = head_commit_id(uri, None).await; + + let change_set = main + .diff_commits(&main_commit, &feature_commit, &ChangeFilter::default()) + .await + .unwrap(); + + assert!( + change_set.changes.is_empty(), + "identical user-visible state should not produce diff entries: {:?}", + change_set.changes + ); +} diff --git a/crates/omnigraph/tests/consistency.rs b/crates/omnigraph/tests/consistency.rs new file mode 100644 index 0000000..0a2872f --- /dev/null +++ b/crates/omnigraph/tests/consistency.rs @@ -0,0 +1,574 @@ +mod helpers; + +use arrow_array::{Array, Date32Array, Int32Array, StringArray}; +use futures::TryStreamExt; + +use omnigraph::db::Omnigraph; +use omnigraph::loader::{LoadMode, load_jsonl}; +use omnigraph_compiler::ir::ParamMap; +use omnigraph_compiler::query::ast::Literal; + +use helpers::*; + +// ─── Snapshot data-level isolation ────────────────────────────────────────── + +#[tokio::test] +async fn snapshot_returns_stale_data_after_write() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Snapshot BEFORE mutation + let snap_before = snapshot_main(&db).await.unwrap(); + + // Insert a new person + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + // Snapshot AFTER mutation + let snap_after = snapshot_main(&db).await.unwrap(); + + // Old snapshot should still see 4 persons + let ds_before = snap_before.open("node:Person").await.unwrap(); + assert_eq!(ds_before.count_rows(None).await.unwrap(), 4); + + // New snapshot should see 5 persons + let ds_after = snap_after.open("node:Person").await.unwrap(); + assert_eq!(ds_after.count_rows(None).await.unwrap(), 5); + + // Verify Eve is NOT in old snapshot's data + let batches_before: Vec = ds_before + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let ids_before = collect_column_strings(&batches_before, "id"); + assert!(!ids_before.contains(&"Eve".to_string())); + + // Verify Eve IS in new snapshot's data + let batches_after: Vec = ds_after + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let ids_after = collect_column_strings(&batches_after, "id"); + assert!(ids_after.contains(&"Eve".to_string())); +} + +// ─── LoadMode::Merge ──────────────────────────────────────────────────────── + +#[tokio::test] +async fn load_merge_upserts_existing_and_inserts_new() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Load Alice(30) and Bob(25) via Overwrite + let initial = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Person", "data": {"name": "Bob", "age": 25}}"#; + load_jsonl(&mut db, initial, LoadMode::Overwrite) + .await + .unwrap(); + + assert_eq!(count_rows(&db, "node:Person").await, 2); + + // Merge: Alice updated to age=31, Charlie is new + let merge_data = r#"{"type": "Person", "data": {"name": "Alice", "age": 31}} +{"type": "Person", "data": {"name": "Charlie", "age": 35}}"#; + load_jsonl(&mut db, merge_data, LoadMode::Merge) + .await + .unwrap(); + + // Should have 3 persons total (not 4) + assert_eq!(count_rows(&db, "node:Person").await, 3); + + // Verify individual values + let batches = read_table(&db, "node:Person").await; + let batch = &batches[0]; + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let ages = batch + .column_by_name("age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..batch.num_rows() { + match ids.value(i) { + "Alice" => assert_eq!(ages.value(i), 31, "Alice should be updated to 31"), + "Bob" => assert_eq!(ages.value(i), 25, "Bob should be unchanged"), + "Charlie" => assert_eq!(ages.value(i), 35, "Charlie should be inserted"), + other => panic!("unexpected person: {}", other), + } + } +} + +#[tokio::test] +async fn cross_type_traversal_deduplicates_duplicate_edges() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let schema = r#" +node Person { name: String @key } +node Company { name: String @key } +edge WorksAt: Person -> Company +"#; + let data = r#"{"type":"Person","data":{"name":"Alice"}} +{"type":"Company","data":{"name":"Acme"}} +{"edge":"WorksAt","from":"Alice","to":"Acme"} +{"edge":"WorksAt","from":"Alice","to":"Acme"}"#; + let query = r#" +query company($name: String) { + match { + $p: Person { name: $name } + $p worksAt $c + } + return { $c.name } +} +"#; + + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let result = query_main(&mut db, query, "company", ¶ms(&[("$name", "Alice")])) + .await + .unwrap(); + assert_eq!(result.num_rows(), 1); +} + +// ─── Multi-writer refresh ─────────────────────────────────────────────────── + +#[tokio::test] +async fn explicit_target_query_sees_other_writer_commits_without_refresh() { + let dir = tempfile::tempdir().unwrap(); + let _db = init_and_load(&dir).await; + drop(_db); + + let uri = dir.path().to_str().unwrap(); + + // Two independent handles to the same repo + let mut db1 = Omnigraph::open(uri).await.unwrap(); + let mut db2 = Omnigraph::open(uri).await.unwrap(); + + // Writer 1 inserts Eve + mutate_main( + &mut db1, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + // Explicit-target reads resolve the latest branch head and should see Eve + let qr = query_main( + &mut db2, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1, "explicit target reads should see Eve"); +} + +#[tokio::test] +async fn explicit_target_query_rebuilds_graph_index_after_external_edge_write() { + let dir = tempfile::tempdir().unwrap(); + let _db = init_and_load(&dir).await; + drop(_db); + + let uri = dir.path().to_str().unwrap(); + let mut db1 = Omnigraph::open(uri).await.unwrap(); + let mut db2 = Omnigraph::open(uri).await.unwrap(); + + let warm = query_main( + &mut db2, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(warm.num_rows(), 2); + + mutate_main( + &mut db1, + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Alice"), ("$to", "Diana")]), + ) + .await + .unwrap(); + + let refreshed = query_main( + &mut db2, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!( + refreshed.num_rows(), + 3, + "explicit target reads should rebuild topology after edge change" + ); + + let batch = refreshed.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let values: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + assert!(values.contains(&"Bob")); + assert!(values.contains(&"Diana")); +} + +// ─── Null handling ────────────────────────────────────────────────────────── + +#[tokio::test] +async fn null_values_in_filter_and_projection() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Load data: Alice has age, Bob has null age, Charlie has age + let data = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Person", "data": {"name": "Bob"}} +{"type": "Person", "data": {"name": "Charlie", "age": 35}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + // Filter: age > 30 should exclude Bob (null) and Alice (30), keep Charlie (35) + let queries = r#" +query older_than_30() { + match { + $p: Person + $p.age > 30 + } + return { $p.name, $p.age } + order { $p.age desc } +} + +query all_persons() { + match { $p: Person } + return { $p.name, $p.age } + order { $p.age desc } +} +"#; + + let result = query_main(&mut db, queries, "older_than_30", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(result.num_rows(), 1); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Charlie"); + + // Projection: Bob's age should be null + let all = query_main(&mut db, queries, "all_persons", &ParamMap::new()) + .await + .unwrap(); + let batch = &all.batches()[0]; + let ids = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let ages = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + for i in 0..batch.num_rows() { + if ids.value(i) == "Bob" { + assert!(ages.is_null(i), "Bob's age should be null"); + } + } +} + +// ─── Graph index after node+edge insert ───────────────────────────────────── + +#[tokio::test] +async fn traversal_works_after_node_then_edge_insert() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Warm up the graph index cache by running a traversal + let _ = query_main( + &mut db, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + // Insert a new node (does NOT invalidate graph index) + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 40)]), + ) + .await + .unwrap(); + + // Insert an edge from Frank → Alice (DOES invalidate graph index) + mutate_main( + &mut db, + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Frank"), ("$to", "Alice")]), + ) + .await + .unwrap(); + + // Traversal should work: Frank → Alice + let result = query_main( + &mut db, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Frank")]), + ) + .await + .unwrap(); + assert_eq!(result.num_rows(), 1); + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Alice"); +} + +// ─── Edge property insert ─────────────────────────────────────────────────── + +#[tokio::test] +async fn insert_edge_with_property() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Knows has `since: Date?` property + let queries = r#" +query add_friend_since($from: String, $to: String, $since: Date) { + insert Knows { from: $from, to: $to, since: $since } +} +"#; + let mut p = params(&[("$from", "Diana"), ("$to", "Bob")]); + p.insert("since".to_string(), Literal::Date("2024-06-15".to_string())); + + let result = mutate_main(&mut db, queries, "add_friend_since", &p) + .await + .unwrap(); + assert_eq!(result.affected_edges, 1); + + // Verify the edge property was stored + let batches = read_table(&db, "edge:Knows").await; + let mut found = false; + for batch in &batches { + let srcs = batch + .column_by_name("src") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let dsts = batch + .column_by_name("dst") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let since = batch + .column_by_name("since") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..batch.num_rows() { + if srcs.value(i) == "Diana" && dsts.value(i) == "Bob" { + assert!(!since.is_null(i), "since should not be null"); + found = true; + } + } + } + assert!(found, "should find Diana→Bob edge"); +} + +// ─── Update / delete no-match ─────────────────────────────────────────────── + +#[tokio::test] +async fn update_nonexistent_returns_zero_affected() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Nobody")], &[("$age", 99)]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 0); +} + +#[tokio::test] +async fn delete_nonexistent_returns_zero_affected() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Nobody")]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 0); + assert_eq!(result.affected_edges, 0); + + // All 4 persons still intact + assert_eq!(count_rows(&db, "node:Person").await, 4); +} + +// ─── Large batch load ─────────────────────────────────────────────────────── + +#[tokio::test] +async fn large_batch_load_and_query() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + let schema = r#" +node Item { + name: String @key + value: I32 +} +"#; + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + // Generate 500 items + let mut lines = Vec::with_capacity(500); + for i in 0..500 { + lines.push(format!( + r#"{{"type": "Item", "data": {{"name": "item_{:04}", "value": {}}}}}"#, + i, i + )); + } + let data = lines.join("\n"); + load_jsonl(&mut db, &data, LoadMode::Overwrite) + .await + .unwrap(); + + assert_eq!(count_rows(&db, "node:Item").await, 500); + + // Query with filter — value > 490 + let queries = r#" +query high_value() { + match { + $i: Item + $i.value > 490 + } + return { $i.name, $i.value } + order { $i.value asc } +} +"#; + let result = query_main(&mut db, queries, "high_value", &ParamMap::new()) + .await + .unwrap(); + + // Items 491..499 = 9 items + assert_eq!(result.num_rows(), 9); + let batch = &result.batches()[0]; + let values = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(values.value(0), 491); + assert_eq!(values.value(8), 499); +} + +// ─── Regression: public mutation on stale handle still applies to latest head ────────────── + +#[tokio::test] +async fn stale_handle_public_mutation_uses_latest_target_head() { + let dir = tempfile::tempdir().unwrap(); + let _db = init_and_load(&dir).await; + drop(_db); + + let uri = dir.path().to_str().unwrap(); + let mut db1 = Omnigraph::open(uri).await.unwrap(); + let mut db2 = Omnigraph::open(uri).await.unwrap(); + + // Writer 1 inserts — advances the Person sub-table version + mutate_main( + &mut db1, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + // Writer 2 (stale) mutates through the public transactional path. + // It should stage from the latest target head rather than replaying a stale write. + mutate_main( + &mut db2, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 99)]), + ) + .await + .unwrap(); + + let result = query_main( + &mut db2, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(result.num_rows(), 1); + assert_eq!(result.to_rust_json()[0]["p.age"], serde_json::json!(99)); + + let eve = query_main( + &mut db2, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(eve.num_rows(), 1, "concurrent insert should be preserved"); +} diff --git a/crates/omnigraph/tests/end_to_end.rs b/crates/omnigraph/tests/end_to_end.rs new file mode 100644 index 0000000..3a95a98 --- /dev/null +++ b/crates/omnigraph/tests/end_to_end.rs @@ -0,0 +1,1831 @@ +mod helpers; + +use arrow_array::{Array, Int32Array, RecordBatch, StringArray}; +use futures::TryStreamExt; + +use omnigraph::db::{Omnigraph, ReadTarget}; +use omnigraph::loader::{LoadMode, load_jsonl, load_jsonl_file}; +use omnigraph_compiler::ir::ParamMap; + +use helpers::*; + +// ─── Init + Load ──────────────────────────────────────────────────────────── + +#[tokio::test] +async fn init_creates_schema_file_and_manifest() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + let db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + assert!(dir.path().join("_schema.pg").exists()); + assert!(dir.path().join("__manifest").exists()); + assert_eq!(db.catalog().node_types.len(), 2); + assert_eq!(db.catalog().edge_types.len(), 2); +} + +#[tokio::test] +async fn open_restores_full_state() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + let original = init_and_load(&dir).await; + let v = version_main(&original).await.unwrap(); + drop(original); + + let reopened = Omnigraph::open(uri).await.unwrap(); + assert_eq!(reopened.catalog().node_types.len(), 2); + assert_eq!(reopened.catalog().edge_types.len(), 2); + // Version should be what we left it at + // (manifest was committed during load) + assert!(version_main(&reopened).await.unwrap() >= v); +} + +#[tokio::test] +async fn load_populates_all_types() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + + let snap = snapshot_main(&db).await.unwrap(); + + // 4 persons + let person_ds = snap.open("node:Person").await.unwrap(); + assert_eq!(person_ds.count_rows(None).await.unwrap(), 4); + + // 2 companies + let company_ds = snap.open("node:Company").await.unwrap(); + assert_eq!(company_ds.count_rows(None).await.unwrap(), 2); + + // 3 Knows edges + let knows_ds = snap.open("edge:Knows").await.unwrap(); + assert_eq!(knows_ds.count_rows(None).await.unwrap(), 3); + + // 2 WorksAt edges + let works_at_ds = snap.open("edge:WorksAt").await.unwrap(); + assert_eq!(works_at_ds.count_rows(None).await.unwrap(), 2); +} + +// ─── Read consistency ─────────────────────────────────────────────────────── + +#[tokio::test] +async fn node_ids_are_key_values() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + + let batches = read_table(&db, "node:Person").await; + let mut ids = collect_column_strings(&batches, "id"); + ids.sort(); + assert_eq!(ids, vec!["Alice", "Bob", "Charlie", "Diana"]); +} + +#[tokio::test] +async fn node_properties_are_correct() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + + let batches = read_table(&db, "node:Person").await; + let batch = &batches[0]; + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let ages = batch + .column_by_name("age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Find Alice's row and check age + let alice_idx = (0..ids.len()).find(|&i| ids.value(i) == "Alice").unwrap(); + assert_eq!(ages.value(alice_idx), 30); +} + +#[tokio::test] +async fn entity_at_returns_typed_json_values() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let schema = r#" +node Flagged { + slug: String @key + active: Bool + rating: I32? +} +"#; + let data = r#"{"type":"Flagged","data":{"slug":"alpha","active":true,"rating":42}}"#; + + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let entity = db + .entity_at_target(ReadTarget::branch("main"), "node:Flagged", "alpha") + .await + .unwrap() + .unwrap(); + assert_eq!(entity["id"], serde_json::json!("alpha")); + assert_eq!(entity["active"], serde_json::json!(true)); + assert_eq!(entity["rating"], serde_json::json!(42)); +} + +#[tokio::test] +async fn nullable_vectors_round_trip_as_null() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let schema = r#" +node Doc { + slug: String @key + embedding: Vector(2)? +} +"#; + let data = r#"{"type":"Doc","data":{"slug":"a"}} +{"type":"Doc","data":{"slug":"b","embedding":[1.0,2.0]}}"#; + + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let missing = db + .entity_at_target(ReadTarget::branch("main"), "node:Doc", "a") + .await + .unwrap() + .unwrap(); + let present = db + .entity_at_target(ReadTarget::branch("main"), "node:Doc", "b") + .await + .unwrap() + .unwrap(); + + assert!(missing["embedding"].is_null()); + assert_eq!(present["embedding"], serde_json::json!([1.0, 2.0])); +} + +#[tokio::test] +async fn edge_src_dst_reference_node_ids() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + + let batches = read_table(&db, "edge:Knows").await; + let batch = &batches[0]; + let srcs = batch + .column_by_name("src") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let dsts = batch + .column_by_name("dst") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + // Collect all (src, dst) pairs + let mut edges: Vec<(&str, &str)> = (0..batch.num_rows()) + .map(|i| (srcs.value(i), dsts.value(i))) + .collect(); + edges.sort(); + + assert_eq!( + edges, + vec![("Alice", "Bob"), ("Alice", "Charlie"), ("Bob", "Diana")] + ); +} + +#[tokio::test] +async fn edge_ids_are_unique_strings() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + + let batches = read_table(&db, "edge:Knows").await; + let batch = &batches[0]; + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + let id_values: Vec<&str> = (0..ids.len()).map(|i| ids.value(i)).collect(); + // All unique + let mut deduped = id_values.clone(); + deduped.sort(); + deduped.dedup(); + assert_eq!(id_values.len(), deduped.len()); + // All non-empty + assert!(id_values.iter().all(|id| !id.is_empty())); +} + +// ─── Load modes ───────────────────────────────────────────────────────────── + +#[tokio::test] +async fn overwrite_replaces_data() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Load full data + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + // Overwrite with just one person + let small = r#"{"type": "Person", "data": {"name": "Zara", "age": 40}}"#; + load_jsonl(&mut db, small, LoadMode::Overwrite) + .await + .unwrap(); + + let batches = read_table(&db, "node:Person").await; + let batch = &batches[0]; + assert_eq!(batch.num_rows(), 1); + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ids.value(0), "Zara"); +} + +#[tokio::test] +async fn append_adds_rows() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let batch1 = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}}"#; + let batch2 = r#"{"type": "Person", "data": {"name": "Bob", "age": 25}}"#; + + load_jsonl(&mut db, batch1, LoadMode::Overwrite) + .await + .unwrap(); + load_jsonl(&mut db, batch2, LoadMode::Append).await.unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Person").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 2); +} + +// ─── Load from fixture file ───────────────────────────────────────────────── + +#[tokio::test] +async fn load_from_file_works() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let fixture_path = concat!(env!("CARGO_MANIFEST_DIR"), "/tests/fixtures/test.jsonl"); + load_jsonl_file(&mut db, fixture_path, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Person").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 4); +} + +// ─── Signals fixture (complex @key schema) ────────────────────────────────── + +#[tokio::test] +async fn signals_fixture_loads_correctly() { + let schema = include_str!("fixtures/signals.pg"); + let data = include_str!("fixtures/signals.jsonl"); + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + + // Verify some types have data + let company_ds = snap.open("node:Company").await.unwrap(); + assert!(company_ds.count_rows(None).await.unwrap() > 0); + + // Verify node IDs are @key values (slug) + let batches: Vec = company_ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + let ids = collect_column_strings(&batches, "id"); + // Should contain slug values like "aws", "openai", etc. + assert!(ids.contains(&"aws".to_string())); + assert!(ids.contains(&"openai".to_string())); +} + +// ─── Query execution ──────────────────────────────────────────────────────── + +#[tokio::test] +async fn query_get_person_by_name() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + assert_eq!(result.num_rows(), 1); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Alice"); + + let ages = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ages.value(0), 30); +} + +#[tokio::test] +async fn query_get_person_not_found() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Nobody")]), + ) + .await + .unwrap(); + + assert_eq!(result.num_rows(), 0); +} + +#[tokio::test] +async fn query_adults_filtered_and_ordered() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main(&mut db, TEST_QUERIES, "adults", &ParamMap::new()) + .await + .unwrap(); + + // Only Charlie (35) matches age > 30, ordered desc + assert_eq!(result.num_rows(), 1); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Charlie"); +} + +#[tokio::test] +async fn query_top_by_age_with_limit() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main(&mut db, TEST_QUERIES, "top_by_age", &ParamMap::new()) + .await + .unwrap(); + + // Top 2 by age desc: Charlie (35), Alice (30) + assert_eq!(result.num_rows(), 2); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Charlie"); + assert_eq!(names.value(1), "Alice"); + + let ages = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ages.value(0), 35); + assert_eq!(ages.value(1), 30); +} + +// ─── Graph traversal ───────────────────────────────────────────────────── + +#[tokio::test] +async fn query_friends_of() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main( + &mut db, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + // Alice knows Bob and Charlie + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut friend_names: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + friend_names.sort(); + assert_eq!(friend_names, vec!["Bob", "Charlie"]); +} + +#[tokio::test] +async fn query_employees_of() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main( + &mut db, + TEST_QUERIES, + "employees_of", + ¶ms(&[("$company", "Acme")]), + ) + .await + .unwrap(); + + // Alice works at Acme (reverse traversal) + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.len(), 1); + assert_eq!(names.value(0), "Alice"); +} + +#[tokio::test] +async fn query_friends_of_friends() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main( + &mut db, + TEST_QUERIES, + "friends_of_friends", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + // Alice→Bob→Diana (Alice→Charlie→nobody) + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut fof_names: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + fof_names.sort(); + assert_eq!(fof_names, vec!["Diana"]); +} + +#[tokio::test] +async fn query_unemployed() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = query_main(&mut db, TEST_QUERIES, "unemployed", &ParamMap::new()) + .await + .unwrap(); + + // Charlie and Diana have no WorksAt edges + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut unemployed: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + unemployed.sort(); + assert_eq!(unemployed, vec!["Charlie", "Diana"]); +} + +#[tokio::test] +async fn query_anti_join_all_have_edges() { + let schema = r#" +node Person { name: String @key } +node Company { name: String @key } +edge WorksAt: Person -> Company +"#; + let data = r#"{"type": "Person", "data": {"name": "Alice"}} +{"type": "Person", "data": {"name": "Bob"}} +{"type": "Company", "data": {"name": "Acme"}} +{"edge": "WorksAt", "from": "Alice", "to": "Acme"} +{"edge": "WorksAt", "from": "Bob", "to": "Acme"} +"#; + let queries = r#" +query unemployed() { + match { + $p: Person + not { $p worksAt $_ } + } + return { $p.name } +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let result = query_main(&mut db, queries, "unemployed", &ParamMap::new()) + .await + .unwrap(); + + // Everyone has a WorksAt edge → empty result + assert_eq!(result.num_rows(), 0); +} + +// ─── Mutations ─────────────────────────────────────────────────────────────── + +#[tokio::test] +async fn mutation_insert_node() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 1); + assert_eq!(result.affected_edges, 0); + + // Query it back + let qr = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + let batch = &qr.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Eve"); +} + +#[tokio::test] +async fn mutation_insert_edge() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Insert Eve + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + // Add edge Eve → Alice + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Eve"), ("$to", "Alice")]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 0); + assert_eq!(result.affected_edges, 1); + + // Verify traversal + let qr = query_main( + &mut db, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + let batch = qr.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Alice"); +} + +#[tokio::test] +async fn mutation_update_node() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 1); + assert_eq!(result.affected_edges, 0); + + // Verify the update + let qr = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + let batch = &qr.batches()[0]; + let ages = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ages.value(0), 31); +} + +#[tokio::test] +async fn mutation_delete_node_cascades_edges() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Alice has: 2 outgoing Knows (Alice→Bob, Alice→Charlie) + 1 WorksAt (Alice→Acme) = 3 edges + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 1); + assert!( + result.affected_edges >= 3, + "expected at least 3 cascaded edges, got {}", + result.affected_edges + ); + + // Alice should be gone + let qr = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 0); + + // Verify no edges reference Alice + let snap = snapshot_main(&db).await.unwrap(); + for edge_key in &["edge:Knows", "edge:WorksAt"] { + let ds = snap.open(edge_key).await.unwrap(); + let batches: Vec = ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + for batch in &batches { + let srcs = batch + .column_by_name("src") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let dsts = batch + .column_by_name("dst") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..batch.num_rows() { + assert_ne!( + srcs.value(i), + "Alice", + "found edge src=Alice in {}", + edge_key + ); + assert_ne!( + dsts.value(i), + "Alice", + "found edge dst=Alice in {}", + edge_key + ); + } + } + } +} + +#[tokio::test] +async fn mutation_delete_edge() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Delete all Knows edges from Alice (Alice→Bob, Alice→Charlie) + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_friendship", + ¶ms(&[("$from", "Alice")]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 0); + assert_eq!(result.affected_edges, 2); + + // Alice should still exist + let qr = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + + // But has no friends + let qr = query_main( + &mut db, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 0); +} + +#[tokio::test] +async fn mutation_insert_duplicate_key_upserts() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Alice already exists with age=30. Insert again with age=99. + let result = mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Alice")], &[("$age", 99)]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 1); + + // Should still be exactly 1 Alice (upsert, not duplicate) + let qr = query_main( + &mut db, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + + // Age should be updated to 99 + let batch = &qr.batches()[0]; + let ages = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(ages.value(0), 99); +} + +#[tokio::test] +async fn mutation_update_key_property_rejected() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let queries = r#" +query rename_person($old_name: String, $new_name: String) { + update Person set { name: $new_name } where name = $old_name +} +"#; + + let result = mutate_main( + &mut db, + queries, + "rename_person", + ¶ms(&[("$old_name", "Alice"), ("$new_name", "Bob")]), + ) + .await; + + assert!(result.is_err()); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@key"), "error should mention @key: {}", err); +} + +// ─── Blob support ──────────────────────────────────────────────────────────── + +const BLOB_SCHEMA: &str = r#" +node Document { + title: String @key + content: Blob? +} +"#; + +const BLOB_QUERIES: &str = r#" +query all_docs() { + match { $d: Document } + return { $d.title, $d.content } +} + +query get_doc($title: String) { + match { $d: Document { title: $title } } + return { $d.title, $d.content } +} +"#; + +const BLOB_MUTATIONS: &str = r#" +query insert_doc($title: String, $content: Blob) { + insert Document { title: $title, content: $content } +} + +query update_doc_content($title: String, $content: Blob) { + update Document set { content: $content } where title = $title +} +"#; + +#[tokio::test] +async fn blob_schema_parses_and_init_succeeds() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + assert!( + db.catalog().node_types["Document"] + .blob_properties + .contains("content") + ); + assert_eq!(db.catalog().node_types["Document"].properties.len(), 2); +} + +#[tokio::test] +async fn blob_load_base64_inline() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + // "Hello World" = "SGVsbG8gV29ybGQ=" + let data = r#"{"type": "Document", "data": {"title": "readme", "content": "base64:SGVsbG8gV29ybGQ="}} +{"type": "Document", "data": {"title": "empty"}} +"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Document").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 2); +} + +#[tokio::test] +async fn blob_query_returns_metadata() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + let data = r#"{"type": "Document", "data": {"title": "readme", "content": "base64:SGVsbG8gV29ybGQ="}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let result = query_main( + &mut db, + BLOB_QUERIES, + "get_doc", + ¶ms(&[("$title", "readme")]), + ) + .await + .unwrap(); + + assert_eq!(result.num_rows(), 1); + + let json = result.to_sdk_json(); + let row = json.as_array().unwrap().first().unwrap(); + assert_eq!(row["d.title"], "readme"); + // Blob columns return null in query projections — data is accessed via take_blobs API. + // (Lance bug: BlobsDescriptions + filter triggers assertion, so blobs are excluded from scan) + assert!( + row["d.content"].is_null(), + "blob column should return null in query projection" + ); +} + +#[tokio::test] +async fn blob_null_returns_null_in_query() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + let data = r#"{"type": "Document", "data": {"title": "empty"}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let result = query_main( + &mut db, + BLOB_QUERIES, + "get_doc", + ¶ms(&[("$title", "empty")]), + ) + .await + .unwrap(); + + assert_eq!(result.num_rows(), 1); + let json = result.to_sdk_json(); + let row = json.as_array().unwrap().first().unwrap(); + assert_eq!(row["d.title"], "empty"); + // Nullable blob with no value should return null + assert!( + row["d.content"].is_null(), + "null blob should return null, got: {}", + row["d.content"] + ); +} + +#[tokio::test] +async fn blob_insert_mutation() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + let result = mutate_main( + &mut db, + BLOB_MUTATIONS, + "insert_doc", + ¶ms(&[("$title", "new-doc"), ("$content", "base64:AQID")]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 1); + + // Query it back + let qr = query_main( + &mut db, + BLOB_QUERIES, + "get_doc", + ¶ms(&[("$title", "new-doc")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); + let json = qr.to_sdk_json(); + let row = json.as_array().unwrap().first().unwrap(); + assert_eq!(row["d.title"], "new-doc"); + // Blob column present but null in query projection (data accessed via take_blobs) + assert!( + row.get("d.content").is_some(), + "content column should be present" + ); +} + +#[tokio::test] +async fn blob_update_mutation() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + // First insert a doc with blob + mutate_main( + &mut db, + BLOB_MUTATIONS, + "insert_doc", + ¶ms(&[("$title", "updatable"), ("$content", "base64:AQID")]), + ) + .await + .unwrap(); + + // Update the blob + let result = mutate_main( + &mut db, + BLOB_MUTATIONS, + "update_doc_content", + ¶ms(&[("$title", "updatable"), ("$content", "base64:BAUG")]), + ) + .await + .unwrap(); + + assert_eq!(result.affected_nodes, 1); + + let blob = db + .read_blob("Document", "updatable", "content") + .await + .unwrap(); + let bytes = blob.read().await.unwrap(); + assert_eq!(&bytes[..], &[4, 5, 6]); +} + +// ─── Blob read API ─────────────────────────────────────────────────────── + +#[tokio::test] +async fn blob_read_returns_bytes() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + // "Hello World" = base64 "SGVsbG8gV29ybGQ=" + let data = r#"{"type": "Document", "data": {"title": "readme", "content": "base64:SGVsbG8gV29ybGQ="}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let blob = db.read_blob("Document", "readme", "content").await.unwrap(); + assert_eq!(blob.size(), 11); // "Hello World" = 11 bytes + + let bytes = blob.read().await.unwrap(); + assert_eq!(&bytes[..], b"Hello World"); +} + +#[tokio::test] +async fn blob_read_not_found_errors() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + let data = r#"{"type": "Document", "data": {"title": "readme", "content": "base64:SGVsbG8="}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + // Non-existent ID + let err = db.read_blob("Document", "nonexistent", "content").await; + assert!(err.is_err()); + + // Non-blob property + let err = db.read_blob("Document", "readme", "title").await; + assert!(err.is_err()); +} + +#[tokio::test] +async fn blob_read_after_mutation_insert() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + // Insert via mutation (base64 for bytes [1, 2, 3]) + mutate_main( + &mut db, + BLOB_MUTATIONS, + "insert_doc", + ¶ms(&[("$title", "inserted"), ("$content", "base64:AQID")]), + ) + .await + .unwrap(); + + let blob = db + .read_blob("Document", "inserted", "content") + .await + .unwrap(); + let bytes = blob.read().await.unwrap(); + assert_eq!(&bytes[..], &[1, 2, 3]); +} + +// ─── Blob low-level: probe BlobHandling::BlobsDescriptions ─────────────── + +#[tokio::test] +async fn blob_scan_with_descriptions_on_nonempty_dataset() { + use lance::datatypes::BlobHandling; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + let data = r#"{"type": "Document", "data": {"title": "readme", "content": "base64:SGVsbG8gV29ybGQ="}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + // Open the dataset directly and try BlobsDescriptions + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Document").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 1); + + // BlobsDescriptions works without filter + let mut scanner = ds.scan(); + scanner.blob_handling(BlobHandling::BlobsDescriptions); + let stream = scanner.try_into_stream().await.unwrap(); + let batches: Vec = stream.try_collect().await.unwrap(); + assert_eq!(batches.len(), 1); + assert_eq!(batches[0].num_rows(), 1); + + // Blob descriptor is a struct with kind, position, size, blob_id, blob_uri + let content_col = batches[0].column_by_name("content").unwrap(); + assert!( + matches!(content_col.data_type(), arrow_schema::DataType::Struct(_)), + "blob column should be Struct, got {:?}", + content_col.data_type() + ); +} + +// ─── Constraint enforcement ────────────────────────────────────────────────── + +#[tokio::test] +async fn range_constraint_rejects_out_of_bounds() { + let schema = r#" +node Person { + name: String @key + age: I32? + @range(age, 0..200) +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + // age = 300 exceeds max of 200 + let data = r#"{"type": "Person", "data": {"name": "Old", "age": 300}}"#; + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "expected range violation"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@range violation"), "error: {}", err); +} + +#[tokio::test] +async fn range_constraint_allows_within_bounds() { + let schema = r#" +node Person { + name: String @key + age: I32? + @range(age, 0..200) +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let data = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Person").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 1); +} + +#[tokio::test] +async fn range_constraint_float_rejects_out_of_bounds() { + let schema = r#" +node Measurement { + name: String @key + temperature: F64? + @range(temperature, 0.0..100.0) +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let data = r#"{"type": "Measurement", "data": {"name": "hot", "temperature": 150.5}}"#; + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "expected range violation for float"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@range violation"), "error: {}", err); +} + +#[tokio::test] +async fn range_constraint_float_allows_within_bounds() { + let schema = r#" +node Measurement { + name: String @key + temperature: F64? + @range(temperature, 0.0..100.0) +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let data = r#"{"type": "Measurement", "data": {"name": "warm", "temperature": 37.5}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Measurement").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 1); +} + +#[tokio::test] +async fn range_constraint_negative_float_bounds() { + let schema = r#" +node Measurement { + name: String @key + temperature: F64? + @range(temperature, -40.0..60.0) +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + // Within bounds — should succeed + let data = r#"{"type": "Measurement", "data": {"name": "cold", "temperature": -20.0}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + // Below minimum — should fail + let data = r#"{"type": "Measurement", "data": {"name": "arctic", "temperature": -50.0}}"#; + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "expected range violation for -50.0"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@range violation"), "error: {}", err); +} + +#[tokio::test] +async fn check_constraint_rejects_bad_pattern() { + let schema = r#" +node Order { + code: String @key + @check(code, "^[A-Z]{3}-[0-9]+$") +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let data = r#"{"type": "Order", "data": {"code": "invalid"}}"#; + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "expected check violation"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@check violation"), "error: {}", err); +} + +#[tokio::test] +async fn check_constraint_allows_matching_pattern() { + let schema = r#" +node Order { + code: String @key + @check(code, "^[A-Z]{3}-[0-9]+$") +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let data = r#"{"type": "Order", "data": {"code": "ABC-123"}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Order").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 1); +} + +#[tokio::test] +async fn mutation_insert_rejects_range_violation() { + let schema = r#" +node Person { + name: String @key + age: I32? + @range(age, 0..200) +} +"#; + let queries = r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let result = mutate_main(&mut db, queries, "insert_person", &{ + let mut p = omnigraph_compiler::ir::ParamMap::new(); + p.insert( + "name".to_string(), + omnigraph_compiler::query::ast::Literal::String("Old".to_string()), + ); + p.insert( + "age".to_string(), + omnigraph_compiler::query::ast::Literal::Integer(300), + ); + p + }) + .await; + assert!(result.is_err(), "expected range violation"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@range violation"), "error: {}", err); +} + +#[tokio::test] +async fn mutation_update_rejects_range_violation() { + let schema = r#" +node Person { + name: String @key + age: I32? + @range(age, 0..200) +} +"#; + let queries = r#" +query set_age($name: String, $age: I32) { + update Person set { age: $age } where name = $name +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl( + &mut db, + r#"{"type": "Person", "data": {"name": "Alice", "age": 30}}"#, + LoadMode::Overwrite, + ) + .await + .unwrap(); + + let result = mutate_main(&mut db, queries, "set_age", &{ + let mut p = omnigraph_compiler::ir::ParamMap::new(); + p.insert( + "name".to_string(), + omnigraph_compiler::query::ast::Literal::String("Alice".to_string()), + ); + p.insert( + "age".to_string(), + omnigraph_compiler::query::ast::Literal::Integer(300), + ); + p + }) + .await; + assert!(result.is_err(), "expected range violation"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@range violation"), "error: {}", err); +} + +#[tokio::test] +async fn mutation_insert_rejects_check_violation() { + let schema = r#" +node Order { + code: String @key + @check(code, "^[A-Z]{3}-[0-9]+$") +} +"#; + let queries = r#" +query insert_order($code: String) { + insert Order { code: $code } +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let result = mutate_main(&mut db, queries, "insert_order", &{ + let mut p = omnigraph_compiler::ir::ParamMap::new(); + p.insert( + "code".to_string(), + omnigraph_compiler::query::ast::Literal::String("invalid".to_string()), + ); + p + }) + .await; + assert!(result.is_err(), "expected check violation"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@check violation"), "error: {}", err); +} + +#[tokio::test] +async fn mutation_update_rejects_check_violation() { + let schema = r#" +node Order { + code: String @key + label: String? + @check(label, "^[A-Z]+$") +} +"#; + let queries = r#" +query set_label($code: String, $label: String) { + update Order set { label: $label } where code = $code +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl( + &mut db, + r#"{"type": "Order", "data": {"code": "ABC-123", "label": "VALID"}}"#, + LoadMode::Overwrite, + ) + .await + .unwrap(); + + let result = mutate_main(&mut db, queries, "set_label", &{ + let mut p = omnigraph_compiler::ir::ParamMap::new(); + p.insert( + "code".to_string(), + omnigraph_compiler::query::ast::Literal::String("ABC-123".to_string()), + ); + p.insert( + "label".to_string(), + omnigraph_compiler::query::ast::Literal::String("invalid".to_string()), + ); + p + }) + .await; + assert!(result.is_err(), "expected check violation"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@check violation"), "error: {}", err); +} + +#[tokio::test] +async fn edge_cardinality_max_enforced() { + let schema = r#" +node Person { name: String @key } +node Company { name: String @key } +edge WorksAt: Person -> Company @card(0..1) +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + // Alice works at two companies — violates @card(0..1) + let data = r#"{"type": "Person", "data": {"name": "Alice"}} +{"type": "Company", "data": {"name": "Acme"}} +{"type": "Company", "data": {"name": "Globex"}} +{"edge": "WorksAt", "from": "Alice", "to": "Acme"} +{"edge": "WorksAt", "from": "Alice", "to": "Globex"} +"#; + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "expected cardinality violation"); + let err = result.unwrap_err().to_string(); + assert!(err.contains("@card violation"), "error: {}", err); +} + +#[tokio::test] +async fn edge_cardinality_allows_within_bounds() { + let schema = r#" +node Person { name: String @key } +node Company { name: String @key } +edge WorksAt: Person -> Company @card(0..1) +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + let data = r#"{"type": "Person", "data": {"name": "Alice"}} +{"type": "Company", "data": {"name": "Acme"}} +{"edge": "WorksAt", "from": "Alice", "to": "Acme"} +"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("edge:WorksAt").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 1); +} + +// ─── Regression: apply_assignments with blob mid-schema ────────────────────── + +#[tokio::test] +async fn update_with_blob_mid_schema_does_not_panic() { + // Blob column in the MIDDLE of schema — not last. This previously caused + // a column-index mismatch in apply_assignments (batch.column(idx) used + // schema position but the batch had blob columns excluded from projection). + let schema = r#" +node Article { + slug: String @key + attachment: Blob? + summary: String? + rating: I32? +} +"#; + let mutations = r#" +query insert_article($slug: String, $summary: String, $rating: I32) { + insert Article { slug: $slug, summary: $summary, rating: $rating } +} +query update_summary($slug: String, $summary: String) { + update Article set { summary: $summary } where slug = $slug +} +query get_article($slug: String) { + match { $a: Article { slug: $slug } } + return { $a.slug, $a.summary, $a.rating } +} +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + mutate_main( + &mut db, + mutations, + "insert_article", + &mixed_params( + &[("$slug", "a1"), ("$summary", "hello")], + &[("$rating", 42)], + ), + ) + .await + .unwrap(); + + // This would panic with the old batch.column(idx) code + let result = mutate_main( + &mut db, + mutations, + "update_summary", + ¶ms(&[("$slug", "a1"), ("$summary", "updated")]), + ) + .await + .unwrap(); + assert_eq!(result.affected_nodes, 1); + + // Verify the update applied correctly + let qr = query_main( + &mut db, + mutations, + "get_article", + ¶ms(&[("$slug", "a1")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); +} + +// ─── Regression: blob update null → non-null ───────────────────────────────── + +#[tokio::test] +async fn blob_update_null_to_non_null() { + // Regression: updating a blob column that was previously all-null panicked + // with assertion `left: 0, right: 1` in lance-table stream.rs because the + // two-phase blob update sent a blob-only batch to merge_insert on a dataset + // with zero blob fragments. + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + + // Load a row with blob = null (no blob data in dataset) + let data = r#"{"type": "Document", "data": {"title": "kid-a"}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + // Update: null → non-null blob. Previously panicked with assertion + // `left: 0, right: 1` in lance-table stream.rs. + let result = mutate_main( + &mut db, + BLOB_MUTATIONS, + "update_doc_content", + ¶ms(&[("$title", "kid-a"), ("$content", "base64:AQID")]), + ) + .await + .unwrap(); + assert_eq!(result.affected_nodes, 1); + + let blob = db.read_blob("Document", "kid-a", "content").await.unwrap(); + let bytes = blob.read().await.unwrap(); + assert_eq!(&bytes[..], &[1, 2, 3]); +} + +// ─── Regression: blob load with external file URI ──────────────────────────── + +#[tokio::test] +async fn blob_load_external_file_uri() { + // Regression: loading blobs with external file:// URIs was rejected with + // "External blob URI '...' is outside registered external bases" because + // allow_external_blob_outside_bases was not set on data table write paths. + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + // Create a temp file to reference + let blob_dir = tempfile::tempdir().unwrap(); + let blob_path = blob_dir.path().join("test.txt"); + std::fs::write(&blob_path, b"Hello from file").unwrap(); + let file_uri = format!("file://{}", blob_path.display()); + + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + let data = format!( + r#"{{"type": "Document", "data": {{"title": "from-file", "content": "{}"}}}}"#, + file_uri + ); + + // Load with external URI + load_jsonl(&mut db, &data, LoadMode::Overwrite) + .await + .unwrap(); + + // Verify the blob is accessible + let blob = db + .read_blob("Document", "from-file", "content") + .await + .unwrap(); + assert!(blob.uri().is_some(), "external blob should have a URI"); +} + +// ─── Regression: execute_update on edge type ───────────────────────────────── + +#[tokio::test] +async fn update_edge_type_returns_error_not_panic() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // The typechecker should reject this, but even if bypassed, + // execute_update must not panic with HashMap key-not-found. + let mutations = r#" +query update_edge($from: String) { + update Knows set { since: "2025-01-01" } where from = $from +} +"#; + let result = mutate_main( + &mut db, + mutations, + "update_edge", + ¶ms(&[("$from", "Alice")]), + ) + .await; + assert!(result.is_err(), "should return error, not panic"); +} + +// ─── Regression: Date/DateTime SQL literal escaping ────────────────────────── + +#[tokio::test] +async fn date_literal_with_quote_is_escaped() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // A date-like value with a single-quote must not cause SQL injection. + // This tests that literal_to_sql escapes Date/DateTime values. + let queries = r#" +query filter_date($d: String) { + match { $p: Person { name: $d } } + return { $p.name } +} +"#; + // Pass a value with a single-quote — should not error or return all rows + let result = query_main( + &mut db, + queries, + "filter_date", + ¶ms(&[("$d", "2025-01-01' OR '1'='1")]), + ) + .await + .unwrap(); + assert_eq!(result.num_rows(), 0); +} + +// ─── Regression: manifest row_count tracks total, not batch size ───────────── + +#[tokio::test] +async fn append_mode_manifest_row_count_is_total() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; // Overwrite: 4 persons + + let extra = r#"{"type": "Person", "data": {"name": "Eve", "age": 22}}"#; + load_jsonl(&mut db, extra, LoadMode::Append).await.unwrap(); + + let snap = snapshot_main(&db).await.unwrap(); + let entry = snap.entry("node:Person").unwrap(); + // Must be total rows (4 + 1 = 5), not just the appended batch size (1) + assert_eq!(entry.row_count, 5); + + // Verify actual dataset count matches manifest + let ds = snap.open("node:Person").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap() as u64, entry.row_count); +} + +// ─── Regression: cardinality violation must not commit manifest ─────────────── + +#[tokio::test] +async fn cardinality_violation_does_not_commit_manifest() { + let schema = r#" +node Person { name: String @key } +node Company { name: String @key } +edge WorksAt: Person -> Company @card(0..1) +"#; + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + + // Alice works at two companies — violates @card(0..1) (at most 1) + let data = r#" +{"type": "Person", "data": {"name": "Alice"}} +{"type": "Company", "data": {"name": "Acme"}} +{"type": "Company", "data": {"name": "Beta"}} +{"edge": "WorksAt", "from": "Alice", "to": "Acme"} +{"edge": "WorksAt", "from": "Alice", "to": "Beta"} +"#; + + let v_before = version_main(&db).await.unwrap(); + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "cardinality violation should be rejected"); + assert!( + result.unwrap_err().to_string().contains("@card violation"), + "error should mention @card" + ); + + // Manifest must NOT have advanced — invalid data was not committed + assert_eq!(version_main(&db).await.unwrap(), v_before); +} + +// ─── Regression: dangling edge references are rejected ─────────────────────── + +#[tokio::test] +async fn dangling_edge_dst_rejected_on_load() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let data = r#" +{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Company", "data": {"name": "Acme"}} +{"edge": "Knows", "from": "Alice", "to": "NonExistent"} +"#; + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "dangling edge dst should be rejected"); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("not found"), + "error should mention 'not found': {}", + err + ); +} + +#[tokio::test] +async fn dangling_edge_src_rejected_on_load() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let data = r#" +{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Company", "data": {"name": "Acme"}} +{"edge": "WorksAt", "from": "Ghost", "to": "Acme"} +"#; + let result = load_jsonl(&mut db, data, LoadMode::Overwrite).await; + assert!(result.is_err(), "dangling edge src should be rejected"); + let err = result.unwrap_err().to_string(); + assert!( + err.contains("not found"), + "error should mention 'not found': {}", + err + ); +} + +// ─── Regression: ensure_indices is idempotent ──────────────────────────────── + +#[tokio::test] +async fn ensure_indices_does_not_error_on_repeated_call() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let version_after_load = version_main(&db).await.unwrap(); + + // load commits now enforce required indices; repeated ensure_indices calls + // should be a no-op at the manifest level. + db.ensure_indices().await.unwrap(); + let version_after_first = version_main(&db).await.unwrap(); + db.ensure_indices().await.unwrap(); + let version_after_second = version_main(&db).await.unwrap(); + + assert_eq!(version_after_first, version_after_load); + assert_eq!(version_after_second, version_after_load); + + // Data should still be queryable after index operations + let snap = snapshot_main(&db).await.unwrap(); + let ds = snap.open("node:Person").await.unwrap(); + assert_eq!(ds.count_rows(None).await.unwrap(), 4); +} diff --git a/crates/omnigraph/tests/export.rs b/crates/omnigraph/tests/export.rs new file mode 100644 index 0000000..696ade9 --- /dev/null +++ b/crates/omnigraph/tests/export.rs @@ -0,0 +1,183 @@ +mod helpers; + +use arrow_array::{Array, StringArray}; + +use omnigraph::db::{Omnigraph, ReadTarget}; +use omnigraph::loader::{LoadMode, load_jsonl}; + +use helpers::*; + +const EXPORT_MUTATIONS: &str = r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} + +query add_friend($from: String, $to: String) { + insert Knows { from: $from, to: $to } +} +"#; + +const NOTE_SCHEMA: &str = r#" +node Note { + text: String +} + +edge References: Note -> Note +"#; + +const NOTE_DATA: &str = r#" +{"type":"Note","data":{"id":"note-1","text":"Alpha"}} +{"type":"Note","data":{"id":"note-2","text":"Beta"}} +{"edge":"References","from":"note-1","to":"note-2","data":{"id":"edge-1"}} +"#; + +#[tokio::test] +async fn export_jsonl_round_trips_branch_snapshot() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + db.branch_create_from(ReadTarget::branch("main"), "feature") + .await + .unwrap(); + db.mutate( + "feature", + EXPORT_MUTATIONS, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 29)]), + ) + .await + .unwrap(); + db.mutate( + "feature", + EXPORT_MUTATIONS, + "add_friend", + ¶ms(&[("$from", "Eve"), ("$to", "Alice")]), + ) + .await + .unwrap(); + + let main_jsonl = db.export_jsonl("main", &[], &[]).await.unwrap(); + let feature_jsonl = db.export_jsonl("feature", &[], &[]).await.unwrap(); + + let imported_main_dir = tempfile::tempdir().unwrap(); + let imported_feature_dir = tempfile::tempdir().unwrap(); + let mut imported_main = + Omnigraph::init(imported_main_dir.path().to_str().unwrap(), TEST_SCHEMA) + .await + .unwrap(); + let mut imported_feature = + Omnigraph::init(imported_feature_dir.path().to_str().unwrap(), TEST_SCHEMA) + .await + .unwrap(); + load_jsonl(&mut imported_main, &main_jsonl, LoadMode::Overwrite) + .await + .unwrap(); + load_jsonl(&mut imported_feature, &feature_jsonl, LoadMode::Overwrite) + .await + .unwrap(); + + assert_eq!(count_rows(&db, "node:Person").await, 4); + assert_eq!(count_rows_branch(&db, "feature", "node:Person").await, 5); + assert_eq!(count_rows(&imported_main, "node:Person").await, 4); + assert_eq!(count_rows(&imported_feature, "node:Person").await, 5); + assert_eq!(count_rows(&imported_main, "edge:Knows").await, 3); + assert_eq!(count_rows(&imported_feature, "edge:Knows").await, 4); +} + +#[tokio::test] +async fn export_jsonl_preserves_explicit_ids_for_non_key_graphs() { + let dir = tempfile::tempdir().unwrap(); + let mut db = Omnigraph::init(dir.path().to_str().unwrap(), NOTE_SCHEMA) + .await + .unwrap(); + load_jsonl(&mut db, NOTE_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + let exported = db.export_jsonl("main", &[], &[]).await.unwrap(); + + let imported_dir = tempfile::tempdir().unwrap(); + let mut imported = Omnigraph::init(imported_dir.path().to_str().unwrap(), NOTE_SCHEMA) + .await + .unwrap(); + load_jsonl(&mut imported, &exported, LoadMode::Overwrite) + .await + .unwrap(); + + let node_batches = read_table(&imported, "node:Note").await; + let node_ids = collect_column_strings(&node_batches, "id"); + assert_eq!(node_ids, vec!["note-1".to_string(), "note-2".to_string()]); + + let edge_batches = read_table(&imported, "edge:References").await; + let edge_ids = collect_column_strings(&edge_batches, "id"); + assert_eq!(edge_ids, vec!["edge-1".to_string()]); + + let srcs = edge_batches[0] + .column_by_name("src") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let dsts = edge_batches[0] + .column_by_name("dst") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(srcs.value(0), "note-1"); + assert_eq!(dsts.value(0), "note-2"); +} + +// ─── Regression: export with blob columns ──────────────────────────────────── + +#[tokio::test] +async fn export_jsonl_with_blob_type() { + // Regression: export on types with blob columns failed with + // "Schema error: Can not append column _rowaddr on schema" because + // Lance 4's take_blobs duplicated _rowaddr on the unsorted path. + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + + const BLOB_SCHEMA: &str = r#" +node Document { + title: String @key + content: Blob? +} +"#; + + let mut db = Omnigraph::init(uri, BLOB_SCHEMA).await.unwrap(); + let data = concat!( + "{\"type\": \"Document\", \"data\": {\"title\": \"readme\", \"content\": \"base64:SGVsbG8=\"}}\n", + "{\"type\": \"Document\", \"data\": {\"title\": \"empty\"}}\n", + ); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + // Export should succeed + let exported = db.export_jsonl("main", &[], &[]).await.unwrap(); + assert!( + exported.contains("readme"), + "export should contain readme doc" + ); + + // Verify blob value is in the export + assert!( + exported.contains("base64:") || exported.contains("SGVsbG8"), + "export should contain blob data as base64" + ); + + // Round-trip: re-import and verify blob data survives + let imported_dir = tempfile::tempdir().unwrap(); + let imported_uri = imported_dir.path().to_str().unwrap(); + let mut imported = Omnigraph::init(imported_uri, BLOB_SCHEMA).await.unwrap(); + load_jsonl(&mut imported, &exported, LoadMode::Overwrite) + .await + .unwrap(); + + let blob = imported + .read_blob("Document", "readme", "content") + .await + .unwrap(); + let bytes = blob.read().await.unwrap(); + assert_eq!(&bytes[..], b"Hello"); +} diff --git a/crates/omnigraph/tests/failpoints.rs b/crates/omnigraph/tests/failpoints.rs new file mode 100644 index 0000000..c1ca555 --- /dev/null +++ b/crates/omnigraph/tests/failpoints.rs @@ -0,0 +1,47 @@ +#![cfg(feature = "failpoints")] + +mod helpers; + +use fail::FailScenario; +use omnigraph::db::Omnigraph; +use omnigraph::failpoints::ScopedFailPoint; + +use helpers::{MUTATION_QUERIES, mixed_params}; + +#[tokio::test] +async fn branch_create_failpoint_triggers() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, helpers::TEST_SCHEMA).await.unwrap(); + let _failpoint = ScopedFailPoint::new("branch_create.after_manifest_branch_create", "return"); + + let err = db.branch_create("feature").await.unwrap_err(); + assert!( + err.to_string() + .contains("injected failpoint triggered: branch_create.after_manifest_branch_create") + ); +} + +#[tokio::test] +async fn graph_publish_failpoint_triggers_before_commit_append() { + let _scenario = FailScenario::setup(); + let dir = tempfile::tempdir().unwrap(); + let mut db = Omnigraph::init(dir.path().to_str().unwrap(), helpers::TEST_SCHEMA) + .await + .unwrap(); + let _failpoint = ScopedFailPoint::new("graph_publish.before_commit_append", "return"); + + let err = mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap_err(); + assert!( + err.to_string() + .contains("injected failpoint triggered: graph_publish.before_commit_append") + ); +} diff --git a/crates/omnigraph/tests/fixtures/context.jsonl b/crates/omnigraph/tests/fixtures/context.jsonl new file mode 100644 index 0000000..ee09a0a --- /dev/null +++ b/crates/omnigraph/tests/fixtures/context.jsonl @@ -0,0 +1,13 @@ +{"type": "Actor", "data": {"slug": "aaron", "name": "Aaron"}} +{"type": "Actor", "data": {"slug": "bruno", "name": "Bruno"}} +{"type": "Actor", "data": {"slug": "jorge", "name": "Jorge"}} +{"type": "Actor", "data": {"slug": "muneeb", "name": "Muneeb"}} +{"type": "Actor", "data": {"slug": "ragnor", "name": "Ragnor"}} +{"type": "Actor", "data": {"slug": "andrew", "name": "Andrew"}} +{"type": "Signal", "data": {"slug": "zylon-private-ai-platform", "title": "Zylon.ai positions as complete on-premise enterprise AI platform for regulated industries", "body": "Zylon.ai positions itself as a complete, fully private (100% on-premise) enterprise AI platform built explicitly for regulated industries, emphasizing air-gapped deployability, data sovereignty (no external cloud dependency), and predictable fixed-cost economics (no per-token pricing).", "category": "competitor", "strength": "strong", "observed_at": "2026-03-27", "source": "https://www.zylon.ai/"}} +{"type": "Decision", "data": {"slug": "create-360-ai-infra", "title": "Create 360 AI Infra offering", "body": "Build a comprehensive 360-degree AI infrastructure offering in response to competitors like Zylon positioning complete on-premise AI platforms for regulated industries.", "status": "proposed", "urgency": "high", "decided_at": "2026-03-27"}} +{"type": "Trace", "data": {"slug": "jorge-spots-zylon", "title": "Jorge spots Zylon.ai competitor signal", "body": "Jorge identified Zylon.ai as a new competitor positioning a fully private enterprise AI platform targeting regulated industries with air-gapped deployment and fixed-cost pricing.", "kind": "note", "recorded_at": "2026-03-27", "source": "https://www.zylon.ai/"}} +{"edge": "OwnedBy", "from": "create-360-ai-infra", "to": "andrew"} +{"edge": "RecordedBy", "from": "jorge-spots-zylon", "to": "jorge"} +{"edge": "Triggered", "from": "zylon-private-ai-platform", "to": "create-360-ai-infra"} +{"edge": "Supports", "from": "jorge-spots-zylon", "to": "create-360-ai-infra"} diff --git a/crates/omnigraph/tests/fixtures/context.pg b/crates/omnigraph/tests/fixtures/context.pg new file mode 100644 index 0000000..906075d --- /dev/null +++ b/crates/omnigraph/tests/fixtures/context.pg @@ -0,0 +1,78 @@ +// Context graph: decisions, the people behind them, +// the evidence trail, and market signals that inform them. + +// ── Nodes ──────────────────────────────────────────── + +node Actor { + slug: String @key + name: String + email: String? @unique +} + +node Decision { + slug: String @key + title: String @index + body: String? + status: enum(proposed, accepted, rejected, superseded) + urgency: enum(low, normal, high, critical) + decided_at: Date? +} + +node Trace { + slug: String @key + title: String @index + body: String? + kind: enum(note, discussion, experiment, review, meeting, document) + recorded_at: Date + source: String? +} + +node Signal { + slug: String @key + title: String @index + body: String? + category: enum(competitor, market, regulatory, technology, customer) + strength: enum(strong, moderate, weak) + observed_at: Date + source: String? +} + +node Artifact { + slug: String @key + title: String @index + kind: enum(doc, presentation, proposal, spec, report, memo) + url: String? + created_at: Date +} + +// ── Ownership / participation ──────────────────────── + +edge OwnedBy: Decision -> Actor @card(1..1) + +edge ParticipatedIn: Actor -> Decision + +edge RecordedBy: Trace -> Actor @card(1..1) + +edge AuthoredBy: Artifact -> Actor @card(1..1) + +// ── Evidence trail ─────────────────────────────────── + +edge Supports: Trace -> Decision + +edge Attached: Artifact -> Decision + +edge CitedIn: Artifact -> Trace + +// ── Signal linkage ─────────────────────────────────── + +edge Triggered: Signal -> Decision + +edge Correlates: Signal -> Signal { + @unique(src, dst) +} + +// ── Decision lineage ───────────────────────────────── + +edge Supersedes: Decision -> Decision { + @unique(src, dst) +} diff --git a/crates/omnigraph/tests/fixtures/revops_large_signal.md b/crates/omnigraph/tests/fixtures/revops_large_signal.md new file mode 100644 index 0000000..39995dc --- /dev/null +++ b/crates/omnigraph/tests/fixtures/revops_large_signal.md @@ -0,0 +1,48 @@ +# Enterprise Procurement Risk Memo + +## Situation +The buyer entered procurement for annual renewal and asked for a bundled proposal that combines platform licensing, implementation support, and SLA uplift. +Legal requested two rounds of redlines and now requires explicit language for data residency, deletion timelines, and subprocessors. +Security asked for the full questionnaire, pen-test summary, SOC evidence package, and a named escalation owner for incident response coordination. +Finance requested price hold terms through quarter close and requires a clean net amount with no conditional side letters. + +## Current Friction +The commercial owner reports that each team is operating on a different timeline. +Procurement prefers a single consolidated response packet, but legal and security are still updating separate drafts. +The buyer champion is supportive but cannot route final approval until the redline and security sections are complete. +Two approvers are out next week, which introduces a calendar risk for final signoff. + +## Evidence From Recent Calls +- Buyer said the risk is not product fit, it is internal process load. +- Procurement requested one owner for all responses to avoid thread drift. +- Legal asked for an explicit breach notification interval in the MSA. +- Security flagged third-party dependency disclosure as incomplete. +- Finance asked for forecast certainty before they release PO authority. + +## Operational Notes +1. The account team should treat this as a coordination problem, not a persuasion problem. +2. Every open item needs owner, due date, and blocking dependency. +3. Replies should be centralized in one tracker to avoid inconsistent statements. +4. Escalation should happen early when legal language depends on security attestations. +5. The champion should get a concise status summary after each workday. + +## Risk Register +- **Timeline risk:** medium-high due to calendar compression and approver availability. +- **Compliance risk:** medium due to unresolved security questionnaire fields. +- **Commercial risk:** medium because procurement is requesting fixed pricing through quarter end. +- **Execution risk:** high if response ownership remains fragmented across teams. + +## Recommended Plan +Create a single response packet and assign one coordinator. +Pre-fill all known legal and security answers from existing templates. +Schedule a thirty-minute cross-functional triage with legal, security, and sales operations. +Lock a daily cutoff time for updates and send one canonical status note to stakeholders. +Escalate unresolved blockers to leadership forty-eight hours before target sign date. + +## Success Criteria +- Security questionnaire submitted with no unresolved critical fields. +- Redline package accepted or narrowed to non-blocking items. +- Pricing terms approved by finance for the requested window. +- Purchase order process initiated before the internal close date. +- Owner confirms all blocker tickets are either resolved or explicitly waived. + diff --git a/crates/omnigraph/tests/fixtures/search.gq b/crates/omnigraph/tests/fixtures/search.gq new file mode 100644 index 0000000..c39af82 --- /dev/null +++ b/crates/omnigraph/tests/fixtures/search.gq @@ -0,0 +1,44 @@ +query text_search($q: String) { + match { + $d: Doc + search($d.title, $q) + } + return { $d.slug, $d.title } +} + +query fuzzy_search($q: String) { + match { + $d: Doc + fuzzy($d.title, $q, 2) + } + return { $d.slug, $d.title } +} + +query phrase_search($q: String) { + match { + $d: Doc + match_text($d.body, $q) + } + return { $d.slug, $d.title } +} + +query vector_search($q: Vector(4)) { + match { $d: Doc } + return { $d.slug, $d.title } + order { nearest($d.embedding, $q) } + limit 3 +} + +query bm25_search($q: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { bm25($d.title, $q) } + limit 3 +} + +query hybrid_search($vq: Vector(4), $tq: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { rrf(nearest($d.embedding, $vq), bm25($d.title, $tq)) } + limit 3 +} diff --git a/crates/omnigraph/tests/fixtures/search.jsonl b/crates/omnigraph/tests/fixtures/search.jsonl new file mode 100644 index 0000000..5b3eb80 --- /dev/null +++ b/crates/omnigraph/tests/fixtures/search.jsonl @@ -0,0 +1,5 @@ +{"type": "Doc", "data": {"slug": "ml-intro", "title": "Introduction to Machine Learning", "body": "Machine learning is a subset of artificial intelligence that focuses on algorithms", "embedding": [0.1, 0.2, 0.3, 0.4]}} +{"type": "Doc", "data": {"slug": "dl-basics", "title": "Deep Learning Basics", "body": "Deep learning uses neural networks with many layers to learn representations", "embedding": [0.5, 0.6, 0.7, 0.8]}} +{"type": "Doc", "data": {"slug": "nlp-guide", "title": "Natural Language Processing Guide", "body": "NLP applies machine learning to understand and generate human language", "embedding": [0.2, 0.3, 0.4, 0.5]}} +{"type": "Doc", "data": {"slug": "cv-overview", "title": "Computer Vision Overview", "body": "Computer vision enables machines to interpret visual information from images", "embedding": [0.8, 0.7, 0.6, 0.5]}} +{"type": "Doc", "data": {"slug": "rl-intro", "title": "Reinforcement Learning Introduction", "body": "Reinforcement learning trains agents through reward and punishment signals", "embedding": [0.3, 0.4, 0.5, 0.6]}} diff --git a/crates/omnigraph/tests/fixtures/search.pg b/crates/omnigraph/tests/fixtures/search.pg new file mode 100644 index 0000000..3a2d88b --- /dev/null +++ b/crates/omnigraph/tests/fixtures/search.pg @@ -0,0 +1,6 @@ +node Doc { + slug: String @key + title: String @index + body: String @index + embedding: Vector(4) +} diff --git a/crates/omnigraph/tests/fixtures/signals.jsonl b/crates/omnigraph/tests/fixtures/signals.jsonl new file mode 100644 index 0000000..d6ba67a --- /dev/null +++ b/crates/omnigraph/tests/fixtures/signals.jsonl @@ -0,0 +1,46 @@ +{"type": "Company", "data": {"slug": "aws", "name": "AWS", "sector": "hyperscaler"}} +{"type": "Company", "data": {"slug": "cerebras", "name": "Cerebras", "sector": "chipmaker"}} +{"type": "Company", "data": {"slug": "vast", "name": "VAST Data", "sector": "startup"}} +{"type": "Company", "data": {"slug": "oracle", "name": "Oracle", "sector": "hyperscaler"}} +{"type": "Company", "data": {"slug": "benchmark", "name": "Benchmark", "sector": "investor"}} +{"type": "Company", "data": {"slug": "xai", "name": "xAI", "sector": "lab"}} +{"type": "Company", "data": {"slug": "openai", "name": "OpenAI", "sector": "lab"}} +{"type": "Company", "data": {"slug": "anthropic", "name": "Anthropic", "sector": "lab"}} +{"type": "Company", "data": {"slug": "nvidia", "name": "NVIDIA", "sector": "chipmaker"}} +{"type": "Tech", "data": {"slug": "cs3", "name": "CS-3", "kind": "infra", "tier": "growth"}} +{"type": "Tech", "data": {"slug": "trainium", "name": "Trainium", "kind": "infra", "tier": "growth"}} +{"type": "Tech", "data": {"slug": "grok", "name": "Grok", "kind": "model", "tier": "emerging"}} +{"type": "Tech", "data": {"slug": "vast-ai-os", "name": "VAST AI OS", "kind": "platform", "tier": "growth"}} +{"type": "Signal", "data": {"slug": "aws-cerebras-inference", "title": "AWS and Cerebras collaborate on disaggregated inference: Trainium for prefill, CS-3 for decode, 5x capacity", "source": "press-release", "strength": "strong", "observed": "2026-03-13"}} +{"type": "Signal", "data": {"slug": "vast-1b-raise", "title": "VAST Data raises $1B at $30B, unveils AI OS bundling storage, compute, and agent runtimes into one stack", "source": "funding-round", "strength": "strong", "observed": "2026-03-12"}} +{"type": "Signal", "data": {"slug": "oracle-cerebras-mention", "title": "Oracle names Cerebras alongside NVIDIA and AMD as enterprise AI chip option", "source": "earnings-call", "strength": "moderate", "observed": "2026-03-10"}} +{"type": "Signal", "data": {"slug": "cerebras-23b-round", "title": "Cerebras valued at $23B after $1B round; Benchmark raises $225M in special vehicles to double down", "source": "funding-round", "strength": "strong", "observed": "2026-02-04"}} +{"type": "Signal", "data": {"slug": "xai-field-engineers", "title": "xAI deploys engineers on-site at enterprise clients to win deals from OpenAI and Anthropic", "source": "reporting", "strength": "strong", "observed": "2026-03-20"}} +{"type": "Pattern", "data": {"slug": "rise-of-fde", "name": "Rise of FDE", "category": "expansion"}} +{"type": "Pattern", "data": {"slug": "alt-chip-breakout", "name": "Alt-chip breakout", "category": "adoption"}} +{"type": "Pattern", "data": {"slug": "stack-collapse", "name": "Stack collapse", "category": "convergence"}} +{"type": "Pattern", "data": {"slug": "inference-specialization", "name": "Inference specialization", "category": "convergence"}} +{"edge": "Builds", "from": "cerebras", "to": "cs3"} +{"edge": "Builds", "from": "aws", "to": "trainium"} +{"edge": "Builds", "from": "xai", "to": "grok"} +{"edge": "Builds", "from": "vast", "to": "vast-ai-os"} +{"edge": "FundedBy", "from": "cerebras", "to": "benchmark", "data": {"amount": "$225M"}} +{"edge": "PartnersWith", "from": "aws", "to": "cerebras"} +{"edge": "PartnersWith", "from": "cerebras", "to": "openai"} +{"edge": "Mentions", "from": "aws-cerebras-inference", "to": "cs3"} +{"edge": "Mentions", "from": "aws-cerebras-inference", "to": "trainium"} +{"edge": "Mentions", "from": "vast-1b-raise", "to": "vast-ai-os"} +{"edge": "Mentions", "from": "cerebras-23b-round", "to": "cs3"} +{"edge": "Mentions", "from": "xai-field-engineers", "to": "grok"} +{"edge": "Indicates", "from": "aws-cerebras-inference", "to": "alt-chip-breakout"} +{"edge": "Indicates", "from": "aws-cerebras-inference", "to": "inference-specialization"} +{"edge": "Indicates", "from": "vast-1b-raise", "to": "stack-collapse"} +{"edge": "Indicates", "from": "oracle-cerebras-mention", "to": "alt-chip-breakout"} +{"edge": "Indicates", "from": "cerebras-23b-round", "to": "alt-chip-breakout"} +{"edge": "Indicates", "from": "xai-field-engineers", "to": "rise-of-fde"} +{"edge": "Involves", "from": "rise-of-fde", "to": "grok"} +{"edge": "Involves", "from": "alt-chip-breakout", "to": "cs3"} +{"edge": "Involves", "from": "alt-chip-breakout", "to": "trainium"} +{"edge": "Involves", "from": "stack-collapse", "to": "vast-ai-os"} +{"edge": "Involves", "from": "inference-specialization", "to": "cs3"} +{"edge": "Involves", "from": "inference-specialization", "to": "trainium"} diff --git a/crates/omnigraph/tests/fixtures/signals.pg b/crates/omnigraph/tests/fixtures/signals.pg new file mode 100644 index 0000000..65499bd --- /dev/null +++ b/crates/omnigraph/tests/fixtures/signals.pg @@ -0,0 +1,44 @@ +// Industry signals around AI models and major players. +// Branch use case: main tracks confirmed signals, branches +// model speculative or emerging interpretations. + +node Signal { + slug: String @key + title: String + source: String + strength: enum(strong, moderate, weak) + observed: Date? +} + +node Pattern { + slug: String @key + name: String + category: enum(adoption, churn, expansion, contraction, convergence) +} + +node Tech { + slug: String @key + name: String + kind: enum(model, platform, infra, framework, tool) + tier: enum(emerging, growth, mature, declining) +} + +node Company { + slug: String @key + name: String + sector: enum(lab, hyperscaler, chipmaker, investor, startup) +} + +edge Indicates: Signal -> Pattern + +edge Mentions: Signal -> Tech + +edge Involves: Pattern -> Tech + +edge Builds: Company -> Tech + +edge FundedBy: Company -> Company { + amount: String? +} + +edge PartnersWith: Company -> Company diff --git a/crates/omnigraph/tests/fixtures/test.gq b/crates/omnigraph/tests/fixtures/test.gq new file mode 100644 index 0000000..daf03ed --- /dev/null +++ b/crates/omnigraph/tests/fixtures/test.gq @@ -0,0 +1,78 @@ +// Basic: find person by name +query get_person($name: String) { + match { + $p: Person { name: $name } + } + return { $p.name, $p.age } +} + +// Filter by age +query adults() { + match { + $p: Person + $p.age > 30 + } + return { $p.name, $p.age } + order { $p.age desc } +} + +// One hop traversal +query friends_of($name: String) { + match { + $p: Person { name: $name } + $p knows $f + } + return { $f.name, $f.age } +} + +// Reverse traversal: who works at a company +query employees_of($company: String) { + match { + $c: Company { name: $company } + $p worksAt $c + } + return { $p.name } +} + +// Two hop: friends of friends +query friends_of_friends($name: String) { + match { + $p: Person { name: $name } + $p knows $mid + $mid knows $fof + } + return { $fof.name } +} + +// Negation: people who don't work anywhere +query unemployed() { + match { + $p: Person + not { $p worksAt $_ } + } + return { $p.name } +} + +// Aggregation: friend count +query friend_counts() { + match { + $p: Person + $p knows $f + } + return { + $p.name + count($f) as friends + } + order { friends desc } + limit 20 +} + +// Order and limit +query top_by_age() { + match { + $p: Person + } + return { $p.name, $p.age } + order { $p.age desc } + limit 2 +} diff --git a/crates/omnigraph/tests/fixtures/test.jsonl b/crates/omnigraph/tests/fixtures/test.jsonl new file mode 100644 index 0000000..7d2dafc --- /dev/null +++ b/crates/omnigraph/tests/fixtures/test.jsonl @@ -0,0 +1,11 @@ +{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Person", "data": {"name": "Bob", "age": 25}} +{"type": "Person", "data": {"name": "Charlie", "age": 35}} +{"type": "Person", "data": {"name": "Diana", "age": 28}} +{"type": "Company", "data": {"name": "Acme"}} +{"type": "Company", "data": {"name": "Globex"}} +{"edge": "Knows", "from": "Alice", "to": "Bob"} +{"edge": "Knows", "from": "Alice", "to": "Charlie"} +{"edge": "Knows", "from": "Bob", "to": "Diana"} +{"edge": "WorksAt", "from": "Alice", "to": "Acme"} +{"edge": "WorksAt", "from": "Bob", "to": "Globex"} diff --git a/crates/omnigraph/tests/fixtures/test.pg b/crates/omnigraph/tests/fixtures/test.pg new file mode 100644 index 0000000..6dcf9ce --- /dev/null +++ b/crates/omnigraph/tests/fixtures/test.pg @@ -0,0 +1,14 @@ +node Person { + name: String @key + age: I32? +} + +node Company { + name: String @key +} + +edge Knows: Person -> Person { + since: Date? +} + +edge WorksAt: Person -> Company diff --git a/crates/omnigraph/tests/helpers/mod.rs b/crates/omnigraph/tests/helpers/mod.rs new file mode 100644 index 0000000..d70ab17 --- /dev/null +++ b/crates/omnigraph/tests/helpers/mod.rs @@ -0,0 +1,256 @@ +#![allow(dead_code)] + +use arrow_array::{Array, RecordBatch, StringArray}; +use futures::TryStreamExt; + +use omnigraph::changes::{ChangeFilter, ChangeSet}; +use omnigraph::db::{Omnigraph, ReadTarget, Snapshot, SnapshotId}; +use omnigraph::error::Result; +use omnigraph::loader::{LoadMode, load_jsonl}; +use omnigraph_compiler::ir::ParamMap; +use omnigraph_compiler::query::ast::Literal; +use omnigraph_compiler::result::{MutationResult, QueryResult}; + +pub const TEST_SCHEMA: &str = include_str!("../fixtures/test.pg"); +pub const TEST_DATA: &str = include_str!("../fixtures/test.jsonl"); +pub const TEST_QUERIES: &str = include_str!("../fixtures/test.gq"); + +pub const MUTATION_QUERIES: &str = r#" +query insert_person($name: String, $age: I32) { + insert Person { name: $name, age: $age } +} + +query add_friend($from: String, $to: String) { + insert Knows { from: $from, to: $to } +} + +query set_age($name: String, $age: I32) { + update Person set { age: $age } where name = $name +} + +query remove_person($name: String) { + delete Person where name = $name +} + +query remove_friendship($from: String) { + delete Knows where from = $from +} +"#; + +/// Init a repo and load the standard test data. +pub async fn init_and_load(dir: &tempfile::TempDir) -> Omnigraph { + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + db +} + +/// Read all rows from a sub-table by table_key. +pub async fn read_table(db: &Omnigraph, table_key: &str) -> Vec { + let snap = snapshot_main(db).await.unwrap(); + let ds = snap.open(table_key).await.unwrap(); + ds.scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap() +} + +/// Read all rows from a branch-local sub-table by table_key. +pub async fn read_table_branch(db: &Omnigraph, branch: &str, table_key: &str) -> Vec { + let snap = snapshot_branch(db, branch).await.unwrap(); + let ds = snap.open(table_key).await.unwrap(); + ds.scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap() +} + +/// Count rows in a sub-table. +pub async fn count_rows(db: &Omnigraph, table_key: &str) -> usize { + let snap = snapshot_main(db).await.unwrap(); + let ds = snap.open(table_key).await.unwrap(); + ds.count_rows(None).await.unwrap() +} + +/// Count rows in a branch-local sub-table. +pub async fn count_rows_branch(db: &Omnigraph, branch: &str, table_key: &str) -> usize { + let snap = snapshot_branch(db, branch).await.unwrap(); + let ds = snap.open(table_key).await.unwrap(); + ds.count_rows(None).await.unwrap() +} + +/// Collect all string values from a named column across batches. +pub fn collect_column_strings(batches: &[RecordBatch], col: &str) -> Vec { + let mut out = Vec::new(); + for batch in batches { + let arr = batch + .column_by_name(col) + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..arr.len() { + if !arr.is_null(i) { + out.push(arr.value(i).to_string()); + } + } + } + out +} + +pub async fn query_main( + db: &mut Omnigraph, + query_source: &str, + query_name: &str, + params: &ParamMap, +) -> Result { + db.query(ReadTarget::branch("main"), query_source, query_name, params) + .await +} + +pub async fn query_branch( + db: &mut Omnigraph, + branch: &str, + query_source: &str, + query_name: &str, + params: &ParamMap, +) -> Result { + db.query(ReadTarget::branch(branch), query_source, query_name, params) + .await +} + +pub async fn mutate_main( + db: &mut Omnigraph, + query_source: &str, + query_name: &str, + params: &ParamMap, +) -> Result { + db.mutate("main", query_source, query_name, params).await +} + +pub async fn mutate_branch( + db: &mut Omnigraph, + branch: &str, + query_source: &str, + query_name: &str, + params: &ParamMap, +) -> Result { + db.mutate(branch, query_source, query_name, params).await +} + +pub async fn snapshot_main(db: &Omnigraph) -> Result { + db.snapshot_of(ReadTarget::branch("main")).await +} + +pub async fn snapshot_branch(db: &Omnigraph, branch: &str) -> Result { + db.snapshot_of(ReadTarget::branch(branch)).await +} + +pub async fn version_main(db: &Omnigraph) -> Result { + db.version_of(ReadTarget::branch("main")).await +} + +pub async fn version_branch(db: &Omnigraph, branch: &str) -> Result { + db.version_of(ReadTarget::branch(branch)).await +} + +pub async fn sync_main(db: &mut Omnigraph) -> Result<()> { + db.sync_branch("main").await +} + +pub async fn sync_named_branch(db: &mut Omnigraph, branch: &str) -> Result<()> { + db.sync_branch(branch).await +} + +pub async fn snapshot_id(db: &Omnigraph, branch: &str) -> Result { + db.resolve_snapshot(branch).await +} + +pub async fn diff_since_branch( + db: &Omnigraph, + branch: &str, + from_snapshot: SnapshotId, + filter: &ChangeFilter, +) -> Result { + db.diff_between( + ReadTarget::Snapshot(from_snapshot), + ReadTarget::branch(branch), + filter, + ) + .await +} + +/// Build a ParamMap from string key-value pairs. +pub fn params(pairs: &[(&str, &str)]) -> ParamMap { + pairs + .iter() + .map(|(k, v)| { + let key = k.strip_prefix('$').unwrap_or(k); + (key.to_string(), Literal::String(v.to_string())) + }) + .collect() +} + +/// Build a ParamMap from integer key-value pairs. +pub fn int_params(pairs: &[(&str, i64)]) -> ParamMap { + pairs + .iter() + .map(|(k, v)| { + let key = k.strip_prefix('$').unwrap_or(k); + (key.to_string(), Literal::Integer(*v)) + }) + .collect() +} + +/// Build a ParamMap from mixed string + integer pairs. +pub fn mixed_params(str_pairs: &[(&str, &str)], int_pairs: &[(&str, i64)]) -> ParamMap { + let mut map = params(str_pairs); + for (k, v) in int_pairs { + let key = k.strip_prefix('$').unwrap_or(k); + map.insert(key.to_string(), Literal::Integer(*v)); + } + map +} + +/// Build a ParamMap with a single vector parameter. +pub fn vector_param(name: &str, values: &[f32]) -> ParamMap { + let key = name.strip_prefix('$').unwrap_or(name).to_string(); + let lit = Literal::List(values.iter().map(|v| Literal::Float(*v as f64)).collect()); + let mut map = ParamMap::new(); + map.insert(key, lit); + map +} + +/// Build a ParamMap with a vector param and a string param. +pub fn vector_and_string_params( + vec_name: &str, + vec_values: &[f32], + str_name: &str, + str_value: &str, +) -> ParamMap { + let mut map = vector_param(vec_name, vec_values); + let key = str_name.strip_prefix('$').unwrap_or(str_name).to_string(); + map.insert(key, Literal::String(str_value.to_string())); + map +} + +pub fn s3_test_repo_uri(suite: &str) -> Option { + let bucket = std::env::var("OMNIGRAPH_S3_TEST_BUCKET").ok()?; + let prefix = std::env::var("OMNIGRAPH_S3_TEST_PREFIX") + .ok() + .filter(|value| !value.trim().is_empty()) + .unwrap_or_else(|| "omnigraph-itests".to_string()); + let unique = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .ok()? + .as_nanos(); + Some(format!("s3://{}/{}/{}/{}", bucket, prefix, suite, unique)) +} diff --git a/crates/omnigraph/tests/lance_version_columns.rs b/crates/omnigraph/tests/lance_version_columns.rs new file mode 100644 index 0000000..b9367b9 --- /dev/null +++ b/crates/omnigraph/tests/lance_version_columns.rs @@ -0,0 +1,268 @@ +/// Investigation test: understand how Lance stamps `_row_created_at_version` and +/// `_row_last_updated_at_version` for different write modes (append, merge_insert new, +/// merge_insert update). +use std::sync::Arc; + +use arrow_array::{Array, Int32Array, RecordBatch, RecordBatchIterator, StringArray, UInt64Array}; +use arrow_schema::{DataType, Field, Schema}; +use futures::TryStreamExt; +use lance::Dataset; +use lance::dataset::{WriteMode, WriteParams}; +use lance_file::version::LanceFileVersion; + +async fn create_test_dataset(uri: &str) -> Dataset { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("value", DataType::Int32, false), + ])); + + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["alice", "bob"])), + Arc::new(Int32Array::from(vec![1, 2])), + ], + ) + .unwrap(); + + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let params = WriteParams { + mode: WriteMode::Create, + enable_stable_row_ids: true, + data_storage_version: Some(LanceFileVersion::V2_2), + ..Default::default() + }; + Dataset::write(reader, uri, Some(params)).await.unwrap() +} + +fn read_version_columns(batches: &[RecordBatch]) -> Vec<(String, i32, u64, u64)> { + let mut rows = Vec::new(); + for batch in batches { + let ids = batch + .column_by_name("id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let vals = batch + .column_by_name("value") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let created = batch + .column_by_name("_row_created_at_version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let updated = batch + .column_by_name("_row_last_updated_at_version") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + for i in 0..ids.len() { + rows.push(( + ids.value(i).to_string(), + vals.value(i), + created.value(i), + updated.value(i), + )); + } + } + rows.sort_by(|a, b| a.0.cmp(&b.0)); + rows +} + +async fn scan_with_versions(ds: &Dataset) -> Vec<(String, i32, u64, u64)> { + let mut scanner = ds.scan(); + scanner + .project(&[ + "id", + "value", + "_row_created_at_version", + "_row_last_updated_at_version", + ]) + .unwrap(); + let batches: Vec = scanner + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + read_version_columns(&batches) +} + +#[tokio::test] +async fn lance_append_stamps_created_at_version_correctly() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("test.lance"); + let uri_str = uri.to_str().unwrap(); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("value", DataType::Int32, false), + ])); + + let ds = create_test_dataset(uri_str).await; + let v1 = ds.version().version; + + // Append a new row + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["charlie"])), + Arc::new(Int32Array::from(vec![3])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let mut ds = ds; + ds.append(reader, None).await.unwrap(); + let v2 = ds.version().version; + + let rows = scan_with_versions(&ds).await; + eprintln!("After append (v1={}, v2={}):", v1, v2); + for (id, val, created, updated) in &rows { + eprintln!( + " id={:<10} val={:<4} created_v={:<4} updated_v={}", + id, val, created, updated + ); + } + + // Alice and Bob: created at v1 + let alice = rows.iter().find(|r| r.0 == "alice").unwrap(); + assert_eq!(alice.2, v1, "alice created_at should be v1"); + + // Charlie: created at v2 (the append version) + let charlie = rows.iter().find(|r| r.0 == "charlie").unwrap(); + assert_eq!( + charlie.2, v2, + "charlie created_at should be v2 (append version)" + ); +} + +#[tokio::test] +async fn lance_merge_insert_new_row_stamps_created_at_version() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("test.lance"); + let uri_str = uri.to_str().unwrap(); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("value", DataType::Int32, false), + ])); + + let ds = create_test_dataset(uri_str).await; + let v1 = ds.version().version; + + // merge_insert a NEW row (eve doesn't exist) + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["eve"])), + Arc::new(Int32Array::from(vec![4])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let ds_arc = Arc::new(ds); + let job = lance::dataset::MergeInsertBuilder::try_new(ds_arc, vec!["id".to_string()]) + .unwrap() + .when_matched(lance::dataset::WhenMatched::UpdateAll) + .when_not_matched(lance::dataset::WhenNotMatched::InsertAll) + .try_build() + .unwrap(); + let (new_ds, _) = job + .execute(lance_datafusion::utils::reader_to_stream(Box::new(reader))) + .await + .unwrap(); + let v2 = new_ds.version().version; + + let rows = scan_with_versions(&new_ds).await; + eprintln!("After merge_insert NEW eve (v1={}, v2={}):", v1, v2); + for (id, val, created, updated) in &rows { + eprintln!( + " id={:<10} val={:<4} created_v={:<4} updated_v={}", + id, val, created, updated + ); + } + + let eve = rows.iter().find(|r| r.0 == "eve").unwrap(); + eprintln!("Eve: created_at_version={}, v1={}, v2={}", eve.2, v1, v2); + + // Lance behavior (as of 3.0.1): merge_insert stamps new rows with + // _row_created_at_version = dataset_creation_version (v1), NOT the + // merge_insert commit version (v2). This is why Omnigraph's change + // detection uses _row_last_updated_at_version + ID set membership + // to classify inserts vs updates, not _row_created_at_version alone. + assert_eq!( + eve.2, v1, + "Lance merge_insert stamps new rows with created_at = dataset creation version, not commit version" + ); + assert_eq!( + eve.3, v2, + "Lance merge_insert stamps new rows with last_updated_at = commit version" + ); +} + +#[tokio::test] +async fn lance_merge_insert_update_preserves_created_at_version() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().join("test.lance"); + let uri_str = uri.to_str().unwrap(); + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Utf8, false), + Field::new("value", DataType::Int32, false), + ])); + + let ds = create_test_dataset(uri_str).await; + let v1 = ds.version().version; + + // merge_insert an EXISTING row (update bob's value) + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(StringArray::from(vec!["bob"])), + Arc::new(Int32Array::from(vec![99])), + ], + ) + .unwrap(); + let reader = RecordBatchIterator::new(vec![Ok(batch)], schema); + let ds_arc = Arc::new(ds); + let job = lance::dataset::MergeInsertBuilder::try_new(ds_arc, vec!["id".to_string()]) + .unwrap() + .when_matched(lance::dataset::WhenMatched::UpdateAll) + .when_not_matched(lance::dataset::WhenNotMatched::InsertAll) + .try_build() + .unwrap(); + let (new_ds, _) = job + .execute(lance_datafusion::utils::reader_to_stream(Box::new(reader))) + .await + .unwrap(); + let v2 = new_ds.version().version; + + let rows = scan_with_versions(&new_ds).await; + eprintln!("After merge_insert UPDATE bob (v1={}, v2={}):", v1, v2); + for (id, val, created, updated) in &rows { + eprintln!( + " id={:<10} val={:<4} created_v={:<4} updated_v={}", + id, val, created, updated + ); + } + + let alice = rows.iter().find(|r| r.0 == "alice").unwrap(); + let bob = rows.iter().find(|r| r.0 == "bob").unwrap(); + + // Alice: untouched, should keep original versions + assert_eq!(alice.2, v1, "alice created_at should still be v1"); + assert_eq!(alice.3, v1, "alice updated_at should still be v1"); + + // Bob: updated via merge_insert + // created_at should be preserved (v1), updated_at should be bumped (v2) + eprintln!( + "Bob: created_at={}, updated_at={}, v1={}, v2={}", + bob.2, bob.3, v1, v2 + ); + assert_eq!(bob.1, 99, "bob's value should be updated to 99"); +} diff --git a/crates/omnigraph/tests/point_in_time.rs b/crates/omnigraph/tests/point_in_time.rs new file mode 100644 index 0000000..d654b88 --- /dev/null +++ b/crates/omnigraph/tests/point_in_time.rs @@ -0,0 +1,736 @@ +mod helpers; + +use arrow_array::{Array, Int32Array}; +use helpers::*; +use omnigraph::db::Omnigraph; +use omnigraph_compiler::ir::ParamMap; + +// ─── Inline queries for point-in-time tests ───────────────────────────────── + +const ALL_PERSONS_QUERY: &str = r#" +query all_persons() { + match { + $p: Person + } + return { $p.name, $p.age } + order { $p.name asc } +} +"#; + +const FRIENDS_QUERY: &str = r#" +query friends_of($name: String) { + match { + $p: Person { name: $name } + $p knows $f + } + return { $f.name } + order { $f.name asc } +} +"#; + +const UNEMPLOYED_QUERY: &str = r#" +query unemployed() { + match { + $p: Person + not { $p worksAt $_ } + } + return { $p.name } + order { $p.name asc } +} +"#; + +const FILTERED_QUERY: &str = r#" +query older_than($min_age: I32) { + match { + $p: Person + $p.age > $min_age + } + return { $p.name, $p.age } + order { $p.name asc } +} +"#; + +const GET_PERSON_QUERY: &str = r#" +query get_person($name: String) { + match { + $p: Person { name: $name } + } + return { $p.name, $p.age } +} +"#; + +// ─── Morphological matrix ─────────────────────────────────────────────────── +// +// Dimensions: +// Query type: Tabular | Traversal | Negation (AntiJoin) | Filtered | Aggregation +// Mutation: Insert | Update | Delete node | Delete edge +// Branch: Main | Named branch +// Result shape: Empty→non-empty | Non-empty→empty | Count changes | Value changes +// +// Existing coverage (4 tests): +// Tabular × Insert × Main (returns_historical_data) +// Traversal × Insert × Main (traversal_uses_historical_graph_index) +// Tabular × Update × Main (multiple_versions_sees_correct_state) +// Error case (snapshot_at_version_fails_for_nonexistent_version) +// +// New coverage (9 tests below): +// Tabular × Delete node × Main → non-empty becomes smaller +// Traversal × Delete edge × Main → edge disappears from historical +// Negation × Insert edge × Main → anti-join result shrinks after insert +// Negation × Delete edge × Main → anti-join result grows after delete +// Filtered × Update × Main → entity enters/exits filter after age change +// Multi-hop × Insert(n+e) × Main → friends-of-friends grows after new path +// Traversal × Delete node × Main → cascade removes edges from traversal +// Tabular × Insert × Branch → branch isolation for point-in-time +// Tabular × Multi-step × Main → 4-version chain: insert, update, delete + +// ─── Original tests ──────────────────────────────────────────────────────── + +#[tokio::test] +async fn run_query_at_returns_historical_data() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = version_main(&db).await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let historical = db + .run_query_at(v_before, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + let current = query_main(&mut db, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + + assert_eq!(historical.num_rows(), 4, "historical should have 4 persons"); + assert_eq!(current.num_rows(), 5, "current should have 5 persons"); + + let historical_names = collect_column_strings(historical.batches(), "p.name"); + assert!(!historical_names.contains(&"Eve".to_string())); + + let current_names = collect_column_strings(current.batches(), "p.name"); + assert!(current_names.contains(&"Eve".to_string())); +} + +#[tokio::test] +async fn run_query_at_traversal_uses_historical_graph_index() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v_before = version_main(&db).await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + mutate_main( + &mut db, + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Eve"), ("$to", "Alice")]), + ) + .await + .unwrap(); + + let historical = db + .run_query_at( + v_before, + FRIENDS_QUERY, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + let current = query_main( + &mut db, + FRIENDS_QUERY, + "friends_of", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + + assert_eq!(historical.num_rows(), 2); + let hist_names = collect_column_strings(historical.batches(), "f.name"); + assert!(hist_names.contains(&"Bob".to_string())); + assert!(hist_names.contains(&"Charlie".to_string())); + + assert_eq!(current.num_rows(), 1); + let cur_names = collect_column_strings(current.batches(), "f.name"); + assert!(cur_names.contains(&"Alice".to_string())); +} + +#[tokio::test] +async fn snapshot_at_version_fails_for_nonexistent_version() { + let dir = tempfile::tempdir().unwrap(); + let db = init_and_load(&dir).await; + + let result = db.snapshot_at_version(99999).await; + assert!(result.is_err(), "non-existent version should return error"); +} + +#[tokio::test] +async fn run_query_at_multiple_versions_sees_correct_state() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let v1 = version_main(&db).await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 99)]), + ) + .await + .unwrap(); + let v2 = version_main(&db).await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 40)]), + ) + .await + .unwrap(); + + let at_v1 = db + .run_query_at(v1, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(at_v1.num_rows(), 4, "v1 should have 4 persons"); + let v1_names = collect_column_strings(at_v1.batches(), "p.name"); + assert!(!v1_names.contains(&"Frank".to_string())); + + let at_v2 = db + .run_query_at(v2, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(at_v2.num_rows(), 4, "v2 should have 4 persons"); + let v2_names = collect_column_strings(at_v2.batches(), "p.name"); + assert!(!v2_names.contains(&"Frank".to_string())); + + let current = query_main(&mut db, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(current.num_rows(), 5, "current should have 5 persons"); + let cur_names = collect_column_strings(current.batches(), "p.name"); + assert!(cur_names.contains(&"Frank".to_string())); +} + +// ─── Tabular × Delete node ───────────────────────────────────────────────── + +#[tokio::test] +async fn tabular_delete_node_invisible_at_historical_version() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + // Fixture: Alice, Bob, Charlie, Diana + let v_before = version_main(&db).await.unwrap(); + + mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Charlie")]), + ) + .await + .unwrap(); + + // Historical: Charlie still exists + let historical = db + .run_query_at(v_before, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + let hist_names = collect_column_strings(historical.batches(), "p.name"); + assert_eq!(historical.num_rows(), 4); + assert!(hist_names.contains(&"Charlie".to_string())); + + // Current: Charlie is gone + let current = query_main(&mut db, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + let cur_names = collect_column_strings(current.batches(), "p.name"); + assert_eq!(current.num_rows(), 3); + assert!(!cur_names.contains(&"Charlie".to_string())); +} + +// ─── Traversal × Delete edge ─────────────────────────────────────────────── + +#[tokio::test] +async fn traversal_delete_edge_invisible_at_historical_version() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + // Fixture: Alice knows Bob, Alice knows Charlie + let v_before = version_main(&db).await.unwrap(); + + // Remove all Knows edges FROM Alice + mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_friendship", + ¶ms(&[("$from", "Alice")]), + ) + .await + .unwrap(); + + // Historical traversal: Alice's friends at v_before = Bob, Charlie + let historical = db + .run_query_at( + v_before, + FRIENDS_QUERY, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(historical.num_rows(), 2); + let hist_names = collect_column_strings(historical.batches(), "f.name"); + assert!(hist_names.contains(&"Bob".to_string())); + assert!(hist_names.contains(&"Charlie".to_string())); + + // Current: Alice has no friends (edges deleted) + let current = query_main( + &mut db, + FRIENDS_QUERY, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!( + current.num_rows(), + 0, + "Alice should have no friends after edge deletion" + ); +} + +// ─── Negation (AntiJoin) × Insert ────────────────────────────────────────── + +#[tokio::test] +async fn negation_insert_shrinks_antijoin_result() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + // Fixture: Alice worksAt Acme, Bob worksAt Globex + // Unemployed: Charlie, Diana + let v_before = version_main(&db).await.unwrap(); + + // Give Charlie a job + mutate_main( + &mut db, + r#" +query hire($from: String, $to: String) { + insert WorksAt { from: $from, to: $to } +} +"#, + "hire", + ¶ms(&[("$from", "Charlie"), ("$to", "Acme")]), + ) + .await + .unwrap(); + + // Historical: Charlie and Diana were unemployed + let historical = db + .run_query_at(v_before, UNEMPLOYED_QUERY, "unemployed", &ParamMap::new()) + .await + .unwrap(); + let hist_names = collect_column_strings(historical.batches(), "p.name"); + assert_eq!(historical.num_rows(), 2); + assert!(hist_names.contains(&"Charlie".to_string())); + assert!(hist_names.contains(&"Diana".to_string())); + + // Current: only Diana is unemployed + let current = query_main(&mut db, UNEMPLOYED_QUERY, "unemployed", &ParamMap::new()) + .await + .unwrap(); + let cur_names = collect_column_strings(current.batches(), "p.name"); + assert_eq!(current.num_rows(), 1); + assert!(cur_names.contains(&"Diana".to_string())); + assert!(!cur_names.contains(&"Charlie".to_string())); +} + +// ─── Negation (AntiJoin) × Delete edge ───────────────────────────────────── + +#[tokio::test] +async fn negation_delete_edge_grows_antijoin_result() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + // Fixture: Alice worksAt Acme, Bob worksAt Globex + // Unemployed at start: Charlie, Diana + let v_before = version_main(&db).await.unwrap(); + + // Fire Alice (delete WorksAt edge) + mutate_main( + &mut db, + r#" +query fire($from: String) { + delete WorksAt where from = $from +} +"#, + "fire", + ¶ms(&[("$from", "Alice")]), + ) + .await + .unwrap(); + + // Historical: 2 unemployed (Charlie, Diana) + let historical = db + .run_query_at(v_before, UNEMPLOYED_QUERY, "unemployed", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(historical.num_rows(), 2); + let hist_names = collect_column_strings(historical.batches(), "p.name"); + assert!(!hist_names.contains(&"Alice".to_string())); + + // Current: 3 unemployed (Alice, Charlie, Diana) + let current = query_main(&mut db, UNEMPLOYED_QUERY, "unemployed", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(current.num_rows(), 3); + let cur_names = collect_column_strings(current.batches(), "p.name"); + assert!(cur_names.contains(&"Alice".to_string())); +} + +// ─── Filtered × Update (value enters/exits filter) ───────────────────────── + +#[tokio::test] +async fn filtered_update_entity_crosses_filter_boundary() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + // Fixture: Alice(30), Bob(25), Charlie(35), Diana(28) + // older_than(30): Charlie(35) only + let v_before = version_main(&db).await.unwrap(); + + // Update Bob's age from 25 to 40 → enters the filter + mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Bob")], &[("$age", 40)]), + ) + .await + .unwrap(); + + // Historical: only Charlie is older than 30 + let historical = db + .run_query_at( + v_before, + FILTERED_QUERY, + "older_than", + &int_params(&[("$min_age", 30)]), + ) + .await + .unwrap(); + assert_eq!(historical.num_rows(), 1); + let hist_names = collect_column_strings(historical.batches(), "p.name"); + assert_eq!(hist_names, vec!["Charlie"]); + + // Current: Bob(40) and Charlie(35) are older than 30 + let current = query_main( + &mut db, + FILTERED_QUERY, + "older_than", + &int_params(&[("$min_age", 30)]), + ) + .await + .unwrap(); + assert_eq!(current.num_rows(), 2); + let cur_names = collect_column_strings(current.batches(), "p.name"); + assert!(cur_names.contains(&"Bob".to_string())); + assert!(cur_names.contains(&"Charlie".to_string())); +} + +// ─── Multi-hop traversal × Insert ────────────────────────────────────────── + +#[tokio::test] +async fn multi_hop_traversal_historical_version() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + // Fixture: Alice→Bob, Alice→Charlie, Bob→Diana + // friends_of_friends(Alice) = Diana (Alice→Bob→Diana) + let v_before = version_main(&db).await.unwrap(); + + // Insert Eve and edge: Charlie→Eve + // Now friends_of_friends(Alice) = Diana + Eve (Alice→Charlie→Eve) + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + mutate_main( + &mut db, + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Charlie"), ("$to", "Eve")]), + ) + .await + .unwrap(); + + let fof_query = r#" +query fof($name: String) { + match { + $p: Person { name: $name } + $p knows $mid + $mid knows $f + } + return { $f.name } + order { $f.name asc } +} +"#; + + // Historical: friends-of-friends of Alice = Diana only + let historical = db + .run_query_at(v_before, fof_query, "fof", ¶ms(&[("$name", "Alice")])) + .await + .unwrap(); + assert_eq!(historical.num_rows(), 1); + let hist_names = collect_column_strings(historical.batches(), "f.name"); + assert_eq!(hist_names, vec!["Diana"]); + + // Current: friends-of-friends of Alice = Diana + Eve + let current = query_main(&mut db, fof_query, "fof", ¶ms(&[("$name", "Alice")])) + .await + .unwrap(); + assert_eq!(current.num_rows(), 2); + let cur_names = collect_column_strings(current.batches(), "f.name"); + assert!(cur_names.contains(&"Diana".to_string())); + assert!(cur_names.contains(&"Eve".to_string())); +} + +// ─── Traversal × Delete node (cascade removes edges) ─────────────────────── + +#[tokio::test] +async fn traversal_delete_node_cascade_removes_edges() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + // Fixture: Alice knows Bob, Alice knows Charlie, Bob knows Diana + let v_before = version_main(&db).await.unwrap(); + + // Delete Bob → cascades to Knows edges involving Bob + mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Bob")]), + ) + .await + .unwrap(); + + // Historical: Alice's friends = Bob, Charlie + let historical = db + .run_query_at( + v_before, + FRIENDS_QUERY, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(historical.num_rows(), 2); + let hist_names = collect_column_strings(historical.batches(), "f.name"); + assert!(hist_names.contains(&"Bob".to_string())); + assert!(hist_names.contains(&"Charlie".to_string())); + + // Current: Alice's friends = Charlie only (Bob was deleted, edge cascaded) + let current = query_main( + &mut db, + FRIENDS_QUERY, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(current.num_rows(), 1); + let cur_names = collect_column_strings(current.batches(), "f.name"); + assert_eq!(cur_names, vec!["Charlie"]); +} + +// ─── Branch isolation for point-in-time ──────────────────────────────────── + +#[tokio::test] +async fn branch_point_in_time_isolated_from_main() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut main = init_and_load(&dir).await; + + main.branch_create("feature").await.unwrap(); + let v_main_before = version_main(&main).await.unwrap(); + + // Insert Eve on main + mutate_main( + &mut main, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + // Insert Frank on feature branch + let mut feature = Omnigraph::open(uri).await.unwrap(); + mutate_branch( + &mut feature, + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Frank")], &[("$age", 33)]), + ) + .await + .unwrap(); + + // Historical main at v_main_before: 4 persons, no Eve, no Frank + let hist_main = main + .run_query_at( + v_main_before, + ALL_PERSONS_QUERY, + "all_persons", + &ParamMap::new(), + ) + .await + .unwrap(); + assert_eq!(hist_main.num_rows(), 4); + let hist_names = collect_column_strings(hist_main.batches(), "p.name"); + assert!(!hist_names.contains(&"Eve".to_string())); + assert!(!hist_names.contains(&"Frank".to_string())); + + // Current main: 5 persons (Eve present, Frank not visible on main) + let cur_main = query_main( + &mut main, + ALL_PERSONS_QUERY, + "all_persons", + &ParamMap::new(), + ) + .await + .unwrap(); + assert_eq!(cur_main.num_rows(), 5); + let cur_names = collect_column_strings(cur_main.batches(), "p.name"); + assert!(cur_names.contains(&"Eve".to_string())); + assert!(!cur_names.contains(&"Frank".to_string())); + + // Feature branch: 5 persons (Frank present, Eve not visible on feature) + let cur_feature = query_branch( + &mut feature, + "feature", + ALL_PERSONS_QUERY, + "all_persons", + &ParamMap::new(), + ) + .await + .unwrap(); + assert_eq!(cur_feature.num_rows(), 5); + let feat_names = collect_column_strings(cur_feature.batches(), "p.name"); + assert!(feat_names.contains(&"Frank".to_string())); + assert!(!feat_names.contains(&"Eve".to_string())); +} + +// ─── Multi-step version chain: insert → update → delete ──────────────────── + +#[tokio::test] +async fn four_version_chain_insert_update_delete() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // v1: baseline (Alice=30, Bob=25, Charlie=35, Diana=28) + let v1 = version_main(&db).await.unwrap(); + + // v2: insert Eve(22) + mutate_main( + &mut db, + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + let v2 = version_main(&db).await.unwrap(); + + // v3: update Eve's age to 50 + mutate_main( + &mut db, + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Eve")], &[("$age", 50)]), + ) + .await + .unwrap(); + let v3 = version_main(&db).await.unwrap(); + + // v4: delete Eve + mutate_main( + &mut db, + MUTATION_QUERIES, + "remove_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + + // v1: no Eve, 4 persons + let at_v1 = db + .run_query_at(v1, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(at_v1.num_rows(), 4); + let v1_names = collect_column_strings(at_v1.batches(), "p.name"); + assert!(!v1_names.contains(&"Eve".to_string())); + + // v2: Eve exists with age 22, 5 persons + let at_v2 = db + .run_query_at( + v2, + GET_PERSON_QUERY, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(at_v2.num_rows(), 1); + let v2_batch = at_v2.concat_batches().unwrap(); + let v2_ages = v2_batch + .column_by_name("p.age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(v2_ages.value(0), 22); + + // v3: Eve exists with age 50 + let at_v3 = db + .run_query_at( + v3, + GET_PERSON_QUERY, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(at_v3.num_rows(), 1); + let v3_batch = at_v3.concat_batches().unwrap(); + let v3_ages = v3_batch + .column_by_name("p.age") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(v3_ages.value(0), 50); + + // v4 (current): Eve is gone, back to 4 + let current = query_main(&mut db, ALL_PERSONS_QUERY, "all_persons", &ParamMap::new()) + .await + .unwrap(); + assert_eq!(current.num_rows(), 4); + let cur_names = collect_column_strings(current.batches(), "p.name"); + assert!(!cur_names.contains(&"Eve".to_string())); +} diff --git a/crates/omnigraph/tests/runs.rs b/crates/omnigraph/tests/runs.rs new file mode 100644 index 0000000..76fea2c --- /dev/null +++ b/crates/omnigraph/tests/runs.rs @@ -0,0 +1,533 @@ +mod helpers; + +use std::collections::HashMap; + +use arrow_array::{Array, RecordBatch, StringArray, TimestampMicrosecondArray}; +use futures::TryStreamExt; +use lance::Dataset; + +use omnigraph::db::commit_graph::CommitGraph; +use omnigraph::db::{Omnigraph, ReadTarget, RunStatus}; +use omnigraph::error::OmniError; +use omnigraph::loader::{LoadMode, load_jsonl}; + +use helpers::*; + +#[derive(Debug, Clone)] +struct PersistedRun { + run_id: String, + target_branch: String, + run_branch: String, + status: String, + updated_at: i64, +} + +async fn latest_runs(uri: &str) -> Vec { + let runs_uri = format!("{}/_graph_runs.lance", uri); + let ds = Dataset::open(&runs_uri).await.unwrap(); + let batches: Vec = ds + .scan() + .try_into_stream() + .await + .unwrap() + .try_collect() + .await + .unwrap(); + + let mut latest: HashMap = HashMap::new(); + for batch in batches { + let run_ids = batch + .column_by_name("run_id") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let target_branches = batch + .column_by_name("target_branch") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let run_branches = batch + .column_by_name("run_branch") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let statuses = batch + .column_by_name("status") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + let updated_ats = batch + .column_by_name("updated_at") + .unwrap() + .as_any() + .downcast_ref::() + .unwrap(); + + for row in 0..batch.num_rows() { + let record = PersistedRun { + run_id: run_ids.value(row).to_string(), + target_branch: target_branches.value(row).to_string(), + run_branch: run_branches.value(row).to_string(), + status: statuses.value(row).to_string(), + updated_at: updated_ats.value(row), + }; + match latest.get(record.run_id.as_str()) { + Some(existing) if existing.updated_at >= record.updated_at => {} + _ => { + latest.insert(record.run_id.clone(), record); + } + } + } + } + + let mut records = latest.into_values().collect::>(); + records.sort_by(|a, b| a.run_id.cmp(&b.run_id)); + records +} + +#[tokio::test] +async fn begin_run_creates_hidden_internal_branch_and_isolates_writes() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let base_snapshot = db.resolve_snapshot("main").await.unwrap(); + + let run = db.begin_run("main", Some("test-load")).await.unwrap(); + + assert!(run.run_branch.starts_with("__run__")); + assert_eq!(run.target_branch, "main"); + assert_eq!(run.base_snapshot_id, base_snapshot.as_str()); + assert_eq!(run.status, RunStatus::Running); + assert_eq!(db.branch_list().await.unwrap(), vec!["main"]); + + db.load( + &run.run_branch, + r#"{"type":"Person","data":{"name":"Eve","age":22}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + + let main_qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(main_qr.num_rows(), 0); + + let run_qr = db + .query( + ReadTarget::branch(run.run_branch.as_str()), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(run_qr.num_rows(), 1); +} + +#[tokio::test] +async fn publish_run_merges_internal_branch_into_target_and_marks_record() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let run = db.begin_run("main", Some("publish-test")).await.unwrap(); + + db.load( + &run.run_branch, + r#"{"type":"Person","data":{"name":"Eve","age":22}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + + let published_snapshot = db.publish_run(&run.run_id).await.unwrap(); + let record = db.get_run(&run.run_id).await.unwrap(); + + assert_eq!(record.status, RunStatus::Published); + assert_eq!( + record.published_snapshot_id.as_deref(), + Some(published_snapshot.as_str()) + ); + + let main_qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(main_qr.num_rows(), 1); +} + +#[tokio::test] +async fn abort_run_keeps_target_unchanged_and_preserves_hidden_branch_for_inspection() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let run = db.begin_run("main", Some("abort-test")).await.unwrap(); + + db.load( + &run.run_branch, + r#"{"type":"Person","data":{"name":"Eve","age":22}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + + let aborted = db.abort_run(&run.run_id).await.unwrap(); + assert_eq!(aborted.status, RunStatus::Aborted); + + let main_qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(main_qr.num_rows(), 0); + + let run_qr = db + .query( + ReadTarget::branch(run.run_branch.as_str()), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(run_qr.num_rows(), 1); +} + +#[tokio::test] +async fn public_branch_apis_reject_internal_run_refs() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + let run = db.begin_run("main", Some("guard-test")).await.unwrap(); + + let merge_err = db.branch_merge(&run.run_branch, "main").await.unwrap_err(); + match merge_err { + OmniError::Manifest(message) => assert!(message.message.contains("internal run refs")), + other => panic!("unexpected error: {}", other), + } + + let create_err = db.branch_create(&run.run_branch).await.unwrap_err(); + match create_err { + OmniError::Manifest(message) => assert!(message.message.contains("internal run ref")), + other => panic!("unexpected error: {}", other), + } + + let delete_err = db.branch_delete(&run.run_branch).await.unwrap_err(); + match delete_err { + OmniError::Manifest(message) => assert!(message.message.contains("internal run ref")), + other => panic!("unexpected error: {}", other), + } + + let fork_err = db + .branch_create_from(ReadTarget::branch(run.run_branch.as_str()), "child") + .await + .unwrap_err(); + match fork_err { + OmniError::Manifest(message) => assert!(message.message.contains("internal run ref")), + other => panic!("unexpected error: {}", other), + } +} + +#[tokio::test] +async fn branch_delete_rejects_target_branches_with_active_runs() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + db.branch_create("feature").await.unwrap(); + let run = db.begin_run("feature", Some("delete-guard")).await.unwrap(); + + let err = db.branch_delete("feature").await.unwrap_err(); + assert!(err.to_string().contains(run.run_id.as_str())); + assert!(err.to_string().contains("targeting it is running")); +} + +#[tokio::test] +async fn public_load_uses_hidden_transactional_run_and_publishes_it() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let result = load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + assert_eq!(result.nodes_loaded.len(), 2); + assert_eq!(result.edges_loaded.len(), 2); + assert_eq!(db.branch_list().await.unwrap(), vec!["main"]); + + let runs = latest_runs(uri).await; + assert_eq!(runs.len(), 1); + assert_eq!(runs[0].target_branch, "main"); + assert_eq!(runs[0].status, "published"); + assert!(runs[0].run_branch.starts_with("__run__")); + + let qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); +} + +#[tokio::test] +async fn public_load_preserves_staged_edge_ids_on_publish() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + let runs = latest_runs(uri).await; + let run_branch = runs[0].run_branch.clone(); + + let mut main_ids = collect_column_strings(&read_table(&db, "edge:Knows").await, "id"); + let mut run_ids = collect_column_strings( + &read_table_branch(&db, run_branch.as_str(), "edge:Knows").await, + "id", + ); + main_ids.sort(); + run_ids.sort(); + assert_eq!(main_ids, run_ids); +} + +#[tokio::test] +async fn failed_public_load_marks_run_failed_and_leaves_target_unchanged() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + let bad = r#"{"type":"Person","data":{"name":"Alice","age":30}} +{"edge":"Knows","from":"Alice","to":"Missing"}"#; + let err = load_jsonl(&mut db, bad, LoadMode::Overwrite) + .await + .unwrap_err(); + match err { + OmniError::Manifest(message) => assert!(message.message.contains("not found in Person")), + other => panic!("unexpected error: {}", other), + } + + let runs = latest_runs(uri).await; + assert_eq!(runs.len(), 1); + assert_eq!(runs[0].status, "failed"); + assert!(runs[0].run_branch.starts_with("__run__")); + + let snap = snapshot_main(&db).await.unwrap(); + let person_count = snap + .open("node:Person") + .await + .unwrap() + .count_rows(None) + .await + .unwrap(); + assert_eq!(person_count, 0); +} + +#[tokio::test] +async fn public_mutation_uses_hidden_transactional_run_and_publishes_it() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = init_and_load(&dir).await; + + let result = db + .mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + assert_eq!(result.affected_nodes, 1); + assert_eq!(result.affected_edges, 0); + + let runs = latest_runs(uri).await; + assert!(!runs.is_empty()); + let latest = runs.last().unwrap(); + assert_eq!(latest.target_branch, "main"); + assert_eq!(latest.status, "published"); + assert!(latest.run_branch.starts_with("__run__")); + + let qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Eve")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 1); +} + +#[tokio::test] +async fn public_mutation_preserves_staged_edge_ids_on_publish() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = init_and_load(&dir).await; + + db.mutate( + "main", + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Alice"), ("$to", "Diana")]), + ) + .await + .unwrap(); + + let runs = latest_runs(uri).await; + let latest = runs.last().unwrap(); + + let mut main_ids = collect_column_strings(&read_table(&db, "edge:Knows").await, "id"); + let mut run_ids = collect_column_strings( + &read_table_branch(&db, latest.run_branch.as_str(), "edge:Knows").await, + "id", + ); + main_ids.sort(); + run_ids.sort(); + assert_eq!(main_ids, run_ids); +} + +#[tokio::test] +async fn failed_public_mutation_marks_run_failed_and_leaves_target_unchanged() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = init_and_load(&dir).await; + + let err = db + .mutate( + "main", + MUTATION_QUERIES, + "add_friend", + ¶ms(&[("$from", "Alice"), ("$to", "Missing")]), + ) + .await + .unwrap_err(); + match err { + OmniError::Manifest(message) => assert!(message.message.contains("not found")), + other => panic!("unexpected error: {}", other), + } + + let runs = latest_runs(uri).await; + assert!(!runs.is_empty()); + let latest = runs.last().unwrap(); + assert_eq!(latest.status, "failed"); + assert!(latest.run_branch.starts_with("__run__")); + + let qr = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(qr.num_rows(), 2); +} + +#[tokio::test] +async fn concurrent_conflicting_run_publish_fails_cleanly() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let run_a = db.begin_run("main", Some("conflict-a")).await.unwrap(); + let run_b = db.begin_run("main", Some("conflict-b")).await.unwrap(); + + db.mutate( + run_a.run_branch.as_str(), + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), + ) + .await + .unwrap(); + db.mutate( + run_b.run_branch.as_str(), + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 32)]), + ) + .await + .unwrap(); + + db.publish_run(&run_a.run_id).await.unwrap(); + let publish_b = db.publish_run(&run_b.run_id).await; + assert!(publish_b.is_err(), "second conflicting publish should fail"); + let err = publish_b.unwrap_err().to_string(); + assert!( + err.contains("conflict") || err.contains("divergent") || err.contains("Alice"), + "unexpected conflict error: {}", + err + ); + + let alice = db + .query( + ReadTarget::branch("main"), + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + let rows = alice.to_rust_json(); + assert_eq!(alice.num_rows(), 1); + assert_eq!(rows[0]["p.age"], serde_json::json!(31)); + + let run_a_record = db.get_run(&run_a.run_id).await.unwrap(); + assert_eq!(run_a_record.status, RunStatus::Published); + let run_b_record = db.get_run(&run_b.run_id).await.unwrap(); + assert_eq!(run_b_record.status, RunStatus::Running); +} + +#[tokio::test] +async fn public_mutation_records_actor_on_run_and_published_commit() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = init_and_load(&dir).await; + + db.mutate_as( + "main", + MUTATION_QUERIES, + "set_age", + &mixed_params(&[("$name", "Alice")], &[("$age", 31)]), + Some("act-andrew"), + ) + .await + .unwrap(); + + let runs = db.list_runs().await.unwrap(); + let run = runs + .iter() + .find(|run| run.operation_hash.as_deref() == Some("mutation:set_age:branch=main")) + .expect("published mutation run should exist"); + assert_eq!(run.actor_id.as_deref(), Some("act-andrew")); + assert_eq!(run.status, RunStatus::Published); + + let head = CommitGraph::open(uri) + .await + .unwrap() + .head_commit() + .await + .unwrap() + .unwrap(); + assert_eq!(head.actor_id.as_deref(), Some("act-andrew")); +} diff --git a/crates/omnigraph/tests/s3_storage.rs b/crates/omnigraph/tests/s3_storage.rs new file mode 100644 index 0000000..a7c26ea --- /dev/null +++ b/crates/omnigraph/tests/s3_storage.rs @@ -0,0 +1,187 @@ +mod helpers; + +use omnigraph::db::MergeOutcome; +use omnigraph::db::Omnigraph; +use omnigraph::loader::{LoadMode, load_jsonl}; + +use helpers::*; + +#[tokio::test(flavor = "multi_thread")] +async fn s3_compatible_repo_lifecycle_works() { + let Some(uri) = s3_test_repo_uri("omnigraph-runtime") else { + eprintln!("skipping s3 runtime test: OMNIGRAPH_S3_TEST_BUCKET is not set"); + return; + }; + + let mut db = Omnigraph::init(&uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + let mut reopened = Omnigraph::open(&uri).await.unwrap(); + let snapshot = reopened.snapshot_of("main").await.unwrap(); + assert!(snapshot.entry("node:Person").is_some()); + assert!(snapshot.entry("edge:Knows").is_some()); + + let alice = query_main( + &mut reopened, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap() + .to_rust_json(); + assert_eq!(alice[0]["p.name"], "Alice"); + + reopened + .mutate( + "main", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "RustFS-Eve")], &[("$age", 29)]), + ) + .await + .unwrap(); + + let run = reopened + .begin_run("main", Some("s3-runtime-run")) + .await + .unwrap(); + reopened + .load( + &run.run_branch, + r#"{"type":"Person","data":{"name":"RunOnly","age":31}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + reopened.publish_run(&run.run_id).await.unwrap(); + + let runs = reopened.list_runs().await.unwrap(); + assert!( + runs.iter() + .any(|record| { record.run_id == run.run_id && record.status.as_str() == "published" }), + "expected published run record in {:?}", + runs + ); + + let mut reopened_again = Omnigraph::open(&uri).await.unwrap(); + let eve = query_main( + &mut reopened_again, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "RustFS-Eve")]), + ) + .await + .unwrap() + .to_rust_json(); + assert_eq!(eve[0]["p.name"], "RustFS-Eve"); + + let run_only = query_main( + &mut reopened_again, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "RunOnly")]), + ) + .await + .unwrap() + .to_rust_json(); + assert_eq!(run_only[0]["p.name"], "RunOnly"); +} + +#[tokio::test(flavor = "multi_thread")] +async fn s3_branch_change_merge_flow_works() { + let Some(uri) = s3_test_repo_uri("omnigraph-branching") else { + eprintln!("skipping s3 branch test: OMNIGRAPH_S3_TEST_BUCKET is not set"); + return; + }; + + let mut main = Omnigraph::init(&uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut main, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + main.branch_create("feature").await.unwrap(); + + let mut feature = Omnigraph::open(&uri).await.unwrap(); + feature + .mutate( + "feature", + MUTATION_QUERIES, + "insert_person", + &mixed_params(&[("$name", "Feature-Eve")], &[("$age", 22)]), + ) + .await + .unwrap(); + + let before_merge = query_main( + &mut main, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Feature-Eve")]), + ) + .await + .unwrap(); + assert_eq!(before_merge.num_rows(), 0); + + let outcome = main.branch_merge("feature", "main").await.unwrap(); + assert_eq!(outcome, MergeOutcome::FastForward); + + let mut reopened = Omnigraph::open(&uri).await.unwrap(); + let after_merge = query_main( + &mut reopened, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Feature-Eve")]), + ) + .await + .unwrap() + .to_rust_json(); + assert_eq!(after_merge[0]["p.name"], "Feature-Eve"); + assert_eq!( + reopened.branch_list().await.unwrap(), + vec!["main".to_string(), "feature".to_string()] + ); +} + +#[tokio::test(flavor = "multi_thread")] +async fn s3_public_load_uses_hidden_run_and_publishes() { + let Some(uri) = s3_test_repo_uri("omnigraph-public-load") else { + eprintln!("skipping s3 public load test: OMNIGRAPH_S3_TEST_BUCKET is not set"); + return; + }; + + let mut db = Omnigraph::init(&uri, TEST_SCHEMA).await.unwrap(); + load_jsonl(&mut db, TEST_DATA, LoadMode::Overwrite) + .await + .unwrap(); + + db.load( + "main", + r#"{"type":"Person","data":{"name":"Loaded-Over-S3","age":34}}"#, + LoadMode::Append, + ) + .await + .unwrap(); + + let runs = db.list_runs().await.unwrap(); + assert!( + runs.iter().any(|record| { + record.target_branch == "main" && record.status.as_str() == "published" + }), + "expected published transactional run in {:?}", + runs + ); + + let mut reopened = Omnigraph::open(&uri).await.unwrap(); + let loaded = query_main( + &mut reopened, + TEST_QUERIES, + "get_person", + ¶ms(&[("$name", "Loaded-Over-S3")]), + ) + .await + .unwrap() + .to_rust_json(); + assert_eq!(loaded[0]["p.name"], "Loaded-Over-S3"); +} diff --git a/crates/omnigraph/tests/search.rs b/crates/omnigraph/tests/search.rs new file mode 100644 index 0000000..a611a0f --- /dev/null +++ b/crates/omnigraph/tests/search.rs @@ -0,0 +1,677 @@ +mod helpers; + +use std::env; + +use arrow_array::{Array, StringArray}; +use lance_index::{DatasetIndexExt, is_system_index}; +use serial_test::serial; + +use omnigraph::db::Omnigraph; +use omnigraph::loader::{LoadMode, load_jsonl}; +use omnigraph_compiler::query::ast::Literal; +use omnigraph_compiler::result::QueryResult; + +use helpers::*; + +const SEARCH_SCHEMA: &str = include_str!("fixtures/search.pg"); +const SEARCH_DATA: &str = include_str!("fixtures/search.jsonl"); +const SEARCH_QUERIES: &str = include_str!("fixtures/search.gq"); +const MOCK_SEARCH_SCHEMA: &str = r#" +node Doc { + slug: String @key + title: String @index + embedding: Vector(4) @index +} +"#; +const MOCK_SEARCH_QUERIES: &str = r#" +query vector_search_vector($q: Vector(4)) { + match { $d: Doc } + return { $d.slug, $d.title } + order { nearest($d.embedding, $q) } + limit 3 +} + +query vector_search_string($q: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { nearest($d.embedding, $q) } + limit 3 +} + +query vector_search_literal() { + match { $d: Doc } + return { $d.slug, $d.title } + order { nearest($d.embedding, "alpha") } + limit 3 +} + +query hybrid_search_vector($vq: Vector(4), $tq: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { rrf(nearest($d.embedding, $vq), bm25($d.title, $tq)) } + limit 3 +} + +query hybrid_search_string($vq: String, $tq: String) { + match { $d: Doc } + return { $d.slug, $d.title } + order { rrf(nearest($d.embedding, $vq), bm25($d.title, $tq)) } + limit 3 +} +"#; +const SEARCH_MUTATIONS: &str = r#" +query insert_doc($slug: String, $title: String, $body: String, $embedding: Vector(4)) { + insert Doc { + slug: $slug, + title: $title, + body: $body, + embedding: $embedding + } +} +"#; + +async fn init_search_db(dir: &tempfile::TempDir) -> Omnigraph { + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, SEARCH_SCHEMA).await.unwrap(); + load_jsonl(&mut db, SEARCH_DATA, LoadMode::Overwrite) + .await + .unwrap(); + db +} + +async fn init_mock_embedding_search_db(dir: &tempfile::TempDir) -> Omnigraph { + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, MOCK_SEARCH_SCHEMA).await.unwrap(); + load_jsonl(&mut db, &mock_embedding_seed_data(), LoadMode::Overwrite) + .await + .unwrap(); + db +} + +fn mock_embedding_seed_data() -> String { + [ + ("alpha-doc", "alpha guide", mock_embedding("alpha", 4)), + ("beta-doc", "beta guide", mock_embedding("beta", 4)), + ("gamma-doc", "gamma handbook", mock_embedding("gamma", 4)), + ] + .into_iter() + .map(|(slug, title, embedding)| { + format!( + r#"{{"type":"Doc","data":{{"slug":"{}","title":"{}","embedding":[{}]}}}}"#, + slug, + title, + format_vector(&embedding) + ) + }) + .collect::>() + .join("\n") +} + +fn format_vector(values: &[f32]) -> String { + values + .iter() + .map(|value| format!("{:.8}", value)) + .collect::>() + .join(", ") +} + +fn mock_embedding(input: &str, dim: usize) -> Vec { + let mut seed = fnv1a64(input.as_bytes()); + let mut out = Vec::with_capacity(dim); + for _ in 0..dim { + seed = xorshift64(seed); + let ratio = (seed as f64 / u64::MAX as f64) as f32; + out.push((ratio * 2.0) - 1.0); + } + normalize_vector(out) +} + +fn normalize_vector(mut values: Vec) -> Vec { + let norm = values + .iter() + .map(|value| (*value as f64) * (*value as f64)) + .sum::() + .sqrt() as f32; + if norm > f32::EPSILON { + for value in &mut values { + *value /= norm; + } + } + values +} + +fn fnv1a64(bytes: &[u8]) -> u64 { + let mut hash = 14695981039346656037u64; + for byte in bytes { + hash ^= *byte as u64; + hash = hash.wrapping_mul(1099511628211u64); + } + hash +} + +fn xorshift64(mut x: u64) -> u64 { + x ^= x << 13; + x ^= x >> 7; + x ^= x << 17; + x +} + +fn result_slugs(result: &QueryResult) -> Vec { + let batch = result.concat_batches().unwrap(); + let slugs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + (0..slugs.len()) + .map(|index| slugs.value(index).to_string()) + .collect() +} + +async fn doc_user_index_count(db: &Omnigraph) -> usize { + let ds = snapshot_main(db) + .await + .unwrap() + .open("node:Doc") + .await + .unwrap(); + ds.load_indices() + .await + .unwrap() + .iter() + .filter(|idx| !is_system_index(idx)) + .count() +} + +struct EnvGuard { + saved: Vec<(&'static str, Option)>, +} + +impl EnvGuard { + fn set(vars: &[(&'static str, Option<&str>)]) -> Self { + let saved = vars + .iter() + .map(|(name, _)| (*name, env::var(name).ok())) + .collect::>(); + for (name, value) in vars { + unsafe { + match value { + Some(value) => env::set_var(name, value), + None => env::remove_var(name), + } + } + } + Self { saved } + } +} + +impl Drop for EnvGuard { + fn drop(&mut self) { + for (name, value) in self.saved.drain(..) { + unsafe { + match value { + Some(value) => env::set_var(name, value), + None => env::remove_var(name), + } + } + } + } +} + +// ─── Text search (match_tokens) ───────────────────────────────────────────── + +#[tokio::test] +#[serial] +async fn text_search_filters_results() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + // "Learning" appears in: ml-intro, dl-basics, rl-intro titles + let result = query_main( + &mut db, + SEARCH_QUERIES, + "text_search", + ¶ms(&[("$q", "Learning")]), + ) + .await + .unwrap(); + + assert!( + result.num_rows() > 0, + "expected at least 1 result for 'Learning'" + ); + let batch = result.concat_batches().unwrap(); + let slugs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let slug_values: Vec<&str> = (0..slugs.len()).map(|i| slugs.value(i)).collect(); + // Should contain ML and RL intro docs + assert!( + slug_values.contains(&"ml-intro") || slug_values.contains(&"rl-intro"), + "expected learning-related docs, got {:?}", + slug_values + ); +} + +#[tokio::test] +#[serial] +async fn text_search_no_results() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + let result = query_main( + &mut db, + SEARCH_QUERIES, + "text_search", + ¶ms(&[("$q", "xyznonexistent")]), + ) + .await + .unwrap(); + + assert_eq!(result.num_rows(), 0); +} + +// ─── Fuzzy search (match_tokens with fuzzy_max_edits) ─────────────────────── + +#[tokio::test] +#[serial] +async fn fuzzy_search_tolerates_typos() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + // "Introductio" (missing 'n') should fuzzy-match "Introduction" with max_edits=2 + let result = query_main( + &mut db, + SEARCH_QUERIES, + "fuzzy_search", + ¶ms(&[("$q", "Introductio")]), + ) + .await + .unwrap(); + + // Fuzzy matching may not work with the default tokenizer on all terms; + // at minimum verify it doesn't error + // If it returns results, great — it matched despite the typo + let _ = result.num_rows(); +} + +// ─── Phrase search (match_phrase) ─────────────────────────────────────────── + +#[tokio::test] +#[serial] +async fn phrase_search_matches_exact_phrase() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + // "neural networks" appears in dl-basics body + let result = query_main( + &mut db, + SEARCH_QUERIES, + "phrase_search", + ¶ms(&[("$q", "neural networks")]), + ) + .await + .unwrap(); + + assert!( + result.num_rows() > 0, + "expected match for 'neural networks'" + ); + let batch = result.concat_batches().unwrap(); + let slugs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let slug_values: Vec<&str> = (0..slugs.len()).map(|i| slugs.value(i)).collect(); + assert!( + slug_values.contains(&"dl-basics"), + "expected dl-basics for 'neural networks', got {:?}", + slug_values + ); +} + +#[tokio::test] +#[serial] +async fn phrase_search_is_documented_fts_fallback() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + let result = query_main( + &mut db, + SEARCH_QUERIES, + "phrase_search", + ¶ms(&[("$q", "networks layers")]), + ) + .await + .unwrap(); + + assert!( + result.num_rows() > 0, + "match_text fallback should still match FTS tokens" + ); + let batch = result.concat_batches().unwrap(); + let slugs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let slug_values: Vec<&str> = (0..slugs.len()).map(|i| slugs.value(i)).collect(); + assert!( + slug_values.contains(&"dl-basics"), + "expected FTS fallback to match dl-basics, got {:?}", + slug_values + ); +} + +// ─── Vector search (nearest) ──────────────────────────────────────────────── + +#[tokio::test] +#[serial] +async fn nearest_returns_k_closest() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + // Query vector [0.1, 0.2, 0.3, 0.4] is identical to ml-intro's embedding + let result = query_main( + &mut db, + SEARCH_QUERIES, + "vector_search", + &vector_param("$q", &[0.1, 0.2, 0.3, 0.4]), + ) + .await + .unwrap(); + + // limit 3 → should return exactly 3 + assert_eq!(result.num_rows(), 3); + + // ml-intro should be the closest (distance=0) + let batch = result.concat_batches().unwrap(); + let slugs = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(slugs.value(0), "ml-intro", "closest should be ml-intro"); +} + +#[tokio::test] +#[serial] +async fn nearest_string_param_matches_explicit_vector_under_mock_embeddings() { + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_EMBEDDINGS_MOCK", Some("1")), + ("GEMINI_API_KEY", None), + ]); + + let dir = tempfile::tempdir().unwrap(); + let mut db = init_mock_embedding_search_db(&dir).await; + + let explicit = query_main( + &mut db, + MOCK_SEARCH_QUERIES, + "vector_search_vector", + &vector_param("$q", &mock_embedding("alpha", 4)), + ) + .await + .unwrap(); + let embedded = query_main( + &mut db, + MOCK_SEARCH_QUERIES, + "vector_search_string", + ¶ms(&[("$q", "alpha")]), + ) + .await + .unwrap(); + + assert_eq!(result_slugs(&embedded), result_slugs(&explicit)); + assert_eq!(result_slugs(&embedded)[0], "alpha-doc"); +} + +#[tokio::test] +#[serial] +async fn nearest_string_literal_works_under_mock_embeddings() { + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_EMBEDDINGS_MOCK", Some("1")), + ("GEMINI_API_KEY", None), + ]); + + let dir = tempfile::tempdir().unwrap(); + let mut db = init_mock_embedding_search_db(&dir).await; + + let result = query_main( + &mut db, + MOCK_SEARCH_QUERIES, + "vector_search_literal", + ¶ms(&[]), + ) + .await + .unwrap(); + + assert_eq!(result_slugs(&result)[0], "alpha-doc"); +} + +#[tokio::test] +#[serial] +async fn rrf_with_string_nearest_matches_explicit_vector_under_mock_embeddings() { + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_EMBEDDINGS_MOCK", Some("1")), + ("GEMINI_API_KEY", None), + ]); + + let dir = tempfile::tempdir().unwrap(); + let mut db = init_mock_embedding_search_db(&dir).await; + + let explicit = query_main( + &mut db, + MOCK_SEARCH_QUERIES, + "hybrid_search_vector", + &vector_and_string_params("$vq", &mock_embedding("alpha", 4), "$tq", "alpha"), + ) + .await + .unwrap(); + let embedded = query_main( + &mut db, + MOCK_SEARCH_QUERIES, + "hybrid_search_string", + ¶ms(&[("$vq", "alpha"), ("$tq", "alpha")]), + ) + .await + .unwrap(); + + assert_eq!(result_slugs(&embedded), result_slugs(&explicit)); + assert_eq!(result_slugs(&embedded)[0], "alpha-doc"); +} + +#[tokio::test] +#[serial] +async fn explicit_vector_nearest_does_not_require_gemini_credentials() { + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_EMBEDDINGS_MOCK", None), + ("GEMINI_API_KEY", None), + ]); + + let dir = tempfile::tempdir().unwrap(); + let mut db = init_mock_embedding_search_db(&dir).await; + + let result = query_main( + &mut db, + MOCK_SEARCH_QUERIES, + "vector_search_vector", + &vector_param("$q", &mock_embedding("alpha", 4)), + ) + .await + .unwrap(); + + assert_eq!(result_slugs(&result)[0], "alpha-doc"); +} + +#[tokio::test] +#[serial] +async fn string_nearest_requires_gemini_credentials_when_mock_is_disabled() { + let _guard = EnvGuard::set(&[ + ("OMNIGRAPH_EMBEDDINGS_MOCK", None), + ("GEMINI_API_KEY", None), + ]); + + let dir = tempfile::tempdir().unwrap(); + let mut db = init_mock_embedding_search_db(&dir).await; + + let err = query_main( + &mut db, + MOCK_SEARCH_QUERIES, + "vector_search_string", + ¶ms(&[("$q", "alpha")]), + ) + .await + .unwrap_err(); + + assert!(err.to_string().contains("GEMINI_API_KEY")); +} + +// ─── BM25 search ──────────────────────────────────────────────────────────── + +#[tokio::test] +#[serial] +async fn bm25_returns_ranked_results() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + // "Learning" appears in multiple titles + let result = query_main( + &mut db, + SEARCH_QUERIES, + "bm25_search", + ¶ms(&[("$q", "Learning")]), + ) + .await + .unwrap(); + + assert!( + result.num_rows() > 0, + "bm25 should return results for 'Learning'" + ); + assert!(result.num_rows() <= 3, "bm25 should respect limit 3"); +} + +#[tokio::test] +#[serial] +async fn mutation_commit_refreshes_search_indices_without_manual_ensure() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + assert_eq!(doc_user_index_count(&db).await, 4); + + let mut mutation_params = vector_param("$embedding", &[0.9, 0.1, 0.1, 0.1]); + mutation_params.insert( + "slug".to_string(), + Literal::String("quasar-notes".to_string()), + ); + mutation_params.insert( + "title".to_string(), + Literal::String("Quasar Notes".to_string()), + ); + mutation_params.insert( + "body".to_string(), + Literal::String("Quasar observations and telescope notes".to_string()), + ); + + db.mutate("main", SEARCH_MUTATIONS, "insert_doc", &mutation_params) + .await + .unwrap(); + + assert_eq!( + doc_user_index_count(&db).await, + 4, + "mutation commit should refresh required indices without duplicating them" + ); + + let result = query_main( + &mut db, + SEARCH_QUERIES, + "text_search", + ¶ms(&[("$q", "Quasar")]), + ) + .await + .unwrap(); + assert!( + result_slugs(&result).contains(&"quasar-notes".to_string()), + "newly inserted row should be searchable without an explicit ensure_indices step" + ); +} + +// ─── RRF hybrid search ───────────────────────────────────────────────────── + +#[tokio::test] +#[serial] +async fn rrf_fuses_vector_and_text() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_search_db(&dir).await; + + let result = query_main( + &mut db, + SEARCH_QUERIES, + "hybrid_search", + &vector_and_string_params("$vq", &[0.1, 0.2, 0.3, 0.4], "$tq", "Learning"), + ) + .await + .unwrap(); + + assert!(result.num_rows() > 0, "rrf should return results"); + assert!(result.num_rows() <= 3, "rrf should respect limit 3"); +} + +#[tokio::test] +#[serial] +async fn load_commit_creates_vector_index_for_vector_annotations() { + let schema = r#" +node Doc { + slug: String @key + embedding: Vector(4) @index +} +"#; + let data = r#"{"type": "Doc", "data": {"slug": "a", "embedding": [0.1, 0.2, 0.3, 0.4]}} +{"type": "Doc", "data": {"slug": "b", "embedding": [0.5, 0.6, 0.7, 0.8]}}"#; + + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, schema).await.unwrap(); + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + let ds = snapshot_main(&db) + .await + .unwrap() + .open("node:Doc") + .await + .unwrap(); + let indices = ds.load_indices().await.unwrap(); + let user_indices: Vec<_> = indices.iter().filter(|idx| !is_system_index(idx)).collect(); + assert_eq!( + user_indices.len(), + 3, + "expected id BTree index plus key-property and vector indices" + ); +} + +#[tokio::test] +#[serial] +async fn load_commit_creates_inverted_indices_for_string_annotations() { + let dir = tempfile::tempdir().unwrap(); + let db = init_search_db(&dir).await; + + let ds = snapshot_main(&db) + .await + .unwrap() + .open("node:Doc") + .await + .unwrap(); + let indices = ds.load_indices().await.unwrap(); + let user_indices: Vec<_> = indices.iter().filter(|idx| !is_system_index(idx)).collect(); + assert_eq!( + user_indices.len(), + 4, + "expected id BTree index plus key-property and title/body inverted indices" + ); +} diff --git a/crates/omnigraph/tests/traversal.rs b/crates/omnigraph/tests/traversal.rs new file mode 100644 index 0000000..cc3228f --- /dev/null +++ b/crates/omnigraph/tests/traversal.rs @@ -0,0 +1,398 @@ +mod helpers; + +use arrow_array::{Array, Int32Array, StringArray}; + +use omnigraph::db::Omnigraph; +use omnigraph::loader::{LoadMode, load_jsonl}; +use omnigraph_compiler::ir::ParamMap; + +use helpers::*; + +// ─── Anti-join slow path (predicated negation) ────────────────────────────── + +#[tokio::test] +async fn anti_join_predicated_negation() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // "People who do NOT work at Acme" + // Inner pipeline: Expand(worksAt) + Filter(name="Acme") → 2 ops → slow path + let queries = r#" +query not_at_acme() { + match { + $p: Person + not { + $p worksAt $c + $c.name = "Acme" + } + } + return { $p.name } +} +"#; + // Test data: Alice→Acme, Bob→Globex. Charlie and Diana have no WorksAt. + // Expected: everyone except Alice = {Bob, Charlie, Diana} + let result = query_main(&mut db, queries, "not_at_acme", &ParamMap::new()) + .await + .unwrap(); + + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut names_vec: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + names_vec.sort(); + assert_eq!(names_vec, vec!["Bob", "Charlie", "Diana"]); +} + +// ─── Variable-length hops ─────────────────────────────────────────────────── + +const CHAIN_SCHEMA: &str = r#" +node Person { name: String @key } +edge Knows: Person -> Person +"#; + +const CHAIN_DATA: &str = r#"{"type": "Person", "data": {"name": "A"}} +{"type": "Person", "data": {"name": "B"}} +{"type": "Person", "data": {"name": "C"}} +{"type": "Person", "data": {"name": "D"}} +{"edge": "Knows", "from": "A", "to": "B"} +{"edge": "Knows", "from": "B", "to": "C"} +{"edge": "Knows", "from": "C", "to": "D"} +"#; + +async fn init_chain(dir: &tempfile::TempDir) -> Omnigraph { + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, CHAIN_SCHEMA).await.unwrap(); + load_jsonl(&mut db, CHAIN_DATA, LoadMode::Overwrite) + .await + .unwrap(); + db +} + +#[tokio::test] +async fn variable_hops_1_to_3() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_chain(&dir).await; + + let queries = r#" +query reachable($name: String) { + match { + $p: Person { name: $name } + $p knows{1,3} $f + } + return { $f.name } +} +"#; + let result = query_main(&mut db, queries, "reachable", ¶ms(&[("$name", "A")])) + .await + .unwrap(); + + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut names_vec: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + names_vec.sort(); + // A→B (1 hop), A→B→C (2 hops), A→B→C→D (3 hops) + assert_eq!(names_vec, vec!["B", "C", "D"]); +} + +#[tokio::test] +async fn variable_hops_2_to_3() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_chain(&dir).await; + + let queries = r#" +query far_reachable($name: String) { + match { + $p: Person { name: $name } + $p knows{2,3} $f + } + return { $f.name } +} +"#; + let result = query_main( + &mut db, + queries, + "far_reachable", + ¶ms(&[("$name", "A")]), + ) + .await + .unwrap(); + + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut names_vec: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + names_vec.sort(); + // Skip 1-hop (B), keep 2-hop (C) and 3-hop (D) + assert_eq!(names_vec, vec!["C", "D"]); +} + +#[tokio::test] +async fn variable_hops_exact_2() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_chain(&dir).await; + + let queries = r#" +query exactly_2($name: String) { + match { + $p: Person { name: $name } + $p knows{2,2} $f + } + return { $f.name } +} +"#; + let result = query_main(&mut db, queries, "exactly_2", ¶ms(&[("$name", "A")])) + .await + .unwrap(); + + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut names_vec: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + names_vec.sort(); + // Exactly 2 hops from A: only C (A→B→C) + assert_eq!(names_vec, vec!["C"]); +} + +// ─── Ordering ASC ─────────────────────────────────────────────────────────── + +#[tokio::test] +async fn ordering_ascending() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let queries = r#" +query by_age_asc() { + match { $p: Person } + return { $p.name, $p.age } + order { $p.age asc } +} +"#; + let result = query_main(&mut db, queries, "by_age_asc", &ParamMap::new()) + .await + .unwrap(); + + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let ages = batch + .column(1) + .as_any() + .downcast_ref::() + .unwrap(); + + // Bob(25), Diana(28), Alice(30), Charlie(35) — ascending by age + assert_eq!(batch.num_rows(), 4); + assert_eq!(ages.value(0), 25); + assert_eq!(ages.value(1), 28); + assert_eq!(ages.value(2), 30); + assert_eq!(ages.value(3), 35); + + assert_eq!(names.value(0), "Bob"); + assert_eq!(names.value(3), "Charlie"); +} + +// ─── Empty graph traversal ────────────────────────────────────────────────── + +#[tokio::test] +async fn traversal_no_edges_returns_empty() { + let dir = tempfile::tempdir().unwrap(); + let uri = dir.path().to_str().unwrap(); + let mut db = Omnigraph::init(uri, TEST_SCHEMA).await.unwrap(); + + // Load only nodes, no edges + let data = r#"{"type": "Person", "data": {"name": "Alice", "age": 30}} +{"type": "Person", "data": {"name": "Bob", "age": 25}} +{"type": "Company", "data": {"name": "Acme"}}"#; + load_jsonl(&mut db, data, LoadMode::Overwrite) + .await + .unwrap(); + + // Traversal should return empty, not crash + let result = query_main( + &mut db, + TEST_QUERIES, + "friends_of", + ¶ms(&[("$name", "Alice")]), + ) + .await + .unwrap(); + assert_eq!(result.num_rows(), 0); + + // Anti-join: everyone is "unemployed" since no WorksAt edges exist + let result = query_main(&mut db, TEST_QUERIES, "unemployed", &ParamMap::new()) + .await + .unwrap(); + let batch = result.concat_batches().unwrap(); + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.len(), 2); // Alice and Bob +} + +// ─── Filter comparison operators ───────────────────────────────────────────── + +#[tokio::test] +async fn filter_less_than() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let queries = r#" +query young($age: I32) { + match { + $p: Person + $p.age < $age + } + return { $p.name, $p.age } + order { $p.age asc } +} +"#; + let result = query_main(&mut db, queries, "young", &int_params(&[("$age", 28)])) + .await + .unwrap(); + + // Only Bob (25) is < 28 + assert_eq!(result.num_rows(), 1); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Bob"); +} + +#[tokio::test] +async fn filter_greater_equal() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let queries = r#" +query at_least_30() { + match { + $p: Person + $p.age >= 30 + } + return { $p.name } + order { $p.age asc } +} +"#; + let result = query_main(&mut db, queries, "at_least_30", &ParamMap::new()) + .await + .unwrap(); + + // Alice (30) and Charlie (35) + assert_eq!(result.num_rows(), 2); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Alice"); + assert_eq!(names.value(1), "Charlie"); +} + +#[tokio::test] +async fn filter_less_equal() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let queries = r#" +query at_most_28() { + match { + $p: Person + $p.age <= 28 + } + return { $p.name } + order { $p.age asc } +} +"#; + let result = query_main(&mut db, queries, "at_most_28", &ParamMap::new()) + .await + .unwrap(); + + // Bob (25) and Diana (28) + assert_eq!(result.num_rows(), 2); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + assert_eq!(names.value(0), "Bob"); + assert_eq!(names.value(1), "Diana"); +} + +#[tokio::test] +async fn filter_not_equal() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + let queries = r#" +query not_alice() { + match { + $p: Person + $p.name != "Alice" + } + return { $p.name } + order { $p.name asc } +} +"#; + let result = query_main(&mut db, queries, "not_alice", &ParamMap::new()) + .await + .unwrap(); + + // Bob, Charlie, Diana + assert_eq!(result.num_rows(), 3); + let batch = &result.batches()[0]; + let names = batch + .column(0) + .as_any() + .downcast_ref::() + .unwrap(); + let mut name_vec: Vec<&str> = (0..names.len()).map(|i| names.value(i)).collect(); + name_vec.sort(); + assert_eq!(name_vec, vec!["Bob", "Charlie", "Diana"]); +} + +// ─── Error paths ──────────────────────────────────────────────────────────── + +#[tokio::test] +async fn insert_missing_required_property_fails() { + let dir = tempfile::tempdir().unwrap(); + let mut db = init_and_load(&dir).await; + + // Insert Person with no name — name is @key, so this should fail + let queries = r#" +query insert_no_name($age: I32) { + insert Person { age: $age } +} +"#; + let result = mutate_main( + &mut db, + queries, + "insert_no_name", + &int_params(&[("$age", 25)]), + ) + .await; + + assert!(result.is_err(), "insert without @key property should fail"); +} diff --git a/docker/entrypoint.sh b/docker/entrypoint.sh new file mode 100644 index 0000000..83b7d34 --- /dev/null +++ b/docker/entrypoint.sh @@ -0,0 +1,32 @@ +#!/bin/sh +set -eu + +SERVER_BIN="/usr/local/bin/omnigraph-server" + +if [ "$#" -gt 0 ]; then + exec "$SERVER_BIN" "$@" +fi + +bind="${OMNIGRAPH_BIND:-0.0.0.0:8080}" + +if [ -n "${OMNIGRAPH_TARGET_URI:-}" ]; then + exec "$SERVER_BIN" "${OMNIGRAPH_TARGET_URI}" --bind "${bind}" +fi + +if [ -n "${OMNIGRAPH_CONFIG:-}" ]; then + if [ -n "${OMNIGRAPH_TARGET:-}" ]; then + exec "$SERVER_BIN" --config "${OMNIGRAPH_CONFIG}" --target "${OMNIGRAPH_TARGET}" --bind "${bind}" + fi + exec "$SERVER_BIN" --config "${OMNIGRAPH_CONFIG}" --bind "${bind}" +fi + +cat >&2 <<'EOF' +omnigraph-server container startup requires one of: + - OMNIGRAPH_TARGET_URI + - OMNIGRAPH_CONFIG + +Optional: + - OMNIGRAPH_BIND (default: 0.0.0.0:8080) + - OMNIGRAPH_TARGET (used with OMNIGRAPH_CONFIG) +EOF +exit 64 diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000..e15d6af --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,89 @@ +# CLI Guide + +## Core Repo Flow + +```bash +omnigraph init --schema ./schema.pg ./repo.omni +omnigraph load --data ./data.jsonl --mode overwrite ./repo.omni +omnigraph snapshot ./repo.omni --branch main --json +omnigraph read ./repo.omni --query ./queries.gq --name get_person --params '{"name":"Alice"}' +omnigraph change ./repo.omni --query ./queries.gq --name insert_person --params '{"name":"Mina","age":28}' +``` + +## Branching And Reviewable Data Flows + +```bash +omnigraph branch create --uri ./repo.omni --from main feature-x +omnigraph branch list --uri ./repo.omni +omnigraph branch merge --uri ./repo.omni feature-x --into main + +omnigraph ingest --data ./batch.jsonl --branch review/import-2026-04-09 ./repo.omni +omnigraph export ./repo.omni --branch main --type Person > people.jsonl +omnigraph commit list ./repo.omni --branch main --json +omnigraph commit show --uri ./repo.omni --json +``` + +## Remote Server Mode + +Serve a repo: + +```bash +omnigraph-server ./repo.omni --bind 127.0.0.1:8080 +``` + +Read through the HTTP API: + +```bash +omnigraph read \ + --target http://127.0.0.1:8080 \ + --query ./queries.gq \ + --name get_person \ + --params '{"name":"Alice"}' +``` + +If the server requires auth, set `OMNIGRAPH_SERVER_BEARER_TOKEN` on the server +and configure the matching `bearer_token_env` in `omnigraph.yaml`. + +## Runs, Policy, And Diagnostics + +```bash +omnigraph schema plan --schema ./next.pg ./repo.omni --json +omnigraph policy validate --config ./omnigraph.yaml +omnigraph policy test --config ./omnigraph.yaml +omnigraph policy explain --config ./omnigraph.yaml --actor act-alice --action read --branch main + +omnigraph run list ./repo.omni --json +omnigraph run show --uri ./repo.omni --json +omnigraph run publish --uri ./repo.omni --json +omnigraph run abort --uri ./repo.omni --json +``` + +## Config + +`omnigraph.yaml` lets the CLI and server share named targets, defaults, and +query roots: + +```yaml +targets: + local: + uri: ./demo.omni + dev: + uri: http://127.0.0.1:8080 + bearer_token_env: OMNIGRAPH_BEARER_TOKEN + +cli: + target: local + branch: main + +query: + roots: + - queries + - . +``` + +The config file can also define: + +- server bind defaults +- auth env files +- query aliases for common read and change commands +- `policy.file` for Cedar authorization rules diff --git a/docs/deployment.md b/docs/deployment.md new file mode 100644 index 0000000..73d82f2 --- /dev/null +++ b/docs/deployment.md @@ -0,0 +1,125 @@ +# Deployment + +This doc describes the public runtime contract for self-hosting Omnigraph. It +does not include environment-specific secrets, private infrastructure, or +internal deploy automation. + +## Runtime Modes + +Omnigraph supports two broad deployment shapes: + +- local directory repos +- `s3://` repos on AWS S3 or S3-compatible object stores + +The server binary and container image expose the same HTTP surface. + +## Binary Deployment + +Build or install: + +- `omnigraph` +- `omnigraph-server` + +Run against a local repo: + +```bash +omnigraph-server ./repo.omni --bind 0.0.0.0:8080 +``` + +Run against an object-store-backed repo: + +```bash +OMNIGRAPH_SERVER_BEARER_TOKEN="change-me" \ +AWS_REGION="us-east-1" \ +omnigraph-server s3://my-bucket/repos/example/releases/2026-04-10-v0.1.0 \ + --bind 0.0.0.0:8080 +``` + +## One-Command Local RustFS Bootstrap + +The easiest local S3-backed deployment path is: + +```bash +curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph-public/main/scripts/local-rustfs-bootstrap.sh | bash +``` + +The bootstrap: + +- starts a local RustFS-backed object store +- creates a bucket and S3-backed Omnigraph repo +- loads the checked-in context fixture +- starts `omnigraph-server` on `127.0.0.1:8080` + +Supported behavior: + +- downloads a tagged release binary when one exists for the current platform +- otherwise clones `ModernRelay/omnigraph-public` and builds from source +- reuses an existing RustFS container if it is already running + +Useful overrides: + +- `WORKDIR=/path/to/state` +- `BUCKET=omnigraph-local` +- `PREFIX=repos/context` +- `BIND=127.0.0.1:8080` +- `RUSTFS_CONTAINER_NAME=omnigraph-rustfs-demo` + +The bootstrap expects: + +- Docker +- `curl` +- either a matching release asset or a local Rust toolchain plus `git` + +If `aws` is not installed, the script attempts a user-local AWS CLI install via +`python3 -m pip`. Docker Desktop or another Docker daemon must already be +running. + +## Container Deployment + +Build the image: + +```bash +docker build -t omnigraph-server:local . +``` + +Run against a local repo: + +```bash +docker run --rm -p 8080:8080 \ + -v "$PWD/repo.omni:/data/repo.omni" \ + omnigraph-server:local \ + /data/repo.omni --bind 0.0.0.0:8080 +``` + +Run against an S3-backed repo: + +```bash +docker run --rm -p 8080:8080 \ + -e OMNIGRAPH_SERVER_BEARER_TOKEN="change-me" \ + -e AWS_REGION="us-east-1" \ + omnigraph-server:local \ + s3://my-bucket/repos/example/releases/2026-04-10-v0.1.0 \ + --bind 0.0.0.0:8080 +``` + +## Auth + +The server can run unauthenticated for local development, but any shared or +internet-facing deployment should set: + +- `OMNIGRAPH_SERVER_BEARER_TOKEN` + +The health endpoint `/healthz` remains suitable for load balancer health checks. + +## S3-Compatible Storage + +For S3-compatible backends such as RustFS or MinIO, set the usual AWS SDK +environment variables: + +- `AWS_ACCESS_KEY_ID` +- `AWS_SECRET_ACCESS_KEY` +- `AWS_REGION` +- optional `AWS_ENDPOINT_URL` +- optional `AWS_ENDPOINT_URL_S3` +- optional `AWS_ALLOW_HTTP=true` +- optional `AWS_S3_FORCE_PATH_STYLE=true` diff --git a/docs/install.md b/docs/install.md new file mode 100644 index 0000000..841f526 --- /dev/null +++ b/docs/install.md @@ -0,0 +1,66 @@ +# Install + +## Quick Install + +```bash +curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph-public/main/scripts/install.sh | bash +``` + +By default the installer places: + +- `omnigraph` +- `omnigraph-server` + +in `~/.local/bin`. + +If a matching release asset exists for your platform, the installer downloads +and unpacks it. Otherwise it falls back to cloning `ModernRelay/omnigraph-public` +and building from source. + +## Useful Overrides + +Install to a different directory: + +```bash +curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph-public/main/scripts/install.sh | INSTALL_DIR="$HOME/bin" bash +``` + +Force a source build even if a release asset exists: + +```bash +curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph-public/main/scripts/install.sh | FORCE_BUILD=1 bash +``` + +Build from a specific git ref: + +```bash +curl -fsSL https://raw.githubusercontent.com/ModernRelay/omnigraph-public/main/scripts/install.sh | SOURCE_REF=main bash +``` + +## Manual Source Build + +```bash +cargo build --release --locked -p omnigraph-cli -p omnigraph-server +install -m 0755 target/release/omnigraph ~/.local/bin/omnigraph +install -m 0755 target/release/omnigraph-server ~/.local/bin/omnigraph-server +``` + +## Release Assets + +Tagged releases are expected to publish: + +- `omnigraph-linux-x86_64.tar.gz` +- `omnigraph-macos-x86_64.tar.gz` +- `omnigraph-macos-arm64.tar.gz` + +Each archive contains both binaries: + +- `omnigraph` +- `omnigraph-server` + +## Verify The Install + +```bash +omnigraph version +omnigraph-server --help +``` diff --git a/omnigraph.example.yaml b/omnigraph.example.yaml new file mode 100644 index 0000000..f4317d6 --- /dev/null +++ b/omnigraph.example.yaml @@ -0,0 +1,15 @@ +targets: + local: + uri: ./repo.omni + dev: + uri: http://127.0.0.1:8080 + bearer_token_env: OMNIGRAPH_BEARER_TOKEN + +cli: + target: local + branch: main + +query: + roots: + - queries + - . diff --git a/rust-toolchain.toml b/rust-toolchain.toml new file mode 100644 index 0000000..2fc3eef --- /dev/null +++ b/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "stable" +profile = "minimal" diff --git a/scripts/install.sh b/scripts/install.sh new file mode 100755 index 0000000..cef9623 --- /dev/null +++ b/scripts/install.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash +set -euo pipefail + +REPO_SLUG="${REPO_SLUG:-ModernRelay/omnigraph-public}" +SOURCE_REF="${SOURCE_REF:-main}" +INSTALL_DIR="${INSTALL_DIR:-$HOME/.local/bin}" +FORCE_BUILD="${FORCE_BUILD:-0}" +TMP_ROOT="${TMPDIR:-/tmp}" +WORKDIR="" + +log() { + printf '==> %s\n' "$*" +} + +die() { + printf 'error: %s\n' "$*" >&2 + exit 1 +} + +need_cmd() { + command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" +} + +cleanup() { + if [ -n "${WORKDIR:-}" ] && [ -d "$WORKDIR" ]; then + rm -rf "$WORKDIR" + fi +} + +trap cleanup EXIT + +repo_root_from_shell() { + if [ -f "$PWD/Cargo.toml" ] && [ -d "$PWD/crates" ]; then + printf '%s\n' "$PWD" + return 0 + fi + + if [ -n "${BASH_SOURCE[0]:-}" ] && [ -f "${BASH_SOURCE[0]}" ]; then + local candidate + candidate="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + if [ -f "$candidate/Cargo.toml" ] && [ -d "$candidate/crates" ]; then + printf '%s\n' "$candidate" + return 0 + fi + fi + + return 1 +} + +latest_release_tag() { + local json + json="$(curl -fsSL "https://api.github.com/repos/$REPO_SLUG/releases/latest" 2>/dev/null || true)" + printf '%s' "$json" | sed -n 's/.*"tag_name":[[:space:]]*"\([^"]*\)".*/\1/p' | head -n 1 +} + +platform_asset_name() { + local os arch + os="$(uname -s)" + arch="$(uname -m)" + + case "$os/$arch" in + Linux/x86_64) + printf 'omnigraph-linux-x86_64.tar.gz\n' + ;; + Darwin/x86_64) + printf 'omnigraph-macos-x86_64.tar.gz\n' + ;; + Darwin/arm64) + printf 'omnigraph-macos-arm64.tar.gz\n' + ;; + *) + return 1 + ;; + esac +} + +install_from_dir() { + mkdir -p "$INSTALL_DIR" + install -m 0755 "$1/omnigraph" "$INSTALL_DIR/omnigraph" + install -m 0755 "$1/omnigraph-server" "$INSTALL_DIR/omnigraph-server" +} + +install_from_release() { + local tag asset archive + + [ "$FORCE_BUILD" = "1" ] && return 1 + + tag="$(latest_release_tag)" + [ -n "$tag" ] || return 1 + + asset="$(platform_asset_name)" || return 1 + WORKDIR="$(mktemp -d "$TMP_ROOT/omnigraph-install.XXXXXX")" + archive="$WORKDIR/$asset" + + log "Downloading $asset from $tag" + curl -fsSL \ + "https://github.com/$REPO_SLUG/releases/download/$tag/$asset" \ + -o "$archive" || return 1 + + tar -C "$WORKDIR" -xzf "$archive" || return 1 + install_from_dir "$WORKDIR" + return 0 +} + +build_from_source() { + local repo_root + repo_root="${1:-}" + + if [ -z "$repo_root" ]; then + need_cmd git + need_cmd cargo + + WORKDIR="$(mktemp -d "$TMP_ROOT/omnigraph-install.XXXXXX")" + repo_root="$WORKDIR/source" + log "Cloning $REPO_SLUG at $SOURCE_REF" + git clone --depth 1 --branch "$SOURCE_REF" "https://github.com/$REPO_SLUG.git" "$repo_root" + fi + + need_cmd cargo + log "Building omnigraph binaries from source" + ( + cd "$repo_root" + cargo build --release --locked -p omnigraph-cli -p omnigraph-server + ) + + install_from_dir "$repo_root/target/release" +} + +print_summary() { + cat < %s\n' "$*" +} + +die() { + printf 'error: %s\n' "$*" >&2 + exit 1 +} + +need_cmd() { + command -v "$1" >/dev/null 2>&1 || die "missing required command: $1" +} + +repo_root_from_shell() { + if [ -f "$PWD/Cargo.toml" ] && [ -f "$PWD/crates/omnigraph/tests/fixtures/context.pg" ]; then + printf '%s\n' "$PWD" + return 0 + fi + + if [ -n "${BASH_SOURCE[0]:-}" ] && [ -f "${BASH_SOURCE[0]}" ]; then + local candidate + candidate="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + if [ -f "$candidate/Cargo.toml" ] && [ -f "$candidate/crates/omnigraph/tests/fixtures/context.pg" ]; then + printf '%s\n' "$candidate" + return 0 + fi + fi + + return 1 +} + +latest_release_tag() { + local json + json="$(curl -fsSL "https://api.github.com/repos/$REPO_SLUG/releases/latest" 2>/dev/null || true)" + printf '%s' "$json" | sed -n 's/.*"tag_name":[[:space:]]*"\([^"]*\)".*/\1/p' | head -n 1 +} + +platform_asset_name() { + local os arch + os="$(uname -s)" + arch="$(uname -m)" + + case "$os/$arch" in + Linux/x86_64) + printf 'omnigraph-linux-x86_64.tar.gz\n' + ;; + Darwin/x86_64) + printf 'omnigraph-macos-x86_64.tar.gz\n' + ;; + Darwin/arm64) + printf 'omnigraph-macos-arm64.tar.gz\n' + ;; + *) + return 1 + ;; + esac +} + +ensure_aws_cli() { + if command -v aws >/dev/null 2>&1; then + AWS_BIN="$(command -v aws)" + return + fi + + need_cmd python3 + + if ! python3 -m pip --version >/dev/null 2>&1; then + python3 -m ensurepip --upgrade --user >/dev/null 2>&1 || die "aws cli not found and python3 pip bootstrap failed" + fi + + log "Installing a user-local AWS CLI" + python3 -m pip install --user awscli >/dev/null + export PATH="$HOME/.local/bin:$PATH" + + command -v aws >/dev/null 2>&1 || die "aws cli installation succeeded but aws was not found on PATH" + AWS_BIN="$(command -v aws)" +} + +download_fixture_files() { + local ref="$1" + local fixture_target="$WORKDIR/fixtures" + mkdir -p "$fixture_target" + + for file in context.pg context.jsonl; do + curl -fsSL \ + "https://raw.githubusercontent.com/$REPO_SLUG/$ref/crates/omnigraph/tests/fixtures/$file" \ + -o "$fixture_target/$file" || return 1 + done + + FIXTURE_DIR="$fixture_target" +} + +download_release_binaries() { + local tag asset archive_dir archive_path + + [ "$FORCE_BUILD" = "1" ] && return 1 + + tag="$(latest_release_tag)" + [ -n "$tag" ] || return 1 + + asset="$(platform_asset_name)" || return 1 + archive_dir="$WORKDIR/release" + archive_path="$archive_dir/$asset" + mkdir -p "$archive_dir" "$WORKDIR/bin" + + log "Downloading release asset $asset from $tag" + curl -fsSL \ + "https://github.com/$REPO_SLUG/releases/download/$tag/$asset" \ + -o "$archive_path" || return 1 + tar -C "$WORKDIR/bin" -xzf "$archive_path" || return 1 + + BIN_DIR="$WORKDIR/bin" + download_fixture_files "$tag" || return 1 +} + +build_from_source() { + local repo_root + repo_root="${1:-}" + + if [ -z "$repo_root" ]; then + need_cmd git + need_cmd cargo + + repo_root="$WORKDIR/source" + if [ ! -d "$repo_root/.git" ]; then + log "Cloning $REPO_SLUG at $SOURCE_REF" + git clone --depth 1 --branch "$SOURCE_REF" "https://github.com/$REPO_SLUG.git" "$repo_root" + fi + fi + + need_cmd cargo + log "Building omnigraph binaries from source" + ( + cd "$repo_root" + cargo build --release --locked -p omnigraph-cli -p omnigraph-server + ) + + BIN_DIR="$repo_root/target/release" + FIXTURE_DIR="$repo_root/crates/omnigraph/tests/fixtures" +} + +setup_binaries() { + local repo_root + repo_root="$(repo_root_from_shell || true)" + + if [ -n "${OMNIGRAPH_BIN_DIR:-}" ]; then + BIN_DIR="$OMNIGRAPH_BIN_DIR" + if [ -n "${OMNIGRAPH_FIXTURE_DIR:-}" ]; then + FIXTURE_DIR="$OMNIGRAPH_FIXTURE_DIR" + elif [ -n "$repo_root" ]; then + FIXTURE_DIR="$repo_root/crates/omnigraph/tests/fixtures" + fi + elif [ -n "$repo_root" ]; then + build_from_source "$repo_root" + elif ! download_release_binaries; then + build_from_source + fi + + [ -x "$BIN_DIR/omnigraph" ] || die "omnigraph binary not found in $BIN_DIR" + [ -x "$BIN_DIR/omnigraph-server" ] || die "omnigraph-server binary not found in $BIN_DIR" + [ -f "$FIXTURE_DIR/context.pg" ] || die "context fixture schema not found in $FIXTURE_DIR" + [ -f "$FIXTURE_DIR/context.jsonl" ] || die "context fixture data not found in $FIXTURE_DIR" +} + +start_rustfs() { + mkdir -p "$RUSTFS_DATA_DIR" + + if docker ps --format '{{.Names}}' | grep -qx "$RUSTFS_CONTAINER_NAME"; then + log "Reusing existing RustFS container $RUSTFS_CONTAINER_NAME" + return + fi + + if docker ps -a --format '{{.Names}}' | grep -qx "$RUSTFS_CONTAINER_NAME"; then + log "Removing stopped RustFS container $RUSTFS_CONTAINER_NAME" + docker rm -f "$RUSTFS_CONTAINER_NAME" >/dev/null + fi + + log "Starting RustFS on $AWS_ENDPOINT_URL_S3" + docker run -d \ + --name "$RUSTFS_CONTAINER_NAME" \ + -p 9000:9000 \ + -p 9001:9001 \ + -v "$RUSTFS_DATA_DIR:/data" \ + -e RUSTFS_ACCESS_KEY="$AWS_ACCESS_KEY_ID" \ + -e RUSTFS_SECRET_KEY="$AWS_SECRET_ACCESS_KEY" \ + "$RUSTFS_IMAGE" \ + /data >/dev/null +} + +wait_for_rustfs() { + local attempt + for attempt in $(seq 1 30); do + if "$AWS_BIN" --endpoint-url "$AWS_ENDPOINT_URL_S3" s3api list-buckets >/dev/null 2>&1; then + return + fi + sleep 2 + done + + docker logs "$RUSTFS_CONTAINER_NAME" || true + die "RustFS did not become ready" +} + +ensure_bucket() { + log "Ensuring bucket $BUCKET exists" + "$AWS_BIN" --endpoint-url "$AWS_ENDPOINT_URL_S3" \ + s3api create-bucket --bucket "$BUCKET" >/dev/null 2>&1 || true +} + +initialize_repo() { + if "$BIN_DIR/omnigraph" snapshot "$REPO_URI" --json >/dev/null 2>&1; then + log "Reusing existing repo at $REPO_URI" + return + fi + + log "Initializing repo at $REPO_URI" + "$BIN_DIR/omnigraph" init --schema "$FIXTURE_DIR/context.pg" "$REPO_URI" + + log "Loading context fixture into $REPO_URI" + "$BIN_DIR/omnigraph" load --data "$FIXTURE_DIR/context.jsonl" "$REPO_URI" +} + +start_server() { + mkdir -p "$WORKDIR" + + if [ -f "$SERVER_PID_FILE" ] && kill -0 "$(cat "$SERVER_PID_FILE")" >/dev/null 2>&1; then + log "Stopping existing server process $(cat "$SERVER_PID_FILE")" + kill "$(cat "$SERVER_PID_FILE")" >/dev/null 2>&1 || true + sleep 1 + fi + + log "Starting omnigraph-server on $BIND" + nohup "$BIN_DIR/omnigraph-server" "$REPO_URI" --bind "$BIND" >"$SERVER_LOG" 2>&1 & + echo "$!" > "$SERVER_PID_FILE" +} + +wait_for_server() { + local bind_host bind_port health_host base_url + bind_host="${BIND%:*}" + bind_port="${BIND##*:}" + health_host="$bind_host" + if [ "$health_host" = "0.0.0.0" ]; then + health_host="127.0.0.1" + fi + base_url="http://$health_host:$bind_port" + + for _ in $(seq 1 30); do + if curl -fsSL "$base_url/healthz" >/dev/null 2>&1; then + printf '%s\n' "$base_url" + return + fi + sleep 1 + done + + cat "$SERVER_LOG" >&2 || true + die "omnigraph-server did not pass /healthz" +} + +print_summary() { + local base_url="$1" + + cat </dev/null 2>&1 || die "docker is installed but the daemon is not reachable; start Docker Desktop or another daemon and rerun" + + export AWS_ACCESS_KEY_ID + export AWS_SECRET_ACCESS_KEY + export AWS_REGION + export AWS_ENDPOINT_URL + export AWS_ENDPOINT_URL_S3 + export AWS_ALLOW_HTTP + export AWS_S3_FORCE_PATH_STYLE + + mkdir -p "$WORKDIR" + + setup_binaries + ensure_aws_cli + start_rustfs + wait_for_rustfs + ensure_bucket + initialize_repo + start_server + print_summary "$(wait_for_server)" +} + +main "$@"