omnigraph/.github/workflows/ci.yml

name: CI

on:
  pull_request:
  push:
    branches:
      - main
    tags:
      - "v*"
  workflow_dispatch:

concurrency:
  group: ci-${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: true

jobs:
  classify_changes:
    name: Classify Changes
    runs-on: ubuntu-latest
    permissions:
      contents: read
    outputs:
      run_full_ci: ${{ steps.filter.outputs.run_full_ci }}
      run_rustfs_ci: ${{ steps.filter.outputs.run_rustfs_ci }}
    steps:
      - name: Checkout source
        uses: actions/checkout@v5.0.1
        with:
          fetch-depth: 0

      - name: Detect text-only changes
        id: filter
        env:
          BEFORE_SHA: ${{ github.event.before }}
          EVENT_NAME: ${{ github.event_name }}
          PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
          PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
          REF_TYPE: ${{ github.ref_type }}
        run: |
          set -euo pipefail

          if [[ "$EVENT_NAME" == "workflow_dispatch" || "$REF_TYPE" == "tag" ]]; then
            echo "run_full_ci=true" >> "$GITHUB_OUTPUT"
            echo "run_rustfs_ci=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi

          if [[ "$EVENT_NAME" == "pull_request" ]]; then
            base="$PR_BASE_SHA"
            head="$PR_HEAD_SHA"
          else
            base="$BEFORE_SHA"
            head="$GITHUB_SHA"
            if [[ "$base" == "0000000000000000000000000000000000000000" ]]; then
              base="$(git rev-parse "${head}^" 2>/dev/null || true)"
            fi
          fi

          if [[ -z "${base:-}" ]]; then
            echo "run_full_ci=true" >> "$GITHUB_OUTPUT"
            echo "run_rustfs_ci=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi

          mapfile -t changed < <(git diff --name-only "$base" "$head")
          if [[ "${#changed[@]}" -eq 0 ]]; then
            echo "run_full_ci=true" >> "$GITHUB_OUTPUT"
            echo "run_rustfs_ci=true" >> "$GITHUB_OUTPUT"
            exit 0
          fi

          run_full_ci=false
          run_rustfs_ci=false
          for path in "${changed[@]}"; do
            case "$path" in
              *.md|*.mdx|*.txt|*.rst|*.adoc) ;;
              *)
                run_full_ci=true
                ;;
            esac

            if [[ "$EVENT_NAME" != "pull_request" ]]; then
              run_rustfs_ci=true
              continue
            fi

            case "$path" in
              .github/workflows/ci.yml|Cargo.toml|Cargo.lock|crates/*/Cargo.toml) run_rustfs_ci=true ;;
              crates/omnigraph/src/storage.rs) run_rustfs_ci=true ;;
              crates/omnigraph/src/db/manifest.rs|crates/omnigraph/src/db/manifest/*) run_rustfs_ci=true ;;
              crates/omnigraph/tests/s3_storage.rs|crates/omnigraph/tests/write_cost_s3.rs|crates/omnigraph/tests/helpers/*) run_rustfs_ci=true ;;
              crates/omnigraph/src/table_store.rs|crates/omnigraph/src/instrumentation.rs) run_rustfs_ci=true ;;
              crates/omnigraph-cluster/src/store.rs|crates/omnigraph-cluster/src/serve.rs) run_rustfs_ci=true ;;
              crates/omnigraph-cluster/tests/s3_cluster.rs) run_rustfs_ci=true ;;
              crates/omnigraph-server/tests/s3.rs|crates/omnigraph-server/tests/support/*) run_rustfs_ci=true ;;
              crates/omnigraph-cli/tests/system_local.rs) run_rustfs_ci=true ;;
            esac
          done

          printf 'Changed files:\n'
          printf '  %s\n' "${changed[@]}"
          echo "run_full_ci=$run_full_ci" >> "$GITHUB_OUTPUT"
          echo "run_rustfs_ci=$run_rustfs_ci" >> "$GITHUB_OUTPUT"

  check_agents_md:
    name: Check AGENTS.md Links
    runs-on: ubuntu-latest
    permissions:
      contents: read
    steps:
      - name: Checkout source
        uses: actions/checkout@v5.0.1

      - name: Verify AGENTS.md ↔ docs/ cross-links
        run: bash scripts/check-agents-md.sh

  entrypoint_test:
    name: Container Entrypoint
    runs-on: ubuntu-latest
    permissions:
      contents: read
    steps:
      - name: Checkout source
        uses: actions/checkout@v5.0.1

      - name: Verify omnigraph-server entrypoint arg composition
        run: sh docker/entrypoint_test.sh

  test:
    name: Test Workspace
    needs: classify_changes
    # PR latency: the full workspace + failpoints build/test is the slowest
    # gate (~15min warm, up to the 75min ceiling cold) and dominated PR
    # turnaround. It now runs only on push to `main` (post-merge), on tags,
    # and on manual `workflow_dispatch` — NOT on pull_request. Trade-off
    # accepted deliberately: a regression is caught on the `main` run after
    # merge rather than before it, so `main` can briefly go red. Mitigations:
    # (1) `Test Workspace` is removed from required PR checks in
    #     `.github/branch-protection.json` (a required check that never
    #     reports would leave every PR permanently pending);
    # (2) run the full suite locally before merging risky changes
    #     (`cargo test --workspace --locked`), or trigger this workflow via
    #     the Actions "Run workflow" button (workflow_dispatch) on your branch;
    # (3) openapi.json is no longer auto-regenerated on PRs (that step lived
    #     here) — regenerate it locally for server/API changes
    #     (`OMNIGRAPH_UPDATE_OPENAPI=1 cargo test -p omnigraph-server --test openapi`)
    #     or the strict drift check fails the post-merge `main` run.
    if: github.event_name != 'pull_request'
    runs-on: ubuntu-latest
    # 75, not 45: a cold rust-cache (every Cargo.lock change) costs a full
    # workspace + failpoints-feature build on a 2-core runner, which now
    # exceeds 45 minutes on slow runner days. A timed-out run never SAVES
    # its cache, so an undersized budget self-perpetuates: every retry
    # starts cold and dies the same way (observed 2026-06-11, four runs).
    # Warm-cache runs stay ~15 minutes; this is headroom, not a target.
    timeout-minutes: 75
    permissions:
      contents: write
    env:
      CARGO_TERM_COLOR: always
    steps:
      - name: Skip heavy CI for text-only changes
        if: needs.classify_changes.outputs.run_full_ci != 'true'
        run: echo "Text-only change detected; skipping workspace test run."

      # Default checkout: on pull_request this gives us the merge commit
      # (refs/pull/N/merge), which is what we want to test. For same-repo PRs
      # the regenerated openapi.json is pushed to the head branch below via a
      # separate shallow clone.
      - name: Checkout source
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        uses: actions/checkout@v5.0.1

      - name: Install system dependencies
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        run: |
          sudo apt-get update
          sudo apt-get install -y protobuf-compiler libprotobuf-dev

      - name: Install Rust stable
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        uses: dtolnay/rust-toolchain@stable
        with:
          toolchain: stable

      - name: Cache Rust build data
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        uses: Swatinem/rust-cache@v2
        with:
          workspaces: |
            . -> target

      - name: Run workspace tests
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        # On same-repo PRs, regenerate openapi.json as part of the drift test
        # so the following step can commit the update. Elsewhere the env var
        # is empty, leaving the drift test in strict-check mode.
        env:
          OMNIGRAPH_UPDATE_OPENAPI: ${{ (github.event_name == 'pull_request' && github.event.pull_request.head.repo.full_name == github.repository) && '1' || '' }}
        run: cargo test --workspace --locked

      - name: Run failpoints feature tests
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        # Run after the workspace test so the build cache is warm —
        # enabling --features failpoints is just an incremental rebuild
        # of the target crate + the small `fail` crate, not the full
        # dep tree (lance, datafusion). A separate job with its own
        # cache key would be a fresh ~20min build on first run; this
        # is ~30s on a warm cache. The cluster feature does not enable
        # omnigraph/failpoints, so each line rebuilds only its crate.
        run: |
          cargo test --locked -p omnigraph-engine --features failpoints --test failpoints
          cargo test --locked -p omnigraph-cluster --features failpoints --test failpoints

      - name: Commit regenerated openapi.json to PR branch
        if: |
          needs.classify_changes.outputs.run_full_ci == 'true' &&
          github.event_name == 'pull_request' &&
          github.event.pull_request.head.repo.full_name == github.repository
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          # The workspace was checked out at the PR's merge commit so tests
          # see the merged state. Pushing the regenerated openapi.json back
          # to the PR branch is done via a separate shallow clone so the
          # pushed commit contains only the spec change, not the merge state.
          if git diff --quiet -- openapi.json; then
            echo "openapi.json is already in sync."
            exit 0
          fi
          tmp=$(mktemp -d)
          git clone --depth 1 --branch "${{ github.head_ref }}" \
            "https://x-access-token:${GITHUB_TOKEN}@github.com/${{ github.repository }}.git" \
            "$tmp"
          cp openapi.json "$tmp/openapi.json"
          cd "$tmp"
          if git diff --quiet -- openapi.json; then
            echo "openapi.json matches PR branch; nothing to push."
            exit 0
          fi
          git config user.name "github-actions[bot]"
          git config user.email "41898282+github-actions[bot]@users.noreply.github.com"
          git add openapi.json
          git commit -m "chore: regenerate openapi.json"
          git push

  test_aws_feature:
    name: Test omnigraph-server --features aws
    needs: classify_changes
    runs-on: ubuntu-latest
    timeout-minutes: 30
    permissions:
      contents: read
    env:
      CARGO_TERM_COLOR: always
    steps:
      - name: Skip for text-only changes
        if: needs.classify_changes.outputs.run_full_ci != 'true'
        run: echo "Text-only change detected; skipping aws feature build."

      - name: Checkout source
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        uses: actions/checkout@v5.0.1

      - name: Install system dependencies
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        run: |
          sudo apt-get update
          sudo apt-get install -y protobuf-compiler libprotobuf-dev

      - name: Install Rust stable
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        uses: dtolnay/rust-toolchain@stable
        with:
          toolchain: stable

      - name: Cache Rust build data
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        uses: Swatinem/rust-cache@v2
        with:
          workspaces: |
            . -> target
          key: aws-feature

      - name: Build omnigraph-server with aws feature
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        run: cargo build --locked -p omnigraph-server --features aws

      - name: Test omnigraph-server with aws feature
        if: needs.classify_changes.outputs.run_full_ci == 'true'
        run: cargo test --locked -p omnigraph-server --features aws

  rustfs_integration:
    name: RustFS S3 Integration
    # `needs: test` means this is push-/dispatch-only too: on pull_request the
    # `test` job is skipped, so this dependent is skipped with it. S3
    # integration runs post-merge on `main`, alongside the workspace suite.
    needs:
      - classify_changes
      - test
    if: needs.classify_changes.outputs.run_rustfs_ci == 'true'
    runs-on: ubuntu-latest
    timeout-minutes: 75
    permissions:
      contents: read
    env:
      AWS_ACCESS_KEY_ID: rustfsadmin
      AWS_SECRET_ACCESS_KEY: rustfsadmin
      AWS_REGION: us-east-1
      AWS_ENDPOINT_URL: http://127.0.0.1:9000
      AWS_ENDPOINT_URL_S3: http://127.0.0.1:9000
      AWS_ALLOW_HTTP: "true"
      AWS_S3_FORCE_PATH_STYLE: "true"
      OMNIGRAPH_S3_TEST_BUCKET: omnigraph-ci
      OMNIGRAPH_S3_TEST_PREFIX: github-actions
      CARGO_TERM_COLOR: always
    steps:
      - name: Checkout source
        uses: actions/checkout@v5.0.1

      - name: Install system dependencies
        run: |
          sudo apt-get update
          sudo apt-get install -y protobuf-compiler libprotobuf-dev python3-pip

      - name: Install Rust stable
        uses: dtolnay/rust-toolchain@stable
        with:
          toolchain: stable

      - name: Cache Rust build data
        uses: Swatinem/rust-cache@v2
        with:
          workspaces: |
            . -> target

      - name: Start RustFS
        # Pinned to 1.0.0-beta.8 (2026-06-10). beta.4+ refuses "default"
        # credentials (rustfsadmin/rustfsadmin) unless
        # RUSTFS_ALLOW_INSECURE_DEFAULT_CREDENTIALS=true is set — fine for
        # an ephemeral CI container. The three S3 suites were validated
        # against the beta.8 binary locally before this bump. Keep the pin
        # explicit (never `latest`) so upgrades are deliberate.
        run: |
          docker rm -f rustfs >/dev/null 2>&1 || true
          docker run -d \
            --name rustfs \
            -p 9000:9000 \
            -p 9001:9001 \
            -e RUSTFS_ACCESS_KEY="${AWS_ACCESS_KEY_ID}" \
            -e RUSTFS_SECRET_KEY="${AWS_SECRET_ACCESS_KEY}" \
            -e RUSTFS_ALLOW_INSECURE_DEFAULT_CREDENTIALS=true \
            rustfs/rustfs:1.0.0-beta.8 \
            /data

      - name: Install AWS CLI
        run: |
          python3 -m pip install --user awscli
          echo "$HOME/.local/bin" >> "$GITHUB_PATH"

      - name: Create RustFS test bucket
        run: |
          for _ in $(seq 1 30); do
            if aws --endpoint-url "${AWS_ENDPOINT_URL_S3}" s3api list-buckets >/dev/null 2>&1; then
              break
            fi
            sleep 2
          done
          aws --endpoint-url "${AWS_ENDPOINT_URL_S3}" \
            s3api create-bucket \
            --bucket "${OMNIGRAPH_S3_TEST_BUCKET}" >/dev/null 2>&1 || true

      - name: Run RustFS storage tests
        run: cargo test --locked -p omnigraph-engine --test s3_storage -- --nocapture

      - name: Run RustFS write-path cost gate (RFC-013 step 3a opener)
        run: cargo test --locked -p omnigraph-engine --test write_cost_s3 -- --nocapture

      - name: Run RustFS server smoke
        # No name filter: every test in the s3 target is bucket-gated, and a
        # filter matching nothing passes vacuously (which silently ran zero
        # tests here for a while — the old filter said s3_repo, the test
        # said s3_graph).
        run: cargo test --locked -p omnigraph-server --test s3 -- --nocapture

      - name: Run RustFS cluster e2e
        run: cargo test --locked -p omnigraph-cluster --test s3_cluster -- --nocapture

      - name: Run RustFS CLI smoke
        run: cargo test --locked -p omnigraph-cli --test system_local local_cli_s3_end_to_end_init_load_read_flow -- --nocapture

      - name: Run RustFS recovery-sidecar lifecycle
        # Sidecar put/list/delete through the S3 storage backend on a
        # real bucket (the failpoint only wedges the publisher; the
        # sidecar I/O is exercised for real). Name filter `s3_` matches
        # the bucket-gated tests in the failpoints target only; the
        # grep guards against the filter going vacuous (cargo passes
        # with 0 tests matched) if those tests are ever renamed.
        run: |
          output=$(cargo test --locked -p omnigraph-engine --features failpoints --test failpoints s3_ -- --nocapture 2>&1); status=$?
          echo "$output"
          [ "$status" -eq 0 ] || exit "$status"
          echo "$output" | grep -Eq "test result: ok\. [1-9][0-9]* passed" \
            || { echo "::error::filter 's3_' matched no tests — vacuous pass"; exit 1; }

      - name: Dump RustFS logs on failure
        if: failure()
        run: docker logs rustfs