diff --git a/research/lance-autoresearch/Cargo.toml b/research/lance-autoresearch/Cargo.toml index ba47235..96d72d4 100644 --- a/research/lance-autoresearch/Cargo.toml +++ b/research/lance-autoresearch/Cargo.toml @@ -1,3 +1,9 @@ +# Empty `[workspace]` section so cargo treats this directory as its own +# workspace root and does NOT walk up to the parent omnigraph workspace. +# Without this, cargo from inside `research/lance-autoresearch/` will try to +# resolve omnigraph's dependencies even though we're excluded as a member. +[workspace] + [package] name = "lance-autoresearch" version = "0.1.0" diff --git a/research/lance-autoresearch/README.md b/research/lance-autoresearch/README.md index a0e9ba8..c2573fd 100644 --- a/research/lance-autoresearch/README.md +++ b/research/lance-autoresearch/README.md @@ -7,18 +7,34 @@ Modeled on Andrej Karpathy's [`nanochat-research`](https://x.com/karpathy/status/1855651423497650238) three-file contract: -- **Immutable bench** — `src/bin/run_experiment.rs` + `src/fixture.rs` + `src/reference.rs`. - The agent cannot touch these. -- **Mutable kernel** — `src/kernels.rs`. The agent's playground. Starts as a clean - scalar PQ L2 implementation matching Lance's algorithm; the agent's job is to +- **Immutable bench** — `src/bin/run_experiment.rs` + `src/inputs.rs` + + `src/reference.rs`. The agent cannot touch these. +- **Mutable kernel** — `src/kernels.rs`. The agent's playground. Starts as a + scalar baseline matching Lance's PQ L2 algorithm shape; the agent's job is to beat it. -- **Human-iterated program** — `program.md`. The "skill" the agent reads at the - start of every session. The human refines it between runs. +- **Human-iterated program** — `program.md`. The "skill" the agent reads at + the start of every session. The human refines it between runs. -The optimization target is the PQ L2 distance kernel for f32 dense vectors on -SIFT1M-shaped data (128-d, 16 sub-vectors × 256 centroids, 8-bit codes, top-10 -retrieval). The eval oracle is **recall@10 against SIFT1M's published ground -truth** at fixed kernel shape, with `geomean_ns_per_query` as the speed metric. +## Dataset-independent by design + +Every other ANN benchmark you've seen is "compete on this fixed dataset" +(SIFT1M, GIST1M, DEEP1B). That conflates two things: *kernel correctness* +(the math) and *kernel speed under one specific data distribution*. An LLM +agent given recall@K as the oracle has incentive to overfit to the dataset's +quirks. + +We split them: + +- **Correctness** = bit-equivalent (`max_abs_err ≤ 1e-4`) match to a scalar + reference kernel, on diverse generated inputs (Gaussian, uniform, sparse, + large-dynamic-range, mostly-zero) × multiple PQ shapes. This is mathematical + equivalence; there's no dataset to overfit. Lossy techniques fail this gate. +- **Speed** = geomean ns/query across multiple PQ shapes × + multiple data distributions. A kernel that wins on one distribution and + regresses on another fails the worst-case guard. + +By construction, an "improvement" generalizes across distributions and shapes. +There is no `wget sift.tar.gz` step; the harness is fully self-contained. ## Why a separate repo @@ -27,21 +43,14 @@ version and consumes its kernels via the public crate API. Improvements live one layer below: in Lance itself. A standalone repo with no OmniGraph dep keeps the optimization target pure (only the kernel changes), keeps the license clean for upstream contribution (dual MIT/Apache-2.0 → Apache-2.0 PRs to Lance), and -keeps the agent's working set tiny (~600 lines). +keeps the agent's working set tiny. ## Quick start ```bash -# 1. (optional but recommended) Download SIFT1M + train + freeze the PQ codebook. -# Takes ~5–10 min; ~250 MB on disk. Skipping it falls back to a synthetic -# deterministic dataset (1024 base / 64 queries) — useful for smoke-testing -# the harness but not representative of real workloads. -bash scripts/prepare_fixtures.sh - -# 2. Run the baseline. cargo run --release --bin run_experiment -# 3. Or run with Claude Code / Codex: +# Or run with Claude Code / Codex: # Open the repo in your agent of choice and prompt: # Hi, have a look at program.md and let's kick off a new experiment. ``` @@ -53,43 +62,51 @@ cargo run --release --bin run_experiment | `src/kernels.rs` | **mutable** | the agent | | `src/bin/run_experiment.rs` | immutable | — | | `src/reference.rs` | immutable | — | -| `src/fixture.rs` | immutable | — | +| `src/inputs.rs` | immutable | — | +| `src/lib.rs` | immutable (shared types) | — | | `benches/pq_l2.rs` | immutable | — | -| `scripts/prepare_fixtures.sh` | immutable | — | | `program.md` | human-iterated | the human, between runs | | `results.tsv` | append-only | the agent, per trial (gitignored) | ## The metric -`run_experiment` prints a fixed-format block: +`run_experiment` runs two phases per trial: a correctness check and a +multi-shape × multi-distribution speed measurement. Output looks like: ``` +correctness: pass --- -source: sift1m -num_base: 1000000 -num_queries: 1000 -recall_at_10: 0.9421 -geomean_ns_per_query: 184273 -peak_mem_mb: 42.1 -total_seconds: 21.7 +correctness: pass +shapes_tested: (128,16,256) (256,16,256) (768,96,256) +distributions_tested: clustered uniform sparse +geomean_ns_per_query: 18234 +worst_ns_per_query: 24515 ((768,96,256), sparse) +best_ns_per_query: 12876 ((128,16,256), clustered) +per_combo_geomean_ns: + (128,16,256) clustered -> 12876 ns + (128,16,256) uniform -> 13441 ns + ... +peak_mem_mb: 28.4 +total_seconds: 12.3 ``` A kernel is "kept" iff: -- `recall_at_10` is within 0.005 of the seeded scalar baseline (and ≥ 0.50 hard floor) -- `geomean_ns_per_query` is strictly better than the previous best-kept kernel +- Correctness phase passes (mathematical equivalence to scalar reference) +- `geomean_ns_per_query` strictly better than the previous best-kept kernel +- `worst_ns_per_query` ≤ 1.05 × the previous best-kept kernel's worst - `total_seconds` ≤ 600 See `program.md` for the full loop spec. ## Upstream contribution path -When a commit clears the keep bar by a meaningful margin (≥10% speedup with -recall in-band), the human reviews the diff, ports the technique against -[`lance-format/lance`](https://github.com/lance-format/lance) HEAD, runs Lance's -own test suite, and opens a PR. Because `src/kernels.rs` is dual MIT/Apache-2.0 -licensed and algorithmically modeled on Lance's existing path, the upstream PR -inherits Apache-2.0 cleanly. +When a commit clears the keep bar by a meaningful margin (≥10% geomean +speedup with worst-case guard intact), the human reviews the diff, ports the +technique against [`lance-format/lance`](https://github.com/lance-format/lance) +HEAD, runs Lance's own test suite, and opens a PR. Because `src/kernels.rs` is +dual MIT/Apache-2.0 licensed and algorithmically modeled on Lance's existing +path, the upstream PR inherits Apache-2.0 cleanly. ## License diff --git a/research/lance-autoresearch/benches/pq_l2.rs b/research/lance-autoresearch/benches/pq_l2.rs index d4068b9..716b133 100644 --- a/research/lance-autoresearch/benches/pq_l2.rs +++ b/research/lance-autoresearch/benches/pq_l2.rs @@ -1,6 +1,5 @@ -//! Criterion benchmark — runs the same kernels the agent edits, but with -//! statistical sampling. Use this for stable speed comparisons; the -//! `run_experiment` binary is the agent's per-trial harness. +//! Criterion benchmark — same kernels the agent edits, with statistical sampling. +//! Use this for stable speed comparisons; `run_experiment` is the per-trial harness. //! //! `cargo bench --bench pq_l2` @@ -8,48 +7,44 @@ use std::hint::black_box; use criterion::{Criterion, criterion_group, criterion_main}; -use lance_autoresearch::fixture::Fixture; -use lance_autoresearch::kernels::{TopKHeap, compute_distance_table_l2, probe_pq_l2_top_k}; -use lance_autoresearch::{DIM, NUM_SUB_VECTORS}; +use lance_autoresearch::inputs::{SHAPES, SPEED_TOP_K, speed_workloads}; +use lance_autoresearch::kernels::PqKernel; fn bench_pq_l2(c: &mut Criterion) { - let fix = Fixture::load_or_synthesize().expect("fixture"); + let workloads = speed_workloads(0xBE3C_C0DE_F1AC_BABE); - let q = &fix.query_vectors[..DIM]; - let table0 = compute_distance_table_l2(q, &fix.codebook); + for wl in &workloads { + let kernel = PqKernel::new(wl.shape, &wl.codebook); + let q = &wl.queries[..wl.shape.dim]; + let table0 = kernel.distance_table(q); - c.bench_function("compute_distance_table_l2", |b| { - b.iter(|| { - let t = compute_distance_table_l2(black_box(q), black_box(&fix.codebook)); - black_box(t); + let label_shape = format!( + "{}x{}x{}", + wl.shape.dim, wl.shape.num_sub_vectors, wl.shape.num_centroids + ); + let label_dist = format!("{:?}", wl.distribution).to_lowercase(); + let id = format!("{label_shape}/{label_dist}"); + + c.bench_function(&format!("distance_table/{id}"), |b| { + b.iter(|| { + let t = kernel.distance_table(black_box(q)); + black_box(t); + }); }); - }); - - c.bench_function("probe_pq_l2_top_k", |b| { - b.iter(|| { - let mut heap = TopKHeap::new(); - probe_pq_l2_top_k( - black_box(&table0), - black_box(&fix.codes), - black_box(fix.num_base), - &mut heap, - ); - black_box(heap); + c.bench_function(&format!("probe_top_k/{id}"), |b| { + b.iter(|| { + let r = kernel.probe_top_k( + black_box(&table0), + black_box(&wl.codes), + black_box(wl.num_vectors), + black_box(SPEED_TOP_K), + ); + black_box(r); + }); }); - }); + } - c.bench_function("end_to_end_one_query", |b| { - b.iter(|| { - let t = compute_distance_table_l2(black_box(q), black_box(&fix.codebook)); - let mut heap = TopKHeap::new(); - probe_pq_l2_top_k(&t, black_box(&fix.codes), black_box(fix.num_base), &mut heap); - black_box(heap); - }); - }); - - // Reference: silence unused warning for NUM_SUB_VECTORS in case the bench is - // ever stubbed out — keeps the constant import meaningful. - let _ = NUM_SUB_VECTORS; + let _ = SHAPES; } criterion_group!(benches, bench_pq_l2); diff --git a/research/lance-autoresearch/program.md b/research/lance-autoresearch/program.md index d7920c4..73f73e3 100644 --- a/research/lance-autoresearch/program.md +++ b/research/lance-autoresearch/program.md @@ -1,9 +1,23 @@ # Lance PQ L2 kernel research — agent instructions -You are an autonomous research assistant. Your job is to improve the kernel(s) in -`src/kernels.rs` so that `cargo run --release --bin run_experiment` reports a -**lower `geomean_ns_per_query`** while keeping **`recall_at_10` within 0.005 of -the seeded baseline** (and never below the hard floor 0.50). +You are an autonomous research assistant. Your job is to improve `src/kernels.rs` +so that `cargo run --release --bin run_experiment` reports a **lower +`geomean_ns_per_query`** while: + +1. The **correctness phase passes** — your kernel's distance values must match the + scalar reference within `MAX_ABS_ERR = 1e-4`, and the top-K must be + tie-tolerant equivalent on every input the bench generates. +2. The `worst_ns_per_query` does **not regress more than 5%** against the + last-kept kernel — if you win on one (shape × distribution) and lose + significantly on another, the change isn't a generalizable improvement. + +This bench is intentionally **dataset-independent**: there is no fixed dataset. +The correctness oracle is mathematical equivalence to the scalar reference, +checked across multiple PQ shapes and synthetic input distributions +(Gaussian / uniform / sparse / large-dynamic-range / mostly-zero). The speed +oracle is the geomean across multiple shapes × distributions, with worst-case +guarded. A win that depends on a specific data distribution or PQ shape will +fail to clear the bar by construction. Read this file end-to-end before doing anything else. Then run setup, then the loop. @@ -14,93 +28,97 @@ Read this file end-to-end before doing anything else. Then run setup, then the l - `program.md` (this file) - `src/lib.rs` - `src/kernels.rs` *(the only file you may edit)* + - `src/reference.rs` + - `src/inputs.rs` - `src/bin/run_experiment.rs` - - `src/fixture.rs` -2. Confirm fixtures are present. SIFT1M lives under `~/.cache/lance-autoresearch/`. - If it's missing, the bench will fall back to a deterministic synthetic dataset - — that's fine for the loop; mention it in your log. If you want SIFT1M, run - `bash scripts/prepare_fixtures.sh` (one-time, ~5–10 min, ~250 MB download). -3. Ensure `results.tsv` exists. If not, create it with this header line: +2. Ensure `results.tsv` exists. If not, create it with this header line: ``` - commit timestamp source num_base recall_at_10 geomean_ns_per_query peak_mem_mb total_seconds keep description + commit timestamp correctness geomean_ns worst_ns worst_combo best_ns best_combo peak_mem_mb total_seconds keep description ``` -4. Run the baseline trial: `cargo run --release --bin run_experiment > run.log 2>&1`. - Parse `run.log` and append a row to `results.tsv` with `keep=baseline`, - `description="seeded scalar PQ-L2 baseline"`. This is your reference number. -5. Commit the baseline row with a one-line message like `baseline: `. +3. Run the baseline trial: `cargo run --release --bin run_experiment > run.log 2>&1`. + Confirm `correctness: pass`. Parse `run.log` and append a row to `results.tsv` + with `keep=baseline` and `description="seeded scalar PQ-L2 baseline"`. This + is your reference number. +4. Commit the baseline row with a one-line message like `baseline: `. ## What you CAN do - Modify **`src/kernels.rs`** freely. You may: - - Reorder loops, change iteration order over codes or sub-vectors. - - Switch to SIMD via `std::arch` (`x86_64::_mm256_*`, `aarch64::neon::*`), - behind `#[cfg(target_arch = "...")]` gates. Always keep a portable scalar - fallback so the kernel compiles everywhere. - - Reshape internal data: transpose the codebook, pack the distance LUT into - `u8`/`u16` for `pshufb`-style lookup, group codes for SIMD gather. + - Pre-process the codebook in `PqKernel::new` (transpose layouts, cache + `c·c` for the FMA trick, pack the codebook for register-resident lookup, + etc.). This cost is paid once per dataset and amortized across queries — + the bench measures per-query, not per-(build + query). + - Reorder loops, switch internal data layouts, drop down to `std::arch` + intrinsics under `#[cfg(target_arch = ...)]` gates. **Always keep a + portable scalar fallback** so the kernel compiles everywhere. - Use `unsafe` if needed; document the invariants you're relying on. - - Mark hot functions `#[inline]` or split them; add private helpers freely. -- Add `#[cfg(test)] mod tests { ... }` inside `src/kernels.rs` if you want - property checks against the scalar path. + - Mark hot functions `#[inline]`; add private helpers freely. + - Add `#[cfg(test)] mod tests { ... }` inside `src/kernels.rs` if you want + in-file property checks. ## What you CANNOT do -- Do **not** modify `src/lib.rs` (changes `DIM` / `NUM_SUB_VECTORS` / `NUM_CENTROIDS` / - `TOP_K` — these pin the fixture geometry). -- Do **not** modify `src/bin/run_experiment.rs`, `src/reference.rs`, `src/fixture.rs`, - `benches/pq_l2.rs`, `scripts/prepare_fixtures.sh`, or `Cargo.toml`. -- Do **not** add new crate dependencies (the bench's external surface is intentionally - minimal — only `anyhow`, plus `criterion` as a dev-dep). -- Do **not** delete or alter the public API of `kernels.rs`: - - `pub type DistanceTable` - - `pub fn compute_distance_table_l2(query: &[f32], codebook: &[f32]) -> DistanceTable` - - `pub fn probe_pq_l2_top_k(table: &DistanceTable, codes: &[u8], num_vectors: usize, out: &mut TopKHeap)` - - `pub struct TopKHeap` with `new() / push / into_sorted` +- Do **not** modify `src/lib.rs` (`PqShape` and the tolerance constants are + shared with the immutable scaffolding). +- Do **not** modify `src/bin/run_experiment.rs`, `src/reference.rs`, + `src/inputs.rs`, `benches/pq_l2.rs`, or `Cargo.toml`. +- Do **not** add new crate dependencies. +- Do **not** alter the public API of `kernels::PqKernel`: + - `PqKernel::new(shape: PqShape, codebook: &[f32]) -> Self` + - `PqKernel::shape(&self) -> &PqShape` + - `PqKernel::distance_table(&self, query: &[f32]) -> Vec` + - `PqKernel::probe_top_k(&self, table: &[f32], codes: &[u8], num_vectors: usize, k: usize) -> Vec<(u32, f32)>` +- Do **not** introduce lossy techniques (LUT u8/u16 quantization, asymmetric- + distance approximation, etc.) — the correctness phase asserts exact-up-to-ε + match against the scalar reference. If you want to explore a lossy track, + surface that in a separate kernel and propose a track extension. ## The metric -Minimize `geomean_ns_per_query` (geometric mean of per-query wall-clock from the -benched queries, rounded to a u64 ns) subject to: +Minimize `geomean_ns_per_query` (geometric mean of per-query wall-clock across +all timed queries, all shapes, all distributions) subject to: -1. `recall_at_10 >= baseline_recall_at_10 - 0.005` -2. `recall_at_10 >= 0.50` (hard floor; below this the bench exits non-zero) -3. `total_seconds <= 600` -4. Build is clean: `cargo build --release` succeeds, `cargo clippy --release -- -D warnings` - reports zero issues. (Run `cargo clippy --release` before each commit.) +1. Correctness phase: **pass** (exit-2 otherwise). +2. `worst_ns_per_query` ≤ 1.05 × the last-kept kernel's worst. +3. `total_seconds` ≤ 600. +4. Build is clean: `cargo build --release` succeeds, `cargo clippy --release + --all-targets -- -D warnings` reports zero issues. Ties break toward simpler code. If two kernels report the same speed within -noise (~3%), prefer the one with fewer lines or less `unsafe`. +~3% noise, prefer fewer lines / less `unsafe`. -## Lance-PQ-specific priors +## Lance-PQ-specific priors (lossless directions) -These are the directions known to pay off on this kernel shape. Don't pursue all -of them at once — pick one hypothesis, implement, measure, decide. +These directions are known to pay off without compromising arithmetic accuracy. +Pick one hypothesis at a time; implement; measure; decide. -- **Codebook layout for the table-build step.** The reference layout is - `[m][k][d]`. For a fixed query, iterating over centroids stays in cache, but - the inner loop over `d` is short (8 floats). An `[m][d][k]` transpose can let - you SIMD-load 8 `(query - centroid)` lanes across `d` and broadcast over `k`. -- **LUT packing for the probe step.** The probe is dominated by `acc += - table[m][codes[off+m]]` × 16. Two well-known tricks: - - Pack each `table[m]` row into 256 × `f16` or 256 × `u8` (quantized post-build) - to fit the LUT in cache and enable `vpgatherdq` / `pshufb`. - - Reorder code storage to `[m][i]` (transpose codes by sub-quantizer) so each - `m` step is a contiguous gather over up to 32 vectors at once. -- **Top-K integration.** `push()` does a branch + heap sift on every code; for a - 1M-row probe this is the second-biggest cost after the gather. Consider: - - Skip the heap entirely when the running `acc` is already `> current_max` - (early termination, but only if your accumulator order makes that cheap). - - Block the probe (e.g., 1024 codes at a time), find the local top-K with a - branchless scan, then merge into the global heap. -- **Prefetch.** A `_mm_prefetch(codes.as_ptr().add(off + 64), _MM_HINT_T0)` ahead - of the gather is usually pure win at 1M scale where codes don't all fit in L2. -- **FMA in the table build.** The diff–square–sum sequence is - `(q - c)·(q - c)` per element — that's `(q*q) - 2qc + c*c`. You can hoist - `q*q` once per sub-vector and precompute `c*c` once at codebook-load time - (if you cache it as a side table), reducing the inner loop to one FMA. - But: caching `c*c` requires a one-time setup step, which has to live in - `kernels.rs` since you cannot touch the fixture; either lazy-init via - `OnceLock>` or rebuild every call (probably not worth it). +- **Codebook layout.** The reference layout is `[m][k][d]`. For a fixed query, + iterating over centroids stays in cache, but the inner loop over `d` is + short. Transposing to `[m][d][k]` lets you SIMD-load 8 `(query - centroid)` + lanes across `d` and broadcast over `k`. Do the transpose in `PqKernel::new` + once. +- **Cache `c·c`.** The diff–square–sum is `(q - c)·(q - c) = q·q - 2qc + c·c`. + Hoist `q·q` per sub-vector, precompute `c·c` once at codebook-load time. + Inner loop becomes one FMA (`-2qc + cc`). Watch the sign / accumulator + ordering so the rounding stays within tolerance. +- **Probe layout.** The probe is dominated by `acc += table[m][codes[off+m]]` + × `num_sub_vectors`. Transposing codes to `[m][i]` (one row per sub-quantizer, + contiguous over base index) lets you process up to 32+ vectors per inner + iteration with `vpgatherdq`-style loads. +- **Top-K integration.** `push()` does a branch + heap sift on every code. + At 50k probes per query × 9 (shape × dist) combos that's the second-biggest + cost after the gather. Block the probe (e.g., 512 codes at a time), find the + local top-K with a branchless pass, then merge into the global heap. +- **Prefetch.** A `_mm_prefetch(codes.as_ptr().add(off + 64), _MM_HINT_T0)` + ahead of the gather is usually pure win at 50k+ scale where codes don't all + fit in L2. +- **FMA chains for table build.** The diff–square–sum maps cleanly to FMA on + AVX2/NEON. Even without intrinsics, structuring the inner loop so `rustc` + emits FMA helps. +- **Avoid the `Vec` allocation in the hot path.** `distance_table` allocates a + fresh `Vec` per call. Returning a fixed-capacity buffer is a public-API + change you can't make — but you can reuse a thread-local scratch buffer + internally if it speeds the build. ## The loop @@ -108,9 +126,10 @@ Once setup is done, repeat indefinitely: 1. **Observe state.** Read the last ~5 rows of `results.tsv`. Note which ideas have been tried, what won, what regressed. Form a hypothesis with one - sentence stating the change and the predicted effect on speed and recall. + sentence stating the change and the predicted effect on speed and + correctness. 2. **Edit `src/kernels.rs`.** Keep the diff focused on the one hypothesis. -3. **Build and lint.** Run: +3. **Build and lint.** ``` cargo build --release cargo clippy --release --all-targets -- -D warnings @@ -120,26 +139,28 @@ Once setup is done, repeat indefinitely: ``` cargo run --release --bin run_experiment > run.log 2>&1 ``` -5. **Parse the result.** Extract `recall_at_10`, `geomean_ns_per_query`, - `peak_mem_mb`, `total_seconds` from `run.log`. Compute the deltas vs. baseline. +5. **Parse the result.** Extract `correctness`, `geomean_ns_per_query`, + `worst_ns_per_query` (with combo), `peak_mem_mb`, `total_seconds`. Compute + deltas vs. baseline. 6. **Decide keep or revert.** - - **Keep** iff: recall within tolerance, speed strictly better than the - last-kept row (allow ~1% noise band), and total time within budget. - - **Revert** otherwise: `git restore src/kernels.rs` (or commit and `git - revert` if you want the revert in history). Note what failed. + - **Keep** iff: `correctness: pass`, geomean strictly better than the + last-kept row (allow ~1% noise band), and `worst_ns_per_query` ≤ 1.05 × + last-kept's worst. + - **Revert** otherwise: `git restore src/kernels.rs` (or commit and + `git revert` if you want the revert in history). Note what failed. 7. **Log.** Append one row to `results.tsv`: ``` - + ``` -8. **Commit.** Use a one-line message describing the change and the headline - number, e.g. `transpose codebook; 184k → 142k ns/query (recall 0.94)`. +8. **Commit.** One-line message describing the change and the headline number, + e.g. `transpose codebook in new(); 18.2k → 14.1k geomean ns (worst -8%)`. ## Hygiene -- Always commit `src/kernels.rs` changes; never commit `results.tsv` or `run.log` - (they're gitignored). -- If a change fails to build, do not commit. Iterate until it builds, or revert - cleanly. +- Always commit `src/kernels.rs` changes; never commit `results.tsv` or + `run.log` (they're gitignored). +- If a change fails to build, do not commit. Iterate until it builds, or + revert cleanly. - If two consecutive ideas regress, take a beat: re-read the last ~10 rows of `results.tsv` and update your mental model before proposing the next. - Per-trial cap: 10 minutes. If `cargo run` is still going after 10 min, kill it diff --git a/research/lance-autoresearch/scripts/prepare_fixtures.sh b/research/lance-autoresearch/scripts/prepare_fixtures.sh deleted file mode 100755 index a75a34d..0000000 --- a/research/lance-autoresearch/scripts/prepare_fixtures.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env bash -# IMMUTABLE. One-time SIFT1M fixture preparation. -# -# Downloads SIFT1M from the Texmex corpus (Inria), extracts the f32 vector -# files + ground-truth, then runs the in-tree fixture builder to train a -# product-quantization codebook and encode the base set. All artifacts are -# written under ~/.cache/lance-autoresearch/ so they survive between trials -# but stay out of git. -# -# Total time: ~5–10 min on a fresh laptop. ~250 MB download. - -set -euo pipefail - -CACHE_DIR="${HOME}/.cache/lance-autoresearch" -SIFT_URL="ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz" -SIFT_URL_MIRROR="https://huggingface.co/datasets/qbo-odp/sift1m/resolve/main/sift.tar.gz" - -mkdir -p "${CACHE_DIR}" -cd "${CACHE_DIR}" - -if [[ ! -f sift_base.fvecs || ! -f sift_query.fvecs || ! -f sift_groundtruth.ivecs ]]; then - echo "[prepare_fixtures] downloading SIFT1M..." - if [[ ! -f sift.tar.gz ]]; then - curl --fail -L -o sift.tar.gz "${SIFT_URL}" || \ - curl --fail -L -o sift.tar.gz "${SIFT_URL_MIRROR}" - fi - echo "[prepare_fixtures] extracting..." - tar xzf sift.tar.gz - mv sift/sift_base.fvecs ./sift_base.fvecs - mv sift/sift_query.fvecs ./sift_query.fvecs - mv sift/sift_groundtruth.ivecs ./sift_groundtruth.ivecs - rm -rf sift sift.tar.gz -fi - -if [[ ! -f pq_codebook.bin || ! -f pq_codes.bin ]]; then - echo "[prepare_fixtures] training PQ codebook + encoding base..." - # The fixture builder is run as a `cargo test` with a marker env var so we - # don't have to add a second binary just for one-time setup. The test reads - # SIFT1M, calls the in-tree `train_codebook` + `encode`, and writes the - # frozen artifacts next to the dataset. - cd "$(dirname "$0")/.." - LANCE_AUTORESEARCH_BUILD_FIXTURES=1 cargo test --release --lib build_fixtures -- --ignored --nocapture -fi - -echo "[prepare_fixtures] done — fixtures in ${CACHE_DIR}" -ls -la "${CACHE_DIR}" diff --git a/research/lance-autoresearch/src/bin/run_experiment.rs b/research/lance-autoresearch/src/bin/run_experiment.rs index 6629cd8..0f0de21 100644 --- a/research/lance-autoresearch/src/bin/run_experiment.rs +++ b/research/lance-autoresearch/src/bin/run_experiment.rs @@ -2,95 +2,113 @@ //! //! Run with: `cargo run --release --bin run_experiment > run.log 2>&1` //! -//! Loads (or synthesizes) the fixture, calls the kernels in `src/kernels.rs`, -//! and prints a fixed-format result block the agent can grep: +//! Two phases: +//! +//! PHASE 1 — CORRECTNESS. For every (shape × input distribution) in the +//! correctness battery, build the agent kernel and the scalar reference, +//! compare distance tables (max abs err must be ≤ MAX_ABS_ERR) and top-K +//! results (must be tie-tolerant equivalent within TOPK_DIST_TOL). Any +//! single failure → exit 2 ("correctness"). +//! +//! PHASE 2 — SPEED. For every (shape × data distribution) speed workload, +//! build the agent kernel once, then time `distance_table + probe_top_k` +//! for each query. Report per-(shape × distribution) geomean ns, plus +//! global geomean / worst / best across all timed queries. +//! +//! Output (fixed format the agent can grep): //! //! --- -//! source: sift1m | synthetic -//! num_base: 1000000 -//! num_queries: 1000 -//! recall_at_10: 0.9421 -//! geomean_ns_per_query: 184273 -//! peak_mem_mb: 42.1 -//! total_seconds: 21.7 +//! correctness: pass | fail +//! shapes_tested: (128,16,256) (256,16,256) (768,96,256) +//! distributions_tested: clustered uniform sparse +//! geomean_ns_per_query: 18234 +//! worst_ns_per_query: 24515 (768x96, sparse) +//! best_ns_per_query: 12876 (128x16, clustered) +//! peak_mem_mb: 28.4 +//! total_seconds: 12.3 //! //! Exit codes: -//! 0 — ran to completion, recall above floor, within time budget. -//! 2 — recall below floor (kernel is broken). +//! 0 — both phases passed within time budget. +//! 2 — correctness failure (agent kernel disagrees with reference). //! 3 — total wall-clock exceeded budget. //! 1 — any other error. -use std::collections::HashSet; use std::time::Instant; -use anyhow::Result; +use lance_autoresearch::inputs::{ + DISTRIBUTIONS, DataDistribution, SHAPES, SpeedWorkload, correctness_battery, speed_workloads, +}; +use lance_autoresearch::kernels::PqKernel; +use lance_autoresearch::reference::{ScalarReference, max_abs_err, topk_consistent}; +use lance_autoresearch::{MAX_ABS_ERR, PqShape, TOPK_DIST_TOL}; -use lance_autoresearch::fixture::Fixture; -use lance_autoresearch::kernels::{TopKHeap, compute_distance_table_l2, probe_pq_l2_top_k}; -use lance_autoresearch::{DIM, TOP_K}; - -const MAX_QUERIES_BENCHED: usize = 1000; +// Any constants; the only requirement is that they're pinned across trials so +// the inputs and the timings are reproducible. +const CORRECTNESS_SEED: u64 = 0xC0FF_EEC0_DEBE_EFFE; +const SPEED_SEED: u64 = 0x5EED_F1AC_BABE_FACE; const TIME_BUDGET_SECS: u64 = 600; -const RECALL_FLOOR: f32 = 0.50; fn main() { - match real_main() { - Ok(()) => {} - Err(e) => { - eprintln!("error: {e:#}"); - std::process::exit(1); - } - } -} - -fn real_main() -> Result<()> { let start = Instant::now(); - let fix = Fixture::load_or_synthesize()?; - let n_q = MAX_QUERIES_BENCHED.min(fix.num_query); - let mut hits = 0usize; - let mut total_relevant = 0usize; - let mut per_query_ns: Vec = Vec::with_capacity(n_q); - - for qi in 0..n_q { - let q = &fix.query_vectors[qi * DIM..(qi + 1) * DIM]; - - let t0 = Instant::now(); - let table = compute_distance_table_l2(q, &fix.codebook); - let mut heap = TopKHeap::new(); - probe_pq_l2_top_k(&table, &fix.codes, fix.num_base, &mut heap); - per_query_ns.push(t0.elapsed().as_nanos() as u64); - - let candidates: Vec = heap.into_sorted().into_iter().map(|(id, _)| id).collect(); - let truth_slice = - &fix.groundtruth[qi * fix.top_k_truth..qi * fix.top_k_truth + TOP_K.min(fix.top_k_truth)]; - let truth_set: HashSet = truth_slice.iter().copied().collect(); - for c in &candidates { - if truth_set.contains(c) { - hits += 1; - } - } - total_relevant += TOP_K; + if let Err(e) = run_correctness() { + eprintln!("---"); + eprintln!("correctness: fail"); + eprintln!("first_failure: {e}"); + eprintln!("total_seconds: {:.2}", start.elapsed().as_secs_f64()); + std::process::exit(2); } + println!("correctness: pass"); + + let workloads = speed_workloads(SPEED_SEED); + let report = run_speed(&workloads); - let recall = hits as f32 / total_relevant as f32; - let geomean_ns = geomean(&per_query_ns); let elapsed = start.elapsed(); let mem_mb = peak_rss_mb(); println!("---"); - println!("source: {}", fix.source_str()); - println!("num_base: {}", fix.num_base); - println!("num_queries: {n_q}"); - println!("recall_at_10: {recall:.4}"); - println!("geomean_ns_per_query: {geomean_ns}"); - println!("peak_mem_mb: {mem_mb:.1}"); - println!("total_seconds: {:.2}", elapsed.as_secs_f64()); - - if recall < RECALL_FLOOR { - eprintln!("FAIL: recall@10 {recall:.4} below floor {RECALL_FLOOR:.4}"); - std::process::exit(2); + println!("correctness: pass"); + println!( + "shapes_tested: {}", + SHAPES + .iter() + .map(format_shape) + .collect::>() + .join(" ") + ); + println!( + "distributions_tested: {}", + DISTRIBUTIONS + .iter() + .map(format_dist) + .collect::>() + .join(" ") + ); + println!("geomean_ns_per_query: {}", report.geomean_ns); + println!( + "worst_ns_per_query: {} ({}, {})", + report.worst_ns, + format_shape(&report.worst_shape), + format_dist(&report.worst_dist) + ); + println!( + "best_ns_per_query: {} ({}, {})", + report.best_ns, + format_shape(&report.best_shape), + format_dist(&report.best_dist) + ); + println!("per_combo_geomean_ns:"); + for combo in &report.per_combo { + println!( + " {} {:<10} -> {} ns", + format_shape(&combo.shape), + format_dist(&combo.dist), + combo.geomean_ns + ); } + println!("peak_mem_mb: {mem_mb:.1}"); + println!("total_seconds: {:.2}", elapsed.as_secs_f64()); + if elapsed.as_secs() > TIME_BUDGET_SECS { eprintln!( "FAIL: total wall-clock {}s exceeds budget {}s", @@ -99,10 +117,99 @@ fn real_main() -> Result<()> { ); std::process::exit(3); } +} +fn run_correctness() -> Result<(), String> { + let cases = correctness_battery(CORRECTNESS_SEED); + for case in &cases { + let agent = PqKernel::new(case.shape, &case.codebook); + let reference = ScalarReference::new(case.shape, &case.codebook); + + let agent_table = agent.distance_table(&case.query); + let ref_table = reference.distance_table(&case.query); + let table_err = max_abs_err(&agent_table, &ref_table); + if table_err > MAX_ABS_ERR { + return Err(format!( + "case={}/{} distance_table max_abs_err={table_err} > {MAX_ABS_ERR}", + format_shape(&case.shape), + case.label + )); + } + + let agent_topk = agent.probe_top_k(&agent_table, &case.codes, case.num_vectors, case.k); + let ref_topk = reference.probe_top_k(&ref_table, &case.codes, case.num_vectors, case.k); + if let Err(diag) = topk_consistent(&agent_topk, &ref_topk, TOPK_DIST_TOL) { + return Err(format!( + "case={}/{} top_k disagreement: {diag}", + format_shape(&case.shape), + case.label + )); + } + } Ok(()) } +struct ComboReport { + shape: PqShape, + dist: DataDistribution, + geomean_ns: u64, +} + +struct SpeedReport { + geomean_ns: u64, + worst_ns: u64, + worst_shape: PqShape, + worst_dist: DataDistribution, + best_ns: u64, + best_shape: PqShape, + best_dist: DataDistribution, + per_combo: Vec, +} + +fn run_speed(workloads: &[SpeedWorkload]) -> SpeedReport { + let mut all_timings: Vec = Vec::new(); + let mut per_combo: Vec = Vec::new(); + + let mut worst = (0u64, SHAPES[0], DISTRIBUTIONS[0]); + let mut best = (u64::MAX, SHAPES[0], DISTRIBUTIONS[0]); + + for wl in workloads { + let kernel = PqKernel::new(wl.shape, &wl.codebook); + let mut combo_timings: Vec = Vec::with_capacity(wl.num_queries); + for qi in 0..wl.num_queries { + let q = &wl.queries[qi * wl.shape.dim..(qi + 1) * wl.shape.dim]; + let t0 = Instant::now(); + let table = kernel.distance_table(q); + let _hits = kernel.probe_top_k(&table, &wl.codes, wl.num_vectors, wl.k); + combo_timings.push(t0.elapsed().as_nanos() as u64); + } + let combo_geo = geomean(&combo_timings); + per_combo.push(ComboReport { + shape: wl.shape, + dist: wl.distribution, + geomean_ns: combo_geo, + }); + if combo_geo > worst.0 { + worst = (combo_geo, wl.shape, wl.distribution); + } + if combo_geo < best.0 { + best = (combo_geo, wl.shape, wl.distribution); + } + all_timings.extend(combo_timings); + } + + SpeedReport { + geomean_ns: geomean(&all_timings), + worst_ns: worst.0, + worst_shape: worst.1, + worst_dist: worst.2, + best_ns: best.0, + best_shape: best.1, + best_dist: best.2, + per_combo, + } +} + fn geomean(xs: &[u64]) -> u64 { if xs.is_empty() { return 0; @@ -114,6 +221,19 @@ fn geomean(xs: &[u64]) -> u64 { (sum_ln / xs.len() as f64).exp() as u64 } +fn format_shape(s: &PqShape) -> String { + format!("({},{},{})", s.dim, s.num_sub_vectors, s.num_centroids) +} + +fn format_dist(d: &DataDistribution) -> String { + match d { + DataDistribution::Clustered => "clustered", + DataDistribution::Uniform => "uniform", + DataDistribution::Sparse => "sparse", + } + .to_string() +} + #[cfg(target_os = "linux")] fn peak_rss_mb() -> f64 { let Ok(s) = std::fs::read_to_string("/proc/self/status") else { diff --git a/research/lance-autoresearch/src/fixture.rs b/research/lance-autoresearch/src/fixture.rs deleted file mode 100644 index 65e15fb..0000000 --- a/research/lance-autoresearch/src/fixture.rs +++ /dev/null @@ -1,449 +0,0 @@ -//! IMMUTABLE. Fixture loader. -//! -//! The bench runs against one of: -//! - SIFT1M (preferred; 128-d, 1M base, 10k queries, published ground truth) -//! loaded from `~/.cache/lance-autoresearch/{sift_base,sift_query,sift_groundtruth}.fvecs|.ivecs` -//! plus pre-trained frozen artifacts `pq_codebook.bin` and `pq_codes.bin`. -//! - A synthetic fallback (1024 base / 64 queries, deterministic seed) so the -//! harness is smoke-testable without any external download. -//! -//! Run `scripts/prepare_fixtures.sh` once to populate the SIFT1M fixtures. - -use std::fs; -use std::io::{BufReader, Read}; -use std::path::{Path, PathBuf}; - -use anyhow::{Context, Result, anyhow}; - -use crate::reference::brute_force_top_k_l2; -use crate::{DIM, NUM_CENTROIDS, NUM_SUB_VECTORS, SUB_VECTOR_DIM}; - -pub const SYNTHETIC_NUM_BASE: usize = 1024; -pub const SYNTHETIC_NUM_QUERY: usize = 64; -pub const SYNTHETIC_TOP_K_TRUTH: usize = 32; -const KMEANS_ITERS: usize = 12; - -pub enum FixtureSource { - Sift1M, - Synthetic { seed: u64 }, -} - -pub struct Fixture { - pub base_vectors: Vec, - pub query_vectors: Vec, - pub codebook: Vec, - pub codes: Vec, - pub groundtruth: Vec, - pub num_base: usize, - pub num_query: usize, - pub top_k_truth: usize, - pub source: FixtureSource, -} - -impl Fixture { - /// Try SIFT1M first; fall back to a deterministic synthetic dataset. - pub fn load_or_synthesize() -> Result { - let dir = cache_dir(); - if dir.join("sift_base.fvecs").exists() - && dir.join("sift_query.fvecs").exists() - && dir.join("sift_groundtruth.ivecs").exists() - && dir.join("pq_codebook.bin").exists() - && dir.join("pq_codes.bin").exists() - { - Self::load_sift1m(&dir) - } else { - Self::synthesize(SYNTHETIC_NUM_BASE, SYNTHETIC_NUM_QUERY, 0xC0FFEE_C0FFEE) - } - } - - pub fn source_str(&self) -> &'static str { - match self.source { - FixtureSource::Sift1M => "sift1m", - FixtureSource::Synthetic { .. } => "synthetic", - } - } - - fn load_sift1m(dir: &Path) -> Result { - let base_vectors = read_fvecs(&dir.join("sift_base.fvecs"))?; - let query_vectors = read_fvecs(&dir.join("sift_query.fvecs"))?; - let (groundtruth, top_k_truth) = read_ivecs(&dir.join("sift_groundtruth.ivecs"))?; - let codebook = read_f32_bin(&dir.join("pq_codebook.bin"))?; - let codes = read_u8_bin(&dir.join("pq_codes.bin"))?; - - let num_base = base_vectors.len() / DIM; - let num_query = query_vectors.len() / DIM; - if codebook.len() != NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM { - return Err(anyhow!( - "codebook size mismatch: got {}, expected {}", - codebook.len(), - NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM - )); - } - if codes.len() != num_base * NUM_SUB_VECTORS { - return Err(anyhow!( - "codes size mismatch: got {}, expected {}", - codes.len(), - num_base * NUM_SUB_VECTORS - )); - } - - Ok(Self { - base_vectors, - query_vectors, - codebook, - codes, - groundtruth, - num_base, - num_query, - top_k_truth, - source: FixtureSource::Sift1M, - }) - } - - fn synthesize(num_base: usize, num_query: usize, seed: u64) -> Result { - let mut rng = SplitMix64::new(seed); - // Cluster the base set so PQ has structure to compress and queries have - // meaningful nearest neighbors. With i.i.d. Gaussian noise the asymptotic - // recall of PQ is near-chance; with cluster-shaped data PQ tracks the - // true top-K closely, which is what we want when smoke-testing kernels. - let base_vectors = gen_clustered(num_base, DIM, 32, 0.15, &mut rng); - // Queries are perturbed base points so they have a true near-neighbor. - let query_vectors = gen_query_near_base(&base_vectors, num_base, num_query, &mut rng); - - let codebook = train_codebook(&base_vectors, num_base, &mut rng); - let codes = encode(&base_vectors, num_base, &codebook); - - let mut groundtruth = Vec::with_capacity(num_query * SYNTHETIC_TOP_K_TRUTH); - for qi in 0..num_query { - let q = &query_vectors[qi * DIM..(qi + 1) * DIM]; - let top = brute_force_top_k_l2(q, &base_vectors, num_base, SYNTHETIC_TOP_K_TRUTH); - groundtruth.extend(top.iter().map(|(id, _)| *id)); - } - - Ok(Self { - base_vectors, - query_vectors, - codebook, - codes, - groundtruth, - num_base, - num_query, - top_k_truth: SYNTHETIC_TOP_K_TRUTH, - source: FixtureSource::Synthetic { seed }, - }) - } -} - -pub fn cache_dir() -> PathBuf { - let home = std::env::var_os("HOME") - .map(PathBuf::from) - .unwrap_or_else(|| PathBuf::from("/tmp")); - home.join(".cache").join("lance-autoresearch") -} - -fn read_fvecs(path: &Path) -> Result> { - let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?; - let mut out = Vec::with_capacity(bytes.len() / 4); - let mut i = 0; - while i < bytes.len() { - if i + 4 > bytes.len() { - return Err(anyhow!("truncated fvecs header at offset {i}")); - } - let dim = u32::from_le_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]) as usize; - if dim != DIM { - return Err(anyhow!("fvecs dim {dim} != expected {DIM}")); - } - i += 4; - let row_bytes = dim * 4; - if i + row_bytes > bytes.len() { - return Err(anyhow!("truncated fvecs row at offset {i}")); - } - for d in 0..dim { - let off = i + d * 4; - out.push(f32::from_le_bytes([ - bytes[off], - bytes[off + 1], - bytes[off + 2], - bytes[off + 3], - ])); - } - i += row_bytes; - } - Ok(out) -} - -fn read_ivecs(path: &Path) -> Result<(Vec, usize)> { - let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?; - let mut out = Vec::new(); - let mut top_k: Option = None; - let mut i = 0; - while i < bytes.len() { - if i + 4 > bytes.len() { - return Err(anyhow!("truncated ivecs header")); - } - let dim = u32::from_le_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]) as usize; - i += 4; - if let Some(k) = top_k { - if k != dim { - return Err(anyhow!("ivecs rows have varying widths {k} vs {dim}")); - } - } else { - top_k = Some(dim); - } - let row_bytes = dim * 4; - if i + row_bytes > bytes.len() { - return Err(anyhow!("truncated ivecs row")); - } - for d in 0..dim { - let off = i + d * 4; - out.push(u32::from_le_bytes([ - bytes[off], - bytes[off + 1], - bytes[off + 2], - bytes[off + 3], - ])); - } - i += row_bytes; - } - Ok((out, top_k.unwrap_or(0))) -} - -fn read_f32_bin(path: &Path) -> Result> { - let f = fs::File::open(path).with_context(|| format!("opening {}", path.display()))?; - let mut r = BufReader::new(f); - let mut bytes = Vec::new(); - r.read_to_end(&mut bytes)?; - if bytes.len() % 4 != 0 { - return Err(anyhow!("f32 binary file not a multiple of 4 bytes")); - } - Ok(bytes - .chunks_exact(4) - .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]])) - .collect()) -} - -fn read_u8_bin(path: &Path) -> Result> { - fs::read(path).with_context(|| format!("reading {}", path.display())) -} - -/// xorshift-ish deterministic PRNG (SplitMix64). Vendored small enough to avoid -/// a `rand` dep — the fixture must be reproducible bit-for-bit. -struct SplitMix64 { - state: u64, -} - -impl SplitMix64 { - fn new(seed: u64) -> Self { - Self { state: seed } - } - fn next_u64(&mut self) -> u64 { - self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15); - let mut z = self.state; - z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); - z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); - z ^ (z >> 31) - } - fn next_f32(&mut self) -> f32 { - let bits = (self.next_u64() >> 40) as u32; - bits as f32 / ((1u32 << 24) as f32) - } - /// Box-Muller standard normal. - fn next_normal(&mut self) -> f32 { - let mut u1 = self.next_f32(); - if u1 < 1e-7 { - u1 = 1e-7; - } - let u2 = self.next_f32(); - (-2.0 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos() - } -} - -fn gen_vectors(n: usize, d: usize, rng: &mut SplitMix64) -> Vec { - let mut out = Vec::with_capacity(n * d); - for _ in 0..n * d { - out.push(rng.next_normal()); - } - out -} - -/// Generate `n` vectors of dim `d` as a Gaussian mixture: `num_clusters` random -/// centers, then `n/num_clusters` points per center perturbed by N(0, noise). -fn gen_clustered(n: usize, d: usize, num_clusters: usize, noise: f32, rng: &mut SplitMix64) -> Vec { - let centers = gen_vectors(num_clusters, d, rng); - let mut out = Vec::with_capacity(n * d); - for i in 0..n { - let ci = i % num_clusters; - let center = ¢ers[ci * d..(ci + 1) * d]; - for &c in center { - out.push(c + noise * rng.next_normal()); - } - } - out -} - -/// Generate query vectors by picking `n_query` random base points and perturbing -/// them. Guarantees each query has true near neighbors in the base set. -fn gen_query_near_base( - base: &[f32], - num_base: usize, - n_query: usize, - rng: &mut SplitMix64, -) -> Vec { - let mut out = Vec::with_capacity(n_query * DIM); - for _ in 0..n_query { - let src = (rng.next_u64() as usize) % num_base; - let src_off = src * DIM; - for d in 0..DIM { - out.push(base[src_off + d] + 0.05 * rng.next_normal()); - } - } - out -} - -/// Train a product-quantization codebook by per-subspace k-means. -fn train_codebook(base: &[f32], num_base: usize, rng: &mut SplitMix64) -> Vec { - let mut codebook = vec![0.0f32; NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM]; - - let k = NUM_CENTROIDS.min(num_base); - if k == 0 { - return codebook; - } - - for m in 0..NUM_SUB_VECTORS { - for ki in 0..k { - let src = (rng.next_u64() as usize) % num_base; - let src_off = src * DIM + m * SUB_VECTOR_DIM; - let dst_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; - codebook[dst_off..dst_off + SUB_VECTOR_DIM] - .copy_from_slice(&base[src_off..src_off + SUB_VECTOR_DIM]); - } - - let mut assignments = vec![0u8; num_base]; - for _iter in 0..KMEANS_ITERS { - for i in 0..num_base { - let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM]; - let mut best_k = 0u8; - let mut best_d = f32::INFINITY; - for ki in 0..k { - let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; - let mut acc = 0.0f32; - for d in 0..SUB_VECTOR_DIM { - let diff = sub[d] - codebook[c_off + d]; - acc += diff * diff; - } - if acc < best_d { - best_d = acc; - best_k = ki as u8; - } - } - assignments[i] = best_k; - } - - let mut sums = vec![0.0f32; k * SUB_VECTOR_DIM]; - let mut counts = vec![0u32; k]; - for i in 0..num_base { - let ki = assignments[i] as usize; - let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM]; - for d in 0..SUB_VECTOR_DIM { - sums[ki * SUB_VECTOR_DIM + d] += sub[d]; - } - counts[ki] += 1; - } - for ki in 0..k { - let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; - if counts[ki] == 0 { - let src = (rng.next_u64() as usize) % num_base; - let src_off = src * DIM + m * SUB_VECTOR_DIM; - codebook[c_off..c_off + SUB_VECTOR_DIM] - .copy_from_slice(&base[src_off..src_off + SUB_VECTOR_DIM]); - } else { - let inv = 1.0 / counts[ki] as f32; - for d in 0..SUB_VECTOR_DIM { - codebook[c_off + d] = sums[ki * SUB_VECTOR_DIM + d] * inv; - } - } - } - } - } - - codebook -} - -fn encode(base: &[f32], num_base: usize, codebook: &[f32]) -> Vec { - let mut out = vec![0u8; num_base * NUM_SUB_VECTORS]; - for i in 0..num_base { - for m in 0..NUM_SUB_VECTORS { - let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM]; - let mut best_k = 0u8; - let mut best_d = f32::INFINITY; - for ki in 0..NUM_CENTROIDS { - let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; - let mut acc = 0.0f32; - for d in 0..SUB_VECTOR_DIM { - let diff = sub[d] - codebook[c_off + d]; - acc += diff * diff; - } - if acc < best_d { - best_d = acc; - best_k = ki as u8; - } - } - out[i * NUM_SUB_VECTORS + m] = best_k; - } - } - out -} - -#[cfg(test)] -mod tests { - //! Fixture-builder tests. The default smoke test exercises the synthetic path - //! end-to-end. `build_fixtures` is `#[ignore]` — it runs only when invoked - //! explicitly by `scripts/prepare_fixtures.sh` and writes the frozen SIFT1M - //! PQ artifacts to `~/.cache/lance-autoresearch/`. - use super::*; - use std::io::Write; - - #[test] - fn synthetic_fixture_is_self_consistent() { - let fix = Fixture::synthesize(256, 8, 0xDEADBEEF).unwrap(); - assert_eq!(fix.base_vectors.len(), 256 * DIM); - assert_eq!(fix.codebook.len(), NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM); - assert_eq!(fix.codes.len(), 256 * NUM_SUB_VECTORS); - assert_eq!(fix.groundtruth.len(), 8 * SYNTHETIC_TOP_K_TRUTH); - for &id in &fix.groundtruth { - assert!((id as usize) < 256); - } - } - - #[test] - #[ignore] - fn build_fixtures() { - if std::env::var("LANCE_AUTORESEARCH_BUILD_FIXTURES").is_err() { - eprintln!("skipping: set LANCE_AUTORESEARCH_BUILD_FIXTURES=1 to run"); - return; - } - let dir = cache_dir(); - let base = read_fvecs(&dir.join("sift_base.fvecs")).expect("read sift_base"); - let num_base = base.len() / DIM; - eprintln!("[build_fixtures] training PQ codebook on {num_base} vectors..."); - - let mut rng = SplitMix64::new(0x0005_1F74_F1AC); - let codebook = train_codebook(&base, num_base, &mut rng); - let codes = encode(&base, num_base, &codebook); - - let codebook_bytes: Vec = codebook - .iter() - .flat_map(|f| f.to_le_bytes()) - .collect(); - std::fs::File::create(dir.join("pq_codebook.bin")) - .unwrap() - .write_all(&codebook_bytes) - .unwrap(); - std::fs::File::create(dir.join("pq_codes.bin")) - .unwrap() - .write_all(&codes) - .unwrap(); - eprintln!("[build_fixtures] wrote {} centroids × {} bytes codebook, {} bytes codes", - NUM_SUB_VECTORS * NUM_CENTROIDS, SUB_VECTOR_DIM * 4, codes.len()); - } -} diff --git a/research/lance-autoresearch/src/inputs.rs b/research/lance-autoresearch/src/inputs.rs new file mode 100644 index 0000000..f153028 --- /dev/null +++ b/research/lance-autoresearch/src/inputs.rs @@ -0,0 +1,381 @@ +//! IMMUTABLE. Diverse test-data + workload generators. +//! +//! Two surfaces. `correctness_battery(seed)` yields several +//! `(shape, query, codebook, codes)` tuples spanning Gaussian, uniform, sparse, +//! large-dynamic-range, and mostly-zero distributions. Used by the correctness +//! phase to pin the agent's kernel against the scalar reference. Same seed +//! produces the same battery, so a regression surfaces in one trial. +//! +//! `speed_workloads(seed)` yields larger `(shape, queries, codebook, codes)` +//! workloads spanning multiple PQ shapes × data distributions. Used by the +//! speed phase. Fixed seed produces reproducible timings across trials. +//! +//! There is no fixed dataset (no SIFT1M, no codebook fixture). Everything is +//! generated deterministically per call. The PQ codebook for each workload is +//! "trained" with a fast pseudo-k-means (random-init + 2 Lloyd iterations) so +//! the codebook is shape-appropriate, not random. + +use crate::PqShape; + +/// PQ shapes the bench evaluates. The agent's kernel must produce correct +/// output and competitive speed on every one. +pub const SHAPES: &[PqShape] = &[ + PqShape::new(128, 16, 256), // SIFT-like; sub_vec_dim = 8 + PqShape::new(256, 16, 256), // larger dim, sub_vec_dim = 16 + PqShape::new(768, 96, 256), // BERT-base-like; sub_vec_dim = 8, large codebook +]; + +/// Number of base vectors used in each speed workload. Kept moderate so a full +/// trial finishes in seconds, not minutes — the benchmark is per-query speed, +/// not throughput, so absolute scale doesn't change the comparison. +pub const SPEED_NUM_BASE: usize = 20_000; + +/// Number of queries timed per (shape, distribution) speed workload. +pub const SPEED_NUM_QUERIES: usize = 32; + +/// Training-set size for codebook k-means. Decoupled from base size so a +/// (768, 96) shape doesn't pay 96 × 256 × 50k inner-loop ops just to build +/// fixtures. K-means converges fine on a few thousand training samples. +const CODEBOOK_TRAIN_SIZE: usize = 4096; + +pub const SPEED_TOP_K: usize = 10; + +#[derive(Clone, Copy, Debug)] +pub enum DataDistribution { + /// Gaussian-mixture clusters with low intra-cluster noise. SIFT-like. + Clustered, + /// Uniform on [-1, 1]. No structure for PQ to exploit. + Uniform, + /// 90% of values are exactly zero, the rest are Gaussian. Stresses + /// branch-on-data optimizations. + Sparse, +} + +pub const DISTRIBUTIONS: &[DataDistribution] = &[ + DataDistribution::Clustered, + DataDistribution::Uniform, + DataDistribution::Sparse, +]; + +pub struct CorrectnessCase { + pub label: &'static str, + pub shape: PqShape, + pub query: Vec, + pub codebook: Vec, + pub codes: Vec, + pub num_vectors: usize, + pub k: usize, +} + +pub struct SpeedWorkload { + pub shape: PqShape, + pub distribution: DataDistribution, + pub queries: Vec, // [num_queries × dim] flat + pub codebook: Vec, + pub codes: Vec, + pub num_queries: usize, + pub num_vectors: usize, + pub k: usize, +} + +pub fn correctness_battery(seed: u64) -> Vec { + let mut out = Vec::new(); + let kinds: &[(InputKind, &'static str)] = &[ + (InputKind::Gaussian, "gaussian"), + (InputKind::Uniform, "uniform"), + (InputKind::Sparse, "sparse"), + (InputKind::LargeDynamicRange, "large_dynamic_range"), + (InputKind::MostlyZero, "mostly_zero"), + ]; + for &shape in SHAPES { + for (kind, label) in kinds { + let mut rng = SplitMix64::new(seed ^ shape_hash(shape) ^ kind_hash(*kind)); + let num_vectors = 4096; + let codebook = build_codebook(shape, *kind, CODEBOOK_TRAIN_SIZE, &mut rng); + let base = gen_vectors(num_vectors, shape.dim, *kind, &mut rng); + let codes = encode(shape, num_vectors, &base, &codebook); + let query = gen_vectors(1, shape.dim, *kind, &mut rng); + out.push(CorrectnessCase { + label, + shape, + query, + codebook, + codes, + num_vectors, + k: 10, + }); + } + } + out +} + +pub fn speed_workloads(seed: u64) -> Vec { + let mut out = Vec::new(); + for &shape in SHAPES { + for &dist in DISTRIBUTIONS { + let mut rng = SplitMix64::new(seed ^ shape_hash(shape) ^ dist_hash(dist)); + let kind = match dist { + DataDistribution::Clustered => InputKind::Clustered, + DataDistribution::Uniform => InputKind::Uniform, + DataDistribution::Sparse => InputKind::Sparse, + }; + let codebook = build_codebook(shape, kind, CODEBOOK_TRAIN_SIZE, &mut rng); + let base = gen_vectors(SPEED_NUM_BASE, shape.dim, kind, &mut rng); + let codes = encode(shape, SPEED_NUM_BASE, &base, &codebook); + let queries = gen_vectors(SPEED_NUM_QUERIES, shape.dim, kind, &mut rng); + out.push(SpeedWorkload { + shape, + distribution: dist, + queries, + codebook, + codes, + num_queries: SPEED_NUM_QUERIES, + num_vectors: SPEED_NUM_BASE, + k: SPEED_TOP_K, + }); + } + } + out +} + +#[derive(Clone, Copy, Debug)] +enum InputKind { + Gaussian, + Uniform, + Sparse, + LargeDynamicRange, + MostlyZero, + Clustered, +} + +fn gen_vectors(n: usize, dim: usize, kind: InputKind, rng: &mut SplitMix64) -> Vec { + let total = n * dim; + let mut out = Vec::with_capacity(total); + match kind { + InputKind::Gaussian => { + for _ in 0..total { + out.push(rng.next_normal()); + } + } + InputKind::Uniform => { + for _ in 0..total { + out.push(rng.next_f32() * 2.0 - 1.0); + } + } + InputKind::Sparse => { + for _ in 0..total { + let v = if rng.next_f32() < 0.10 { rng.next_normal() } else { 0.0 }; + out.push(v); + } + } + InputKind::LargeDynamicRange => { + for _ in 0..total { + // Mix three orders of magnitude. + let exp = match (rng.next_u64() % 3) as u32 { + 0 => -3, + 1 => 0, + _ => 3, + }; + out.push(rng.next_normal() * (10.0f32).powi(exp)); + } + } + InputKind::MostlyZero => { + for _ in 0..total { + let v = if rng.next_f32() < 0.01 { rng.next_normal() } else { 0.0 }; + out.push(v); + } + } + InputKind::Clustered => { + // 32 cluster centers, tight noise, evenly-distributed assignments. + let num_clusters = 32usize; + let centers: Vec = (0..num_clusters * dim).map(|_| rng.next_normal()).collect(); + for i in 0..n { + let ci = i % num_clusters; + let center = ¢ers[ci * dim..(ci + 1) * dim]; + for &c in center { + out.push(c + 0.10 * rng.next_normal()); + } + } + } + } + out +} + +/// Build a PQ codebook by random-init plus two Lloyd refinements per subspace. +/// Cheap; not optimal; good enough that codes are non-degenerate so probe-time +/// access patterns look realistic. +fn build_codebook(shape: PqShape, kind: InputKind, num_train: usize, rng: &mut SplitMix64) -> Vec { + let svd = shape.sub_vector_dim(); + let train = gen_vectors(num_train, shape.dim, kind, rng); + let mut codebook = vec![0.0f32; shape.codebook_len()]; + let k = shape.num_centroids.min(num_train); + + for m in 0..shape.num_sub_vectors { + // Init centroids from random training samples. + for ki in 0..k { + let src = (rng.next_u64() as usize) % num_train; + let src_off = src * shape.dim + m * svd; + let dst_off = m * shape.num_centroids * svd + ki * svd; + codebook[dst_off..dst_off + svd] + .copy_from_slice(&train[src_off..src_off + svd]); + } + + let mut assignments = vec![0u8; num_train]; + for _iter in 0..2 { + for i in 0..num_train { + let sub = &train[i * shape.dim + m * svd..i * shape.dim + (m + 1) * svd]; + let mut best_k = 0u8; + let mut best_d = f32::INFINITY; + for ki in 0..k { + let c_off = m * shape.num_centroids * svd + ki * svd; + let mut acc = 0.0f32; + for d in 0..svd { + let diff = sub[d] - codebook[c_off + d]; + acc += diff * diff; + } + if acc < best_d { + best_d = acc; + best_k = ki as u8; + } + } + assignments[i] = best_k; + } + let mut sums = vec![0.0f32; k * svd]; + let mut counts = vec![0u32; k]; + for i in 0..num_train { + let ki = assignments[i] as usize; + let sub = &train[i * shape.dim + m * svd..i * shape.dim + (m + 1) * svd]; + for d in 0..svd { + sums[ki * svd + d] += sub[d]; + } + counts[ki] += 1; + } + for ki in 0..k { + let c_off = m * shape.num_centroids * svd + ki * svd; + if counts[ki] == 0 { + let src = (rng.next_u64() as usize) % num_train; + let src_off = src * shape.dim + m * svd; + codebook[c_off..c_off + svd] + .copy_from_slice(&train[src_off..src_off + svd]); + } else { + let inv = 1.0 / counts[ki] as f32; + for d in 0..svd { + codebook[c_off + d] = sums[ki * svd + d] * inv; + } + } + } + } + } + codebook +} + +fn encode(shape: PqShape, n: usize, base: &[f32], codebook: &[f32]) -> Vec { + let svd = shape.sub_vector_dim(); + let mut out = vec![0u8; n * shape.num_sub_vectors]; + for i in 0..n { + for m in 0..shape.num_sub_vectors { + let sub = &base[i * shape.dim + m * svd..i * shape.dim + (m + 1) * svd]; + let mut best_k = 0u8; + let mut best_d = f32::INFINITY; + for ki in 0..shape.num_centroids { + let c_off = m * shape.num_centroids * svd + ki * svd; + let mut acc = 0.0f32; + for d in 0..svd { + let diff = sub[d] - codebook[c_off + d]; + acc += diff * diff; + } + if acc < best_d { + best_d = acc; + best_k = ki as u8; + } + } + out[i * shape.num_sub_vectors + m] = best_k; + } + } + out +} + +/// SplitMix64 — small, deterministic; bit-for-bit reproducible across machines. +struct SplitMix64 { + state: u64, +} + +impl SplitMix64 { + fn new(seed: u64) -> Self { + Self { state: seed } + } + fn next_u64(&mut self) -> u64 { + self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15); + let mut z = self.state; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); + z ^ (z >> 31) + } + fn next_f32(&mut self) -> f32 { + let bits = (self.next_u64() >> 40) as u32; + bits as f32 / ((1u32 << 24) as f32) + } + fn next_normal(&mut self) -> f32 { + let mut u1 = self.next_f32(); + if u1 < 1e-7 { + u1 = 1e-7; + } + let u2 = self.next_f32(); + (-2.0 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos() + } +} + +fn shape_hash(s: PqShape) -> u64 { + (s.dim as u64).wrapping_mul(0x9E37_79B9_7F4A_7C15) + ^ (s.num_sub_vectors as u64).wrapping_mul(0xBF58_476D_1CE4_E5B9) + ^ (s.num_centroids as u64).wrapping_mul(0x94D0_49BB_1331_11EB) +} + +fn kind_hash(k: InputKind) -> u64 { + let tag: u64 = match k { + InputKind::Gaussian => 0x11, + InputKind::Uniform => 0x22, + InputKind::Sparse => 0x33, + InputKind::LargeDynamicRange => 0x44, + InputKind::MostlyZero => 0x55, + InputKind::Clustered => 0x66, + }; + tag.wrapping_mul(0xDEAD_BEEF_CAFE_F00D) +} + +fn dist_hash(d: DataDistribution) -> u64 { + let tag: u64 = match d { + DataDistribution::Clustered => 0x101, + DataDistribution::Uniform => 0x202, + DataDistribution::Sparse => 0x303, + }; + tag.wrapping_mul(0xFEED_FACE_BABE_CAFE) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn correctness_battery_is_deterministic() { + let a = correctness_battery(0xABCD); + let b = correctness_battery(0xABCD); + assert_eq!(a.len(), b.len()); + for (x, y) in a.iter().zip(&b) { + assert_eq!(x.shape, y.shape); + assert_eq!(x.codes, y.codes); + assert_eq!(x.query, y.query); + } + } + + #[test] + fn speed_workloads_match_shapes() { + let w = speed_workloads(0x1234); + assert_eq!(w.len(), SHAPES.len() * DISTRIBUTIONS.len()); + for wl in w { + assert_eq!(wl.codebook.len(), wl.shape.codebook_len()); + assert_eq!(wl.codes.len(), wl.num_vectors * wl.shape.num_sub_vectors); + assert_eq!(wl.queries.len(), wl.num_queries * wl.shape.dim); + } + } +} diff --git a/research/lance-autoresearch/src/kernels.rs b/research/lance-autoresearch/src/kernels.rs index a6517a8..c4c1a70 100644 --- a/research/lance-autoresearch/src/kernels.rs +++ b/research/lance-autoresearch/src/kernels.rs @@ -3,114 +3,143 @@ // AGENT'S PLAYGROUND. This is the file you (the agent) modify. // // Algorithmically modeled on the L2 path in lance-linalg's distance / pq modules -// (Lance 4.x, Apache-2.0; see https://github.com/lance-format/lance). It is *not* -// a verbatim vendored copy — pulling in lance-linalg's private helpers as deps -// would couple this harness to crate internals and slow rebuilds. The baseline is -// intentionally a clean scalar implementation of the same algorithm Lance uses: -// build an asymmetric distance LUT, then probe every PQ-encoded vector via 16 -// table lookups + an accumulator. Beating the baseline (and porting wins back -// upstream) is the point of this repo. +// (Lance 4.x, Apache-2.0; https://github.com/lance-format/lance). It is *not* a +// verbatim vendored copy — pulling in lance-linalg's private helpers as deps +// would couple this harness to crate internals and slow rebuilds. The baseline +// is intentionally a clean scalar implementation of the same algorithm Lance +// uses: pre-process the codebook into a kernel context, build an asymmetric +// distance LUT per query, then probe every PQ-encoded vector via +// `num_sub_vectors` table lookups + an accumulator. // -// PUBLIC API CONTRACT (must remain stable so `bin/run_experiment.rs` keeps building): -// - DistanceTable type alias -// - compute_distance_table_l2(query, codebook) -> DistanceTable -// - probe_pq_l2_top_k(table, codes, num_vectors, &mut TopKHeap) -// - TopKHeap::new() / push / into_sorted +// PUBLIC API CONTRACT (must remain stable so the bench keeps building): +// - `pub struct PqKernel` +// - `PqKernel::new(shape: PqShape, codebook: &[f32]) -> Self` +// - `PqKernel::shape(&self) -> &PqShape` +// - `PqKernel::distance_table(&self, query: &[f32]) -> Vec` +// - `PqKernel::probe_top_k(&self, table: &[f32], codes: &[u8], num_vectors: usize, k: usize) -> Vec<(u32, f32)>` // -// You may add private helpers, switch internal data layouts (e.g. transpose the -// codebook for vectorized table-build, pack the LUT for `pshufb`), drop down to -// `std::arch` intrinsics behind cfg gates, mark functions `#[inline]`, etc. -// You may NOT change `DIM` / `NUM_SUB_VECTORS` / `NUM_CENTROIDS` / `TOP_K` -// (those are pinned by the fixture geometry in `lib.rs`). +// What you CAN do: +// - Pre-process the codebook in `new` (transpose, pack into u8 LUT, cache c·c +// for the FMA trick, etc.). Build cost is paid once per dataset, amortized +// across queries — the bench measures per-query, not per-(build + query). +// - Reorder loops, switch internal data layouts, drop down to `std::arch` +// intrinsics under `#[cfg(target_arch = ...)]` gates (always keep a +// portable scalar fallback so this compiles everywhere). +// - Use `unsafe` if needed; document the invariants. +// - Add private helpers freely. +// +// What you CANNOT do: +// - Change the public API above. +// - Modify lib.rs / reference.rs / inputs.rs / run_experiment.rs / benches/. +// - Lose accuracy — the correctness phase asserts max_abs_err ≤ 1e-4 against +// the scalar reference on every input. Lossy techniques (u8-LUT +// quantization, etc.) will fail the gate; if you want to explore them, +// surface a separate kernel and propose a "lossy track" extension to the +// human. -use crate::{NUM_CENTROIDS, NUM_SUB_VECTORS, SUB_VECTOR_DIM, TOP_K}; +use crate::PqShape; -/// Precomputed asymmetric L2 distance table. -/// -/// Indexed as `table[sub_vector_idx][centroid_idx]`. Each entry is the squared -/// L2 distance from the query's `m`-th sub-vector to the `k`-th centroid of the -/// `m`-th sub-quantizer. -pub type DistanceTable = [[f32; NUM_CENTROIDS]; NUM_SUB_VECTORS]; +/// Kernel context. Pre-computed state derived from the codebook lives here. +pub struct PqKernel { + shape: PqShape, + /// Codebook stored in `[m][k][d]` layout. + codebook: Vec, +} -/// Build the asymmetric distance table for one query against the codebook. -/// -/// `codebook` layout: contiguous `[NUM_SUB_VECTORS][NUM_CENTROIDS][SUB_VECTOR_DIM]`. -#[allow(clippy::needless_range_loop)] -pub fn compute_distance_table_l2(query: &[f32], codebook: &[f32]) -> DistanceTable { - debug_assert_eq!(query.len(), NUM_SUB_VECTORS * SUB_VECTOR_DIM); - debug_assert_eq!( - codebook.len(), - NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM - ); +impl PqKernel { + /// Build a kernel context from a codebook. Pre-processing time is amortized + /// across all queries against this kernel. + /// + /// `codebook` layout: `[num_sub_vectors][num_centroids][sub_vector_dim]` flat. + pub fn new(shape: PqShape, codebook: &[f32]) -> Self { + debug_assert_eq!(codebook.len(), shape.codebook_len()); + Self { + shape, + codebook: codebook.to_vec(), + } + } - let mut table = [[0.0f32; NUM_CENTROIDS]; NUM_SUB_VECTORS]; - for m in 0..NUM_SUB_VECTORS { - let q_sub = &query[m * SUB_VECTOR_DIM..(m + 1) * SUB_VECTOR_DIM]; - let cb_offset = m * NUM_CENTROIDS * SUB_VECTOR_DIM; - for k in 0..NUM_CENTROIDS { - let base = cb_offset + k * SUB_VECTOR_DIM; - let mut acc = 0.0f32; - for d in 0..SUB_VECTOR_DIM { - let diff = q_sub[d] - codebook[base + d]; - acc += diff * diff; + pub fn shape(&self) -> &PqShape { + &self.shape + } + + /// Asymmetric L2 distance table for one query. + /// + /// Layout of returned `Vec`: `[num_sub_vectors][num_centroids]` flat + /// (`table[m * num_centroids + k]`). + #[allow(clippy::needless_range_loop)] + pub fn distance_table(&self, query: &[f32]) -> Vec { + let s = &self.shape; + let svd = s.sub_vector_dim(); + debug_assert_eq!(query.len(), s.dim); + + let mut table = vec![0.0f32; s.distance_table_len()]; + for m in 0..s.num_sub_vectors { + let q_sub = &query[m * svd..(m + 1) * svd]; + let cb_off = m * s.num_centroids * svd; + let tbl_off = m * s.num_centroids; + for k in 0..s.num_centroids { + let base = cb_off + k * svd; + let mut acc = 0.0f32; + for d in 0..svd { + let diff = q_sub[d] - self.codebook[base + d]; + acc += diff * diff; + } + table[tbl_off + k] = acc; } - table[m][k] = acc; } + table } - table -} -/// Probe every PQ-encoded vector and accumulate the top-K minimum distances. -/// -/// `codes` layout: `[num_vectors][NUM_SUB_VECTORS]` packed; one byte per sub-quantizer. -pub fn probe_pq_l2_top_k( - table: &DistanceTable, - codes: &[u8], - num_vectors: usize, - out: &mut TopKHeap, -) { - debug_assert_eq!(codes.len(), num_vectors * NUM_SUB_VECTORS); + /// Probe `num_vectors` PQ-encoded vectors and return top-K by ascending + /// distance. + /// + /// `codes` layout: `[num_vectors][num_sub_vectors]` flat. + pub fn probe_top_k( + &self, + table: &[f32], + codes: &[u8], + num_vectors: usize, + k: usize, + ) -> Vec<(u32, f32)> { + let s = &self.shape; + debug_assert_eq!(table.len(), s.distance_table_len()); + debug_assert_eq!(codes.len(), num_vectors * s.num_sub_vectors); - for i in 0..num_vectors { - let off = i * NUM_SUB_VECTORS; - let mut acc = 0.0f32; - for m in 0..NUM_SUB_VECTORS { - let k = codes[off + m] as usize; - acc += table[m][k]; + let mut heap = TopKHeap::with_capacity(k); + for i in 0..num_vectors { + let off = i * s.num_sub_vectors; + let mut acc = 0.0f32; + for m in 0..s.num_sub_vectors { + let c = codes[off + m] as usize; + acc += table[m * s.num_centroids + c]; + } + heap.push(i as u32, acc); } - out.push(i as u32, acc); + heap.into_sorted() } } -/// Fixed-capacity max-heap that keeps the K *smallest*-distance entries seen. -/// -/// Root is the largest of the K kept distances, so deciding whether to admit a -/// new entry is one comparison. -pub struct TopKHeap { - entries: [(u32, f32); TOP_K], - len: usize, -} - -impl Default for TopKHeap { - fn default() -> Self { - Self::new() - } +/// Variable-capacity max-heap that keeps the K *smallest*-distance entries. +/// Root is the largest of the K kept distances, so admission is one comparison. +struct TopKHeap { + cap: usize, + entries: Vec<(u32, f32)>, } impl TopKHeap { - pub fn new() -> Self { + fn with_capacity(k: usize) -> Self { Self { - entries: [(u32::MAX, f32::INFINITY); TOP_K], - len: 0, + cap: k, + entries: Vec::with_capacity(k), } } #[inline] - pub fn push(&mut self, id: u32, dist: f32) { - if self.len < TOP_K { - self.entries[self.len] = (id, dist); - self.len += 1; - if self.len == TOP_K { + fn push(&mut self, id: u32, dist: f32) { + if self.entries.len() < self.cap { + self.entries.push((id, dist)); + if self.entries.len() == self.cap { self.heapify(); } return; @@ -122,20 +151,21 @@ impl TopKHeap { } fn heapify(&mut self) { - for i in (0..TOP_K / 2).rev() { + for i in (0..self.entries.len() / 2).rev() { self.sift_down(i); } } fn sift_down(&mut self, mut i: usize) { + let len = self.entries.len(); loop { let l = 2 * i + 1; let r = 2 * i + 2; let mut largest = i; - if l < self.len && self.entries[l].1 > self.entries[largest].1 { + if l < len && self.entries[l].1 > self.entries[largest].1 { largest = l; } - if r < self.len && self.entries[r].1 > self.entries[largest].1 { + if r < len && self.entries[r].1 > self.entries[largest].1 { largest = r; } if largest == i { @@ -146,9 +176,9 @@ impl TopKHeap { } } - pub fn into_sorted(self) -> Vec<(u32, f32)> { - let mut v: Vec<_> = self.entries[..self.len].to_vec(); - v.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); - v + fn into_sorted(mut self) -> Vec<(u32, f32)> { + self.entries + .sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); + self.entries } } diff --git a/research/lance-autoresearch/src/lib.rs b/research/lance-autoresearch/src/lib.rs index 5da7fea..b050c03 100644 --- a/research/lance-autoresearch/src/lib.rs +++ b/research/lance-autoresearch/src/lib.rs @@ -1,21 +1,56 @@ //! Lance autoresearch harness — public API for the bench binary, benchmarks, and tests. //! -//! Layout mirrors Karpathy's nanochat-research / autoresearch three-file contract: +//! Contract (Karpathy-style three files): //! -//! - `kernels` — the AGENT'S PLAYGROUND. May be rewritten freely. -//! - `reference` — IMMUTABLE. Exact brute-force baseline used to certify recall. -//! - `fixture` — IMMUTABLE. Dataset + frozen codebook loader. +//! - `kernels` — the AGENT'S PLAYGROUND. Modify freely. +//! - `reference` — IMMUTABLE. Scalar reference kernel. Defines the math. +//! - `inputs` — IMMUTABLE. Diverse test-data + workload generators, +//! deterministic per fixed seed, varied across the input battery. //! -//! Constants are global because the agent shouldn't have to thread sizes through -//! its kernel — they pin the optimization target (SIFT1M-shaped: 128-d f32, -//! 16 sub-vectors × 256 centroids × 8-d, top-10). +//! The optimization target is dataset-independent: the agent's kernel must match +//! the scalar reference within `MAX_ABS_ERR` on every input the bench generates, +//! and minimize geomean ns/query across multiple PQ shapes and data +//! distributions. There is no fixed dataset; an "improvement" by construction +//! generalizes across distributions and shapes. -pub mod fixture; +pub mod inputs; pub mod kernels; pub mod reference; -pub const DIM: usize = 128; -pub const NUM_SUB_VECTORS: usize = 16; -pub const NUM_CENTROIDS: usize = 256; -pub const SUB_VECTOR_DIM: usize = DIM / NUM_SUB_VECTORS; -pub const TOP_K: usize = 10; +/// Geometry of a PQ index: vector dimension, number of sub-quantizers, centroids +/// per sub-quantizer. We pin nbits=8 (256 centroids) — the dominant Lance code +/// path. `dim` must be divisible by `num_sub_vectors`. +#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)] +pub struct PqShape { + pub dim: usize, + pub num_sub_vectors: usize, + pub num_centroids: usize, +} + +impl PqShape { + pub const fn new(dim: usize, num_sub_vectors: usize, num_centroids: usize) -> Self { + Self { + dim, + num_sub_vectors, + num_centroids, + } + } + pub const fn sub_vector_dim(&self) -> usize { + self.dim / self.num_sub_vectors + } + pub const fn distance_table_len(&self) -> usize { + self.num_sub_vectors * self.num_centroids + } + pub const fn codebook_len(&self) -> usize { + self.num_sub_vectors * self.num_centroids * self.sub_vector_dim() + } +} + +/// Tolerance for the agent kernel's distance values vs. the scalar reference. +/// Loose enough to permit legal SIMD-accumulator reordering; tight enough to +/// catch real arithmetic bugs. +pub const MAX_ABS_ERR: f32 = 1e-4; + +/// Tolerance for top-K *distances* (id sets are compared with tie-tolerance — +/// see `reference::topk_consistent`). +pub const TOPK_DIST_TOL: f32 = 1e-4; diff --git a/research/lance-autoresearch/src/reference.rs b/research/lance-autoresearch/src/reference.rs index 3a7d60c..72a7c54 100644 --- a/research/lance-autoresearch/src/reference.rs +++ b/research/lance-autoresearch/src/reference.rs @@ -1,35 +1,130 @@ -//! IMMUTABLE. Brute-force exact L2 top-K. Used at fixture-build time to compute -//! synthetic-dataset ground truth (against which the agent's PQ-approximate -//! kernel is then scored for recall). For SIFT1M fixtures we use the published -//! ground-truth file instead and never call this at bench-time. +//! IMMUTABLE. Scalar reference kernel — defines the math the agent must match. +//! +//! Same public API as `kernels::PqKernel`, intentionally — it's the bit-for-bit +//! oracle the bench compares against. The implementation here is deliberately +//! plain: no SIMD, no preprocessing, no cleverness. If `kernels::PqKernel` +//! produces a distance value off by more than `MAX_ABS_ERR` from this, the +//! correctness phase fails and the agent's edit is rejected. -use crate::DIM; +use crate::PqShape; -/// Brute-force exact top-K by squared L2. Returns `(id, distance)` ascending. -/// -/// Quadratic in `num_vectors`; only used by the fixture builder, not the hot path. -pub fn brute_force_top_k_l2( - query: &[f32], - base: &[f32], - num_vectors: usize, - k: usize, -) -> Vec<(u32, f32)> { - assert_eq!(query.len(), DIM); - assert_eq!(base.len(), num_vectors * DIM); - - let mut dists: Vec<(u32, f32)> = (0..num_vectors) - .map(|i| { - let v = &base[i * DIM..(i + 1) * DIM]; - let mut acc = 0.0f32; - for d in 0..DIM { - let diff = query[d] - v[d]; - acc += diff * diff; - } - (i as u32, acc) - }) - .collect(); - - dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); - dists.truncate(k); - dists +pub struct ScalarReference { + shape: PqShape, + codebook: Vec, +} + +impl ScalarReference { + pub fn new(shape: PqShape, codebook: &[f32]) -> Self { + assert_eq!(codebook.len(), shape.codebook_len()); + Self { + shape, + codebook: codebook.to_vec(), + } + } + + pub fn shape(&self) -> &PqShape { + &self.shape + } + + #[allow(clippy::needless_range_loop)] + pub fn distance_table(&self, query: &[f32]) -> Vec { + let s = &self.shape; + let svd = s.sub_vector_dim(); + assert_eq!(query.len(), s.dim); + + let mut table = vec![0.0f32; s.distance_table_len()]; + for m in 0..s.num_sub_vectors { + let q_sub = &query[m * svd..(m + 1) * svd]; + let cb_off = m * s.num_centroids * svd; + let tbl_off = m * s.num_centroids; + for k in 0..s.num_centroids { + let base = cb_off + k * svd; + let mut acc = 0.0f32; + for d in 0..svd { + let diff = q_sub[d] - self.codebook[base + d]; + acc += diff * diff; + } + table[tbl_off + k] = acc; + } + } + table + } + + pub fn probe_top_k( + &self, + table: &[f32], + codes: &[u8], + num_vectors: usize, + k: usize, + ) -> Vec<(u32, f32)> { + let s = &self.shape; + assert_eq!(table.len(), s.distance_table_len()); + assert_eq!(codes.len(), num_vectors * s.num_sub_vectors); + + let mut all: Vec<(u32, f32)> = (0..num_vectors) + .map(|i| { + let off = i * s.num_sub_vectors; + let mut acc = 0.0f32; + for m in 0..s.num_sub_vectors { + let c = codes[off + m] as usize; + acc += table[m * s.num_centroids + c]; + } + (i as u32, acc) + }) + .collect(); + all.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); + all.truncate(k); + all + } +} + +/// Compare two distance tables and report the worst absolute element error. +pub fn max_abs_err(a: &[f32], b: &[f32]) -> f32 { + assert_eq!(a.len(), b.len()); + a.iter() + .zip(b) + .map(|(x, y)| (x - y).abs()) + .fold(0.0f32, f32::max) +} + +/// Check two top-K results are equivalent up to: +/// - shared distance tolerance `dist_tol` +/// - distance-tied id substitution (if two candidates have equal distances, +/// either order is acceptable) +/// +/// Returns `Ok(())` on match, or `Err(diagnostic_string)` describing the first +/// disagreement found. +pub fn topk_consistent( + agent: &[(u32, f32)], + reference: &[(u32, f32)], + dist_tol: f32, +) -> Result<(), String> { + if agent.len() != reference.len() { + return Err(format!( + "topk length mismatch: agent={} reference={}", + agent.len(), + reference.len() + )); + } + for (i, ((a_id, a_d), (r_id, r_d))) in agent.iter().zip(reference).enumerate() { + if (a_d - r_d).abs() > dist_tol { + return Err(format!( + "topk[{i}] distance mismatch: agent=({a_id}, {a_d}) reference=({r_id}, {r_d}) | err={}", + (a_d - r_d).abs() + )); + } + if a_id != r_id { + // Different id at same rank is acceptable iff this distance is tied + // with a neighbor in either result — we accept any permutation + // within a tied-distance band. + let agent_neighbor_match = agent.iter().any(|(id, d)| id == r_id && (d - r_d).abs() <= dist_tol); + let ref_neighbor_match = reference.iter().any(|(id, d)| id == a_id && (d - a_d).abs() <= dist_tol); + if !agent_neighbor_match || !ref_neighbor_match { + return Err(format!( + "topk[{i}] id mismatch with no tie-break excuse: agent=({a_id}, {a_d}) reference=({r_id}, {r_d})" + )); + } + } + } + Ok(()) }