research: lance-autoresearch — PQ L2 kernel autoresearch harness

Stand up a standalone Rust project under research/lance-autoresearch/ for LLM-driven optimization of Lance's PQ L2 distance kernels, following Karpathy's three-file autoresearch contract: - src/kernels.rs (mutable, the agent's playground): scalar baseline PQ L2 distance + top-K matching Lance 4.x's algorithm shape (16 sub-vectors, 256 centroids, 8-bit codes, 128-d f32). - src/{fixture,reference,bin/run_experiment}.rs (immutable): SIFT1M loader (fvecs/ivecs + frozen codebook) with deterministic synthetic fallback, brute-force ground truth, fixed-format result block with recall@10 floor + time-budget exits. - program.md (human-iterated): the skill the agent reads each session — setup, what it can / cannot edit, the metric, Lance-PQ-specific priors, the keep/revert loop. Smoke tests pass: baseline build clean, recall@10 = 0.66 on synthetic above the 0.50 floor (exit 0), broken kernel triggers floor failure (exit 2), clippy -D warnings clean. Excludes research/ from omnigraph workspace so the nested project doesn't enter omnigraph's cargo build graph. Licensed dual MIT / Apache-2.0 to keep the upstream-PR path to lance-format/lance clean. https://claude.ai/code/session_01Aq8kBUcjmEPobcEufnWbW5
2026-06-09 01:35:18 +02:00 · 2026-05-14 22:38:39 +00:00 · 2026-05-14 22:38:39 +00:00 · ed376af7d8
commit ed376af7d8
parent 0de7fb3057
15 changed files with 1418 additions and 0 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -6,6 +6,11 @@ members = [
    "crates/omnigraph-cli",
    "crates/omnigraph-server",
 ]
+exclude = [
+    # `research/` holds standalone experimental projects with their own
+    # workspaces. They must not be picked up by the omnigraph workspace build.
+    "research",
+]
 default-members = [
    "crates/omnigraph",
    "crates/omnigraph-cli",
--- a/research/lance-autoresearch/.gitignore
+++ b/research/lance-autoresearch/.gitignore
@ -0,0 +1,7 @@
+target/
+Cargo.lock
+results.tsv
+run.log
+.DS_Store
+*.swp
+data/
--- a/research/lance-autoresearch/Cargo.toml
+++ b/research/lance-autoresearch/Cargo.toml
@ -0,0 +1,30 @@
+[package]
+name = "lance-autoresearch"
+version = "0.1.0"
+edition = "2024"
+license = "MIT OR Apache-2.0"
+description = "Autoresearch-style harness for evolving Lance PQ L2 distance kernels via LLM agents."
+publish = false
+
+[lib]
+path = "src/lib.rs"
+
+[[bin]]
+name = "run_experiment"
+path = "src/bin/run_experiment.rs"
+
+[[bench]]
+name = "pq_l2"
+harness = false
+
+[dependencies]
+anyhow = "1"
+
+[dev-dependencies]
+criterion = { version = "0.5", default-features = false, features = ["plotters", "cargo_bench_support"] }
+
+[profile.release]
+opt-level = 3
+lto = "thin"
+codegen-units = 1
+debug = 1
--- a/research/lance-autoresearch/LICENSE-APACHE
+++ b/research/lance-autoresearch/LICENSE-APACHE
@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for describing the origin of the Work and
+      reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Support. While redistributing the Work or
+      Derivative Works thereof, You may choose to offer, and charge a
+      fee for, acceptance of support, warranty, indemnity, or other
+      liability obligations and/or rights consistent with this License.
+      However, in accepting such obligations, You may act only on Your
+      own behalf and on Your sole responsibility, not on behalf of any
+      other Contributor, and only if You agree to indemnify, defend, and
+      hold each Contributor harmless for any liability incurred by, or
+      claims asserted against, such Contributor by reason of your
+      accepting any such warranty or support.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright 2026 lance-autoresearch contributors
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/research/lance-autoresearch/LICENSE-MIT
+++ b/research/lance-autoresearch/LICENSE-MIT
@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2026 lance-autoresearch contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/research/lance-autoresearch/README.md
+++ b/research/lance-autoresearch/README.md
@ -0,0 +1,101 @@
+# lance-autoresearch
+
+An autoresearch-style harness for evolving [Lance](https://github.com/lance-format/lance)
+PQ L2 distance kernels via LLM coding agents (Claude Code, Codex, Cursor).
+
+Modeled on Andrej Karpathy's
+[`nanochat-research`](https://x.com/karpathy/status/1855651423497650238)
+three-file contract:
+
+- **Immutable bench** — `src/bin/run_experiment.rs` + `src/fixture.rs` + `src/reference.rs`.
+  The agent cannot touch these.
+- **Mutable kernel** — `src/kernels.rs`. The agent's playground. Starts as a clean
+  scalar PQ L2 implementation matching Lance's algorithm; the agent's job is to
+  beat it.
+- **Human-iterated program** — `program.md`. The "skill" the agent reads at the
+  start of every session. The human refines it between runs.
+
+The optimization target is the PQ L2 distance kernel for f32 dense vectors on
+SIFT1M-shaped data (128-d, 16 sub-vectors × 256 centroids, 8-bit codes, top-10
+retrieval). The eval oracle is **recall@10 against SIFT1M's published ground
+truth** at fixed kernel shape, with `geomean_ns_per_query` as the speed metric.
+
+## Why a separate repo
+
+OmniGraph (the graph engine that motivated this) pins Lance at a released
+version and consumes its kernels via the public crate API. Improvements live one
+layer below: in Lance itself. A standalone repo with no OmniGraph dep keeps the
+optimization target pure (only the kernel changes), keeps the license clean for
+upstream contribution (dual MIT/Apache-2.0 → Apache-2.0 PRs to Lance), and
+keeps the agent's working set tiny (~600 lines).
+
+## Quick start
+
+```bash
+# 1. (optional but recommended) Download SIFT1M + train + freeze the PQ codebook.
+#    Takes ~5–10 min; ~250 MB on disk. Skipping it falls back to a synthetic
+#    deterministic dataset (1024 base / 64 queries) — useful for smoke-testing
+#    the harness but not representative of real workloads.
+bash scripts/prepare_fixtures.sh
+
+# 2. Run the baseline.
+cargo run --release --bin run_experiment
+
+# 3. Or run with Claude Code / Codex:
+#    Open the repo in your agent of choice and prompt:
+#       Hi, have a look at program.md and let's kick off a new experiment.
+```
+
+## File ownership
+
+| File | Mutability | Edited by |
+|---|---|---|
+| `src/kernels.rs` | **mutable** | the agent |
+| `src/bin/run_experiment.rs` | immutable | — |
+| `src/reference.rs` | immutable | — |
+| `src/fixture.rs` | immutable | — |
+| `benches/pq_l2.rs` | immutable | — |
+| `scripts/prepare_fixtures.sh` | immutable | — |
+| `program.md` | human-iterated | the human, between runs |
+| `results.tsv` | append-only | the agent, per trial (gitignored) |
+
+## The metric
+
+`run_experiment` prints a fixed-format block:
+
+```
+---
+source:               sift1m
+num_base:             1000000
+num_queries:          1000
+recall_at_10:         0.9421
+geomean_ns_per_query: 184273
+peak_mem_mb:          42.1
+total_seconds:        21.7
+```
+
+A kernel is "kept" iff:
+
+- `recall_at_10` is within 0.005 of the seeded scalar baseline (and ≥ 0.50 hard floor)
+- `geomean_ns_per_query` is strictly better than the previous best-kept kernel
+- `total_seconds` ≤ 600
+
+See `program.md` for the full loop spec.
+
+## Upstream contribution path
+
+When a commit clears the keep bar by a meaningful margin (≥10% speedup with
+recall in-band), the human reviews the diff, ports the technique against
+[`lance-format/lance`](https://github.com/lance-format/lance) HEAD, runs Lance's
+own test suite, and opens a PR. Because `src/kernels.rs` is dual MIT/Apache-2.0
+licensed and algorithmically modeled on Lance's existing path, the upstream PR
+inherits Apache-2.0 cleanly.
+
+## License
+
+Dual-licensed under either of:
+
+- MIT license ([LICENSE-MIT](LICENSE-MIT))
+- Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE))
+
+at your option.
--- a/research/lance-autoresearch/benches/pq_l2.rs
+++ b/research/lance-autoresearch/benches/pq_l2.rs
@ -0,0 +1,56 @@
+//! Criterion benchmark — runs the same kernels the agent edits, but with
+//! statistical sampling. Use this for stable speed comparisons; the
+//! `run_experiment` binary is the agent's per-trial harness.
+//!
+//! `cargo bench --bench pq_l2`
+
+use std::hint::black_box;
+
+use criterion::{Criterion, criterion_group, criterion_main};
+
+use lance_autoresearch::fixture::Fixture;
+use lance_autoresearch::kernels::{TopKHeap, compute_distance_table_l2, probe_pq_l2_top_k};
+use lance_autoresearch::{DIM, NUM_SUB_VECTORS};
+
+fn bench_pq_l2(c: &mut Criterion) {
+    let fix = Fixture::load_or_synthesize().expect("fixture");
+
+    let q = &fix.query_vectors[..DIM];
+    let table0 = compute_distance_table_l2(q, &fix.codebook);
+
+    c.bench_function("compute_distance_table_l2", |b| {
+        b.iter(|| {
+            let t = compute_distance_table_l2(black_box(q), black_box(&fix.codebook));
+            black_box(t);
+        });
+    });
+
+    c.bench_function("probe_pq_l2_top_k", |b| {
+        b.iter(|| {
+            let mut heap = TopKHeap::new();
+            probe_pq_l2_top_k(
+                black_box(&table0),
+                black_box(&fix.codes),
+                black_box(fix.num_base),
+                &mut heap,
+            );
+            black_box(heap);
+        });
+    });
+
+    c.bench_function("end_to_end_one_query", |b| {
+        b.iter(|| {
+            let t = compute_distance_table_l2(black_box(q), black_box(&fix.codebook));
+            let mut heap = TopKHeap::new();
+            probe_pq_l2_top_k(&t, black_box(&fix.codes), black_box(fix.num_base), &mut heap);
+            black_box(heap);
+        });
+    });
+
+    // Reference: silence unused warning for NUM_SUB_VECTORS in case the bench is
+    // ever stubbed out — keeps the constant import meaningful.
+    let _ = NUM_SUB_VECTORS;
+}
+
+criterion_group!(benches, bench_pq_l2);
+criterion_main!(benches);
--- a/research/lance-autoresearch/program.md
+++ b/research/lance-autoresearch/program.md
@ -0,0 +1,151 @@
+# Lance PQ L2 kernel research — agent instructions
+
+You are an autonomous research assistant. Your job is to improve the kernel(s) in
+`src/kernels.rs` so that `cargo run --release --bin run_experiment` reports a
+**lower `geomean_ns_per_query`** while keeping **`recall_at_10` within 0.005 of
+the seeded baseline** (and never below the hard floor 0.50).
+
+Read this file end-to-end before doing anything else. Then run setup, then the loop.
+
+## Setup (do once at the start of every session)
+
+1. Read these files, in this order:
+   - `README.md`
+   - `program.md` (this file)
+   - `src/lib.rs`
+   - `src/kernels.rs` *(the only file you may edit)*
+   - `src/bin/run_experiment.rs`
+   - `src/fixture.rs`
+2. Confirm fixtures are present. SIFT1M lives under `~/.cache/lance-autoresearch/`.
+   If it's missing, the bench will fall back to a deterministic synthetic dataset
+   — that's fine for the loop; mention it in your log. If you want SIFT1M, run
+   `bash scripts/prepare_fixtures.sh` (one-time, ~5–10 min, ~250 MB download).
+3. Ensure `results.tsv` exists. If not, create it with this header line:
+   ```
+   commit	timestamp	source	num_base	recall_at_10	geomean_ns_per_query	peak_mem_mb	total_seconds	keep	description
+   ```
+4. Run the baseline trial: `cargo run --release --bin run_experiment > run.log 2>&1`.
+   Parse `run.log` and append a row to `results.tsv` with `keep=baseline`,
+   `description="seeded scalar PQ-L2 baseline"`. This is your reference number.
+5. Commit the baseline row with a one-line message like `baseline: <numbers>`.
+
+## What you CAN do
+
+- Modify **`src/kernels.rs`** freely. You may:
+  - Reorder loops, change iteration order over codes or sub-vectors.
+  - Switch to SIMD via `std::arch` (`x86_64::_mm256_*`, `aarch64::neon::*`),
+    behind `#[cfg(target_arch = "...")]` gates. Always keep a portable scalar
+    fallback so the kernel compiles everywhere.
+  - Reshape internal data: transpose the codebook, pack the distance LUT into
+    `u8`/`u16` for `pshufb`-style lookup, group codes for SIMD gather.
+  - Use `unsafe` if needed; document the invariants you're relying on.
+  - Mark hot functions `#[inline]` or split them; add private helpers freely.
+- Add `#[cfg(test)] mod tests { ... }` inside `src/kernels.rs` if you want
+  property checks against the scalar path.
+
+## What you CANNOT do
+
+- Do **not** modify `src/lib.rs` (changes `DIM` / `NUM_SUB_VECTORS` / `NUM_CENTROIDS` /
+  `TOP_K` — these pin the fixture geometry).
+- Do **not** modify `src/bin/run_experiment.rs`, `src/reference.rs`, `src/fixture.rs`,
+  `benches/pq_l2.rs`, `scripts/prepare_fixtures.sh`, or `Cargo.toml`.
+- Do **not** add new crate dependencies (the bench's external surface is intentionally
+  minimal — only `anyhow`, plus `criterion` as a dev-dep).
+- Do **not** delete or alter the public API of `kernels.rs`:
+  - `pub type DistanceTable`
+  - `pub fn compute_distance_table_l2(query: &[f32], codebook: &[f32]) -> DistanceTable`
+  - `pub fn probe_pq_l2_top_k(table: &DistanceTable, codes: &[u8], num_vectors: usize, out: &mut TopKHeap)`
+  - `pub struct TopKHeap` with `new() / push / into_sorted`
+
+## The metric
+
+Minimize `geomean_ns_per_query` (geometric mean of per-query wall-clock from the
+benched queries, rounded to a u64 ns) subject to:
+
+1. `recall_at_10 >= baseline_recall_at_10 - 0.005`
+2. `recall_at_10 >= 0.50` (hard floor; below this the bench exits non-zero)
+3. `total_seconds <= 600`
+4. Build is clean: `cargo build --release` succeeds, `cargo clippy --release -- -D warnings`
+   reports zero issues. (Run `cargo clippy --release` before each commit.)
+
+Ties break toward simpler code. If two kernels report the same speed within
+noise (~3%), prefer the one with fewer lines or less `unsafe`.
+
+## Lance-PQ-specific priors
+
+These are the directions known to pay off on this kernel shape. Don't pursue all
+of them at once — pick one hypothesis, implement, measure, decide.
+
+- **Codebook layout for the table-build step.** The reference layout is
+  `[m][k][d]`. For a fixed query, iterating over centroids stays in cache, but
+  the inner loop over `d` is short (8 floats). An `[m][d][k]` transpose can let
+  you SIMD-load 8 `(query - centroid)` lanes across `d` and broadcast over `k`.
+- **LUT packing for the probe step.** The probe is dominated by `acc +=
+  table[m][codes[off+m]]` × 16. Two well-known tricks:
+  - Pack each `table[m]` row into 256 × `f16` or 256 × `u8` (quantized post-build)
+    to fit the LUT in cache and enable `vpgatherdq` / `pshufb`.
+  - Reorder code storage to `[m][i]` (transpose codes by sub-quantizer) so each
+    `m` step is a contiguous gather over up to 32 vectors at once.
+- **Top-K integration.** `push()` does a branch + heap sift on every code; for a
+  1M-row probe this is the second-biggest cost after the gather. Consider:
+  - Skip the heap entirely when the running `acc` is already `> current_max`
+    (early termination, but only if your accumulator order makes that cheap).
+  - Block the probe (e.g., 1024 codes at a time), find the local top-K with a
+    branchless scan, then merge into the global heap.
+- **Prefetch.** A `_mm_prefetch(codes.as_ptr().add(off + 64), _MM_HINT_T0)` ahead
+  of the gather is usually pure win at 1M scale where codes don't all fit in L2.
+- **FMA in the table build.** The diff–square–sum sequence is
+  `(q - c)·(q - c)` per element — that's `(q*q) - 2qc + c*c`. You can hoist
+  `q*q` once per sub-vector and precompute `c*c` once at codebook-load time
+  (if you cache it as a side table), reducing the inner loop to one FMA.
+  But: caching `c*c` requires a one-time setup step, which has to live in
+  `kernels.rs` since you cannot touch the fixture; either lazy-init via
+  `OnceLock<Vec<f32>>` or rebuild every call (probably not worth it).
+
+## The loop
+
+Once setup is done, repeat indefinitely:
+
+1. **Observe state.** Read the last ~5 rows of `results.tsv`. Note which ideas
+   have been tried, what won, what regressed. Form a hypothesis with one
+   sentence stating the change and the predicted effect on speed and recall.
+2. **Edit `src/kernels.rs`.** Keep the diff focused on the one hypothesis.
+3. **Build and lint.** Run:
+   ```
+   cargo build --release
+   cargo clippy --release --all-targets -- -D warnings
+   ```
+   If either fails, fix and try again — do not commit broken state.
+4. **Run the trial.**
+   ```
+   cargo run --release --bin run_experiment > run.log 2>&1
+   ```
+5. **Parse the result.** Extract `recall_at_10`, `geomean_ns_per_query`,
+   `peak_mem_mb`, `total_seconds` from `run.log`. Compute the deltas vs. baseline.
+6. **Decide keep or revert.**
+   - **Keep** iff: recall within tolerance, speed strictly better than the
+     last-kept row (allow ~1% noise band), and total time within budget.
+   - **Revert** otherwise: `git restore src/kernels.rs` (or commit and `git
+     revert` if you want the revert in history). Note what failed.
+7. **Log.** Append one row to `results.tsv`:
+   ```
+   <short_sha>	<iso8601>	<source>	<num_base>	<recall>	<geomean_ns>	<peak_mem>	<elapsed>	<keep|revert>	<one-line description>
+   ```
+8. **Commit.** Use a one-line message describing the change and the headline
+   number, e.g. `transpose codebook; 184k → 142k ns/query (recall 0.94)`.
+
+## Hygiene
+
+- Always commit `src/kernels.rs` changes; never commit `results.tsv` or `run.log`
+  (they're gitignored).
+- If a change fails to build, do not commit. Iterate until it builds, or revert
+  cleanly.
+- If two consecutive ideas regress, take a beat: re-read the last ~10 rows of
+  `results.tsv` and update your mental model before proposing the next.
+- Per-trial cap: 10 minutes. If `cargo run` is still going after 10 min, kill it
+  and mark the trial as `timeout`.
+
+## Never stop
+
+Keep going until interrupted. Each loop iteration is one hypothesis, one edit,
+one measurement, one commit. No multi-step plans across iterations.
--- a/research/lance-autoresearch/rust-toolchain.toml
+++ b/research/lance-autoresearch/rust-toolchain.toml
@ -0,0 +1,3 @@
+[toolchain]
+channel = "stable"
+components = ["rustfmt", "clippy"]
--- a/research/lance-autoresearch/scripts/prepare_fixtures.sh
+++ b/research/lance-autoresearch/scripts/prepare_fixtures.sh
@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# IMMUTABLE. One-time SIFT1M fixture preparation.
+#
+# Downloads SIFT1M from the Texmex corpus (Inria), extracts the f32 vector
+# files + ground-truth, then runs the in-tree fixture builder to train a
+# product-quantization codebook and encode the base set. All artifacts are
+# written under ~/.cache/lance-autoresearch/ so they survive between trials
+# but stay out of git.
+#
+# Total time: ~5–10 min on a fresh laptop. ~250 MB download.
+
+set -euo pipefail
+
+CACHE_DIR="${HOME}/.cache/lance-autoresearch"
+SIFT_URL="ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz"
+SIFT_URL_MIRROR="https://huggingface.co/datasets/qbo-odp/sift1m/resolve/main/sift.tar.gz"
+
+mkdir -p "${CACHE_DIR}"
+cd "${CACHE_DIR}"
+
+if [[ ! -f sift_base.fvecs || ! -f sift_query.fvecs || ! -f sift_groundtruth.ivecs ]]; then
+  echo "[prepare_fixtures] downloading SIFT1M..."
+  if [[ ! -f sift.tar.gz ]]; then
+    curl --fail -L -o sift.tar.gz "${SIFT_URL}" || \
+      curl --fail -L -o sift.tar.gz "${SIFT_URL_MIRROR}"
+  fi
+  echo "[prepare_fixtures] extracting..."
+  tar xzf sift.tar.gz
+  mv sift/sift_base.fvecs        ./sift_base.fvecs
+  mv sift/sift_query.fvecs       ./sift_query.fvecs
+  mv sift/sift_groundtruth.ivecs ./sift_groundtruth.ivecs
+  rm -rf sift sift.tar.gz
+fi
+
+if [[ ! -f pq_codebook.bin || ! -f pq_codes.bin ]]; then
+  echo "[prepare_fixtures] training PQ codebook + encoding base..."
+  # The fixture builder is run as a `cargo test` with a marker env var so we
+  # don't have to add a second binary just for one-time setup. The test reads
+  # SIFT1M, calls the in-tree `train_codebook` + `encode`, and writes the
+  # frozen artifacts next to the dataset.
+  cd "$(dirname "$0")/.."
+  LANCE_AUTORESEARCH_BUILD_FIXTURES=1 cargo test --release --lib build_fixtures -- --ignored --nocapture
+fi
+
+echo "[prepare_fixtures] done — fixtures in ${CACHE_DIR}"
+ls -la "${CACHE_DIR}"
--- a/research/lance-autoresearch/src/bin/run_experiment.rs
+++ b/research/lance-autoresearch/src/bin/run_experiment.rs
@ -0,0 +1,138 @@
+//! IMMUTABLE entry point — the single command the agent invokes per trial.
+//!
+//! Run with:  `cargo run --release --bin run_experiment > run.log 2>&1`
+//!
+//! Loads (or synthesizes) the fixture, calls the kernels in `src/kernels.rs`,
+//! and prints a fixed-format result block the agent can grep:
+//!
+//!     ---
+//!     source:               sift1m | synthetic
+//!     num_base:             1000000
+//!     num_queries:          1000
+//!     recall_at_10:         0.9421
+//!     geomean_ns_per_query: 184273
+//!     peak_mem_mb:          42.1
+//!     total_seconds:        21.7
+//!
+//! Exit codes:
+//!   0  — ran to completion, recall above floor, within time budget.
+//!   2  — recall below floor (kernel is broken).
+//!   3  — total wall-clock exceeded budget.
+//!   1  — any other error.
+
+use std::collections::HashSet;
+use std::time::Instant;
+
+use anyhow::Result;
+
+use lance_autoresearch::fixture::Fixture;
+use lance_autoresearch::kernels::{TopKHeap, compute_distance_table_l2, probe_pq_l2_top_k};
+use lance_autoresearch::{DIM, TOP_K};
+
+const MAX_QUERIES_BENCHED: usize = 1000;
+const TIME_BUDGET_SECS: u64 = 600;
+const RECALL_FLOOR: f32 = 0.50;
+
+fn main() {
+    match real_main() {
+        Ok(()) => {}
+        Err(e) => {
+            eprintln!("error: {e:#}");
+            std::process::exit(1);
+        }
+    }
+}
+
+fn real_main() -> Result<()> {
+    let start = Instant::now();
+    let fix = Fixture::load_or_synthesize()?;
+
+    let n_q = MAX_QUERIES_BENCHED.min(fix.num_query);
+    let mut hits = 0usize;
+    let mut total_relevant = 0usize;
+    let mut per_query_ns: Vec<u64> = Vec::with_capacity(n_q);
+
+    for qi in 0..n_q {
+        let q = &fix.query_vectors[qi * DIM..(qi + 1) * DIM];
+
+        let t0 = Instant::now();
+        let table = compute_distance_table_l2(q, &fix.codebook);
+        let mut heap = TopKHeap::new();
+        probe_pq_l2_top_k(&table, &fix.codes, fix.num_base, &mut heap);
+        per_query_ns.push(t0.elapsed().as_nanos() as u64);
+
+        let candidates: Vec<u32> = heap.into_sorted().into_iter().map(|(id, _)| id).collect();
+        let truth_slice =
+            &fix.groundtruth[qi * fix.top_k_truth..qi * fix.top_k_truth + TOP_K.min(fix.top_k_truth)];
+        let truth_set: HashSet<u32> = truth_slice.iter().copied().collect();
+        for c in &candidates {
+            if truth_set.contains(c) {
+                hits += 1;
+            }
+        }
+        total_relevant += TOP_K;
+    }
+
+    let recall = hits as f32 / total_relevant as f32;
+    let geomean_ns = geomean(&per_query_ns);
+    let elapsed = start.elapsed();
+    let mem_mb = peak_rss_mb();
+
+    println!("---");
+    println!("source:               {}", fix.source_str());
+    println!("num_base:             {}", fix.num_base);
+    println!("num_queries:          {n_q}");
+    println!("recall_at_10:         {recall:.4}");
+    println!("geomean_ns_per_query: {geomean_ns}");
+    println!("peak_mem_mb:          {mem_mb:.1}");
+    println!("total_seconds:        {:.2}", elapsed.as_secs_f64());
+
+    if recall < RECALL_FLOOR {
+        eprintln!("FAIL: recall@10 {recall:.4} below floor {RECALL_FLOOR:.4}");
+        std::process::exit(2);
+    }
+    if elapsed.as_secs() > TIME_BUDGET_SECS {
+        eprintln!(
+            "FAIL: total wall-clock {}s exceeds budget {}s",
+            elapsed.as_secs(),
+            TIME_BUDGET_SECS
+        );
+        std::process::exit(3);
+    }
+
+    Ok(())
+}
+
+fn geomean(xs: &[u64]) -> u64 {
+    if xs.is_empty() {
+        return 0;
+    }
+    let mut sum_ln = 0.0f64;
+    for &x in xs {
+        sum_ln += (x.max(1) as f64).ln();
+    }
+    (sum_ln / xs.len() as f64).exp() as u64
+}
+
+#[cfg(target_os = "linux")]
+fn peak_rss_mb() -> f64 {
+    let Ok(s) = std::fs::read_to_string("/proc/self/status") else {
+        return 0.0;
+    };
+    for line in s.lines() {
+        if let Some(rest) = line.strip_prefix("VmPeak:") {
+            let kb: f64 = rest
+                .split_whitespace()
+                .next()
+                .and_then(|t| t.parse().ok())
+                .unwrap_or(0.0);
+            return kb / 1024.0;
+        }
+    }
+    0.0
+}
+
+#[cfg(not(target_os = "linux"))]
+fn peak_rss_mb() -> f64 {
+    0.0
+}
--- a/research/lance-autoresearch/src/fixture.rs
+++ b/research/lance-autoresearch/src/fixture.rs
@ -0,0 +1,449 @@
+//! IMMUTABLE. Fixture loader.
+//!
+//! The bench runs against one of:
+//!   - SIFT1M (preferred; 128-d, 1M base, 10k queries, published ground truth)
+//!     loaded from `~/.cache/lance-autoresearch/{sift_base,sift_query,sift_groundtruth}.fvecs|.ivecs`
+//!     plus pre-trained frozen artifacts `pq_codebook.bin` and `pq_codes.bin`.
+//!   - A synthetic fallback (1024 base / 64 queries, deterministic seed) so the
+//!     harness is smoke-testable without any external download.
+//!
+//! Run `scripts/prepare_fixtures.sh` once to populate the SIFT1M fixtures.
+
+use std::fs;
+use std::io::{BufReader, Read};
+use std::path::{Path, PathBuf};
+
+use anyhow::{Context, Result, anyhow};
+
+use crate::reference::brute_force_top_k_l2;
+use crate::{DIM, NUM_CENTROIDS, NUM_SUB_VECTORS, SUB_VECTOR_DIM};
+
+pub const SYNTHETIC_NUM_BASE: usize = 1024;
+pub const SYNTHETIC_NUM_QUERY: usize = 64;
+pub const SYNTHETIC_TOP_K_TRUTH: usize = 32;
+const KMEANS_ITERS: usize = 12;
+
+pub enum FixtureSource {
+    Sift1M,
+    Synthetic { seed: u64 },
+}
+
+pub struct Fixture {
+    pub base_vectors: Vec<f32>,
+    pub query_vectors: Vec<f32>,
+    pub codebook: Vec<f32>,
+    pub codes: Vec<u8>,
+    pub groundtruth: Vec<u32>,
+    pub num_base: usize,
+    pub num_query: usize,
+    pub top_k_truth: usize,
+    pub source: FixtureSource,
+}
+
+impl Fixture {
+    /// Try SIFT1M first; fall back to a deterministic synthetic dataset.
+    pub fn load_or_synthesize() -> Result<Self> {
+        let dir = cache_dir();
+        if dir.join("sift_base.fvecs").exists()
+            && dir.join("sift_query.fvecs").exists()
+            && dir.join("sift_groundtruth.ivecs").exists()
+            && dir.join("pq_codebook.bin").exists()
+            && dir.join("pq_codes.bin").exists()
+        {
+            Self::load_sift1m(&dir)
+        } else {
+            Self::synthesize(SYNTHETIC_NUM_BASE, SYNTHETIC_NUM_QUERY, 0xC0FFEE_C0FFEE)
+        }
+    }
+
+    pub fn source_str(&self) -> &'static str {
+        match self.source {
+            FixtureSource::Sift1M => "sift1m",
+            FixtureSource::Synthetic { .. } => "synthetic",
+        }
+    }
+
+    fn load_sift1m(dir: &Path) -> Result<Self> {
+        let base_vectors = read_fvecs(&dir.join("sift_base.fvecs"))?;
+        let query_vectors = read_fvecs(&dir.join("sift_query.fvecs"))?;
+        let (groundtruth, top_k_truth) = read_ivecs(&dir.join("sift_groundtruth.ivecs"))?;
+        let codebook = read_f32_bin(&dir.join("pq_codebook.bin"))?;
+        let codes = read_u8_bin(&dir.join("pq_codes.bin"))?;
+
+        let num_base = base_vectors.len() / DIM;
+        let num_query = query_vectors.len() / DIM;
+        if codebook.len() != NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM {
+            return Err(anyhow!(
+                "codebook size mismatch: got {}, expected {}",
+                codebook.len(),
+                NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM
+            ));
+        }
+        if codes.len() != num_base * NUM_SUB_VECTORS {
+            return Err(anyhow!(
+                "codes size mismatch: got {}, expected {}",
+                codes.len(),
+                num_base * NUM_SUB_VECTORS
+            ));
+        }
+
+        Ok(Self {
+            base_vectors,
+            query_vectors,
+            codebook,
+            codes,
+            groundtruth,
+            num_base,
+            num_query,
+            top_k_truth,
+            source: FixtureSource::Sift1M,
+        })
+    }
+
+    fn synthesize(num_base: usize, num_query: usize, seed: u64) -> Result<Self> {
+        let mut rng = SplitMix64::new(seed);
+        // Cluster the base set so PQ has structure to compress and queries have
+        // meaningful nearest neighbors. With i.i.d. Gaussian noise the asymptotic
+        // recall of PQ is near-chance; with cluster-shaped data PQ tracks the
+        // true top-K closely, which is what we want when smoke-testing kernels.
+        let base_vectors = gen_clustered(num_base, DIM, 32, 0.15, &mut rng);
+        // Queries are perturbed base points so they have a true near-neighbor.
+        let query_vectors = gen_query_near_base(&base_vectors, num_base, num_query, &mut rng);
+
+        let codebook = train_codebook(&base_vectors, num_base, &mut rng);
+        let codes = encode(&base_vectors, num_base, &codebook);
+
+        let mut groundtruth = Vec::with_capacity(num_query * SYNTHETIC_TOP_K_TRUTH);
+        for qi in 0..num_query {
+            let q = &query_vectors[qi * DIM..(qi + 1) * DIM];
+            let top = brute_force_top_k_l2(q, &base_vectors, num_base, SYNTHETIC_TOP_K_TRUTH);
+            groundtruth.extend(top.iter().map(|(id, _)| *id));
+        }
+
+        Ok(Self {
+            base_vectors,
+            query_vectors,
+            codebook,
+            codes,
+            groundtruth,
+            num_base,
+            num_query,
+            top_k_truth: SYNTHETIC_TOP_K_TRUTH,
+            source: FixtureSource::Synthetic { seed },
+        })
+    }
+}
+
+pub fn cache_dir() -> PathBuf {
+    let home = std::env::var_os("HOME")
+        .map(PathBuf::from)
+        .unwrap_or_else(|| PathBuf::from("/tmp"));
+    home.join(".cache").join("lance-autoresearch")
+}
+
+fn read_fvecs(path: &Path) -> Result<Vec<f32>> {
+    let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?;
+    let mut out = Vec::with_capacity(bytes.len() / 4);
+    let mut i = 0;
+    while i < bytes.len() {
+        if i + 4 > bytes.len() {
+            return Err(anyhow!("truncated fvecs header at offset {i}"));
+        }
+        let dim = u32::from_le_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]) as usize;
+        if dim != DIM {
+            return Err(anyhow!("fvecs dim {dim} != expected {DIM}"));
+        }
+        i += 4;
+        let row_bytes = dim * 4;
+        if i + row_bytes > bytes.len() {
+            return Err(anyhow!("truncated fvecs row at offset {i}"));
+        }
+        for d in 0..dim {
+            let off = i + d * 4;
+            out.push(f32::from_le_bytes([
+                bytes[off],
+                bytes[off + 1],
+                bytes[off + 2],
+                bytes[off + 3],
+            ]));
+        }
+        i += row_bytes;
+    }
+    Ok(out)
+}
+
+fn read_ivecs(path: &Path) -> Result<(Vec<u32>, usize)> {
+    let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?;
+    let mut out = Vec::new();
+    let mut top_k: Option<usize> = None;
+    let mut i = 0;
+    while i < bytes.len() {
+        if i + 4 > bytes.len() {
+            return Err(anyhow!("truncated ivecs header"));
+        }
+        let dim = u32::from_le_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]) as usize;
+        i += 4;
+        if let Some(k) = top_k {
+            if k != dim {
+                return Err(anyhow!("ivecs rows have varying widths {k} vs {dim}"));
+            }
+        } else {
+            top_k = Some(dim);
+        }
+        let row_bytes = dim * 4;
+        if i + row_bytes > bytes.len() {
+            return Err(anyhow!("truncated ivecs row"));
+        }
+        for d in 0..dim {
+            let off = i + d * 4;
+            out.push(u32::from_le_bytes([
+                bytes[off],
+                bytes[off + 1],
+                bytes[off + 2],
+                bytes[off + 3],
+            ]));
+        }
+        i += row_bytes;
+    }
+    Ok((out, top_k.unwrap_or(0)))
+}
+
+fn read_f32_bin(path: &Path) -> Result<Vec<f32>> {
+    let f = fs::File::open(path).with_context(|| format!("opening {}", path.display()))?;
+    let mut r = BufReader::new(f);
+    let mut bytes = Vec::new();
+    r.read_to_end(&mut bytes)?;
+    if bytes.len() % 4 != 0 {
+        return Err(anyhow!("f32 binary file not a multiple of 4 bytes"));
+    }
+    Ok(bytes
+        .chunks_exact(4)
+        .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]]))
+        .collect())
+}
+
+fn read_u8_bin(path: &Path) -> Result<Vec<u8>> {
+    fs::read(path).with_context(|| format!("reading {}", path.display()))
+}
+
+/// xorshift-ish deterministic PRNG (SplitMix64). Vendored small enough to avoid
+/// a `rand` dep — the fixture must be reproducible bit-for-bit.
+struct SplitMix64 {
+    state: u64,
+}
+
+impl SplitMix64 {
+    fn new(seed: u64) -> Self {
+        Self { state: seed }
+    }
+    fn next_u64(&mut self) -> u64 {
+        self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15);
+        let mut z = self.state;
+        z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9);
+        z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB);
+        z ^ (z >> 31)
+    }
+    fn next_f32(&mut self) -> f32 {
+        let bits = (self.next_u64() >> 40) as u32;
+        bits as f32 / ((1u32 << 24) as f32)
+    }
+    /// Box-Muller standard normal.
+    fn next_normal(&mut self) -> f32 {
+        let mut u1 = self.next_f32();
+        if u1 < 1e-7 {
+            u1 = 1e-7;
+        }
+        let u2 = self.next_f32();
+        (-2.0 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos()
+    }
+}
+
+fn gen_vectors(n: usize, d: usize, rng: &mut SplitMix64) -> Vec<f32> {
+    let mut out = Vec::with_capacity(n * d);
+    for _ in 0..n * d {
+        out.push(rng.next_normal());
+    }
+    out
+}
+
+/// Generate `n` vectors of dim `d` as a Gaussian mixture: `num_clusters` random
+/// centers, then `n/num_clusters` points per center perturbed by N(0, noise).
+fn gen_clustered(n: usize, d: usize, num_clusters: usize, noise: f32, rng: &mut SplitMix64) -> Vec<f32> {
+    let centers = gen_vectors(num_clusters, d, rng);
+    let mut out = Vec::with_capacity(n * d);
+    for i in 0..n {
+        let ci = i % num_clusters;
+        let center = &centers[ci * d..(ci + 1) * d];
+        for &c in center {
+            out.push(c + noise * rng.next_normal());
+        }
+    }
+    out
+}
+
+/// Generate query vectors by picking `n_query` random base points and perturbing
+/// them. Guarantees each query has true near neighbors in the base set.
+fn gen_query_near_base(
+    base: &[f32],
+    num_base: usize,
+    n_query: usize,
+    rng: &mut SplitMix64,
+) -> Vec<f32> {
+    let mut out = Vec::with_capacity(n_query * DIM);
+    for _ in 0..n_query {
+        let src = (rng.next_u64() as usize) % num_base;
+        let src_off = src * DIM;
+        for d in 0..DIM {
+            out.push(base[src_off + d] + 0.05 * rng.next_normal());
+        }
+    }
+    out
+}
+
+/// Train a product-quantization codebook by per-subspace k-means.
+fn train_codebook(base: &[f32], num_base: usize, rng: &mut SplitMix64) -> Vec<f32> {
+    let mut codebook = vec![0.0f32; NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM];
+
+    let k = NUM_CENTROIDS.min(num_base);
+    if k == 0 {
+        return codebook;
+    }
+
+    for m in 0..NUM_SUB_VECTORS {
+        for ki in 0..k {
+            let src = (rng.next_u64() as usize) % num_base;
+            let src_off = src * DIM + m * SUB_VECTOR_DIM;
+            let dst_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM;
+            codebook[dst_off..dst_off + SUB_VECTOR_DIM]
+                .copy_from_slice(&base[src_off..src_off + SUB_VECTOR_DIM]);
+        }
+
+        let mut assignments = vec![0u8; num_base];
+        for _iter in 0..KMEANS_ITERS {
+            for i in 0..num_base {
+                let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM];
+                let mut best_k = 0u8;
+                let mut best_d = f32::INFINITY;
+                for ki in 0..k {
+                    let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM;
+                    let mut acc = 0.0f32;
+                    for d in 0..SUB_VECTOR_DIM {
+                        let diff = sub[d] - codebook[c_off + d];
+                        acc += diff * diff;
+                    }
+                    if acc < best_d {
+                        best_d = acc;
+                        best_k = ki as u8;
+                    }
+                }
+                assignments[i] = best_k;
+            }
+
+            let mut sums = vec![0.0f32; k * SUB_VECTOR_DIM];
+            let mut counts = vec![0u32; k];
+            for i in 0..num_base {
+                let ki = assignments[i] as usize;
+                let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM];
+                for d in 0..SUB_VECTOR_DIM {
+                    sums[ki * SUB_VECTOR_DIM + d] += sub[d];
+                }
+                counts[ki] += 1;
+            }
+            for ki in 0..k {
+                let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM;
+                if counts[ki] == 0 {
+                    let src = (rng.next_u64() as usize) % num_base;
+                    let src_off = src * DIM + m * SUB_VECTOR_DIM;
+                    codebook[c_off..c_off + SUB_VECTOR_DIM]
+                        .copy_from_slice(&base[src_off..src_off + SUB_VECTOR_DIM]);
+                } else {
+                    let inv = 1.0 / counts[ki] as f32;
+                    for d in 0..SUB_VECTOR_DIM {
+                        codebook[c_off + d] = sums[ki * SUB_VECTOR_DIM + d] * inv;
+                    }
+                }
+            }
+        }
+    }
+
+    codebook
+}
+
+fn encode(base: &[f32], num_base: usize, codebook: &[f32]) -> Vec<u8> {
+    let mut out = vec![0u8; num_base * NUM_SUB_VECTORS];
+    for i in 0..num_base {
+        for m in 0..NUM_SUB_VECTORS {
+            let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM];
+            let mut best_k = 0u8;
+            let mut best_d = f32::INFINITY;
+            for ki in 0..NUM_CENTROIDS {
+                let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM;
+                let mut acc = 0.0f32;
+                for d in 0..SUB_VECTOR_DIM {
+                    let diff = sub[d] - codebook[c_off + d];
+                    acc += diff * diff;
+                }
+                if acc < best_d {
+                    best_d = acc;
+                    best_k = ki as u8;
+                }
+            }
+            out[i * NUM_SUB_VECTORS + m] = best_k;
+        }
+    }
+    out
+}
+
+#[cfg(test)]
+mod tests {
+    //! Fixture-builder tests. The default smoke test exercises the synthetic path
+    //! end-to-end. `build_fixtures` is `#[ignore]` — it runs only when invoked
+    //! explicitly by `scripts/prepare_fixtures.sh` and writes the frozen SIFT1M
+    //! PQ artifacts to `~/.cache/lance-autoresearch/`.
+    use super::*;
+    use std::io::Write;
+
+    #[test]
+    fn synthetic_fixture_is_self_consistent() {
+        let fix = Fixture::synthesize(256, 8, 0xDEADBEEF).unwrap();
+        assert_eq!(fix.base_vectors.len(), 256 * DIM);
+        assert_eq!(fix.codebook.len(), NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM);
+        assert_eq!(fix.codes.len(), 256 * NUM_SUB_VECTORS);
+        assert_eq!(fix.groundtruth.len(), 8 * SYNTHETIC_TOP_K_TRUTH);
+        for &id in &fix.groundtruth {
+            assert!((id as usize) < 256);
+        }
+    }
+
+    #[test]
+    #[ignore]
+    fn build_fixtures() {
+        if std::env::var("LANCE_AUTORESEARCH_BUILD_FIXTURES").is_err() {
+            eprintln!("skipping: set LANCE_AUTORESEARCH_BUILD_FIXTURES=1 to run");
+            return;
+        }
+        let dir = cache_dir();
+        let base = read_fvecs(&dir.join("sift_base.fvecs")).expect("read sift_base");
+        let num_base = base.len() / DIM;
+        eprintln!("[build_fixtures] training PQ codebook on {num_base} vectors...");
+
+        let mut rng = SplitMix64::new(0x0005_1F74_F1AC);
+        let codebook = train_codebook(&base, num_base, &mut rng);
+        let codes = encode(&base, num_base, &codebook);
+
+        let codebook_bytes: Vec<u8> = codebook
+            .iter()
+            .flat_map(|f| f.to_le_bytes())
+            .collect();
+        std::fs::File::create(dir.join("pq_codebook.bin"))
+            .unwrap()
+            .write_all(&codebook_bytes)
+            .unwrap();
+        std::fs::File::create(dir.join("pq_codes.bin"))
+            .unwrap()
+            .write_all(&codes)
+            .unwrap();
+        eprintln!("[build_fixtures] wrote {} centroids × {} bytes codebook, {} bytes codes",
+            NUM_SUB_VECTORS * NUM_CENTROIDS, SUB_VECTOR_DIM * 4, codes.len());
+    }
+}
--- a/research/lance-autoresearch/src/kernels.rs
+++ b/research/lance-autoresearch/src/kernels.rs
@ -0,0 +1,154 @@
+// SPDX-License-Identifier: Apache-2.0
+//
+// AGENT'S PLAYGROUND. This is the file you (the agent) modify.
+//
+// Algorithmically modeled on the L2 path in lance-linalg's distance / pq modules
+// (Lance 4.x, Apache-2.0; see https://github.com/lance-format/lance). It is *not*
+// a verbatim vendored copy — pulling in lance-linalg's private helpers as deps
+// would couple this harness to crate internals and slow rebuilds. The baseline is
+// intentionally a clean scalar implementation of the same algorithm Lance uses:
+// build an asymmetric distance LUT, then probe every PQ-encoded vector via 16
+// table lookups + an accumulator. Beating the baseline (and porting wins back
+// upstream) is the point of this repo.
+//
+// PUBLIC API CONTRACT (must remain stable so `bin/run_experiment.rs` keeps building):
+//   - DistanceTable type alias
+//   - compute_distance_table_l2(query, codebook) -> DistanceTable
+//   - probe_pq_l2_top_k(table, codes, num_vectors, &mut TopKHeap)
+//   - TopKHeap::new() / push / into_sorted
+//
+// You may add private helpers, switch internal data layouts (e.g. transpose the
+// codebook for vectorized table-build, pack the LUT for `pshufb`), drop down to
+// `std::arch` intrinsics behind cfg gates, mark functions `#[inline]`, etc.
+// You may NOT change `DIM` / `NUM_SUB_VECTORS` / `NUM_CENTROIDS` / `TOP_K`
+// (those are pinned by the fixture geometry in `lib.rs`).
+
+use crate::{NUM_CENTROIDS, NUM_SUB_VECTORS, SUB_VECTOR_DIM, TOP_K};
+
+/// Precomputed asymmetric L2 distance table.
+///
+/// Indexed as `table[sub_vector_idx][centroid_idx]`. Each entry is the squared
+/// L2 distance from the query's `m`-th sub-vector to the `k`-th centroid of the
+/// `m`-th sub-quantizer.
+pub type DistanceTable = [[f32; NUM_CENTROIDS]; NUM_SUB_VECTORS];
+
+/// Build the asymmetric distance table for one query against the codebook.
+///
+/// `codebook` layout: contiguous `[NUM_SUB_VECTORS][NUM_CENTROIDS][SUB_VECTOR_DIM]`.
+#[allow(clippy::needless_range_loop)]
+pub fn compute_distance_table_l2(query: &[f32], codebook: &[f32]) -> DistanceTable {
+    debug_assert_eq!(query.len(), NUM_SUB_VECTORS * SUB_VECTOR_DIM);
+    debug_assert_eq!(
+        codebook.len(),
+        NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM
+    );
+
+    let mut table = [[0.0f32; NUM_CENTROIDS]; NUM_SUB_VECTORS];
+    for m in 0..NUM_SUB_VECTORS {
+        let q_sub = &query[m * SUB_VECTOR_DIM..(m + 1) * SUB_VECTOR_DIM];
+        let cb_offset = m * NUM_CENTROIDS * SUB_VECTOR_DIM;
+        for k in 0..NUM_CENTROIDS {
+            let base = cb_offset + k * SUB_VECTOR_DIM;
+            let mut acc = 0.0f32;
+            for d in 0..SUB_VECTOR_DIM {
+                let diff = q_sub[d] - codebook[base + d];
+                acc += diff * diff;
+            }
+            table[m][k] = acc;
+        }
+    }
+    table
+}
+
+/// Probe every PQ-encoded vector and accumulate the top-K minimum distances.
+///
+/// `codes` layout: `[num_vectors][NUM_SUB_VECTORS]` packed; one byte per sub-quantizer.
+pub fn probe_pq_l2_top_k(
+    table: &DistanceTable,
+    codes: &[u8],
+    num_vectors: usize,
+    out: &mut TopKHeap,
+) {
+    debug_assert_eq!(codes.len(), num_vectors * NUM_SUB_VECTORS);
+
+    for i in 0..num_vectors {
+        let off = i * NUM_SUB_VECTORS;
+        let mut acc = 0.0f32;
+        for m in 0..NUM_SUB_VECTORS {
+            let k = codes[off + m] as usize;
+            acc += table[m][k];
+        }
+        out.push(i as u32, acc);
+    }
+}
+
+/// Fixed-capacity max-heap that keeps the K *smallest*-distance entries seen.
+///
+/// Root is the largest of the K kept distances, so deciding whether to admit a
+/// new entry is one comparison.
+pub struct TopKHeap {
+    entries: [(u32, f32); TOP_K],
+    len: usize,
+}
+
+impl Default for TopKHeap {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl TopKHeap {
+    pub fn new() -> Self {
+        Self {
+            entries: [(u32::MAX, f32::INFINITY); TOP_K],
+            len: 0,
+        }
+    }
+
+    #[inline]
+    pub fn push(&mut self, id: u32, dist: f32) {
+        if self.len < TOP_K {
+            self.entries[self.len] = (id, dist);
+            self.len += 1;
+            if self.len == TOP_K {
+                self.heapify();
+            }
+            return;
+        }
+        if dist < self.entries[0].1 {
+            self.entries[0] = (id, dist);
+            self.sift_down(0);
+        }
+    }
+
+    fn heapify(&mut self) {
+        for i in (0..TOP_K / 2).rev() {
+            self.sift_down(i);
+        }
+    }
+
+    fn sift_down(&mut self, mut i: usize) {
+        loop {
+            let l = 2 * i + 1;
+            let r = 2 * i + 2;
+            let mut largest = i;
+            if l < self.len && self.entries[l].1 > self.entries[largest].1 {
+                largest = l;
+            }
+            if r < self.len && self.entries[r].1 > self.entries[largest].1 {
+                largest = r;
+            }
+            if largest == i {
+                return;
+            }
+            self.entries.swap(i, largest);
+            i = largest;
+        }
+    }
+
+    pub fn into_sorted(self) -> Vec<(u32, f32)> {
+        let mut v: Vec<_> = self.entries[..self.len].to_vec();
+        v.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
+        v
+    }
+}
--- a/research/lance-autoresearch/src/lib.rs
+++ b/research/lance-autoresearch/src/lib.rs
@ -0,0 +1,21 @@
+//! Lance autoresearch harness — public API for the bench binary, benchmarks, and tests.
+//!
+//! Layout mirrors Karpathy's nanochat-research / autoresearch three-file contract:
+//!
+//! - `kernels`   — the AGENT'S PLAYGROUND. May be rewritten freely.
+//! - `reference` — IMMUTABLE. Exact brute-force baseline used to certify recall.
+//! - `fixture`   — IMMUTABLE. Dataset + frozen codebook loader.
+//!
+//! Constants are global because the agent shouldn't have to thread sizes through
+//! its kernel — they pin the optimization target (SIFT1M-shaped: 128-d f32,
+//! 16 sub-vectors × 256 centroids × 8-d, top-10).
+
+pub mod fixture;
+pub mod kernels;
+pub mod reference;
+
+pub const DIM: usize = 128;
+pub const NUM_SUB_VECTORS: usize = 16;
+pub const NUM_CENTROIDS: usize = 256;
+pub const SUB_VECTOR_DIM: usize = DIM / NUM_SUB_VECTORS;
+pub const TOP_K: usize = 10;
--- a/research/lance-autoresearch/src/reference.rs
+++ b/research/lance-autoresearch/src/reference.rs
@ -0,0 +1,35 @@
+//! IMMUTABLE. Brute-force exact L2 top-K. Used at fixture-build time to compute
+//! synthetic-dataset ground truth (against which the agent's PQ-approximate
+//! kernel is then scored for recall). For SIFT1M fixtures we use the published
+//! ground-truth file instead and never call this at bench-time.
+
+use crate::DIM;
+
+/// Brute-force exact top-K by squared L2. Returns `(id, distance)` ascending.
+///
+/// Quadratic in `num_vectors`; only used by the fixture builder, not the hot path.
+pub fn brute_force_top_k_l2(
+    query: &[f32],
+    base: &[f32],
+    num_vectors: usize,
+    k: usize,
+) -> Vec<(u32, f32)> {
+    assert_eq!(query.len(), DIM);
+    assert_eq!(base.len(), num_vectors * DIM);
+
+    let mut dists: Vec<(u32, f32)> = (0..num_vectors)
+        .map(|i| {
+            let v = &base[i * DIM..(i + 1) * DIM];
+            let mut acc = 0.0f32;
+            for d in 0..DIM {
+                let diff = query[d] - v[d];
+                acc += diff * diff;
+            }
+            (i as u32, acc)
+        })
+        .collect();
+
+    dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
+    dists.truncate(k);
+    dists
+}