diff --git a/Cargo.toml b/Cargo.toml index 761f29b..9a1c5a3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,6 +6,11 @@ members = [ "crates/omnigraph-cli", "crates/omnigraph-server", ] +exclude = [ + # `research/` holds standalone experimental projects with their own + # workspaces. They must not be picked up by the omnigraph workspace build. + "research", +] default-members = [ "crates/omnigraph", "crates/omnigraph-cli", diff --git a/research/lance-autoresearch/.gitignore b/research/lance-autoresearch/.gitignore new file mode 100644 index 0000000..0ad1c67 --- /dev/null +++ b/research/lance-autoresearch/.gitignore @@ -0,0 +1,7 @@ +target/ +Cargo.lock +results.tsv +run.log +.DS_Store +*.swp +data/ diff --git a/research/lance-autoresearch/Cargo.toml b/research/lance-autoresearch/Cargo.toml new file mode 100644 index 0000000..ba47235 --- /dev/null +++ b/research/lance-autoresearch/Cargo.toml @@ -0,0 +1,30 @@ +[package] +name = "lance-autoresearch" +version = "0.1.0" +edition = "2024" +license = "MIT OR Apache-2.0" +description = "Autoresearch-style harness for evolving Lance PQ L2 distance kernels via LLM agents." +publish = false + +[lib] +path = "src/lib.rs" + +[[bin]] +name = "run_experiment" +path = "src/bin/run_experiment.rs" + +[[bench]] +name = "pq_l2" +harness = false + +[dependencies] +anyhow = "1" + +[dev-dependencies] +criterion = { version = "0.5", default-features = false, features = ["plotters", "cargo_bench_support"] } + +[profile.release] +opt-level = 3 +lto = "thin" +codegen-units = 1 +debug = 1 diff --git a/research/lance-autoresearch/LICENSE-APACHE b/research/lance-autoresearch/LICENSE-APACHE new file mode 100644 index 0000000..3c268b1 --- /dev/null +++ b/research/lance-autoresearch/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for describing the origin of the Work and + reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Support. While redistributing the Work or + Derivative Works thereof, You may choose to offer, and charge a + fee for, acceptance of support, warranty, indemnity, or other + liability obligations and/or rights consistent with this License. + However, in accepting such obligations, You may act only on Your + own behalf and on Your sole responsibility, not on behalf of any + other Contributor, and only if You agree to indemnify, defend, and + hold each Contributor harmless for any liability incurred by, or + claims asserted against, such Contributor by reason of your + accepting any such warranty or support. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2026 lance-autoresearch contributors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/research/lance-autoresearch/LICENSE-MIT b/research/lance-autoresearch/LICENSE-MIT new file mode 100644 index 0000000..b20da1e --- /dev/null +++ b/research/lance-autoresearch/LICENSE-MIT @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 lance-autoresearch contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/research/lance-autoresearch/README.md b/research/lance-autoresearch/README.md new file mode 100644 index 0000000..a0e9ba8 --- /dev/null +++ b/research/lance-autoresearch/README.md @@ -0,0 +1,101 @@ +# lance-autoresearch + +An autoresearch-style harness for evolving [Lance](https://github.com/lance-format/lance) +PQ L2 distance kernels via LLM coding agents (Claude Code, Codex, Cursor). + +Modeled on Andrej Karpathy's +[`nanochat-research`](https://x.com/karpathy/status/1855651423497650238) +three-file contract: + +- **Immutable bench** — `src/bin/run_experiment.rs` + `src/fixture.rs` + `src/reference.rs`. + The agent cannot touch these. +- **Mutable kernel** — `src/kernels.rs`. The agent's playground. Starts as a clean + scalar PQ L2 implementation matching Lance's algorithm; the agent's job is to + beat it. +- **Human-iterated program** — `program.md`. The "skill" the agent reads at the + start of every session. The human refines it between runs. + +The optimization target is the PQ L2 distance kernel for f32 dense vectors on +SIFT1M-shaped data (128-d, 16 sub-vectors × 256 centroids, 8-bit codes, top-10 +retrieval). The eval oracle is **recall@10 against SIFT1M's published ground +truth** at fixed kernel shape, with `geomean_ns_per_query` as the speed metric. + +## Why a separate repo + +OmniGraph (the graph engine that motivated this) pins Lance at a released +version and consumes its kernels via the public crate API. Improvements live one +layer below: in Lance itself. A standalone repo with no OmniGraph dep keeps the +optimization target pure (only the kernel changes), keeps the license clean for +upstream contribution (dual MIT/Apache-2.0 → Apache-2.0 PRs to Lance), and +keeps the agent's working set tiny (~600 lines). + +## Quick start + +```bash +# 1. (optional but recommended) Download SIFT1M + train + freeze the PQ codebook. +# Takes ~5–10 min; ~250 MB on disk. Skipping it falls back to a synthetic +# deterministic dataset (1024 base / 64 queries) — useful for smoke-testing +# the harness but not representative of real workloads. +bash scripts/prepare_fixtures.sh + +# 2. Run the baseline. +cargo run --release --bin run_experiment + +# 3. Or run with Claude Code / Codex: +# Open the repo in your agent of choice and prompt: +# Hi, have a look at program.md and let's kick off a new experiment. +``` + +## File ownership + +| File | Mutability | Edited by | +|---|---|---| +| `src/kernels.rs` | **mutable** | the agent | +| `src/bin/run_experiment.rs` | immutable | — | +| `src/reference.rs` | immutable | — | +| `src/fixture.rs` | immutable | — | +| `benches/pq_l2.rs` | immutable | — | +| `scripts/prepare_fixtures.sh` | immutable | — | +| `program.md` | human-iterated | the human, between runs | +| `results.tsv` | append-only | the agent, per trial (gitignored) | + +## The metric + +`run_experiment` prints a fixed-format block: + +``` +--- +source: sift1m +num_base: 1000000 +num_queries: 1000 +recall_at_10: 0.9421 +geomean_ns_per_query: 184273 +peak_mem_mb: 42.1 +total_seconds: 21.7 +``` + +A kernel is "kept" iff: + +- `recall_at_10` is within 0.005 of the seeded scalar baseline (and ≥ 0.50 hard floor) +- `geomean_ns_per_query` is strictly better than the previous best-kept kernel +- `total_seconds` ≤ 600 + +See `program.md` for the full loop spec. + +## Upstream contribution path + +When a commit clears the keep bar by a meaningful margin (≥10% speedup with +recall in-band), the human reviews the diff, ports the technique against +[`lance-format/lance`](https://github.com/lance-format/lance) HEAD, runs Lance's +own test suite, and opens a PR. Because `src/kernels.rs` is dual MIT/Apache-2.0 +licensed and algorithmically modeled on Lance's existing path, the upstream PR +inherits Apache-2.0 cleanly. + +## License + +Dual-licensed under either of: + +- MIT license ([LICENSE-MIT](LICENSE-MIT)) +- Apache License, Version 2.0 ([LICENSE-APACHE](LICENSE-APACHE)) + +at your option. diff --git a/research/lance-autoresearch/benches/pq_l2.rs b/research/lance-autoresearch/benches/pq_l2.rs new file mode 100644 index 0000000..d4068b9 --- /dev/null +++ b/research/lance-autoresearch/benches/pq_l2.rs @@ -0,0 +1,56 @@ +//! Criterion benchmark — runs the same kernels the agent edits, but with +//! statistical sampling. Use this for stable speed comparisons; the +//! `run_experiment` binary is the agent's per-trial harness. +//! +//! `cargo bench --bench pq_l2` + +use std::hint::black_box; + +use criterion::{Criterion, criterion_group, criterion_main}; + +use lance_autoresearch::fixture::Fixture; +use lance_autoresearch::kernels::{TopKHeap, compute_distance_table_l2, probe_pq_l2_top_k}; +use lance_autoresearch::{DIM, NUM_SUB_VECTORS}; + +fn bench_pq_l2(c: &mut Criterion) { + let fix = Fixture::load_or_synthesize().expect("fixture"); + + let q = &fix.query_vectors[..DIM]; + let table0 = compute_distance_table_l2(q, &fix.codebook); + + c.bench_function("compute_distance_table_l2", |b| { + b.iter(|| { + let t = compute_distance_table_l2(black_box(q), black_box(&fix.codebook)); + black_box(t); + }); + }); + + c.bench_function("probe_pq_l2_top_k", |b| { + b.iter(|| { + let mut heap = TopKHeap::new(); + probe_pq_l2_top_k( + black_box(&table0), + black_box(&fix.codes), + black_box(fix.num_base), + &mut heap, + ); + black_box(heap); + }); + }); + + c.bench_function("end_to_end_one_query", |b| { + b.iter(|| { + let t = compute_distance_table_l2(black_box(q), black_box(&fix.codebook)); + let mut heap = TopKHeap::new(); + probe_pq_l2_top_k(&t, black_box(&fix.codes), black_box(fix.num_base), &mut heap); + black_box(heap); + }); + }); + + // Reference: silence unused warning for NUM_SUB_VECTORS in case the bench is + // ever stubbed out — keeps the constant import meaningful. + let _ = NUM_SUB_VECTORS; +} + +criterion_group!(benches, bench_pq_l2); +criterion_main!(benches); diff --git a/research/lance-autoresearch/program.md b/research/lance-autoresearch/program.md new file mode 100644 index 0000000..d7920c4 --- /dev/null +++ b/research/lance-autoresearch/program.md @@ -0,0 +1,151 @@ +# Lance PQ L2 kernel research — agent instructions + +You are an autonomous research assistant. Your job is to improve the kernel(s) in +`src/kernels.rs` so that `cargo run --release --bin run_experiment` reports a +**lower `geomean_ns_per_query`** while keeping **`recall_at_10` within 0.005 of +the seeded baseline** (and never below the hard floor 0.50). + +Read this file end-to-end before doing anything else. Then run setup, then the loop. + +## Setup (do once at the start of every session) + +1. Read these files, in this order: + - `README.md` + - `program.md` (this file) + - `src/lib.rs` + - `src/kernels.rs` *(the only file you may edit)* + - `src/bin/run_experiment.rs` + - `src/fixture.rs` +2. Confirm fixtures are present. SIFT1M lives under `~/.cache/lance-autoresearch/`. + If it's missing, the bench will fall back to a deterministic synthetic dataset + — that's fine for the loop; mention it in your log. If you want SIFT1M, run + `bash scripts/prepare_fixtures.sh` (one-time, ~5–10 min, ~250 MB download). +3. Ensure `results.tsv` exists. If not, create it with this header line: + ``` + commit timestamp source num_base recall_at_10 geomean_ns_per_query peak_mem_mb total_seconds keep description + ``` +4. Run the baseline trial: `cargo run --release --bin run_experiment > run.log 2>&1`. + Parse `run.log` and append a row to `results.tsv` with `keep=baseline`, + `description="seeded scalar PQ-L2 baseline"`. This is your reference number. +5. Commit the baseline row with a one-line message like `baseline: `. + +## What you CAN do + +- Modify **`src/kernels.rs`** freely. You may: + - Reorder loops, change iteration order over codes or sub-vectors. + - Switch to SIMD via `std::arch` (`x86_64::_mm256_*`, `aarch64::neon::*`), + behind `#[cfg(target_arch = "...")]` gates. Always keep a portable scalar + fallback so the kernel compiles everywhere. + - Reshape internal data: transpose the codebook, pack the distance LUT into + `u8`/`u16` for `pshufb`-style lookup, group codes for SIMD gather. + - Use `unsafe` if needed; document the invariants you're relying on. + - Mark hot functions `#[inline]` or split them; add private helpers freely. +- Add `#[cfg(test)] mod tests { ... }` inside `src/kernels.rs` if you want + property checks against the scalar path. + +## What you CANNOT do + +- Do **not** modify `src/lib.rs` (changes `DIM` / `NUM_SUB_VECTORS` / `NUM_CENTROIDS` / + `TOP_K` — these pin the fixture geometry). +- Do **not** modify `src/bin/run_experiment.rs`, `src/reference.rs`, `src/fixture.rs`, + `benches/pq_l2.rs`, `scripts/prepare_fixtures.sh`, or `Cargo.toml`. +- Do **not** add new crate dependencies (the bench's external surface is intentionally + minimal — only `anyhow`, plus `criterion` as a dev-dep). +- Do **not** delete or alter the public API of `kernels.rs`: + - `pub type DistanceTable` + - `pub fn compute_distance_table_l2(query: &[f32], codebook: &[f32]) -> DistanceTable` + - `pub fn probe_pq_l2_top_k(table: &DistanceTable, codes: &[u8], num_vectors: usize, out: &mut TopKHeap)` + - `pub struct TopKHeap` with `new() / push / into_sorted` + +## The metric + +Minimize `geomean_ns_per_query` (geometric mean of per-query wall-clock from the +benched queries, rounded to a u64 ns) subject to: + +1. `recall_at_10 >= baseline_recall_at_10 - 0.005` +2. `recall_at_10 >= 0.50` (hard floor; below this the bench exits non-zero) +3. `total_seconds <= 600` +4. Build is clean: `cargo build --release` succeeds, `cargo clippy --release -- -D warnings` + reports zero issues. (Run `cargo clippy --release` before each commit.) + +Ties break toward simpler code. If two kernels report the same speed within +noise (~3%), prefer the one with fewer lines or less `unsafe`. + +## Lance-PQ-specific priors + +These are the directions known to pay off on this kernel shape. Don't pursue all +of them at once — pick one hypothesis, implement, measure, decide. + +- **Codebook layout for the table-build step.** The reference layout is + `[m][k][d]`. For a fixed query, iterating over centroids stays in cache, but + the inner loop over `d` is short (8 floats). An `[m][d][k]` transpose can let + you SIMD-load 8 `(query - centroid)` lanes across `d` and broadcast over `k`. +- **LUT packing for the probe step.** The probe is dominated by `acc += + table[m][codes[off+m]]` × 16. Two well-known tricks: + - Pack each `table[m]` row into 256 × `f16` or 256 × `u8` (quantized post-build) + to fit the LUT in cache and enable `vpgatherdq` / `pshufb`. + - Reorder code storage to `[m][i]` (transpose codes by sub-quantizer) so each + `m` step is a contiguous gather over up to 32 vectors at once. +- **Top-K integration.** `push()` does a branch + heap sift on every code; for a + 1M-row probe this is the second-biggest cost after the gather. Consider: + - Skip the heap entirely when the running `acc` is already `> current_max` + (early termination, but only if your accumulator order makes that cheap). + - Block the probe (e.g., 1024 codes at a time), find the local top-K with a + branchless scan, then merge into the global heap. +- **Prefetch.** A `_mm_prefetch(codes.as_ptr().add(off + 64), _MM_HINT_T0)` ahead + of the gather is usually pure win at 1M scale where codes don't all fit in L2. +- **FMA in the table build.** The diff–square–sum sequence is + `(q - c)·(q - c)` per element — that's `(q*q) - 2qc + c*c`. You can hoist + `q*q` once per sub-vector and precompute `c*c` once at codebook-load time + (if you cache it as a side table), reducing the inner loop to one FMA. + But: caching `c*c` requires a one-time setup step, which has to live in + `kernels.rs` since you cannot touch the fixture; either lazy-init via + `OnceLock>` or rebuild every call (probably not worth it). + +## The loop + +Once setup is done, repeat indefinitely: + +1. **Observe state.** Read the last ~5 rows of `results.tsv`. Note which ideas + have been tried, what won, what regressed. Form a hypothesis with one + sentence stating the change and the predicted effect on speed and recall. +2. **Edit `src/kernels.rs`.** Keep the diff focused on the one hypothesis. +3. **Build and lint.** Run: + ``` + cargo build --release + cargo clippy --release --all-targets -- -D warnings + ``` + If either fails, fix and try again — do not commit broken state. +4. **Run the trial.** + ``` + cargo run --release --bin run_experiment > run.log 2>&1 + ``` +5. **Parse the result.** Extract `recall_at_10`, `geomean_ns_per_query`, + `peak_mem_mb`, `total_seconds` from `run.log`. Compute the deltas vs. baseline. +6. **Decide keep or revert.** + - **Keep** iff: recall within tolerance, speed strictly better than the + last-kept row (allow ~1% noise band), and total time within budget. + - **Revert** otherwise: `git restore src/kernels.rs` (or commit and `git + revert` if you want the revert in history). Note what failed. +7. **Log.** Append one row to `results.tsv`: + ``` + + ``` +8. **Commit.** Use a one-line message describing the change and the headline + number, e.g. `transpose codebook; 184k → 142k ns/query (recall 0.94)`. + +## Hygiene + +- Always commit `src/kernels.rs` changes; never commit `results.tsv` or `run.log` + (they're gitignored). +- If a change fails to build, do not commit. Iterate until it builds, or revert + cleanly. +- If two consecutive ideas regress, take a beat: re-read the last ~10 rows of + `results.tsv` and update your mental model before proposing the next. +- Per-trial cap: 10 minutes. If `cargo run` is still going after 10 min, kill it + and mark the trial as `timeout`. + +## Never stop + +Keep going until interrupted. Each loop iteration is one hypothesis, one edit, +one measurement, one commit. No multi-step plans across iterations. diff --git a/research/lance-autoresearch/rust-toolchain.toml b/research/lance-autoresearch/rust-toolchain.toml new file mode 100644 index 0000000..73cb934 --- /dev/null +++ b/research/lance-autoresearch/rust-toolchain.toml @@ -0,0 +1,3 @@ +[toolchain] +channel = "stable" +components = ["rustfmt", "clippy"] diff --git a/research/lance-autoresearch/scripts/prepare_fixtures.sh b/research/lance-autoresearch/scripts/prepare_fixtures.sh new file mode 100755 index 0000000..a75a34d --- /dev/null +++ b/research/lance-autoresearch/scripts/prepare_fixtures.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# IMMUTABLE. One-time SIFT1M fixture preparation. +# +# Downloads SIFT1M from the Texmex corpus (Inria), extracts the f32 vector +# files + ground-truth, then runs the in-tree fixture builder to train a +# product-quantization codebook and encode the base set. All artifacts are +# written under ~/.cache/lance-autoresearch/ so they survive between trials +# but stay out of git. +# +# Total time: ~5–10 min on a fresh laptop. ~250 MB download. + +set -euo pipefail + +CACHE_DIR="${HOME}/.cache/lance-autoresearch" +SIFT_URL="ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz" +SIFT_URL_MIRROR="https://huggingface.co/datasets/qbo-odp/sift1m/resolve/main/sift.tar.gz" + +mkdir -p "${CACHE_DIR}" +cd "${CACHE_DIR}" + +if [[ ! -f sift_base.fvecs || ! -f sift_query.fvecs || ! -f sift_groundtruth.ivecs ]]; then + echo "[prepare_fixtures] downloading SIFT1M..." + if [[ ! -f sift.tar.gz ]]; then + curl --fail -L -o sift.tar.gz "${SIFT_URL}" || \ + curl --fail -L -o sift.tar.gz "${SIFT_URL_MIRROR}" + fi + echo "[prepare_fixtures] extracting..." + tar xzf sift.tar.gz + mv sift/sift_base.fvecs ./sift_base.fvecs + mv sift/sift_query.fvecs ./sift_query.fvecs + mv sift/sift_groundtruth.ivecs ./sift_groundtruth.ivecs + rm -rf sift sift.tar.gz +fi + +if [[ ! -f pq_codebook.bin || ! -f pq_codes.bin ]]; then + echo "[prepare_fixtures] training PQ codebook + encoding base..." + # The fixture builder is run as a `cargo test` with a marker env var so we + # don't have to add a second binary just for one-time setup. The test reads + # SIFT1M, calls the in-tree `train_codebook` + `encode`, and writes the + # frozen artifacts next to the dataset. + cd "$(dirname "$0")/.." + LANCE_AUTORESEARCH_BUILD_FIXTURES=1 cargo test --release --lib build_fixtures -- --ignored --nocapture +fi + +echo "[prepare_fixtures] done — fixtures in ${CACHE_DIR}" +ls -la "${CACHE_DIR}" diff --git a/research/lance-autoresearch/src/bin/run_experiment.rs b/research/lance-autoresearch/src/bin/run_experiment.rs new file mode 100644 index 0000000..6629cd8 --- /dev/null +++ b/research/lance-autoresearch/src/bin/run_experiment.rs @@ -0,0 +1,138 @@ +//! IMMUTABLE entry point — the single command the agent invokes per trial. +//! +//! Run with: `cargo run --release --bin run_experiment > run.log 2>&1` +//! +//! Loads (or synthesizes) the fixture, calls the kernels in `src/kernels.rs`, +//! and prints a fixed-format result block the agent can grep: +//! +//! --- +//! source: sift1m | synthetic +//! num_base: 1000000 +//! num_queries: 1000 +//! recall_at_10: 0.9421 +//! geomean_ns_per_query: 184273 +//! peak_mem_mb: 42.1 +//! total_seconds: 21.7 +//! +//! Exit codes: +//! 0 — ran to completion, recall above floor, within time budget. +//! 2 — recall below floor (kernel is broken). +//! 3 — total wall-clock exceeded budget. +//! 1 — any other error. + +use std::collections::HashSet; +use std::time::Instant; + +use anyhow::Result; + +use lance_autoresearch::fixture::Fixture; +use lance_autoresearch::kernels::{TopKHeap, compute_distance_table_l2, probe_pq_l2_top_k}; +use lance_autoresearch::{DIM, TOP_K}; + +const MAX_QUERIES_BENCHED: usize = 1000; +const TIME_BUDGET_SECS: u64 = 600; +const RECALL_FLOOR: f32 = 0.50; + +fn main() { + match real_main() { + Ok(()) => {} + Err(e) => { + eprintln!("error: {e:#}"); + std::process::exit(1); + } + } +} + +fn real_main() -> Result<()> { + let start = Instant::now(); + let fix = Fixture::load_or_synthesize()?; + + let n_q = MAX_QUERIES_BENCHED.min(fix.num_query); + let mut hits = 0usize; + let mut total_relevant = 0usize; + let mut per_query_ns: Vec = Vec::with_capacity(n_q); + + for qi in 0..n_q { + let q = &fix.query_vectors[qi * DIM..(qi + 1) * DIM]; + + let t0 = Instant::now(); + let table = compute_distance_table_l2(q, &fix.codebook); + let mut heap = TopKHeap::new(); + probe_pq_l2_top_k(&table, &fix.codes, fix.num_base, &mut heap); + per_query_ns.push(t0.elapsed().as_nanos() as u64); + + let candidates: Vec = heap.into_sorted().into_iter().map(|(id, _)| id).collect(); + let truth_slice = + &fix.groundtruth[qi * fix.top_k_truth..qi * fix.top_k_truth + TOP_K.min(fix.top_k_truth)]; + let truth_set: HashSet = truth_slice.iter().copied().collect(); + for c in &candidates { + if truth_set.contains(c) { + hits += 1; + } + } + total_relevant += TOP_K; + } + + let recall = hits as f32 / total_relevant as f32; + let geomean_ns = geomean(&per_query_ns); + let elapsed = start.elapsed(); + let mem_mb = peak_rss_mb(); + + println!("---"); + println!("source: {}", fix.source_str()); + println!("num_base: {}", fix.num_base); + println!("num_queries: {n_q}"); + println!("recall_at_10: {recall:.4}"); + println!("geomean_ns_per_query: {geomean_ns}"); + println!("peak_mem_mb: {mem_mb:.1}"); + println!("total_seconds: {:.2}", elapsed.as_secs_f64()); + + if recall < RECALL_FLOOR { + eprintln!("FAIL: recall@10 {recall:.4} below floor {RECALL_FLOOR:.4}"); + std::process::exit(2); + } + if elapsed.as_secs() > TIME_BUDGET_SECS { + eprintln!( + "FAIL: total wall-clock {}s exceeds budget {}s", + elapsed.as_secs(), + TIME_BUDGET_SECS + ); + std::process::exit(3); + } + + Ok(()) +} + +fn geomean(xs: &[u64]) -> u64 { + if xs.is_empty() { + return 0; + } + let mut sum_ln = 0.0f64; + for &x in xs { + sum_ln += (x.max(1) as f64).ln(); + } + (sum_ln / xs.len() as f64).exp() as u64 +} + +#[cfg(target_os = "linux")] +fn peak_rss_mb() -> f64 { + let Ok(s) = std::fs::read_to_string("/proc/self/status") else { + return 0.0; + }; + for line in s.lines() { + if let Some(rest) = line.strip_prefix("VmPeak:") { + let kb: f64 = rest + .split_whitespace() + .next() + .and_then(|t| t.parse().ok()) + .unwrap_or(0.0); + return kb / 1024.0; + } + } + 0.0 +} + +#[cfg(not(target_os = "linux"))] +fn peak_rss_mb() -> f64 { + 0.0 +} diff --git a/research/lance-autoresearch/src/fixture.rs b/research/lance-autoresearch/src/fixture.rs new file mode 100644 index 0000000..65e15fb --- /dev/null +++ b/research/lance-autoresearch/src/fixture.rs @@ -0,0 +1,449 @@ +//! IMMUTABLE. Fixture loader. +//! +//! The bench runs against one of: +//! - SIFT1M (preferred; 128-d, 1M base, 10k queries, published ground truth) +//! loaded from `~/.cache/lance-autoresearch/{sift_base,sift_query,sift_groundtruth}.fvecs|.ivecs` +//! plus pre-trained frozen artifacts `pq_codebook.bin` and `pq_codes.bin`. +//! - A synthetic fallback (1024 base / 64 queries, deterministic seed) so the +//! harness is smoke-testable without any external download. +//! +//! Run `scripts/prepare_fixtures.sh` once to populate the SIFT1M fixtures. + +use std::fs; +use std::io::{BufReader, Read}; +use std::path::{Path, PathBuf}; + +use anyhow::{Context, Result, anyhow}; + +use crate::reference::brute_force_top_k_l2; +use crate::{DIM, NUM_CENTROIDS, NUM_SUB_VECTORS, SUB_VECTOR_DIM}; + +pub const SYNTHETIC_NUM_BASE: usize = 1024; +pub const SYNTHETIC_NUM_QUERY: usize = 64; +pub const SYNTHETIC_TOP_K_TRUTH: usize = 32; +const KMEANS_ITERS: usize = 12; + +pub enum FixtureSource { + Sift1M, + Synthetic { seed: u64 }, +} + +pub struct Fixture { + pub base_vectors: Vec, + pub query_vectors: Vec, + pub codebook: Vec, + pub codes: Vec, + pub groundtruth: Vec, + pub num_base: usize, + pub num_query: usize, + pub top_k_truth: usize, + pub source: FixtureSource, +} + +impl Fixture { + /// Try SIFT1M first; fall back to a deterministic synthetic dataset. + pub fn load_or_synthesize() -> Result { + let dir = cache_dir(); + if dir.join("sift_base.fvecs").exists() + && dir.join("sift_query.fvecs").exists() + && dir.join("sift_groundtruth.ivecs").exists() + && dir.join("pq_codebook.bin").exists() + && dir.join("pq_codes.bin").exists() + { + Self::load_sift1m(&dir) + } else { + Self::synthesize(SYNTHETIC_NUM_BASE, SYNTHETIC_NUM_QUERY, 0xC0FFEE_C0FFEE) + } + } + + pub fn source_str(&self) -> &'static str { + match self.source { + FixtureSource::Sift1M => "sift1m", + FixtureSource::Synthetic { .. } => "synthetic", + } + } + + fn load_sift1m(dir: &Path) -> Result { + let base_vectors = read_fvecs(&dir.join("sift_base.fvecs"))?; + let query_vectors = read_fvecs(&dir.join("sift_query.fvecs"))?; + let (groundtruth, top_k_truth) = read_ivecs(&dir.join("sift_groundtruth.ivecs"))?; + let codebook = read_f32_bin(&dir.join("pq_codebook.bin"))?; + let codes = read_u8_bin(&dir.join("pq_codes.bin"))?; + + let num_base = base_vectors.len() / DIM; + let num_query = query_vectors.len() / DIM; + if codebook.len() != NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM { + return Err(anyhow!( + "codebook size mismatch: got {}, expected {}", + codebook.len(), + NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM + )); + } + if codes.len() != num_base * NUM_SUB_VECTORS { + return Err(anyhow!( + "codes size mismatch: got {}, expected {}", + codes.len(), + num_base * NUM_SUB_VECTORS + )); + } + + Ok(Self { + base_vectors, + query_vectors, + codebook, + codes, + groundtruth, + num_base, + num_query, + top_k_truth, + source: FixtureSource::Sift1M, + }) + } + + fn synthesize(num_base: usize, num_query: usize, seed: u64) -> Result { + let mut rng = SplitMix64::new(seed); + // Cluster the base set so PQ has structure to compress and queries have + // meaningful nearest neighbors. With i.i.d. Gaussian noise the asymptotic + // recall of PQ is near-chance; with cluster-shaped data PQ tracks the + // true top-K closely, which is what we want when smoke-testing kernels. + let base_vectors = gen_clustered(num_base, DIM, 32, 0.15, &mut rng); + // Queries are perturbed base points so they have a true near-neighbor. + let query_vectors = gen_query_near_base(&base_vectors, num_base, num_query, &mut rng); + + let codebook = train_codebook(&base_vectors, num_base, &mut rng); + let codes = encode(&base_vectors, num_base, &codebook); + + let mut groundtruth = Vec::with_capacity(num_query * SYNTHETIC_TOP_K_TRUTH); + for qi in 0..num_query { + let q = &query_vectors[qi * DIM..(qi + 1) * DIM]; + let top = brute_force_top_k_l2(q, &base_vectors, num_base, SYNTHETIC_TOP_K_TRUTH); + groundtruth.extend(top.iter().map(|(id, _)| *id)); + } + + Ok(Self { + base_vectors, + query_vectors, + codebook, + codes, + groundtruth, + num_base, + num_query, + top_k_truth: SYNTHETIC_TOP_K_TRUTH, + source: FixtureSource::Synthetic { seed }, + }) + } +} + +pub fn cache_dir() -> PathBuf { + let home = std::env::var_os("HOME") + .map(PathBuf::from) + .unwrap_or_else(|| PathBuf::from("/tmp")); + home.join(".cache").join("lance-autoresearch") +} + +fn read_fvecs(path: &Path) -> Result> { + let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?; + let mut out = Vec::with_capacity(bytes.len() / 4); + let mut i = 0; + while i < bytes.len() { + if i + 4 > bytes.len() { + return Err(anyhow!("truncated fvecs header at offset {i}")); + } + let dim = u32::from_le_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]) as usize; + if dim != DIM { + return Err(anyhow!("fvecs dim {dim} != expected {DIM}")); + } + i += 4; + let row_bytes = dim * 4; + if i + row_bytes > bytes.len() { + return Err(anyhow!("truncated fvecs row at offset {i}")); + } + for d in 0..dim { + let off = i + d * 4; + out.push(f32::from_le_bytes([ + bytes[off], + bytes[off + 1], + bytes[off + 2], + bytes[off + 3], + ])); + } + i += row_bytes; + } + Ok(out) +} + +fn read_ivecs(path: &Path) -> Result<(Vec, usize)> { + let bytes = fs::read(path).with_context(|| format!("reading {}", path.display()))?; + let mut out = Vec::new(); + let mut top_k: Option = None; + let mut i = 0; + while i < bytes.len() { + if i + 4 > bytes.len() { + return Err(anyhow!("truncated ivecs header")); + } + let dim = u32::from_le_bytes([bytes[i], bytes[i + 1], bytes[i + 2], bytes[i + 3]]) as usize; + i += 4; + if let Some(k) = top_k { + if k != dim { + return Err(anyhow!("ivecs rows have varying widths {k} vs {dim}")); + } + } else { + top_k = Some(dim); + } + let row_bytes = dim * 4; + if i + row_bytes > bytes.len() { + return Err(anyhow!("truncated ivecs row")); + } + for d in 0..dim { + let off = i + d * 4; + out.push(u32::from_le_bytes([ + bytes[off], + bytes[off + 1], + bytes[off + 2], + bytes[off + 3], + ])); + } + i += row_bytes; + } + Ok((out, top_k.unwrap_or(0))) +} + +fn read_f32_bin(path: &Path) -> Result> { + let f = fs::File::open(path).with_context(|| format!("opening {}", path.display()))?; + let mut r = BufReader::new(f); + let mut bytes = Vec::new(); + r.read_to_end(&mut bytes)?; + if bytes.len() % 4 != 0 { + return Err(anyhow!("f32 binary file not a multiple of 4 bytes")); + } + Ok(bytes + .chunks_exact(4) + .map(|b| f32::from_le_bytes([b[0], b[1], b[2], b[3]])) + .collect()) +} + +fn read_u8_bin(path: &Path) -> Result> { + fs::read(path).with_context(|| format!("reading {}", path.display())) +} + +/// xorshift-ish deterministic PRNG (SplitMix64). Vendored small enough to avoid +/// a `rand` dep — the fixture must be reproducible bit-for-bit. +struct SplitMix64 { + state: u64, +} + +impl SplitMix64 { + fn new(seed: u64) -> Self { + Self { state: seed } + } + fn next_u64(&mut self) -> u64 { + self.state = self.state.wrapping_add(0x9E37_79B9_7F4A_7C15); + let mut z = self.state; + z = (z ^ (z >> 30)).wrapping_mul(0xBF58_476D_1CE4_E5B9); + z = (z ^ (z >> 27)).wrapping_mul(0x94D0_49BB_1331_11EB); + z ^ (z >> 31) + } + fn next_f32(&mut self) -> f32 { + let bits = (self.next_u64() >> 40) as u32; + bits as f32 / ((1u32 << 24) as f32) + } + /// Box-Muller standard normal. + fn next_normal(&mut self) -> f32 { + let mut u1 = self.next_f32(); + if u1 < 1e-7 { + u1 = 1e-7; + } + let u2 = self.next_f32(); + (-2.0 * u1.ln()).sqrt() * (std::f32::consts::TAU * u2).cos() + } +} + +fn gen_vectors(n: usize, d: usize, rng: &mut SplitMix64) -> Vec { + let mut out = Vec::with_capacity(n * d); + for _ in 0..n * d { + out.push(rng.next_normal()); + } + out +} + +/// Generate `n` vectors of dim `d` as a Gaussian mixture: `num_clusters` random +/// centers, then `n/num_clusters` points per center perturbed by N(0, noise). +fn gen_clustered(n: usize, d: usize, num_clusters: usize, noise: f32, rng: &mut SplitMix64) -> Vec { + let centers = gen_vectors(num_clusters, d, rng); + let mut out = Vec::with_capacity(n * d); + for i in 0..n { + let ci = i % num_clusters; + let center = ¢ers[ci * d..(ci + 1) * d]; + for &c in center { + out.push(c + noise * rng.next_normal()); + } + } + out +} + +/// Generate query vectors by picking `n_query` random base points and perturbing +/// them. Guarantees each query has true near neighbors in the base set. +fn gen_query_near_base( + base: &[f32], + num_base: usize, + n_query: usize, + rng: &mut SplitMix64, +) -> Vec { + let mut out = Vec::with_capacity(n_query * DIM); + for _ in 0..n_query { + let src = (rng.next_u64() as usize) % num_base; + let src_off = src * DIM; + for d in 0..DIM { + out.push(base[src_off + d] + 0.05 * rng.next_normal()); + } + } + out +} + +/// Train a product-quantization codebook by per-subspace k-means. +fn train_codebook(base: &[f32], num_base: usize, rng: &mut SplitMix64) -> Vec { + let mut codebook = vec![0.0f32; NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM]; + + let k = NUM_CENTROIDS.min(num_base); + if k == 0 { + return codebook; + } + + for m in 0..NUM_SUB_VECTORS { + for ki in 0..k { + let src = (rng.next_u64() as usize) % num_base; + let src_off = src * DIM + m * SUB_VECTOR_DIM; + let dst_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; + codebook[dst_off..dst_off + SUB_VECTOR_DIM] + .copy_from_slice(&base[src_off..src_off + SUB_VECTOR_DIM]); + } + + let mut assignments = vec![0u8; num_base]; + for _iter in 0..KMEANS_ITERS { + for i in 0..num_base { + let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM]; + let mut best_k = 0u8; + let mut best_d = f32::INFINITY; + for ki in 0..k { + let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; + let mut acc = 0.0f32; + for d in 0..SUB_VECTOR_DIM { + let diff = sub[d] - codebook[c_off + d]; + acc += diff * diff; + } + if acc < best_d { + best_d = acc; + best_k = ki as u8; + } + } + assignments[i] = best_k; + } + + let mut sums = vec![0.0f32; k * SUB_VECTOR_DIM]; + let mut counts = vec![0u32; k]; + for i in 0..num_base { + let ki = assignments[i] as usize; + let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM]; + for d in 0..SUB_VECTOR_DIM { + sums[ki * SUB_VECTOR_DIM + d] += sub[d]; + } + counts[ki] += 1; + } + for ki in 0..k { + let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; + if counts[ki] == 0 { + let src = (rng.next_u64() as usize) % num_base; + let src_off = src * DIM + m * SUB_VECTOR_DIM; + codebook[c_off..c_off + SUB_VECTOR_DIM] + .copy_from_slice(&base[src_off..src_off + SUB_VECTOR_DIM]); + } else { + let inv = 1.0 / counts[ki] as f32; + for d in 0..SUB_VECTOR_DIM { + codebook[c_off + d] = sums[ki * SUB_VECTOR_DIM + d] * inv; + } + } + } + } + } + + codebook +} + +fn encode(base: &[f32], num_base: usize, codebook: &[f32]) -> Vec { + let mut out = vec![0u8; num_base * NUM_SUB_VECTORS]; + for i in 0..num_base { + for m in 0..NUM_SUB_VECTORS { + let sub = &base[i * DIM + m * SUB_VECTOR_DIM..i * DIM + (m + 1) * SUB_VECTOR_DIM]; + let mut best_k = 0u8; + let mut best_d = f32::INFINITY; + for ki in 0..NUM_CENTROIDS { + let c_off = m * NUM_CENTROIDS * SUB_VECTOR_DIM + ki * SUB_VECTOR_DIM; + let mut acc = 0.0f32; + for d in 0..SUB_VECTOR_DIM { + let diff = sub[d] - codebook[c_off + d]; + acc += diff * diff; + } + if acc < best_d { + best_d = acc; + best_k = ki as u8; + } + } + out[i * NUM_SUB_VECTORS + m] = best_k; + } + } + out +} + +#[cfg(test)] +mod tests { + //! Fixture-builder tests. The default smoke test exercises the synthetic path + //! end-to-end. `build_fixtures` is `#[ignore]` — it runs only when invoked + //! explicitly by `scripts/prepare_fixtures.sh` and writes the frozen SIFT1M + //! PQ artifacts to `~/.cache/lance-autoresearch/`. + use super::*; + use std::io::Write; + + #[test] + fn synthetic_fixture_is_self_consistent() { + let fix = Fixture::synthesize(256, 8, 0xDEADBEEF).unwrap(); + assert_eq!(fix.base_vectors.len(), 256 * DIM); + assert_eq!(fix.codebook.len(), NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM); + assert_eq!(fix.codes.len(), 256 * NUM_SUB_VECTORS); + assert_eq!(fix.groundtruth.len(), 8 * SYNTHETIC_TOP_K_TRUTH); + for &id in &fix.groundtruth { + assert!((id as usize) < 256); + } + } + + #[test] + #[ignore] + fn build_fixtures() { + if std::env::var("LANCE_AUTORESEARCH_BUILD_FIXTURES").is_err() { + eprintln!("skipping: set LANCE_AUTORESEARCH_BUILD_FIXTURES=1 to run"); + return; + } + let dir = cache_dir(); + let base = read_fvecs(&dir.join("sift_base.fvecs")).expect("read sift_base"); + let num_base = base.len() / DIM; + eprintln!("[build_fixtures] training PQ codebook on {num_base} vectors..."); + + let mut rng = SplitMix64::new(0x0005_1F74_F1AC); + let codebook = train_codebook(&base, num_base, &mut rng); + let codes = encode(&base, num_base, &codebook); + + let codebook_bytes: Vec = codebook + .iter() + .flat_map(|f| f.to_le_bytes()) + .collect(); + std::fs::File::create(dir.join("pq_codebook.bin")) + .unwrap() + .write_all(&codebook_bytes) + .unwrap(); + std::fs::File::create(dir.join("pq_codes.bin")) + .unwrap() + .write_all(&codes) + .unwrap(); + eprintln!("[build_fixtures] wrote {} centroids × {} bytes codebook, {} bytes codes", + NUM_SUB_VECTORS * NUM_CENTROIDS, SUB_VECTOR_DIM * 4, codes.len()); + } +} diff --git a/research/lance-autoresearch/src/kernels.rs b/research/lance-autoresearch/src/kernels.rs new file mode 100644 index 0000000..a6517a8 --- /dev/null +++ b/research/lance-autoresearch/src/kernels.rs @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: Apache-2.0 +// +// AGENT'S PLAYGROUND. This is the file you (the agent) modify. +// +// Algorithmically modeled on the L2 path in lance-linalg's distance / pq modules +// (Lance 4.x, Apache-2.0; see https://github.com/lance-format/lance). It is *not* +// a verbatim vendored copy — pulling in lance-linalg's private helpers as deps +// would couple this harness to crate internals and slow rebuilds. The baseline is +// intentionally a clean scalar implementation of the same algorithm Lance uses: +// build an asymmetric distance LUT, then probe every PQ-encoded vector via 16 +// table lookups + an accumulator. Beating the baseline (and porting wins back +// upstream) is the point of this repo. +// +// PUBLIC API CONTRACT (must remain stable so `bin/run_experiment.rs` keeps building): +// - DistanceTable type alias +// - compute_distance_table_l2(query, codebook) -> DistanceTable +// - probe_pq_l2_top_k(table, codes, num_vectors, &mut TopKHeap) +// - TopKHeap::new() / push / into_sorted +// +// You may add private helpers, switch internal data layouts (e.g. transpose the +// codebook for vectorized table-build, pack the LUT for `pshufb`), drop down to +// `std::arch` intrinsics behind cfg gates, mark functions `#[inline]`, etc. +// You may NOT change `DIM` / `NUM_SUB_VECTORS` / `NUM_CENTROIDS` / `TOP_K` +// (those are pinned by the fixture geometry in `lib.rs`). + +use crate::{NUM_CENTROIDS, NUM_SUB_VECTORS, SUB_VECTOR_DIM, TOP_K}; + +/// Precomputed asymmetric L2 distance table. +/// +/// Indexed as `table[sub_vector_idx][centroid_idx]`. Each entry is the squared +/// L2 distance from the query's `m`-th sub-vector to the `k`-th centroid of the +/// `m`-th sub-quantizer. +pub type DistanceTable = [[f32; NUM_CENTROIDS]; NUM_SUB_VECTORS]; + +/// Build the asymmetric distance table for one query against the codebook. +/// +/// `codebook` layout: contiguous `[NUM_SUB_VECTORS][NUM_CENTROIDS][SUB_VECTOR_DIM]`. +#[allow(clippy::needless_range_loop)] +pub fn compute_distance_table_l2(query: &[f32], codebook: &[f32]) -> DistanceTable { + debug_assert_eq!(query.len(), NUM_SUB_VECTORS * SUB_VECTOR_DIM); + debug_assert_eq!( + codebook.len(), + NUM_SUB_VECTORS * NUM_CENTROIDS * SUB_VECTOR_DIM + ); + + let mut table = [[0.0f32; NUM_CENTROIDS]; NUM_SUB_VECTORS]; + for m in 0..NUM_SUB_VECTORS { + let q_sub = &query[m * SUB_VECTOR_DIM..(m + 1) * SUB_VECTOR_DIM]; + let cb_offset = m * NUM_CENTROIDS * SUB_VECTOR_DIM; + for k in 0..NUM_CENTROIDS { + let base = cb_offset + k * SUB_VECTOR_DIM; + let mut acc = 0.0f32; + for d in 0..SUB_VECTOR_DIM { + let diff = q_sub[d] - codebook[base + d]; + acc += diff * diff; + } + table[m][k] = acc; + } + } + table +} + +/// Probe every PQ-encoded vector and accumulate the top-K minimum distances. +/// +/// `codes` layout: `[num_vectors][NUM_SUB_VECTORS]` packed; one byte per sub-quantizer. +pub fn probe_pq_l2_top_k( + table: &DistanceTable, + codes: &[u8], + num_vectors: usize, + out: &mut TopKHeap, +) { + debug_assert_eq!(codes.len(), num_vectors * NUM_SUB_VECTORS); + + for i in 0..num_vectors { + let off = i * NUM_SUB_VECTORS; + let mut acc = 0.0f32; + for m in 0..NUM_SUB_VECTORS { + let k = codes[off + m] as usize; + acc += table[m][k]; + } + out.push(i as u32, acc); + } +} + +/// Fixed-capacity max-heap that keeps the K *smallest*-distance entries seen. +/// +/// Root is the largest of the K kept distances, so deciding whether to admit a +/// new entry is one comparison. +pub struct TopKHeap { + entries: [(u32, f32); TOP_K], + len: usize, +} + +impl Default for TopKHeap { + fn default() -> Self { + Self::new() + } +} + +impl TopKHeap { + pub fn new() -> Self { + Self { + entries: [(u32::MAX, f32::INFINITY); TOP_K], + len: 0, + } + } + + #[inline] + pub fn push(&mut self, id: u32, dist: f32) { + if self.len < TOP_K { + self.entries[self.len] = (id, dist); + self.len += 1; + if self.len == TOP_K { + self.heapify(); + } + return; + } + if dist < self.entries[0].1 { + self.entries[0] = (id, dist); + self.sift_down(0); + } + } + + fn heapify(&mut self) { + for i in (0..TOP_K / 2).rev() { + self.sift_down(i); + } + } + + fn sift_down(&mut self, mut i: usize) { + loop { + let l = 2 * i + 1; + let r = 2 * i + 2; + let mut largest = i; + if l < self.len && self.entries[l].1 > self.entries[largest].1 { + largest = l; + } + if r < self.len && self.entries[r].1 > self.entries[largest].1 { + largest = r; + } + if largest == i { + return; + } + self.entries.swap(i, largest); + i = largest; + } + } + + pub fn into_sorted(self) -> Vec<(u32, f32)> { + let mut v: Vec<_> = self.entries[..self.len].to_vec(); + v.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); + v + } +} diff --git a/research/lance-autoresearch/src/lib.rs b/research/lance-autoresearch/src/lib.rs new file mode 100644 index 0000000..5da7fea --- /dev/null +++ b/research/lance-autoresearch/src/lib.rs @@ -0,0 +1,21 @@ +//! Lance autoresearch harness — public API for the bench binary, benchmarks, and tests. +//! +//! Layout mirrors Karpathy's nanochat-research / autoresearch three-file contract: +//! +//! - `kernels` — the AGENT'S PLAYGROUND. May be rewritten freely. +//! - `reference` — IMMUTABLE. Exact brute-force baseline used to certify recall. +//! - `fixture` — IMMUTABLE. Dataset + frozen codebook loader. +//! +//! Constants are global because the agent shouldn't have to thread sizes through +//! its kernel — they pin the optimization target (SIFT1M-shaped: 128-d f32, +//! 16 sub-vectors × 256 centroids × 8-d, top-10). + +pub mod fixture; +pub mod kernels; +pub mod reference; + +pub const DIM: usize = 128; +pub const NUM_SUB_VECTORS: usize = 16; +pub const NUM_CENTROIDS: usize = 256; +pub const SUB_VECTOR_DIM: usize = DIM / NUM_SUB_VECTORS; +pub const TOP_K: usize = 10; diff --git a/research/lance-autoresearch/src/reference.rs b/research/lance-autoresearch/src/reference.rs new file mode 100644 index 0000000..3a7d60c --- /dev/null +++ b/research/lance-autoresearch/src/reference.rs @@ -0,0 +1,35 @@ +//! IMMUTABLE. Brute-force exact L2 top-K. Used at fixture-build time to compute +//! synthetic-dataset ground truth (against which the agent's PQ-approximate +//! kernel is then scored for recall). For SIFT1M fixtures we use the published +//! ground-truth file instead and never call this at bench-time. + +use crate::DIM; + +/// Brute-force exact top-K by squared L2. Returns `(id, distance)` ascending. +/// +/// Quadratic in `num_vectors`; only used by the fixture builder, not the hot path. +pub fn brute_force_top_k_l2( + query: &[f32], + base: &[f32], + num_vectors: usize, + k: usize, +) -> Vec<(u32, f32)> { + assert_eq!(query.len(), DIM); + assert_eq!(base.len(), num_vectors * DIM); + + let mut dists: Vec<(u32, f32)> = (0..num_vectors) + .map(|i| { + let v = &base[i * DIM..(i + 1) * DIM]; + let mut acc = 0.0f32; + for d in 0..DIM { + let diff = query[d] - v[d]; + acc += diff * diff; + } + (i as u32, acc) + }) + .collect(); + + dists.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal)); + dists.truncate(k); + dists +}