diff --git a/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs b/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs index 022cacc..44fd20a 100644 --- a/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs +++ b/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs @@ -16,7 +16,8 @@ fn bench_pq_l2(c: &mut Criterion) { for wl in &workloads { let kernel = PqKernel::new(wl.shape, &wl.codebook); let q = &wl.queries[..wl.shape.dim]; - let table0 = kernel.distance_table(q); + let mut table0 = vec![0.0f32; wl.shape.distance_table_len()]; + kernel.distance_table(q, &mut table0); let label_shape = format!( "{}x{}x{}", @@ -26,9 +27,10 @@ fn bench_pq_l2(c: &mut Criterion) { let id = format!("{label_shape}/{label_dist}"); c.bench_function(&format!("distance_table/{id}"), |b| { + let mut scratch = vec![0.0f32; wl.shape.distance_table_len()]; b.iter(|| { - let t = kernel.distance_table(black_box(q)); - black_box(t); + kernel.distance_table(black_box(q), black_box(&mut scratch)); + black_box(&scratch); }); }); c.bench_function(&format!("probe_top_k/{id}"), |b| { diff --git a/research/lance-autoresearch/crates/pq-l2/program.md b/research/lance-autoresearch/crates/pq-l2/program.md index 1b8b9ec..4137f53 100644 --- a/research/lance-autoresearch/crates/pq-l2/program.md +++ b/research/lance-autoresearch/crates/pq-l2/program.md @@ -38,7 +38,7 @@ pub struct PqKernel { /* agent's private fields */ } impl PqKernel { pub fn new(shape: PqShape, codebook: &[f32]) -> Self; - pub fn distance_table(&self, query: &[f32]) -> Vec; + pub fn distance_table(&self, query: &[f32], out: &mut [f32]); pub fn probe_top_k(&self, table: &[f32], codes: &[u8], num_vectors: usize, k: usize) -> Vec<(u32, f32)>; } ``` @@ -91,7 +91,11 @@ to combine multiple ideas at once. - **FMA chains for table build.** The diff–square–sum maps cleanly to FMA on AVX2/NEON. Even without intrinsics, structuring the inner loop so `rustc` emits FMA helps. -- **Avoid the `Vec` allocation in the hot path.** `distance_table` allocates - a fresh `Vec` per call. The public API is fixed (returns `Vec`), - but you can reuse a thread-local scratch buffer internally and copy to a - `Vec` at the boundary if it speeds the build. +- **Reduce `probe_top_k`'s `Vec<(u32, f32)>` allocation.** `distance_table`'s + output buffer is already pre-allocated by the caller (the bench reuses + one `&mut [f32]` per workload), so allocation isn't on that hot path. + `probe_top_k` still allocates a `Vec<(u32, f32)>` for the result. K is + small (10–100) so this is a single small alloc per query, but on the + SIMD'd kernel it can be a measurable fraction. A heap that uses a + fixed-size `[(u32, f32); MAX_K]` internally and only allocates the + result `Vec` at the boundary is one option. diff --git a/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs b/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs index 86ed8b0..8208849 100644 --- a/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs +++ b/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs @@ -125,8 +125,10 @@ fn run_correctness() -> Result<(), String> { let agent = PqKernel::new(case.shape, &case.codebook); let reference = ScalarReference::new(case.shape, &case.codebook); - let agent_table = agent.distance_table(&case.query); - let ref_table = reference.distance_table(&case.query); + let mut agent_table = vec![0.0f32; case.shape.distance_table_len()]; + let mut ref_table = vec![0.0f32; case.shape.distance_table_len()]; + agent.distance_table(&case.query, &mut agent_table); + reference.distance_table(&case.query, &mut ref_table); let table_err = max_abs_err(&agent_table, &ref_table); if table_err > MAX_ABS_ERR { return Err(format!( @@ -175,13 +177,32 @@ fn run_speed(workloads: &[SpeedWorkload]) -> SpeedReport { for wl in workloads { let kernel = PqKernel::new(wl.shape, &wl.codebook); + // Distance-table buffer reused across queries — the alloc must stay + // out of the per-query timing so allocator-pressure improvements + // don't masquerade as kernel improvements. + let mut table = vec![0.0f32; wl.shape.distance_table_len()]; + + // Warmup: one untimed query primes caches (codes, codebook) and the + // CPU branch predictor before measurement starts. The first query + // otherwise pays cold-cache cost on the codes array, which for + // (768, 96, 256) is ~1.9 MB and exceeds L2 on many laptops. + { + let q = &wl.queries[..wl.shape.dim]; + kernel.distance_table(q, &mut table); + let hits = kernel.probe_top_k(&table, &wl.codes, wl.num_vectors, wl.k); + std::hint::black_box(hits); + } + let mut combo_timings: Vec = Vec::with_capacity(wl.num_queries); for qi in 0..wl.num_queries { let q = &wl.queries[qi * wl.shape.dim..(qi + 1) * wl.shape.dim]; let t0 = Instant::now(); - let table = kernel.distance_table(q); - let _hits = kernel.probe_top_k(&table, &wl.codes, wl.num_vectors, wl.k); + kernel.distance_table(q, &mut table); + let hits = kernel.probe_top_k(&table, &wl.codes, wl.num_vectors, wl.k); combo_timings.push(t0.elapsed().as_nanos() as u64); + // black_box prevents LTO from DCE-ing the heap maintenance work + // when the binary is the only consumer of `hits`. + std::hint::black_box(hits); } let combo_geo = geomean(&combo_timings); per_combo.push(ComboReport { diff --git a/research/lance-autoresearch/crates/pq-l2/src/kernels.rs b/research/lance-autoresearch/crates/pq-l2/src/kernels.rs index 3e42e2a..3f1fd18 100644 --- a/research/lance-autoresearch/crates/pq-l2/src/kernels.rs +++ b/research/lance-autoresearch/crates/pq-l2/src/kernels.rs @@ -14,7 +14,7 @@ // PUBLIC API CONTRACT (must remain stable so the bench keeps building): // - `pub struct PqKernel` // - `PqKernel::new(shape: PqShape, codebook: &[f32]) -> Self` -// - `PqKernel::distance_table(&self, query: &[f32]) -> Vec` +// - `PqKernel::distance_table(&self, query: &[f32], out: &mut [f32])` // - `PqKernel::probe_top_k(&self, table: &[f32], codes: &[u8], num_vectors: usize, k: usize) -> Vec<(u32, f32)>` // // What you CAN do: @@ -58,17 +58,19 @@ impl PqKernel { } } - /// Asymmetric L2 distance table for one query. + /// Write the asymmetric L2 distance table for one query into `out`. /// - /// Layout of returned `Vec`: `[num_sub_vectors][num_centroids]` flat - /// (`table[m * num_centroids + k]`). + /// `out` layout: `[num_sub_vectors][num_centroids]` flat + /// (`out[m * num_centroids + k]`). Caller pre-allocates `out` with length + /// `shape.distance_table_len()`; the bench reuses one buffer across all + /// queries so allocator cost stays out of the per-query timing. #[allow(clippy::needless_range_loop)] - pub fn distance_table(&self, query: &[f32]) -> Vec { + pub fn distance_table(&self, query: &[f32], out: &mut [f32]) { let s = &self.shape; let svd = s.sub_vector_dim(); debug_assert_eq!(query.len(), s.dim); + debug_assert_eq!(out.len(), s.distance_table_len()); - let mut table = vec![0.0f32; s.distance_table_len()]; for m in 0..s.num_sub_vectors { let q_sub = &query[m * svd..(m + 1) * svd]; let cb_off = m * s.num_centroids * svd; @@ -80,10 +82,9 @@ impl PqKernel { let diff = q_sub[d] - self.codebook[base + d]; acc += diff * diff; } - table[tbl_off + k] = acc; + out[tbl_off + k] = acc; } } - table } /// Probe `num_vectors` PQ-encoded vectors and return top-K by ascending diff --git a/research/lance-autoresearch/crates/pq-l2/src/reference.rs b/research/lance-autoresearch/crates/pq-l2/src/reference.rs index 57cfd84..aa04de9 100644 --- a/research/lance-autoresearch/crates/pq-l2/src/reference.rs +++ b/research/lance-autoresearch/crates/pq-l2/src/reference.rs @@ -25,12 +25,12 @@ impl ScalarReference { } #[allow(clippy::needless_range_loop)] - pub fn distance_table(&self, query: &[f32]) -> Vec { + pub fn distance_table(&self, query: &[f32], out: &mut [f32]) { let s = &self.shape; let svd = s.sub_vector_dim(); assert_eq!(query.len(), s.dim); + assert_eq!(out.len(), s.distance_table_len()); - let mut table = vec![0.0f32; s.distance_table_len()]; for m in 0..s.num_sub_vectors { let q_sub = &query[m * svd..(m + 1) * svd]; let cb_off = m * s.num_centroids * svd; @@ -42,10 +42,9 @@ impl ScalarReference { let diff = q_sub[d] - self.codebook[base + d]; acc += diff * diff; } - table[tbl_off + k] = acc; + out[tbl_off + k] = acc; } } - table } pub fn probe_top_k( diff --git a/research/lance-autoresearch/docs/targets/pq-l2.md b/research/lance-autoresearch/docs/targets/pq-l2.md index 7ac7daf..279e6da 100644 --- a/research/lance-autoresearch/docs/targets/pq-l2.md +++ b/research/lance-autoresearch/docs/targets/pq-l2.md @@ -12,9 +12,12 @@ generalizable speedups against it. Two functions in `crates/pq-l2/src/kernels.rs`: -- `PqKernel::distance_table(query)` — builds the asymmetric distance table - (`[num_sub_vectors][num_centroids]`) for one query against the codebook. - Cost: `num_sub_vectors × num_centroids × sub_vector_dim` MAC ops per query. +- `PqKernel::distance_table(query, &mut out)` — writes the asymmetric + distance table (`[num_sub_vectors][num_centroids]`) for one query against + the codebook into a caller-provided `&mut [f32]` buffer (the bench + pre-allocates and reuses one buffer per workload so allocator cost stays + out of the per-query timing). Cost: + `num_sub_vectors × num_centroids × sub_vector_dim` MAC ops per query. - `PqKernel::probe_top_k(table, codes, num_vectors, k)` — probes `num_vectors` PQ-encoded vectors, accumulates per-vector distance via `num_sub_vectors` table lookups, returns top-K. Cost: