diff --git a/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs b/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs
index 022cacc..44fd20a 100644
--- a/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs
+++ b/research/lance-autoresearch/crates/pq-l2/benches/pq_l2.rs
@@ -16,7 +16,8 @@ fn bench_pq_l2(c: &mut Criterion) {
     for wl in &workloads {
         let kernel = PqKernel::new(wl.shape, &wl.codebook);
         let q = &wl.queries[..wl.shape.dim];
-        let table0 = kernel.distance_table(q);
+        let mut table0 = vec![0.0f32; wl.shape.distance_table_len()];
+        kernel.distance_table(q, &mut table0);
 
         let label_shape = format!(
             "{}x{}x{}",
@@ -26,9 +27,10 @@ fn bench_pq_l2(c: &mut Criterion) {
         let id = format!("{label_shape}/{label_dist}");
 
         c.bench_function(&format!("distance_table/{id}"), |b| {
+            let mut scratch = vec![0.0f32; wl.shape.distance_table_len()];
             b.iter(|| {
-                let t = kernel.distance_table(black_box(q));
-                black_box(t);
+                kernel.distance_table(black_box(q), black_box(&mut scratch));
+                black_box(&scratch);
             });
         });
         c.bench_function(&format!("probe_top_k/{id}"), |b| {
diff --git a/research/lance-autoresearch/crates/pq-l2/program.md b/research/lance-autoresearch/crates/pq-l2/program.md
index 1b8b9ec..4137f53 100644
--- a/research/lance-autoresearch/crates/pq-l2/program.md
+++ b/research/lance-autoresearch/crates/pq-l2/program.md
@@ -38,7 +38,7 @@ pub struct PqKernel { /* agent's private fields */ }
 
 impl PqKernel {
     pub fn new(shape: PqShape, codebook: &[f32]) -> Self;
-    pub fn distance_table(&self, query: &[f32]) -> Vec<f32>;
+    pub fn distance_table(&self, query: &[f32], out: &mut [f32]);
     pub fn probe_top_k(&self, table: &[f32], codes: &[u8], num_vectors: usize, k: usize) -> Vec<(u32, f32)>;
 }
 ```
@@ -91,7 +91,11 @@ to combine multiple ideas at once.
 - **FMA chains for table build.** The diff–square–sum maps cleanly to FMA
   on AVX2/NEON. Even without intrinsics, structuring the inner loop so
   `rustc` emits FMA helps.
-- **Avoid the `Vec` allocation in the hot path.** `distance_table` allocates
-  a fresh `Vec<f32>` per call. The public API is fixed (returns `Vec<f32>`),
-  but you can reuse a thread-local scratch buffer internally and copy to a
-  `Vec` at the boundary if it speeds the build.
+- **Reduce `probe_top_k`'s `Vec<(u32, f32)>` allocation.** `distance_table`'s
+  output buffer is already pre-allocated by the caller (the bench reuses
+  one `&mut [f32]` per workload), so allocation isn't on that hot path.
+  `probe_top_k` still allocates a `Vec<(u32, f32)>` for the result. K is
+  small (10–100) so this is a single small alloc per query, but on the
+  SIMD'd kernel it can be a measurable fraction. A heap that uses a
+  fixed-size `[(u32, f32); MAX_K]` internally and only allocates the
+  result `Vec` at the boundary is one option.
diff --git a/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs b/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs
index 86ed8b0..8208849 100644
--- a/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs
+++ b/research/lance-autoresearch/crates/pq-l2/src/bin/run_experiment.rs
@@ -125,8 +125,10 @@ fn run_correctness() -> Result<(), String> {
         let agent = PqKernel::new(case.shape, &case.codebook);
         let reference = ScalarReference::new(case.shape, &case.codebook);
 
-        let agent_table = agent.distance_table(&case.query);
-        let ref_table = reference.distance_table(&case.query);
+        let mut agent_table = vec![0.0f32; case.shape.distance_table_len()];
+        let mut ref_table = vec![0.0f32; case.shape.distance_table_len()];
+        agent.distance_table(&case.query, &mut agent_table);
+        reference.distance_table(&case.query, &mut ref_table);
         let table_err = max_abs_err(&agent_table, &ref_table);
         if table_err > MAX_ABS_ERR {
             return Err(format!(
@@ -175,13 +177,32 @@ fn run_speed(workloads: &[SpeedWorkload]) -> SpeedReport {
 
     for wl in workloads {
         let kernel = PqKernel::new(wl.shape, &wl.codebook);
+        // Distance-table buffer reused across queries — the alloc must stay
+        // out of the per-query timing so allocator-pressure improvements
+        // don't masquerade as kernel improvements.
+        let mut table = vec![0.0f32; wl.shape.distance_table_len()];
+
+        // Warmup: one untimed query primes caches (codes, codebook) and the
+        // CPU branch predictor before measurement starts. The first query
+        // otherwise pays cold-cache cost on the codes array, which for
+        // (768, 96, 256) is ~1.9 MB and exceeds L2 on many laptops.
+        {
+            let q = &wl.queries[..wl.shape.dim];
+            kernel.distance_table(q, &mut table);
+            let hits = kernel.probe_top_k(&table, &wl.codes, wl.num_vectors, wl.k);
+            std::hint::black_box(hits);
+        }
+
         let mut combo_timings: Vec<u64> = Vec::with_capacity(wl.num_queries);
         for qi in 0..wl.num_queries {
             let q = &wl.queries[qi * wl.shape.dim..(qi + 1) * wl.shape.dim];
             let t0 = Instant::now();
-            let table = kernel.distance_table(q);
-            let _hits = kernel.probe_top_k(&table, &wl.codes, wl.num_vectors, wl.k);
+            kernel.distance_table(q, &mut table);
+            let hits = kernel.probe_top_k(&table, &wl.codes, wl.num_vectors, wl.k);
             combo_timings.push(t0.elapsed().as_nanos() as u64);
+            // black_box prevents LTO from DCE-ing the heap maintenance work
+            // when the binary is the only consumer of `hits`.
+            std::hint::black_box(hits);
         }
         let combo_geo = geomean(&combo_timings);
         per_combo.push(ComboReport {
diff --git a/research/lance-autoresearch/crates/pq-l2/src/kernels.rs b/research/lance-autoresearch/crates/pq-l2/src/kernels.rs
index 3e42e2a..3f1fd18 100644
--- a/research/lance-autoresearch/crates/pq-l2/src/kernels.rs
+++ b/research/lance-autoresearch/crates/pq-l2/src/kernels.rs
@@ -14,7 +14,7 @@
 // PUBLIC API CONTRACT (must remain stable so the bench keeps building):
 //   - `pub struct PqKernel`
 //   - `PqKernel::new(shape: PqShape, codebook: &[f32]) -> Self`
-//   - `PqKernel::distance_table(&self, query: &[f32]) -> Vec<f32>`
+//   - `PqKernel::distance_table(&self, query: &[f32], out: &mut [f32])`
 //   - `PqKernel::probe_top_k(&self, table: &[f32], codes: &[u8], num_vectors: usize, k: usize) -> Vec<(u32, f32)>`
 //
 // What you CAN do:
@@ -58,17 +58,19 @@ impl PqKernel {
         }
     }
 
-    /// Asymmetric L2 distance table for one query.
+    /// Write the asymmetric L2 distance table for one query into `out`.
     ///
-    /// Layout of returned `Vec<f32>`: `[num_sub_vectors][num_centroids]` flat
-    /// (`table[m * num_centroids + k]`).
+    /// `out` layout: `[num_sub_vectors][num_centroids]` flat
+    /// (`out[m * num_centroids + k]`). Caller pre-allocates `out` with length
+    /// `shape.distance_table_len()`; the bench reuses one buffer across all
+    /// queries so allocator cost stays out of the per-query timing.
     #[allow(clippy::needless_range_loop)]
-    pub fn distance_table(&self, query: &[f32]) -> Vec<f32> {
+    pub fn distance_table(&self, query: &[f32], out: &mut [f32]) {
         let s = &self.shape;
         let svd = s.sub_vector_dim();
         debug_assert_eq!(query.len(), s.dim);
+        debug_assert_eq!(out.len(), s.distance_table_len());
 
-        let mut table = vec![0.0f32; s.distance_table_len()];
         for m in 0..s.num_sub_vectors {
             let q_sub = &query[m * svd..(m + 1) * svd];
             let cb_off = m * s.num_centroids * svd;
@@ -80,10 +82,9 @@ impl PqKernel {
                     let diff = q_sub[d] - self.codebook[base + d];
                     acc += diff * diff;
                 }
-                table[tbl_off + k] = acc;
+                out[tbl_off + k] = acc;
             }
         }
-        table
     }
 
     /// Probe `num_vectors` PQ-encoded vectors and return top-K by ascending
diff --git a/research/lance-autoresearch/crates/pq-l2/src/reference.rs b/research/lance-autoresearch/crates/pq-l2/src/reference.rs
index 57cfd84..aa04de9 100644
--- a/research/lance-autoresearch/crates/pq-l2/src/reference.rs
+++ b/research/lance-autoresearch/crates/pq-l2/src/reference.rs
@@ -25,12 +25,12 @@ impl ScalarReference {
     }
 
     #[allow(clippy::needless_range_loop)]
-    pub fn distance_table(&self, query: &[f32]) -> Vec<f32> {
+    pub fn distance_table(&self, query: &[f32], out: &mut [f32]) {
         let s = &self.shape;
         let svd = s.sub_vector_dim();
         assert_eq!(query.len(), s.dim);
+        assert_eq!(out.len(), s.distance_table_len());
 
-        let mut table = vec![0.0f32; s.distance_table_len()];
         for m in 0..s.num_sub_vectors {
             let q_sub = &query[m * svd..(m + 1) * svd];
             let cb_off = m * s.num_centroids * svd;
@@ -42,10 +42,9 @@ impl ScalarReference {
                     let diff = q_sub[d] - self.codebook[base + d];
                     acc += diff * diff;
                 }
-                table[tbl_off + k] = acc;
+                out[tbl_off + k] = acc;
             }
         }
-        table
     }
 
     pub fn probe_top_k(
diff --git a/research/lance-autoresearch/docs/targets/pq-l2.md b/research/lance-autoresearch/docs/targets/pq-l2.md
index 7ac7daf..279e6da 100644
--- a/research/lance-autoresearch/docs/targets/pq-l2.md
+++ b/research/lance-autoresearch/docs/targets/pq-l2.md
@@ -12,9 +12,12 @@ generalizable speedups against it.
 
 Two functions in `crates/pq-l2/src/kernels.rs`:
 
-- `PqKernel::distance_table(query)` — builds the asymmetric distance table
-  (`[num_sub_vectors][num_centroids]`) for one query against the codebook.
-  Cost: `num_sub_vectors × num_centroids × sub_vector_dim` MAC ops per query.
+- `PqKernel::distance_table(query, &mut out)` — writes the asymmetric
+  distance table (`[num_sub_vectors][num_centroids]`) for one query against
+  the codebook into a caller-provided `&mut [f32]` buffer (the bench
+  pre-allocates and reuses one buffer per workload so allocator cost stays
+  out of the per-query timing). Cost:
+  `num_sub_vectors × num_centroids × sub_vector_dim` MAC ops per query.
 - `PqKernel::probe_top_k(table, codes, num_vectors, k)` — probes
   `num_vectors` PQ-encoded vectors, accumulates per-vector distance via
   `num_sub_vectors` table lookups, returns top-K. Cost: