omnigraph/research/lance-autoresearch/src/lib.rs

//! Lance autoresearch harness — public API for the bench binary, benchmarks, and tests.
//!
//! Contract (Karpathy-style three files):
//!
//! - `kernels` — the AGENT'S PLAYGROUND. Modify freely.
//! - `reference` — IMMUTABLE. Scalar reference kernel. Defines the math.
//! - `inputs` — IMMUTABLE. Diverse test-data + workload generators,
//!   deterministic per fixed seed, varied across the input battery.
//!
//! The optimization target is dataset-independent: the agent's kernel must match
//! the scalar reference within `MAX_ABS_ERR` on every input the bench generates,
//! and minimize geomean ns/query across multiple PQ shapes and data
//! distributions. There is no fixed dataset; an "improvement" by construction
//! generalizes across distributions and shapes.

pub mod inputs;
pub mod kernels;
pub mod reference;

/// Geometry of a PQ index: vector dimension, number of sub-quantizers, centroids
/// per sub-quantizer. We pin nbits=8 (256 centroids) — the dominant Lance code
/// path. `dim` must be divisible by `num_sub_vectors`.
#[derive(Clone, Copy, Debug, Eq, PartialEq, Hash)]
pub struct PqShape {
    pub dim: usize,
    pub num_sub_vectors: usize,
    pub num_centroids: usize,
}

impl PqShape {
    pub const fn new(dim: usize, num_sub_vectors: usize, num_centroids: usize) -> Self {
        Self {
            dim,
            num_sub_vectors,
            num_centroids,
        }
    }
    pub const fn sub_vector_dim(&self) -> usize {
        self.dim / self.num_sub_vectors
    }
    pub const fn distance_table_len(&self) -> usize {
        self.num_sub_vectors * self.num_centroids
    }
    pub const fn codebook_len(&self) -> usize {
        self.num_sub_vectors * self.num_centroids * self.sub_vector_dim()
    }
}

/// Tolerance for the agent kernel's distance values vs. the scalar reference.
/// Loose enough to permit legal SIMD-accumulator reordering; tight enough to
/// catch real arithmetic bugs.
pub const MAX_ABS_ERR: f32 = 1e-4;

/// Tolerance for top-K *distances* (id sets are compared with tie-tolerance —
/// see `reference::topk_consistent`).
pub const TOPK_DIST_TOL: f32 = 1e-4;