From d684178a12385184fb1319d91cb9021a24679505 Mon Sep 17 00:00:00 2001 From: Alex Garcia Date: Tue, 31 Mar 2026 17:39:41 -0700 Subject: [PATCH] Add AVX2-optimized Hamming distance using VPSHUFB popcount Implements distance_hamming_avx2() which processes 32 bytes per iteration using the standard VPSHUFB nibble-lookup popcount pattern. Dispatched when SQLITE_VEC_ENABLE_AVX is defined and input >= 32 bytes. Falls back to u64 scalar or u8 byte-at-a-time for smaller inputs. Also adds -mavx2 flag to Makefile for x86-64 targets alongside existing -mavx. Co-Authored-By: Claude Opus 4.6 (1M context) --- Makefile | 4 ++-- sqlite-vec.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 59 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 89907fa..175ab16 100644 --- a/Makefile +++ b/Makefile @@ -37,7 +37,7 @@ endif ifndef OMIT_SIMD ifeq ($(shell uname -sm),Darwin x86_64) - CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX + CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX endif ifeq ($(shell uname -sm),Darwin arm64) CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON @@ -45,7 +45,7 @@ ifndef OMIT_SIMD ifeq ($(shell uname -s),Linux) ifeq ($(findstring android,$(CC)),) ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) - CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX + CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX endif endif endif diff --git a/sqlite-vec.c b/sqlite-vec.c index cb597dd..f239d47 100644 --- a/sqlite-vec.c +++ b/sqlite-vec.c @@ -708,6 +708,58 @@ static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) { } #endif +#ifdef SQLITE_VEC_ENABLE_AVX +/** + * AVX2 Hamming distance using VPSHUFB-based popcount. + * Processes 32 bytes (256 bits) per iteration. + */ +static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) { + const u8 *pEnd = a + n_bytes; + + // VPSHUFB lookup table: popcount of low nibble + const __m256i lookup = _mm256_setr_epi8( + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4, + 0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4); + const __m256i low_mask = _mm256_set1_epi8(0x0f); + + __m256i acc = _mm256_setzero_si256(); + + while (a <= pEnd - 32) { + __m256i va = _mm256_loadu_si256((const __m256i *)a); + __m256i vb = _mm256_loadu_si256((const __m256i *)b); + __m256i xored = _mm256_xor_si256(va, vb); + + // VPSHUFB popcount: split into nibbles, lookup each + __m256i lo = _mm256_and_si256(xored, low_mask); + __m256i hi = _mm256_and_si256(_mm256_srli_epi16(xored, 4), low_mask); + __m256i popcnt = _mm256_add_epi8(_mm256_shuffle_epi8(lookup, lo), + _mm256_shuffle_epi8(lookup, hi)); + + // Horizontal sum: u8 -> u64 via sad against zero + acc = _mm256_add_epi64(acc, _mm256_sad_epu8(popcnt, _mm256_setzero_si256())); + a += 32; + b += 32; + } + + // Horizontal sum of 4 x u64 lanes + u64 tmp[4]; + _mm256_storeu_si256((__m256i *)tmp, acc); + u32 sum = (u32)(tmp[0] + tmp[1] + tmp[2] + tmp[3]); + + // Scalar tail + while (a < pEnd) { + u8 x = *a ^ *b; + x = x - ((x >> 1) & 0x55); + x = (x & 0x33) + ((x >> 2) & 0x33); + sum += (x + (x >> 4)) & 0x0F; + a++; + b++; + } + + return (f32)sum; +} +#endif + static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { int same = 0; for (unsigned long i = 0; i < n; i++) { @@ -762,6 +814,11 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) { return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes); } #endif +#ifdef SQLITE_VEC_ENABLE_AVX + if (n_bytes >= 32) { + return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes); + } +#endif if ((dimensions % 64) == 0) { return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64));