Add AVX2-optimized Hamming distance using VPSHUFB popcount

Implements distance_hamming_avx2() which processes 32 bytes per
iteration using the standard VPSHUFB nibble-lookup popcount pattern.
Dispatched when SQLITE_VEC_ENABLE_AVX is defined and input >= 32
bytes. Falls back to u64 scalar or u8 byte-at-a-time for smaller
inputs.

Also adds -mavx2 flag to Makefile for x86-64 targets alongside
existing -mavx.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Alex Garcia 2026-03-31 17:39:41 -07:00
parent d033bf5728
commit d684178a12
2 changed files with 59 additions and 2 deletions

View file

@ -37,7 +37,7 @@ endif
ifndef OMIT_SIMD ifndef OMIT_SIMD
ifeq ($(shell uname -sm),Darwin x86_64) ifeq ($(shell uname -sm),Darwin x86_64)
CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX
endif endif
ifeq ($(shell uname -sm),Darwin arm64) ifeq ($(shell uname -sm),Darwin arm64)
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
@ -45,7 +45,7 @@ ifndef OMIT_SIMD
ifeq ($(shell uname -s),Linux) ifeq ($(shell uname -s),Linux)
ifeq ($(findstring android,$(CC)),) ifeq ($(findstring android,$(CC)),)
ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),) ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),)
CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX
endif endif
endif endif
endif endif

View file

@ -708,6 +708,58 @@ static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) {
} }
#endif #endif
#ifdef SQLITE_VEC_ENABLE_AVX
/**
* AVX2 Hamming distance using VPSHUFB-based popcount.
* Processes 32 bytes (256 bits) per iteration.
*/
static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) {
const u8 *pEnd = a + n_bytes;
// VPSHUFB lookup table: popcount of low nibble
const __m256i lookup = _mm256_setr_epi8(
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4);
const __m256i low_mask = _mm256_set1_epi8(0x0f);
__m256i acc = _mm256_setzero_si256();
while (a <= pEnd - 32) {
__m256i va = _mm256_loadu_si256((const __m256i *)a);
__m256i vb = _mm256_loadu_si256((const __m256i *)b);
__m256i xored = _mm256_xor_si256(va, vb);
// VPSHUFB popcount: split into nibbles, lookup each
__m256i lo = _mm256_and_si256(xored, low_mask);
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(xored, 4), low_mask);
__m256i popcnt = _mm256_add_epi8(_mm256_shuffle_epi8(lookup, lo),
_mm256_shuffle_epi8(lookup, hi));
// Horizontal sum: u8 -> u64 via sad against zero
acc = _mm256_add_epi64(acc, _mm256_sad_epu8(popcnt, _mm256_setzero_si256()));
a += 32;
b += 32;
}
// Horizontal sum of 4 x u64 lanes
u64 tmp[4];
_mm256_storeu_si256((__m256i *)tmp, acc);
u32 sum = (u32)(tmp[0] + tmp[1] + tmp[2] + tmp[3]);
// Scalar tail
while (a < pEnd) {
u8 x = *a ^ *b;
x = x - ((x >> 1) & 0x55);
x = (x & 0x33) + ((x >> 2) & 0x33);
sum += (x + (x >> 4)) & 0x0F;
a++;
b++;
}
return (f32)sum;
}
#endif
static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) { static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) {
int same = 0; int same = 0;
for (unsigned long i = 0; i < n; i++) { for (unsigned long i = 0; i < n; i++) {
@ -762,6 +814,11 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) {
return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes); return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes);
} }
#endif #endif
#ifdef SQLITE_VEC_ENABLE_AVX
if (n_bytes >= 32) {
return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes);
}
#endif
if ((dimensions % 64) == 0) { if ((dimensions % 64) == 0) {
return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64)); return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64));