mirror of
https://github.com/asg017/sqlite-vec.git
synced 2026-04-25 08:46:49 +02:00
Add AVX2-optimized Hamming distance using VPSHUFB popcount
Implements distance_hamming_avx2() which processes 32 bytes per iteration using the standard VPSHUFB nibble-lookup popcount pattern. Dispatched when SQLITE_VEC_ENABLE_AVX is defined and input >= 32 bytes. Falls back to u64 scalar or u8 byte-at-a-time for smaller inputs. Also adds -mavx2 flag to Makefile for x86-64 targets alongside existing -mavx. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d033bf5728
commit
d684178a12
2 changed files with 59 additions and 2 deletions
4
Makefile
4
Makefile
|
|
@ -37,7 +37,7 @@ endif
|
||||||
|
|
||||||
ifndef OMIT_SIMD
|
ifndef OMIT_SIMD
|
||||||
ifeq ($(shell uname -sm),Darwin x86_64)
|
ifeq ($(shell uname -sm),Darwin x86_64)
|
||||||
CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX
|
CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX
|
||||||
endif
|
endif
|
||||||
ifeq ($(shell uname -sm),Darwin arm64)
|
ifeq ($(shell uname -sm),Darwin arm64)
|
||||||
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
|
CFLAGS += -mcpu=apple-m1 -DSQLITE_VEC_ENABLE_NEON
|
||||||
|
|
@ -45,7 +45,7 @@ ifndef OMIT_SIMD
|
||||||
ifeq ($(shell uname -s),Linux)
|
ifeq ($(shell uname -s),Linux)
|
||||||
ifeq ($(findstring android,$(CC)),)
|
ifeq ($(findstring android,$(CC)),)
|
||||||
ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),)
|
ifneq ($(filter avx,$(shell grep -o 'avx[^ ]*' /proc/cpuinfo 2>/dev/null | head -1)),)
|
||||||
CFLAGS += -mavx -DSQLITE_VEC_ENABLE_AVX
|
CFLAGS += -mavx -mavx2 -DSQLITE_VEC_ENABLE_AVX
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
|
||||||
57
sqlite-vec.c
57
sqlite-vec.c
|
|
@ -708,6 +708,58 @@ static f32 distance_hamming_neon(const u8 *a, const u8 *b, size_t n_bytes) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SQLITE_VEC_ENABLE_AVX
|
||||||
|
/**
|
||||||
|
* AVX2 Hamming distance using VPSHUFB-based popcount.
|
||||||
|
* Processes 32 bytes (256 bits) per iteration.
|
||||||
|
*/
|
||||||
|
static f32 distance_hamming_avx2(const u8 *a, const u8 *b, size_t n_bytes) {
|
||||||
|
const u8 *pEnd = a + n_bytes;
|
||||||
|
|
||||||
|
// VPSHUFB lookup table: popcount of low nibble
|
||||||
|
const __m256i lookup = _mm256_setr_epi8(
|
||||||
|
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,
|
||||||
|
0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4);
|
||||||
|
const __m256i low_mask = _mm256_set1_epi8(0x0f);
|
||||||
|
|
||||||
|
__m256i acc = _mm256_setzero_si256();
|
||||||
|
|
||||||
|
while (a <= pEnd - 32) {
|
||||||
|
__m256i va = _mm256_loadu_si256((const __m256i *)a);
|
||||||
|
__m256i vb = _mm256_loadu_si256((const __m256i *)b);
|
||||||
|
__m256i xored = _mm256_xor_si256(va, vb);
|
||||||
|
|
||||||
|
// VPSHUFB popcount: split into nibbles, lookup each
|
||||||
|
__m256i lo = _mm256_and_si256(xored, low_mask);
|
||||||
|
__m256i hi = _mm256_and_si256(_mm256_srli_epi16(xored, 4), low_mask);
|
||||||
|
__m256i popcnt = _mm256_add_epi8(_mm256_shuffle_epi8(lookup, lo),
|
||||||
|
_mm256_shuffle_epi8(lookup, hi));
|
||||||
|
|
||||||
|
// Horizontal sum: u8 -> u64 via sad against zero
|
||||||
|
acc = _mm256_add_epi64(acc, _mm256_sad_epu8(popcnt, _mm256_setzero_si256()));
|
||||||
|
a += 32;
|
||||||
|
b += 32;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Horizontal sum of 4 x u64 lanes
|
||||||
|
u64 tmp[4];
|
||||||
|
_mm256_storeu_si256((__m256i *)tmp, acc);
|
||||||
|
u32 sum = (u32)(tmp[0] + tmp[1] + tmp[2] + tmp[3]);
|
||||||
|
|
||||||
|
// Scalar tail
|
||||||
|
while (a < pEnd) {
|
||||||
|
u8 x = *a ^ *b;
|
||||||
|
x = x - ((x >> 1) & 0x55);
|
||||||
|
x = (x & 0x33) + ((x >> 2) & 0x33);
|
||||||
|
sum += (x + (x >> 4)) & 0x0F;
|
||||||
|
a++;
|
||||||
|
b++;
|
||||||
|
}
|
||||||
|
|
||||||
|
return (f32)sum;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) {
|
static f32 distance_hamming_u8(u8 *a, u8 *b, size_t n) {
|
||||||
int same = 0;
|
int same = 0;
|
||||||
for (unsigned long i = 0; i < n; i++) {
|
for (unsigned long i = 0; i < n; i++) {
|
||||||
|
|
@ -762,6 +814,11 @@ static f32 distance_hamming(const void *a, const void *b, const void *d) {
|
||||||
return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes);
|
return distance_hamming_neon((const u8 *)a, (const u8 *)b, n_bytes);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef SQLITE_VEC_ENABLE_AVX
|
||||||
|
if (n_bytes >= 32) {
|
||||||
|
return distance_hamming_avx2((const u8 *)a, (const u8 *)b, n_bytes);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
if ((dimensions % 64) == 0) {
|
if ((dimensions % 64) == 0) {
|
||||||
return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64));
|
return distance_hamming_u64((const u8 *)a, (const u8 *)b, n_bytes / sizeof(u64));
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue