mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-27 05:16:22 +02:00
405 lines
10 KiB
Go
405 lines
10 KiB
Go
|
|
package vectorstore
|
|||
|
|
|
|||
|
|
import (
|
|||
|
|
"fmt"
|
|||
|
|
"math"
|
|||
|
|
"math/rand"
|
|||
|
|
"testing"
|
|||
|
|
|
|||
|
|
"github.com/stretchr/testify/assert"
|
|||
|
|
"github.com/stretchr/testify/require"
|
|||
|
|
)
|
|||
|
|
|
|||
|
|
// --- PolarQuant Core Tests ---
|
|||
|
|
|
|||
|
|
func TestPolarQuant_EncodeDecode_Deterministic(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
cv1 := codec.Encode(vec)
|
|||
|
|
cv2 := codec.Encode(vec)
|
|||
|
|
|
|||
|
|
assert.Equal(t, cv1.Data, cv2.Data, "same input → same compressed data")
|
|||
|
|
assert.Equal(t, cv1.Radius, cv2.Radius, "same input → same radius")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_RoundTrip_4bit(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
cv := codec.Encode(vec)
|
|||
|
|
reconstructed := codec.Decode(cv)
|
|||
|
|
|
|||
|
|
// 4-bit quantization on 128-dim: ~91% avg cosine (empirically measured).
|
|||
|
|
// Quantization noise is higher at d=128 vs d=3 due to more dimensions.
|
|||
|
|
l2err := l2Error(vec, reconstructed)
|
|||
|
|
assert.Less(t, l2err, 0.50, "4-bit roundtrip L2 error should be < 50%%, got %.4f", l2err)
|
|||
|
|
|
|||
|
|
cosSim := CosineSimilarity(vec, reconstructed)
|
|||
|
|
assert.Greater(t, cosSim, 0.90, "4-bit roundtrip cosine similarity should be > 0.90, got %.4f", cosSim)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_RoundTrip_8bit(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 8, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
cv := codec.Encode(vec)
|
|||
|
|
reconstructed := codec.Decode(cv)
|
|||
|
|
|
|||
|
|
// 8-bit quantization: expect << 1% reconstruction error.
|
|||
|
|
l2err := l2Error(vec, reconstructed)
|
|||
|
|
assert.Less(t, l2err, 0.05, "8-bit roundtrip L2 error should be < 5%%, got %.4f", l2err)
|
|||
|
|
|
|||
|
|
cosSim := CosineSimilarity(vec, reconstructed)
|
|||
|
|
assert.Greater(t, cosSim, 0.999, "8-bit roundtrip cosine similarity should be > 0.999, got %.4f", cosSim)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_RoundTrip_2bit(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 2, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
cv := codec.Encode(vec)
|
|||
|
|
reconstructed := codec.Decode(cv)
|
|||
|
|
|
|||
|
|
// 2-bit: coarse but should preserve general direction.
|
|||
|
|
cosSim := CosineSimilarity(vec, reconstructed)
|
|||
|
|
assert.Greater(t, cosSim, 0.70, "2-bit roundtrip cosine should be > 0.70, got %.4f", cosSim)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_PreservesRadius(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
// Scale vector to non-unit length.
|
|||
|
|
scaled := make([]float64, len(vec))
|
|||
|
|
for i, v := range vec {
|
|||
|
|
scaled[i] = v * 3.7
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
cv := codec.Encode(scaled)
|
|||
|
|
assert.InDelta(t, 3.7, float64(cv.Radius), 0.01, "radius should be ≈ 3.7")
|
|||
|
|
|
|||
|
|
reconstructed := codec.Decode(cv)
|
|||
|
|
// Check that the scale is preserved.
|
|||
|
|
var recNorm float64
|
|||
|
|
for _, v := range reconstructed {
|
|||
|
|
recNorm += v * v
|
|||
|
|
}
|
|||
|
|
recNorm = math.Sqrt(recNorm)
|
|||
|
|
assert.InDelta(t, 3.7, recNorm, 0.5, "reconstructed norm should be ≈ 3.7")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_PreservesOrdering(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
|
|||
|
|
query := pqRandomVector(128, 1)
|
|||
|
|
close := pqPerturbVector(query, 0.1, 2)
|
|||
|
|
far := pqPerturbVector(query, 0.9, 3)
|
|||
|
|
|
|||
|
|
cosClose := CosineSimilarity(query, close)
|
|||
|
|
cosFar := CosineSimilarity(query, far)
|
|||
|
|
require.Greater(t, cosClose, cosFar, "sanity: close > far in original space")
|
|||
|
|
|
|||
|
|
// Encode all.
|
|||
|
|
cvQ := codec.Encode(query)
|
|||
|
|
cvClose := codec.Encode(close)
|
|||
|
|
cvFar := codec.Encode(far)
|
|||
|
|
|
|||
|
|
// Compressed similarity should preserve ordering.
|
|||
|
|
compClose := codec.CompressedSimilarity(cvQ, cvClose)
|
|||
|
|
compFar := codec.CompressedSimilarity(cvQ, cvFar)
|
|||
|
|
|
|||
|
|
assert.Greater(t, compClose, compFar,
|
|||
|
|
"PolarQuant must preserve ordering: comp(query,close)=%.4f > comp(query,far)=%.4f",
|
|||
|
|
compClose, compFar)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_CompressedSimilarity_AccuracyVsExact(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
|
|||
|
|
v1 := pqRandomVector(128, 10)
|
|||
|
|
v2 := pqRandomVector(128, 20)
|
|||
|
|
|
|||
|
|
exactSim := CosineSimilarity(v1, v2)
|
|||
|
|
|
|||
|
|
cv1 := codec.Encode(v1)
|
|||
|
|
cv2 := codec.Encode(v2)
|
|||
|
|
compSim := codec.CompressedSimilarity(cv1, cv2)
|
|||
|
|
|
|||
|
|
// 4-bit compressed similarity should be within ±0.1 of exact.
|
|||
|
|
assert.InDelta(t, exactSim, compSim, 0.1,
|
|||
|
|
"compressed similarity (%.4f) should be close to exact (%.4f)", compSim, exactSim)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_MemoryReduction_4bit(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
|
|||
|
|
compBytes := codec.CompressedBytes() + 4 // +4 for float32 radius
|
|||
|
|
origBytes := 128 * 8 // float64
|
|||
|
|
ratio := codec.CompressionRatio()
|
|||
|
|
|
|||
|
|
assert.Equal(t, 64, codec.CompressedBytes(), "128×4bit = 512 bits = 64 bytes")
|
|||
|
|
assert.Equal(t, 68, compBytes, "total = 64 data + 4 radius = 68 bytes")
|
|||
|
|
assert.InDelta(t, float64(origBytes)/float64(compBytes), ratio, 0.1)
|
|||
|
|
assert.Greater(t, ratio, 14.0, "should be >14x compression, got %.1fx", ratio)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_MemoryReduction_8bit(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 8, 42)
|
|||
|
|
|
|||
|
|
compBytes := codec.CompressedBytes() + 4
|
|||
|
|
ratio := codec.CompressionRatio()
|
|||
|
|
|
|||
|
|
assert.Equal(t, 128, codec.CompressedBytes(), "128×8bit = 1024 bits = 128 bytes")
|
|||
|
|
assert.Equal(t, 132, compBytes)
|
|||
|
|
assert.Greater(t, ratio, 7.0, "should be >7x compression, got %.1fx", ratio)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_ZeroVector(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
zero := make([]float64, 128)
|
|||
|
|
|
|||
|
|
cv := codec.Encode(zero)
|
|||
|
|
assert.InDelta(t, 0.0, float64(cv.Radius), 0.001)
|
|||
|
|
|
|||
|
|
reconstructed := codec.Decode(cv)
|
|||
|
|
for i, v := range reconstructed {
|
|||
|
|
assert.InDelta(t, 0.0, v, 0.001, "zero vector dimension %d should stay zero", i)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_SmallDim(t *testing.T) {
|
|||
|
|
// Ensure PolarQuant works for small dimensions too.
|
|||
|
|
codec := NewPolarQuantCodec(3, 4, 42)
|
|||
|
|
vec := []float64{0.6, 0.8, 0.0}
|
|||
|
|
|
|||
|
|
cv := codec.Encode(vec)
|
|||
|
|
reconstructed := codec.Decode(cv)
|
|||
|
|
|
|||
|
|
cosSim := CosineSimilarity(vec, reconstructed)
|
|||
|
|
assert.Greater(t, cosSim, 0.90, "3-dim 4-bit cosine should be > 0.90, got %.4f", cosSim)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_DifferentSeeds(t *testing.T) {
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
codec1 := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
codec2 := NewPolarQuantCodec(128, 4, 99)
|
|||
|
|
|
|||
|
|
cv1 := codec1.Encode(vec)
|
|||
|
|
cv2 := codec2.Encode(vec)
|
|||
|
|
|
|||
|
|
assert.NotEqual(t, cv1.Data, cv2.Data, "different seeds → different compressed data")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_BitWidthClamping(t *testing.T) {
|
|||
|
|
// bitsPerDim < 1 → clamp to 1.
|
|||
|
|
codec1 := NewPolarQuantCodec(128, 0, 42)
|
|||
|
|
assert.Equal(t, 1, codec1.BitsPerDim())
|
|||
|
|
|
|||
|
|
// bitsPerDim > 8 → clamp to 8.
|
|||
|
|
codec2 := NewPolarQuantCodec(128, 16, 42)
|
|||
|
|
assert.Equal(t, 8, codec2.BitsPerDim())
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_OrthogonalRotation_PreservesNorm(t *testing.T) {
|
|||
|
|
// Orthogonal matrix should preserve vector norms.
|
|||
|
|
codec := NewPolarQuantCodec(64, 8, 42)
|
|||
|
|
vec := pqRandomVector(64, 5)
|
|||
|
|
|
|||
|
|
// Manually rotate.
|
|||
|
|
rotated := make([]float64, 64)
|
|||
|
|
for i := 0; i < 64; i++ {
|
|||
|
|
var dot float64
|
|||
|
|
for j := 0; j < 64; j++ {
|
|||
|
|
dot += codec.rotation[i][j] * vec[j]
|
|||
|
|
}
|
|||
|
|
rotated[i] = dot
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
origNorm := vecNorm(vec, 64)
|
|||
|
|
rotNorm := vecNorm(rotated, 64)
|
|||
|
|
assert.InDelta(t, origNorm, rotNorm, 0.001, "rotation should preserve L2 norm")
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_OrthogonalRotation_Deterministic(t *testing.T) {
|
|||
|
|
c1 := NewPolarQuantCodec(32, 4, 42)
|
|||
|
|
c2 := NewPolarQuantCodec(32, 4, 42)
|
|||
|
|
|
|||
|
|
for i := 0; i < 32; i++ {
|
|||
|
|
for j := 0; j < 32; j++ {
|
|||
|
|
assert.Equal(t, c1.rotation[i][j], c2.rotation[i][j],
|
|||
|
|
"same seed → same rotation at [%d][%d]", i, j)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// --- Batch Quality Tests ---
|
|||
|
|
|
|||
|
|
func TestPolarQuant_BatchQuality_4bit(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
n := 100
|
|||
|
|
|
|||
|
|
var totalCosSim float64
|
|||
|
|
for i := 0; i < n; i++ {
|
|||
|
|
vec := pqRandomVector(128, int64(i))
|
|||
|
|
cv := codec.Encode(vec)
|
|||
|
|
rec := codec.Decode(cv)
|
|||
|
|
totalCosSim += CosineSimilarity(vec, rec)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
avgCos := totalCosSim / float64(n)
|
|||
|
|
assert.Greater(t, avgCos, 0.90,
|
|||
|
|
"avg cosine similarity over %d vectors should be > 0.90, got %.4f", n, avgCos)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func TestPolarQuant_BatchOrderingPreservation(t *testing.T) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
n := 50
|
|||
|
|
|
|||
|
|
// For each query, verify that the top-1 nearest neighbor is preserved.
|
|||
|
|
preserved := 0
|
|||
|
|
for i := 0; i < n; i++ {
|
|||
|
|
query := pqRandomVector(128, int64(i*100))
|
|||
|
|
vectors := make([][]float64, 10)
|
|||
|
|
for j := 0; j < 10; j++ {
|
|||
|
|
vectors[j] = pqRandomVector(128, int64(i*100+j+1))
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Find exact top-1.
|
|||
|
|
bestExact := -1
|
|||
|
|
bestExactSim := -2.0
|
|||
|
|
for j, v := range vectors {
|
|||
|
|
sim := CosineSimilarity(query, v)
|
|||
|
|
if sim > bestExactSim {
|
|||
|
|
bestExactSim = sim
|
|||
|
|
bestExact = j
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Find compressed top-1.
|
|||
|
|
cvQ := codec.Encode(query)
|
|||
|
|
bestComp := -1
|
|||
|
|
bestCompSim := -2.0
|
|||
|
|
for j, v := range vectors {
|
|||
|
|
cv := codec.Encode(v)
|
|||
|
|
sim := codec.CompressedSimilarity(cvQ, cv)
|
|||
|
|
if sim > bestCompSim {
|
|||
|
|
bestCompSim = sim
|
|||
|
|
bestComp = j
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if bestExact == bestComp {
|
|||
|
|
preserved++
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
rate := float64(preserved) / float64(n)
|
|||
|
|
assert.Greater(t, rate, 0.55,
|
|||
|
|
"top-1 preservation rate should be > 55%%, got %.0f%% (%d/%d)", rate*100, preserved, n)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// --- Benchmarks ---
|
|||
|
|
|
|||
|
|
func BenchmarkPolarQuant_Encode_4bit(b *testing.B) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
b.ResetTimer()
|
|||
|
|
for i := 0; i < b.N; i++ {
|
|||
|
|
codec.Encode(vec)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func BenchmarkPolarQuant_Decode_4bit(b *testing.B) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
cv := codec.Encode(vec)
|
|||
|
|
|
|||
|
|
b.ResetTimer()
|
|||
|
|
for i := 0; i < b.N; i++ {
|
|||
|
|
codec.Decode(cv)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func BenchmarkPolarQuant_CompressedSimilarity_4bit(b *testing.B) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 4, 42)
|
|||
|
|
v1 := pqRandomVector(128, 1)
|
|||
|
|
v2 := pqRandomVector(128, 2)
|
|||
|
|
cv1 := codec.Encode(v1)
|
|||
|
|
cv2 := codec.Encode(v2)
|
|||
|
|
|
|||
|
|
b.ResetTimer()
|
|||
|
|
for i := 0; i < b.N; i++ {
|
|||
|
|
codec.CompressedSimilarity(cv1, cv2)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func BenchmarkPolarQuant_Encode_8bit(b *testing.B) {
|
|||
|
|
codec := NewPolarQuantCodec(128, 8, 42)
|
|||
|
|
vec := pqRandomVector(128, 1)
|
|||
|
|
|
|||
|
|
b.ResetTimer()
|
|||
|
|
for i := 0; i < b.N; i++ {
|
|||
|
|
codec.Encode(vec)
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// --- Helpers (scoped to polarquant tests to avoid collision with qjl_test) ---
|
|||
|
|
|
|||
|
|
func pqRandomVector(dim int, seed int64) []float64 {
|
|||
|
|
rng := rand.New(rand.NewSource(seed))
|
|||
|
|
vec := make([]float64, dim)
|
|||
|
|
for i := range vec {
|
|||
|
|
vec[i] = rng.NormFloat64()
|
|||
|
|
}
|
|||
|
|
var norm float64
|
|||
|
|
for _, v := range vec {
|
|||
|
|
norm += v * v
|
|||
|
|
}
|
|||
|
|
norm = math.Sqrt(norm)
|
|||
|
|
if norm > 0 {
|
|||
|
|
for i := range vec {
|
|||
|
|
vec[i] /= norm
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return vec
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func pqPerturbVector(v []float64, noise float64, seed int64) []float64 {
|
|||
|
|
rng := rand.New(rand.NewSource(seed))
|
|||
|
|
perturbed := make([]float64, len(v))
|
|||
|
|
for i := range v {
|
|||
|
|
perturbed[i] = v[i] + noise*rng.NormFloat64()
|
|||
|
|
}
|
|||
|
|
var norm float64
|
|||
|
|
for _, val := range perturbed {
|
|||
|
|
norm += val * val
|
|||
|
|
}
|
|||
|
|
norm = math.Sqrt(norm)
|
|||
|
|
if norm > 0 {
|
|||
|
|
for i := range perturbed {
|
|||
|
|
perturbed[i] /= norm
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return perturbed
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func l2Error(a, b []float64) float64 {
|
|||
|
|
if len(a) != len(b) {
|
|||
|
|
return math.Inf(1)
|
|||
|
|
}
|
|||
|
|
var sumSq float64
|
|||
|
|
for i := range a {
|
|||
|
|
d := a[i] - b[i]
|
|||
|
|
sumSq += d * d
|
|||
|
|
}
|
|||
|
|
return math.Sqrt(sumSq)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
func init() {
|
|||
|
|
// Silence unused import warnings by referencing fmt.
|
|||
|
|
_ = fmt.Sprint
|
|||
|
|
}
|