gomcp/internal/domain/vectorstore/fts5_embedder.go

103 lines
2.3 KiB
Go

package vectorstore
import (
"context"
"math"
"strings"
"unicode/utf8"
)
// FTS5Embedder is a pure-Go fallback embedder that uses character n-gram
// frequency vectors instead of neural embeddings. No external deps required.
//
// Quality is lower than MiniLM but sufficient for basic intent matching.
// Used when ONNX runtime is not available → [ORACLE: DEGRADED].
type FTS5Embedder struct {
ngramSize int
dimension int
}
// NewFTS5Embedder creates a fallback embedder with character n-grams.
// Uses tri-grams (n=3) projected to a fixed dimension via hashing.
func NewFTS5Embedder() *FTS5Embedder {
return &FTS5Embedder{
ngramSize: 3,
dimension: 128, // Hash-projected dimension.
}
}
// Embed generates a character n-gram frequency vector.
// Text is lowercased, split into n-grams, each hashed to a bucket.
func (e *FTS5Embedder) Embed(_ context.Context, text string) ([]float64, error) {
text = strings.ToLower(strings.TrimSpace(text))
if text == "" {
return make([]float64, e.dimension), nil
}
vec := make([]float64, e.dimension)
runes := []rune(text)
// Generate character n-grams and hash into buckets.
count := 0
for i := 0; i <= len(runes)-e.ngramSize; i++ {
ngram := string(runes[i : i+e.ngramSize])
bucket := fnvHash(ngram) % uint32(e.dimension)
vec[bucket]++
count++
}
// Also add word-level features for better discrimination.
words := strings.Fields(text)
for _, w := range words {
if utf8.RuneCountInString(w) >= 2 {
bucket := fnvHash("w:"+w) % uint32(e.dimension)
vec[bucket]++
count++
}
}
// L2-normalize the vector.
if count > 0 {
var norm float64
for _, v := range vec {
norm += v * v
}
norm = math.Sqrt(norm)
if norm > 0 {
for i := range vec {
vec[i] /= norm
}
}
}
return vec, nil
}
// Dimension returns the fixed output dimension (128).
func (e *FTS5Embedder) Dimension() int {
return e.dimension
}
// Name returns the embedder identifier.
func (e *FTS5Embedder) Name() string {
return "fts5:trigram-128d"
}
// Mode returns DEGRADED — this is a fallback embedder.
func (e *FTS5Embedder) Mode() OracleMode {
return OracleModeDegraded
}
// fnvHash computes FNV-1a hash of a string.
func fnvHash(s string) uint32 {
const (
offset32 = uint32(2166136261)
prime32 = uint32(16777619)
)
h := offset32
for i := 0; i < len(s); i++ {
h ^= uint32(s[i])
h *= prime32
}
return h
}