mirror of
https://github.com/syntrex-lab/gomcp.git
synced 2026-04-28 05:46:22 +02:00
103 lines
2.3 KiB
Go
103 lines
2.3 KiB
Go
package vectorstore
|
|
|
|
import (
|
|
"context"
|
|
"math"
|
|
"strings"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
// FTS5Embedder is a pure-Go fallback embedder that uses character n-gram
|
|
// frequency vectors instead of neural embeddings. No external deps required.
|
|
//
|
|
// Quality is lower than MiniLM but sufficient for basic intent matching.
|
|
// Used when ONNX runtime is not available → [ORACLE: DEGRADED].
|
|
type FTS5Embedder struct {
|
|
ngramSize int
|
|
dimension int
|
|
}
|
|
|
|
// NewFTS5Embedder creates a fallback embedder with character n-grams.
|
|
// Uses tri-grams (n=3) projected to a fixed dimension via hashing.
|
|
func NewFTS5Embedder() *FTS5Embedder {
|
|
return &FTS5Embedder{
|
|
ngramSize: 3,
|
|
dimension: 128, // Hash-projected dimension.
|
|
}
|
|
}
|
|
|
|
// Embed generates a character n-gram frequency vector.
|
|
// Text is lowercased, split into n-grams, each hashed to a bucket.
|
|
func (e *FTS5Embedder) Embed(_ context.Context, text string) ([]float64, error) {
|
|
text = strings.ToLower(strings.TrimSpace(text))
|
|
if text == "" {
|
|
return make([]float64, e.dimension), nil
|
|
}
|
|
|
|
vec := make([]float64, e.dimension)
|
|
runes := []rune(text)
|
|
|
|
// Generate character n-grams and hash into buckets.
|
|
count := 0
|
|
for i := 0; i <= len(runes)-e.ngramSize; i++ {
|
|
ngram := string(runes[i : i+e.ngramSize])
|
|
bucket := fnvHash(ngram) % uint32(e.dimension)
|
|
vec[bucket]++
|
|
count++
|
|
}
|
|
|
|
// Also add word-level features for better discrimination.
|
|
words := strings.Fields(text)
|
|
for _, w := range words {
|
|
if utf8.RuneCountInString(w) >= 2 {
|
|
bucket := fnvHash("w:"+w) % uint32(e.dimension)
|
|
vec[bucket]++
|
|
count++
|
|
}
|
|
}
|
|
|
|
// L2-normalize the vector.
|
|
if count > 0 {
|
|
var norm float64
|
|
for _, v := range vec {
|
|
norm += v * v
|
|
}
|
|
norm = math.Sqrt(norm)
|
|
if norm > 0 {
|
|
for i := range vec {
|
|
vec[i] /= norm
|
|
}
|
|
}
|
|
}
|
|
|
|
return vec, nil
|
|
}
|
|
|
|
// Dimension returns the fixed output dimension (128).
|
|
func (e *FTS5Embedder) Dimension() int {
|
|
return e.dimension
|
|
}
|
|
|
|
// Name returns the embedder identifier.
|
|
func (e *FTS5Embedder) Name() string {
|
|
return "fts5:trigram-128d"
|
|
}
|
|
|
|
// Mode returns DEGRADED — this is a fallback embedder.
|
|
func (e *FTS5Embedder) Mode() OracleMode {
|
|
return OracleModeDegraded
|
|
}
|
|
|
|
// fnvHash computes FNV-1a hash of a string.
|
|
func fnvHash(s string) uint32 {
|
|
const (
|
|
offset32 = uint32(2166136261)
|
|
prime32 = uint32(16777619)
|
|
)
|
|
h := offset32
|
|
for i := 0; i < len(s); i++ {
|
|
h ^= uint32(s[i])
|
|
h *= prime32
|
|
}
|
|
return h
|
|
}
|