mirror of
https://github.com/samvallad33/vestige.git
synced 2026-04-30 11:26:22 +02:00
Initial commit: Vestige v1.0.0 - Cognitive memory MCP server
FSRS-6 spaced repetition, spreading activation, synaptic tagging, hippocampal indexing, and 130 years of memory research. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
commit
f9c60eb5a7
169 changed files with 97206 additions and 0 deletions
377
tests/e2e/src/mocks/mock_embedding.rs
Normal file
377
tests/e2e/src/mocks/mock_embedding.rs
Normal file
|
|
@ -0,0 +1,377 @@
|
|||
//! Mock Embedding Service using FxHash
|
||||
//!
|
||||
//! Provides deterministic embeddings for testing without requiring
|
||||
//! the actual fastembed model. Uses FxHash for fast, consistent hashing.
|
||||
//!
|
||||
//! Key properties:
|
||||
//! - Deterministic: Same input always produces same embedding
|
||||
//! - Fast: No ML model loading/inference
|
||||
//! - Semantic similarity: Similar strings produce similar embeddings
|
||||
//! - Normalized: All embeddings have unit length
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Dimensions for mock embeddings (matches BGE-base-en-v1.5)
|
||||
pub const MOCK_EMBEDDING_DIM: usize = 768;
|
||||
|
||||
/// FxHash implementation (fast, non-cryptographic hash)
|
||||
/// Based on Firefox's hash function
|
||||
fn fx_hash(data: &[u8]) -> u64 {
|
||||
const SEED: u64 = 0x517cc1b727220a95;
|
||||
let mut hash = SEED;
|
||||
for &byte in data {
|
||||
hash = hash.rotate_left(5) ^ (byte as u64);
|
||||
hash = hash.wrapping_mul(SEED);
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
/// Mock embedding service for testing
|
||||
///
|
||||
/// Produces deterministic embeddings based on text content using FxHash.
|
||||
/// Designed to approximate real embedding behavior:
|
||||
/// - Similar texts produce similar embeddings
|
||||
/// - Different texts produce different embeddings
|
||||
/// - Embeddings are normalized to unit length
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// let service = MockEmbeddingService::new();
|
||||
///
|
||||
/// let emb1 = service.embed("hello world");
|
||||
/// let emb2 = service.embed("hello world");
|
||||
/// let emb3 = service.embed("goodbye world");
|
||||
///
|
||||
/// // Same input = same output
|
||||
/// assert_eq!(emb1, emb2);
|
||||
///
|
||||
/// // Different input = different output
|
||||
/// assert_ne!(emb1, emb3);
|
||||
///
|
||||
/// // But similar inputs have higher similarity
|
||||
/// let sim_same = service.cosine_similarity(&emb1, &emb2);
|
||||
/// let sim_diff = service.cosine_similarity(&emb1, &emb3);
|
||||
/// assert!(sim_same > sim_diff);
|
||||
/// ```
|
||||
pub struct MockEmbeddingService {
|
||||
/// Cache for computed embeddings
|
||||
cache: HashMap<String, Vec<f32>>,
|
||||
/// Whether to use word-level hashing for better semantic similarity
|
||||
semantic_mode: bool,
|
||||
}
|
||||
|
||||
impl Default for MockEmbeddingService {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl MockEmbeddingService {
|
||||
/// Create a new mock embedding service
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
cache: HashMap::new(),
|
||||
semantic_mode: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a service without semantic mode (pure hash-based)
|
||||
pub fn new_simple() -> Self {
|
||||
Self {
|
||||
cache: HashMap::new(),
|
||||
semantic_mode: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Embed text into a vector
|
||||
pub fn embed(&mut self, text: &str) -> Vec<f32> {
|
||||
// Check cache first
|
||||
if let Some(cached) = self.cache.get(text) {
|
||||
return cached.clone();
|
||||
}
|
||||
|
||||
let embedding = if self.semantic_mode {
|
||||
self.semantic_embed(text)
|
||||
} else {
|
||||
self.simple_embed(text)
|
||||
};
|
||||
|
||||
self.cache.insert(text.to_string(), embedding.clone());
|
||||
embedding
|
||||
}
|
||||
|
||||
/// Simple hash-based embedding
|
||||
fn simple_embed(&self, text: &str) -> Vec<f32> {
|
||||
let mut embedding = vec![0.0f32; MOCK_EMBEDDING_DIM];
|
||||
let normalized = text.to_lowercase();
|
||||
|
||||
// Use multiple hash seeds for different dimensions
|
||||
for (i, chunk) in embedding.chunks_mut(64).enumerate() {
|
||||
let seed_text = format!("{}:{}", i, normalized);
|
||||
let hash = fx_hash(seed_text.as_bytes());
|
||||
|
||||
for (j, val) in chunk.iter_mut().enumerate() {
|
||||
// Generate pseudo-random float from hash
|
||||
let shifted = hash.rotate_left((j * 5) as u32);
|
||||
*val = ((shifted as f32 / u64::MAX as f32) * 2.0) - 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
normalize(&mut embedding);
|
||||
embedding
|
||||
}
|
||||
|
||||
/// Semantic-aware embedding (word-level hashing)
|
||||
fn semantic_embed(&self, text: &str) -> Vec<f32> {
|
||||
let mut embedding = vec![0.0f32; MOCK_EMBEDDING_DIM];
|
||||
let normalized = text.to_lowercase();
|
||||
|
||||
// Tokenize into words
|
||||
let words: Vec<&str> = normalized
|
||||
.split(|c: char| !c.is_alphanumeric())
|
||||
.filter(|w| !w.is_empty())
|
||||
.collect();
|
||||
|
||||
if words.is_empty() {
|
||||
// Fall back to simple embedding for empty text
|
||||
return self.simple_embed(text);
|
||||
}
|
||||
|
||||
// Each word contributes to the embedding
|
||||
for word in &words {
|
||||
let word_hash = fx_hash(word.as_bytes());
|
||||
|
||||
// Map word to a sparse set of dimensions
|
||||
for i in 0..16 {
|
||||
let dim = ((word_hash >> (i * 4)) as usize) % MOCK_EMBEDDING_DIM;
|
||||
let sign = if (word_hash >> (i + 48)) & 1 == 0 { 1.0 } else { -1.0 };
|
||||
let magnitude = ((word_hash >> (i * 2)) as f32 % 100.0) / 100.0 + 0.5;
|
||||
embedding[dim] += sign * magnitude;
|
||||
}
|
||||
}
|
||||
|
||||
// Add position-aware component for word order sensitivity
|
||||
for (pos, word) in words.iter().enumerate() {
|
||||
let pos_hash = fx_hash(format!("{}:{}", pos, word).as_bytes());
|
||||
let dim = (pos_hash as usize) % MOCK_EMBEDDING_DIM;
|
||||
let weight = 1.0 / (pos as f32 + 1.0);
|
||||
embedding[dim] += weight;
|
||||
}
|
||||
|
||||
// Add character n-gram features for subword similarity
|
||||
let chars: Vec<char> = normalized.chars().collect();
|
||||
for i in 0..chars.len().saturating_sub(2) {
|
||||
let trigram: String = chars[i..i + 3].iter().collect();
|
||||
let hash = fx_hash(trigram.as_bytes());
|
||||
let dim = (hash as usize) % MOCK_EMBEDDING_DIM;
|
||||
embedding[dim] += 0.1;
|
||||
}
|
||||
|
||||
normalize(&mut embedding);
|
||||
embedding
|
||||
}
|
||||
|
||||
/// Calculate cosine similarity between two embeddings
|
||||
pub fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
|
||||
if a.len() != b.len() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
||||
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
|
||||
if norm_a == 0.0 || norm_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
dot / (norm_a * norm_b)
|
||||
}
|
||||
|
||||
/// Calculate euclidean distance between two embeddings
|
||||
pub fn euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
|
||||
if a.len() != b.len() {
|
||||
return f32::MAX;
|
||||
}
|
||||
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
/// Find most similar embedding from a set
|
||||
pub fn find_most_similar<'a>(
|
||||
&self,
|
||||
query: &[f32],
|
||||
candidates: &'a [(String, Vec<f32>)],
|
||||
) -> Option<(&'a str, f32)> {
|
||||
candidates
|
||||
.iter()
|
||||
.map(|(id, emb)| (id.as_str(), self.cosine_similarity(query, emb)))
|
||||
.max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
|
||||
}
|
||||
|
||||
/// Clear the embedding cache
|
||||
pub fn clear_cache(&mut self) {
|
||||
self.cache.clear();
|
||||
}
|
||||
|
||||
/// Get cache size
|
||||
pub fn cache_size(&self) -> usize {
|
||||
self.cache.len()
|
||||
}
|
||||
|
||||
/// Check if service is ready (always true for mock)
|
||||
pub fn is_ready(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize a vector to unit length
|
||||
fn normalize(v: &mut [f32]) {
|
||||
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm > 0.0 {
|
||||
for x in v.iter_mut() {
|
||||
*x /= norm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_deterministic_embedding() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb1 = service.embed("hello world");
|
||||
let emb2 = service.embed("hello world");
|
||||
|
||||
assert_eq!(emb1, emb2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_different_texts_different_embeddings() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb1 = service.embed("hello world");
|
||||
let emb2 = service.embed("goodbye universe");
|
||||
|
||||
assert_ne!(emb1, emb2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_embedding_dimension() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("test text");
|
||||
|
||||
assert_eq!(emb.len(), MOCK_EMBEDDING_DIM);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalized_embeddings() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("test normalization");
|
||||
|
||||
let norm: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!((norm - 1.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_semantic_similarity() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb_dog = service.embed("the dog runs fast");
|
||||
let emb_cat = service.embed("the cat runs fast");
|
||||
let emb_car = service.embed("machine learning algorithms");
|
||||
|
||||
let sim_animals = service.cosine_similarity(&emb_dog, &emb_cat);
|
||||
let sim_different = service.cosine_similarity(&emb_dog, &emb_car);
|
||||
|
||||
// Similar sentences should have higher similarity
|
||||
assert!(sim_animals > sim_different);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cosine_similarity_range() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb1 = service.embed("test one");
|
||||
let emb2 = service.embed("test two");
|
||||
|
||||
let sim = service.cosine_similarity(&emb1, &emb2);
|
||||
|
||||
// Cosine similarity should be in [-1, 1]
|
||||
assert!(sim >= -1.0 && sim <= 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_self_similarity() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("self similarity test");
|
||||
|
||||
let sim = service.cosine_similarity(&emb, &emb);
|
||||
assert!((sim - 1.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_caching() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
assert_eq!(service.cache_size(), 0);
|
||||
|
||||
service.embed("text one");
|
||||
assert_eq!(service.cache_size(), 1);
|
||||
|
||||
service.embed("text one"); // Should use cache
|
||||
assert_eq!(service.cache_size(), 1);
|
||||
|
||||
service.embed("text two");
|
||||
assert_eq!(service.cache_size(), 2);
|
||||
|
||||
service.clear_cache();
|
||||
assert_eq!(service.cache_size(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_most_similar() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let query = service.embed("programming code");
|
||||
let candidates = vec![
|
||||
("doc1".to_string(), service.embed("python programming language")),
|
||||
("doc2".to_string(), service.embed("cooking recipes")),
|
||||
("doc3".to_string(), service.embed("software development code")),
|
||||
];
|
||||
|
||||
let result = service.find_most_similar(&query, &candidates);
|
||||
assert!(result.is_some());
|
||||
|
||||
// Should find a programming-related document
|
||||
let (id, _) = result.unwrap();
|
||||
assert!(id == "doc1" || id == "doc3");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_text() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("");
|
||||
|
||||
assert_eq!(emb.len(), MOCK_EMBEDDING_DIM);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_mode() {
|
||||
let mut service = MockEmbeddingService::new_simple();
|
||||
let emb = service.embed("test simple mode");
|
||||
|
||||
assert_eq!(emb.len(), MOCK_EMBEDDING_DIM);
|
||||
|
||||
// Verify normalization
|
||||
let norm: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!((norm - 1.0).abs() < 0.001);
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue