mirror of
https://github.com/samvallad33/vestige.git
synced 2026-05-17 18:35:17 +02:00
Initial commit: Vestige v1.0.0 - Cognitive memory MCP server
FSRS-6 spaced repetition, spreading activation, synaptic tagging, hippocampal indexing, and 130 years of memory research. Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
commit
f9c60eb5a7
169 changed files with 97206 additions and 0 deletions
573
tests/e2e/src/mocks/fixtures.rs
Normal file
573
tests/e2e/src/mocks/fixtures.rs
Normal file
|
|
@ -0,0 +1,573 @@
|
|||
//! Test Data Factory
|
||||
//!
|
||||
//! Provides utilities for generating realistic test data:
|
||||
//! - Memory nodes with various properties
|
||||
//! - Batch generation for stress testing
|
||||
//! - Pre-built scenarios for common test cases
|
||||
|
||||
use chrono::{DateTime, Duration, Utc};
|
||||
use vestige_core::{KnowledgeNode, Rating, Storage};
|
||||
|
||||
/// Helper to create IngestInput (works around non_exhaustive)
|
||||
fn make_ingest_input(
|
||||
content: String,
|
||||
node_type: String,
|
||||
tags: Vec<String>,
|
||||
sentiment_score: f64,
|
||||
sentiment_magnitude: f64,
|
||||
source: Option<String>,
|
||||
valid_from: Option<DateTime<Utc>>,
|
||||
valid_until: Option<DateTime<Utc>>,
|
||||
) -> vestige_core::IngestInput {
|
||||
let mut input = vestige_core::IngestInput::default();
|
||||
input.content = content;
|
||||
input.node_type = node_type;
|
||||
input.tags = tags;
|
||||
input.sentiment_score = sentiment_score;
|
||||
input.sentiment_magnitude = sentiment_magnitude;
|
||||
input.source = source;
|
||||
input.valid_from = valid_from;
|
||||
input.valid_until = valid_until;
|
||||
input
|
||||
}
|
||||
|
||||
/// Factory for creating test data
|
||||
///
|
||||
/// Generates realistic test data with configurable properties.
|
||||
/// Designed for creating comprehensive test scenarios.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// let mut storage = Storage::new(Some(path))?;
|
||||
///
|
||||
/// // Create a single memory
|
||||
/// let node = TestDataFactory::create_memory(&mut storage, "test content");
|
||||
///
|
||||
/// // Create a batch
|
||||
/// let nodes = TestDataFactory::create_batch(&mut storage, 100);
|
||||
///
|
||||
/// // Create a specific scenario
|
||||
/// let scenario = TestDataFactory::create_decay_scenario(&mut storage);
|
||||
/// ```
|
||||
pub struct TestDataFactory;
|
||||
|
||||
/// Configuration for batch memory generation
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct BatchConfig {
|
||||
/// Number of memories to create
|
||||
pub count: usize,
|
||||
/// Node type to use (None = random)
|
||||
pub node_type: Option<String>,
|
||||
/// Base content prefix
|
||||
pub content_prefix: String,
|
||||
/// Tags to apply
|
||||
pub tags: Vec<String>,
|
||||
/// Whether to add sentiment
|
||||
pub with_sentiment: bool,
|
||||
/// Whether to add temporal validity
|
||||
pub with_temporal: bool,
|
||||
}
|
||||
|
||||
impl Default for BatchConfig {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
count: 10,
|
||||
node_type: None,
|
||||
content_prefix: "Test memory".to_string(),
|
||||
tags: vec![],
|
||||
with_sentiment: false,
|
||||
with_temporal: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Scenario containing related test data
|
||||
#[derive(Debug)]
|
||||
pub struct TestScenario {
|
||||
/// IDs of created nodes
|
||||
pub node_ids: Vec<String>,
|
||||
/// Description of the scenario
|
||||
pub description: String,
|
||||
/// Metadata for test assertions
|
||||
pub metadata: std::collections::HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl TestDataFactory {
|
||||
// ========================================================================
|
||||
// SINGLE MEMORY CREATION
|
||||
// ========================================================================
|
||||
|
||||
/// Create a simple memory with content
|
||||
pub fn create_memory(storage: &mut Storage, content: &str) -> Option<KnowledgeNode> {
|
||||
let input = make_ingest_input(
|
||||
content.to_string(),
|
||||
"fact".to_string(),
|
||||
vec![],
|
||||
0.0,
|
||||
0.0,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
storage.ingest(input).ok()
|
||||
}
|
||||
|
||||
/// Create a memory with full configuration
|
||||
pub fn create_memory_full(
|
||||
storage: &mut Storage,
|
||||
content: &str,
|
||||
node_type: &str,
|
||||
source: Option<&str>,
|
||||
tags: Vec<&str>,
|
||||
sentiment_score: f64,
|
||||
sentiment_magnitude: f64,
|
||||
) -> Option<KnowledgeNode> {
|
||||
let input = make_ingest_input(
|
||||
content.to_string(),
|
||||
node_type.to_string(),
|
||||
tags.iter().map(|s| s.to_string()).collect(),
|
||||
sentiment_score,
|
||||
sentiment_magnitude,
|
||||
source.map(String::from),
|
||||
None,
|
||||
None,
|
||||
);
|
||||
storage.ingest(input).ok()
|
||||
}
|
||||
|
||||
/// Create a memory with temporal validity
|
||||
pub fn create_temporal_memory(
|
||||
storage: &mut Storage,
|
||||
content: &str,
|
||||
valid_from: Option<DateTime<Utc>>,
|
||||
valid_until: Option<DateTime<Utc>>,
|
||||
) -> Option<KnowledgeNode> {
|
||||
let input = make_ingest_input(
|
||||
content.to_string(),
|
||||
"fact".to_string(),
|
||||
vec![],
|
||||
0.0,
|
||||
0.0,
|
||||
None,
|
||||
valid_from,
|
||||
valid_until,
|
||||
);
|
||||
storage.ingest(input).ok()
|
||||
}
|
||||
|
||||
/// Create an emotional memory
|
||||
pub fn create_emotional_memory(
|
||||
storage: &mut Storage,
|
||||
content: &str,
|
||||
sentiment: f64,
|
||||
magnitude: f64,
|
||||
) -> Option<KnowledgeNode> {
|
||||
let input = make_ingest_input(
|
||||
content.to_string(),
|
||||
"event".to_string(),
|
||||
vec![],
|
||||
sentiment,
|
||||
magnitude,
|
||||
None,
|
||||
None,
|
||||
None,
|
||||
);
|
||||
storage.ingest(input).ok()
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// BATCH CREATION
|
||||
// ========================================================================
|
||||
|
||||
/// Create a batch of memories
|
||||
pub fn create_batch(storage: &mut Storage, count: usize) -> Vec<String> {
|
||||
Self::create_batch_with_config(storage, BatchConfig { count, ..Default::default() })
|
||||
}
|
||||
|
||||
/// Create a batch with custom configuration
|
||||
pub fn create_batch_with_config(storage: &mut Storage, config: BatchConfig) -> Vec<String> {
|
||||
let node_types = ["fact", "concept", "procedure", "event", "code"];
|
||||
let mut ids = Vec::with_capacity(config.count);
|
||||
|
||||
for i in 0..config.count {
|
||||
let node_type = config
|
||||
.node_type
|
||||
.clone()
|
||||
.unwrap_or_else(|| node_types[i % node_types.len()].to_string());
|
||||
|
||||
let sentiment_score = if config.with_sentiment {
|
||||
((i as f64) / (config.count as f64) * 2.0) - 1.0
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let sentiment_magnitude = if config.with_sentiment {
|
||||
(i as f64) / (config.count as f64)
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
|
||||
let (valid_from, valid_until) = if config.with_temporal {
|
||||
let now = Utc::now();
|
||||
if i % 3 == 0 {
|
||||
(Some(now - Duration::days(30)), Some(now + Duration::days(30)))
|
||||
} else if i % 3 == 1 {
|
||||
(Some(now - Duration::days(60)), Some(now - Duration::days(30)))
|
||||
} else {
|
||||
(None, None)
|
||||
}
|
||||
} else {
|
||||
(None, None)
|
||||
};
|
||||
|
||||
let input = make_ingest_input(
|
||||
format!("{} {}", config.content_prefix, i),
|
||||
node_type,
|
||||
config.tags.clone(),
|
||||
sentiment_score,
|
||||
sentiment_magnitude,
|
||||
None,
|
||||
valid_from,
|
||||
valid_until,
|
||||
);
|
||||
|
||||
if let Ok(node) = storage.ingest(input) {
|
||||
ids.push(node.id);
|
||||
}
|
||||
}
|
||||
|
||||
ids
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// SCENARIO CREATION
|
||||
// ========================================================================
|
||||
|
||||
/// Create a scenario for testing memory decay
|
||||
pub fn create_decay_scenario(storage: &mut Storage) -> TestScenario {
|
||||
let mut ids = Vec::new();
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
|
||||
// High stability memory (should decay slowly)
|
||||
let high_stab = Self::create_memory_full(
|
||||
storage,
|
||||
"Well-learned fact about photosynthesis",
|
||||
"fact",
|
||||
Some("biology textbook"),
|
||||
vec!["biology", "science"],
|
||||
0.3,
|
||||
0.5,
|
||||
);
|
||||
if let Some(node) = high_stab {
|
||||
metadata.insert("high_stability".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// Low stability memory (should decay quickly)
|
||||
let low_stab = Self::create_memory(storage, "Random fact I just learned");
|
||||
if let Some(node) = low_stab {
|
||||
metadata.insert("low_stability".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// Emotional memory (decay should be affected by sentiment)
|
||||
let emotional = Self::create_emotional_memory(
|
||||
storage,
|
||||
"Important life event",
|
||||
0.9,
|
||||
0.95,
|
||||
);
|
||||
if let Some(node) = emotional {
|
||||
metadata.insert("emotional".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
TestScenario {
|
||||
node_ids: ids,
|
||||
description: "Decay testing scenario with varied stability".to_string(),
|
||||
metadata,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a scenario for testing review scheduling
|
||||
pub fn create_scheduling_scenario(storage: &mut Storage) -> TestScenario {
|
||||
let mut ids = Vec::new();
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
|
||||
// New card (never reviewed)
|
||||
let new_card = Self::create_memory(storage, "Brand new memory");
|
||||
if let Some(node) = new_card {
|
||||
metadata.insert("new".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// Learning card (few reviews)
|
||||
if let Some(node) = Self::create_memory(storage, "Learning memory") {
|
||||
let _ = storage.mark_reviewed(&node.id, Rating::Good);
|
||||
metadata.insert("learning".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// Review card (many reviews)
|
||||
if let Some(node) = Self::create_memory(storage, "Well-reviewed memory") {
|
||||
for _ in 0..5 {
|
||||
let _ = storage.mark_reviewed(&node.id, Rating::Good);
|
||||
}
|
||||
metadata.insert("review".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// Relearning card (had lapses)
|
||||
if let Some(node) = Self::create_memory(storage, "Struggling memory") {
|
||||
let _ = storage.mark_reviewed(&node.id, Rating::Good);
|
||||
let _ = storage.mark_reviewed(&node.id, Rating::Again);
|
||||
metadata.insert("relearning".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
TestScenario {
|
||||
node_ids: ids,
|
||||
description: "Scheduling scenario with cards in different learning states".to_string(),
|
||||
metadata,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a scenario for testing search
|
||||
pub fn create_search_scenario(storage: &mut Storage) -> TestScenario {
|
||||
let mut ids = Vec::new();
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
|
||||
// Programming memories
|
||||
for content in [
|
||||
"Rust programming language uses ownership for memory safety",
|
||||
"Python is great for data science and machine learning",
|
||||
"JavaScript runs in web browsers and Node.js",
|
||||
] {
|
||||
if let Some(node) = Self::create_memory_full(
|
||||
storage,
|
||||
content,
|
||||
"fact",
|
||||
Some("programming docs"),
|
||||
vec!["programming", "code"],
|
||||
0.0,
|
||||
0.0,
|
||||
) {
|
||||
ids.push(node.id);
|
||||
}
|
||||
}
|
||||
metadata.insert("programming_count".to_string(), "3".to_string());
|
||||
|
||||
// Science memories
|
||||
for content in [
|
||||
"Mitochondria is the powerhouse of the cell",
|
||||
"DNA contains genetic information",
|
||||
"Gravity is the force of attraction between masses",
|
||||
] {
|
||||
if let Some(node) = Self::create_memory_full(
|
||||
storage,
|
||||
content,
|
||||
"fact",
|
||||
Some("science textbook"),
|
||||
vec!["science"],
|
||||
0.0,
|
||||
0.0,
|
||||
) {
|
||||
ids.push(node.id);
|
||||
}
|
||||
}
|
||||
metadata.insert("science_count".to_string(), "3".to_string());
|
||||
|
||||
// Recipe memories
|
||||
for content in [
|
||||
"To make pasta, boil water and add salt",
|
||||
"Chocolate cake requires cocoa powder and eggs",
|
||||
] {
|
||||
if let Some(node) = Self::create_memory_full(
|
||||
storage,
|
||||
content,
|
||||
"procedure",
|
||||
Some("cookbook"),
|
||||
vec!["cooking", "recipes"],
|
||||
0.0,
|
||||
0.0,
|
||||
) {
|
||||
ids.push(node.id);
|
||||
}
|
||||
}
|
||||
metadata.insert("recipe_count".to_string(), "2".to_string());
|
||||
|
||||
TestScenario {
|
||||
node_ids: ids,
|
||||
description: "Search scenario with categorized content".to_string(),
|
||||
metadata,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a scenario for testing temporal queries
|
||||
pub fn create_temporal_scenario(storage: &mut Storage) -> TestScenario {
|
||||
let now = Utc::now();
|
||||
let mut ids = Vec::new();
|
||||
let mut metadata = std::collections::HashMap::new();
|
||||
|
||||
// Currently valid
|
||||
if let Some(node) = Self::create_temporal_memory(
|
||||
storage,
|
||||
"Currently valid memory",
|
||||
Some(now - Duration::days(10)),
|
||||
Some(now + Duration::days(10)),
|
||||
) {
|
||||
metadata.insert("current".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// Expired
|
||||
if let Some(node) = Self::create_temporal_memory(
|
||||
storage,
|
||||
"Expired memory",
|
||||
Some(now - Duration::days(60)),
|
||||
Some(now - Duration::days(30)),
|
||||
) {
|
||||
metadata.insert("expired".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// Future
|
||||
if let Some(node) = Self::create_temporal_memory(
|
||||
storage,
|
||||
"Future memory",
|
||||
Some(now + Duration::days(30)),
|
||||
Some(now + Duration::days(60)),
|
||||
) {
|
||||
metadata.insert("future".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
// No bounds (always valid)
|
||||
if let Some(node) = Self::create_temporal_memory(
|
||||
storage,
|
||||
"Always valid memory",
|
||||
None,
|
||||
None,
|
||||
) {
|
||||
metadata.insert("always_valid".to_string(), node.id.clone());
|
||||
ids.push(node.id);
|
||||
}
|
||||
|
||||
TestScenario {
|
||||
node_ids: ids,
|
||||
description: "Temporal scenario with different validity periods".to_string(),
|
||||
metadata,
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// UTILITY METHODS
|
||||
// ========================================================================
|
||||
|
||||
/// Get a random node type
|
||||
pub fn random_node_type(seed: usize) -> &'static str {
|
||||
const TYPES: [&str; 9] = [
|
||||
"fact", "concept", "procedure", "event", "relationship",
|
||||
"quote", "code", "question", "insight",
|
||||
];
|
||||
TYPES[seed % TYPES.len()]
|
||||
}
|
||||
|
||||
/// Generate lorem ipsum-like content
|
||||
pub fn lorem_content(words: usize, seed: usize) -> String {
|
||||
const WORDS: [&str; 20] = [
|
||||
"the", "memory", "learning", "knowledge", "algorithm",
|
||||
"data", "system", "process", "function", "method",
|
||||
"class", "object", "variable", "constant", "type",
|
||||
"structure", "pattern", "design", "architecture", "code",
|
||||
];
|
||||
|
||||
(0..words)
|
||||
.map(|i| WORDS[(seed + i * 7) % WORDS.len()])
|
||||
.collect::<Vec<_>>()
|
||||
.join(" ")
|
||||
}
|
||||
|
||||
/// Generate tags
|
||||
pub fn generate_tags(count: usize, seed: usize) -> Vec<String> {
|
||||
const TAGS: [&str; 10] = [
|
||||
"important", "review", "todo", "concept", "fact",
|
||||
"code", "note", "idea", "question", "reference",
|
||||
];
|
||||
|
||||
(0..count)
|
||||
.map(|i| TAGS[(seed + i) % TAGS.len()].to_string())
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use tempfile::tempdir;
|
||||
|
||||
fn create_test_storage() -> Storage {
|
||||
let dir = tempdir().unwrap();
|
||||
let db_path = dir.path().join("test.db");
|
||||
Storage::new(Some(db_path)).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_memory() {
|
||||
let mut storage = create_test_storage();
|
||||
let node = TestDataFactory::create_memory(&mut storage, "test content");
|
||||
|
||||
assert!(node.is_some());
|
||||
assert_eq!(node.unwrap().content, "test content");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_batch() {
|
||||
let mut storage = create_test_storage();
|
||||
let ids = TestDataFactory::create_batch(&mut storage, 10);
|
||||
|
||||
assert_eq!(ids.len(), 10);
|
||||
|
||||
let stats = storage.get_stats().unwrap();
|
||||
assert_eq!(stats.total_nodes, 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_decay_scenario() {
|
||||
let mut storage = create_test_storage();
|
||||
let scenario = TestDataFactory::create_decay_scenario(&mut storage);
|
||||
|
||||
assert!(!scenario.node_ids.is_empty());
|
||||
assert!(scenario.metadata.contains_key("high_stability"));
|
||||
assert!(scenario.metadata.contains_key("low_stability"));
|
||||
assert!(scenario.metadata.contains_key("emotional"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_create_scheduling_scenario() {
|
||||
let mut storage = create_test_storage();
|
||||
let scenario = TestDataFactory::create_scheduling_scenario(&mut storage);
|
||||
|
||||
assert!(!scenario.node_ids.is_empty());
|
||||
assert!(scenario.metadata.contains_key("new"));
|
||||
assert!(scenario.metadata.contains_key("learning"));
|
||||
assert!(scenario.metadata.contains_key("review"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_lorem_content() {
|
||||
let content = TestDataFactory::lorem_content(10, 42);
|
||||
let words: Vec<_> = content.split_whitespace().collect();
|
||||
|
||||
assert_eq!(words.len(), 10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_generate_tags() {
|
||||
let tags = TestDataFactory::generate_tags(5, 0);
|
||||
|
||||
assert_eq!(tags.len(), 5);
|
||||
assert!(tags.iter().all(|t| !t.is_empty()));
|
||||
}
|
||||
}
|
||||
377
tests/e2e/src/mocks/mock_embedding.rs
Normal file
377
tests/e2e/src/mocks/mock_embedding.rs
Normal file
|
|
@ -0,0 +1,377 @@
|
|||
//! Mock Embedding Service using FxHash
|
||||
//!
|
||||
//! Provides deterministic embeddings for testing without requiring
|
||||
//! the actual fastembed model. Uses FxHash for fast, consistent hashing.
|
||||
//!
|
||||
//! Key properties:
|
||||
//! - Deterministic: Same input always produces same embedding
|
||||
//! - Fast: No ML model loading/inference
|
||||
//! - Semantic similarity: Similar strings produce similar embeddings
|
||||
//! - Normalized: All embeddings have unit length
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Dimensions for mock embeddings (matches BGE-base-en-v1.5)
|
||||
pub const MOCK_EMBEDDING_DIM: usize = 768;
|
||||
|
||||
/// FxHash implementation (fast, non-cryptographic hash)
|
||||
/// Based on Firefox's hash function
|
||||
fn fx_hash(data: &[u8]) -> u64 {
|
||||
const SEED: u64 = 0x517cc1b727220a95;
|
||||
let mut hash = SEED;
|
||||
for &byte in data {
|
||||
hash = hash.rotate_left(5) ^ (byte as u64);
|
||||
hash = hash.wrapping_mul(SEED);
|
||||
}
|
||||
hash
|
||||
}
|
||||
|
||||
/// Mock embedding service for testing
|
||||
///
|
||||
/// Produces deterministic embeddings based on text content using FxHash.
|
||||
/// Designed to approximate real embedding behavior:
|
||||
/// - Similar texts produce similar embeddings
|
||||
/// - Different texts produce different embeddings
|
||||
/// - Embeddings are normalized to unit length
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust,ignore
|
||||
/// let service = MockEmbeddingService::new();
|
||||
///
|
||||
/// let emb1 = service.embed("hello world");
|
||||
/// let emb2 = service.embed("hello world");
|
||||
/// let emb3 = service.embed("goodbye world");
|
||||
///
|
||||
/// // Same input = same output
|
||||
/// assert_eq!(emb1, emb2);
|
||||
///
|
||||
/// // Different input = different output
|
||||
/// assert_ne!(emb1, emb3);
|
||||
///
|
||||
/// // But similar inputs have higher similarity
|
||||
/// let sim_same = service.cosine_similarity(&emb1, &emb2);
|
||||
/// let sim_diff = service.cosine_similarity(&emb1, &emb3);
|
||||
/// assert!(sim_same > sim_diff);
|
||||
/// ```
|
||||
pub struct MockEmbeddingService {
|
||||
/// Cache for computed embeddings
|
||||
cache: HashMap<String, Vec<f32>>,
|
||||
/// Whether to use word-level hashing for better semantic similarity
|
||||
semantic_mode: bool,
|
||||
}
|
||||
|
||||
impl Default for MockEmbeddingService {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
impl MockEmbeddingService {
|
||||
/// Create a new mock embedding service
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
cache: HashMap::new(),
|
||||
semantic_mode: true,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a service without semantic mode (pure hash-based)
|
||||
pub fn new_simple() -> Self {
|
||||
Self {
|
||||
cache: HashMap::new(),
|
||||
semantic_mode: false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Embed text into a vector
|
||||
pub fn embed(&mut self, text: &str) -> Vec<f32> {
|
||||
// Check cache first
|
||||
if let Some(cached) = self.cache.get(text) {
|
||||
return cached.clone();
|
||||
}
|
||||
|
||||
let embedding = if self.semantic_mode {
|
||||
self.semantic_embed(text)
|
||||
} else {
|
||||
self.simple_embed(text)
|
||||
};
|
||||
|
||||
self.cache.insert(text.to_string(), embedding.clone());
|
||||
embedding
|
||||
}
|
||||
|
||||
/// Simple hash-based embedding
|
||||
fn simple_embed(&self, text: &str) -> Vec<f32> {
|
||||
let mut embedding = vec![0.0f32; MOCK_EMBEDDING_DIM];
|
||||
let normalized = text.to_lowercase();
|
||||
|
||||
// Use multiple hash seeds for different dimensions
|
||||
for (i, chunk) in embedding.chunks_mut(64).enumerate() {
|
||||
let seed_text = format!("{}:{}", i, normalized);
|
||||
let hash = fx_hash(seed_text.as_bytes());
|
||||
|
||||
for (j, val) in chunk.iter_mut().enumerate() {
|
||||
// Generate pseudo-random float from hash
|
||||
let shifted = hash.rotate_left((j * 5) as u32);
|
||||
*val = ((shifted as f32 / u64::MAX as f32) * 2.0) - 1.0;
|
||||
}
|
||||
}
|
||||
|
||||
normalize(&mut embedding);
|
||||
embedding
|
||||
}
|
||||
|
||||
/// Semantic-aware embedding (word-level hashing)
|
||||
fn semantic_embed(&self, text: &str) -> Vec<f32> {
|
||||
let mut embedding = vec![0.0f32; MOCK_EMBEDDING_DIM];
|
||||
let normalized = text.to_lowercase();
|
||||
|
||||
// Tokenize into words
|
||||
let words: Vec<&str> = normalized
|
||||
.split(|c: char| !c.is_alphanumeric())
|
||||
.filter(|w| !w.is_empty())
|
||||
.collect();
|
||||
|
||||
if words.is_empty() {
|
||||
// Fall back to simple embedding for empty text
|
||||
return self.simple_embed(text);
|
||||
}
|
||||
|
||||
// Each word contributes to the embedding
|
||||
for word in &words {
|
||||
let word_hash = fx_hash(word.as_bytes());
|
||||
|
||||
// Map word to a sparse set of dimensions
|
||||
for i in 0..16 {
|
||||
let dim = ((word_hash >> (i * 4)) as usize) % MOCK_EMBEDDING_DIM;
|
||||
let sign = if (word_hash >> (i + 48)) & 1 == 0 { 1.0 } else { -1.0 };
|
||||
let magnitude = ((word_hash >> (i * 2)) as f32 % 100.0) / 100.0 + 0.5;
|
||||
embedding[dim] += sign * magnitude;
|
||||
}
|
||||
}
|
||||
|
||||
// Add position-aware component for word order sensitivity
|
||||
for (pos, word) in words.iter().enumerate() {
|
||||
let pos_hash = fx_hash(format!("{}:{}", pos, word).as_bytes());
|
||||
let dim = (pos_hash as usize) % MOCK_EMBEDDING_DIM;
|
||||
let weight = 1.0 / (pos as f32 + 1.0);
|
||||
embedding[dim] += weight;
|
||||
}
|
||||
|
||||
// Add character n-gram features for subword similarity
|
||||
let chars: Vec<char> = normalized.chars().collect();
|
||||
for i in 0..chars.len().saturating_sub(2) {
|
||||
let trigram: String = chars[i..i + 3].iter().collect();
|
||||
let hash = fx_hash(trigram.as_bytes());
|
||||
let dim = (hash as usize) % MOCK_EMBEDDING_DIM;
|
||||
embedding[dim] += 0.1;
|
||||
}
|
||||
|
||||
normalize(&mut embedding);
|
||||
embedding
|
||||
}
|
||||
|
||||
/// Calculate cosine similarity between two embeddings
|
||||
pub fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
|
||||
if a.len() != b.len() {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
|
||||
let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
|
||||
if norm_a == 0.0 || norm_b == 0.0 {
|
||||
return 0.0;
|
||||
}
|
||||
|
||||
dot / (norm_a * norm_b)
|
||||
}
|
||||
|
||||
/// Calculate euclidean distance between two embeddings
|
||||
pub fn euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
|
||||
if a.len() != b.len() {
|
||||
return f32::MAX;
|
||||
}
|
||||
|
||||
a.iter()
|
||||
.zip(b.iter())
|
||||
.map(|(x, y)| (x - y).powi(2))
|
||||
.sum::<f32>()
|
||||
.sqrt()
|
||||
}
|
||||
|
||||
/// Find most similar embedding from a set
|
||||
pub fn find_most_similar<'a>(
|
||||
&self,
|
||||
query: &[f32],
|
||||
candidates: &'a [(String, Vec<f32>)],
|
||||
) -> Option<(&'a str, f32)> {
|
||||
candidates
|
||||
.iter()
|
||||
.map(|(id, emb)| (id.as_str(), self.cosine_similarity(query, emb)))
|
||||
.max_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal))
|
||||
}
|
||||
|
||||
/// Clear the embedding cache
|
||||
pub fn clear_cache(&mut self) {
|
||||
self.cache.clear();
|
||||
}
|
||||
|
||||
/// Get cache size
|
||||
pub fn cache_size(&self) -> usize {
|
||||
self.cache.len()
|
||||
}
|
||||
|
||||
/// Check if service is ready (always true for mock)
|
||||
pub fn is_ready(&self) -> bool {
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
/// Normalize a vector to unit length
|
||||
fn normalize(v: &mut [f32]) {
|
||||
let norm: f32 = v.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
if norm > 0.0 {
|
||||
for x in v.iter_mut() {
|
||||
*x /= norm;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_deterministic_embedding() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb1 = service.embed("hello world");
|
||||
let emb2 = service.embed("hello world");
|
||||
|
||||
assert_eq!(emb1, emb2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_different_texts_different_embeddings() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb1 = service.embed("hello world");
|
||||
let emb2 = service.embed("goodbye universe");
|
||||
|
||||
assert_ne!(emb1, emb2);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_embedding_dimension() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("test text");
|
||||
|
||||
assert_eq!(emb.len(), MOCK_EMBEDDING_DIM);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_normalized_embeddings() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("test normalization");
|
||||
|
||||
let norm: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!((norm - 1.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_semantic_similarity() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb_dog = service.embed("the dog runs fast");
|
||||
let emb_cat = service.embed("the cat runs fast");
|
||||
let emb_car = service.embed("machine learning algorithms");
|
||||
|
||||
let sim_animals = service.cosine_similarity(&emb_dog, &emb_cat);
|
||||
let sim_different = service.cosine_similarity(&emb_dog, &emb_car);
|
||||
|
||||
// Similar sentences should have higher similarity
|
||||
assert!(sim_animals > sim_different);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_cosine_similarity_range() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let emb1 = service.embed("test one");
|
||||
let emb2 = service.embed("test two");
|
||||
|
||||
let sim = service.cosine_similarity(&emb1, &emb2);
|
||||
|
||||
// Cosine similarity should be in [-1, 1]
|
||||
assert!(sim >= -1.0 && sim <= 1.0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_self_similarity() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("self similarity test");
|
||||
|
||||
let sim = service.cosine_similarity(&emb, &emb);
|
||||
assert!((sim - 1.0).abs() < 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_caching() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
assert_eq!(service.cache_size(), 0);
|
||||
|
||||
service.embed("text one");
|
||||
assert_eq!(service.cache_size(), 1);
|
||||
|
||||
service.embed("text one"); // Should use cache
|
||||
assert_eq!(service.cache_size(), 1);
|
||||
|
||||
service.embed("text two");
|
||||
assert_eq!(service.cache_size(), 2);
|
||||
|
||||
service.clear_cache();
|
||||
assert_eq!(service.cache_size(), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_most_similar() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
|
||||
let query = service.embed("programming code");
|
||||
let candidates = vec![
|
||||
("doc1".to_string(), service.embed("python programming language")),
|
||||
("doc2".to_string(), service.embed("cooking recipes")),
|
||||
("doc3".to_string(), service.embed("software development code")),
|
||||
];
|
||||
|
||||
let result = service.find_most_similar(&query, &candidates);
|
||||
assert!(result.is_some());
|
||||
|
||||
// Should find a programming-related document
|
||||
let (id, _) = result.unwrap();
|
||||
assert!(id == "doc1" || id == "doc3");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty_text() {
|
||||
let mut service = MockEmbeddingService::new();
|
||||
let emb = service.embed("");
|
||||
|
||||
assert_eq!(emb.len(), MOCK_EMBEDDING_DIM);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_mode() {
|
||||
let mut service = MockEmbeddingService::new_simple();
|
||||
let emb = service.embed("test simple mode");
|
||||
|
||||
assert_eq!(emb.len(), MOCK_EMBEDDING_DIM);
|
||||
|
||||
// Verify normalization
|
||||
let norm: f32 = emb.iter().map(|x| x * x).sum::<f32>().sqrt();
|
||||
assert!((norm - 1.0).abs() < 0.001);
|
||||
}
|
||||
}
|
||||
11
tests/e2e/src/mocks/mod.rs
Normal file
11
tests/e2e/src/mocks/mod.rs
Normal file
|
|
@ -0,0 +1,11 @@
|
|||
//! Mock Services Module
|
||||
//!
|
||||
//! Provides mock implementations for testing:
|
||||
//! - `MockEmbeddingService` - Deterministic embeddings using FxHash
|
||||
//! - `TestDataFactory` - Generate test data with realistic properties
|
||||
|
||||
mod fixtures;
|
||||
mod mock_embedding;
|
||||
|
||||
pub use fixtures::TestDataFactory;
|
||||
pub use mock_embedding::MockEmbeddingService;
|
||||
Loading…
Add table
Add a link
Reference in a new issue