diff --git a/crates/vestige-core/Cargo.toml b/crates/vestige-core/Cargo.toml index 45fd40f..908f063 100644 --- a/crates/vestige-core/Cargo.toml +++ b/crates/vestige-core/Cargo.toml @@ -60,7 +60,7 @@ notify = "8" # ============================================================================ # OPTIONAL: Embeddings (fastembed v5 - local ONNX inference, 2026 bleeding edge) # ============================================================================ -# BGE-base-en-v1.5: 768 dimensions, 85%+ Top-5 accuracy (vs 56% for MiniLM) +# nomic-embed-text-v1.5: 768 dimensions, 8192 token context, Matryoshka support fastembed = { version = "5", optional = true } # ============================================================================ diff --git a/crates/vestige-core/src/embeddings/local.rs b/crates/vestige-core/src/embeddings/local.rs index 254566a..98e0985 100644 --- a/crates/vestige-core/src/embeddings/local.rs +++ b/crates/vestige-core/src/embeddings/local.rs @@ -1,14 +1,15 @@ //! Local Semantic Embeddings //! //! Uses fastembed v5 for local ONNX-based embedding generation. -//! Default model: BGE-base-en-v1.5 (768 dimensions, 85%+ Top-5 accuracy) +//! Default model: Nomic Embed Text v1.5 (768 dimensions, Matryoshka support) //! //! ## 2026 GOD TIER UPGRADE //! -//! Upgraded from all-MiniLM-L6-v2 (384d, 56% accuracy) to BGE-base-en-v1.5: -//! - +30% retrieval accuracy -//! - 768 dimensions for richer semantic representation +//! Upgraded to nomic-embed-text-v1.5: +//! - 768 dimensions with Matryoshka representation learning +//! - 8192 token context window (vs 512 for most models) //! - State-of-the-art MTEB benchmark performance +//! - Fully open source with training data released use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; use std::sync::{Mutex, OnceLock}; @@ -17,8 +18,8 @@ use std::sync::{Mutex, OnceLock}; // CONSTANTS // ============================================================================ -/// Embedding dimensions for the default model (BGE-base-en-v1.5) -/// Upgraded from 384 (MiniLM) to 768 (BGE) for +30% accuracy +/// Embedding dimensions for the default model (nomic-embed-text-v1.5) +/// 768 dimensions with Matryoshka support (can truncate to 256/512 if needed) pub const EMBEDDING_DIMENSIONS: usize = 768; /// Maximum text length for embedding (truncated if longer) @@ -35,19 +36,19 @@ pub const BATCH_SIZE: usize = 32; static EMBEDDING_MODEL_RESULT: OnceLock, String>> = OnceLock::new(); /// Initialize the global embedding model -/// Using BGE-base-en-v1.5 (768d) - 2026 GOD TIER upgrade from MiniLM-L6-v2 +/// Using nomic-embed-text-v1.5 (768d) - 8192 token context, Matryoshka support fn get_model() -> Result, EmbeddingError> { let result = EMBEDDING_MODEL_RESULT.get_or_init(|| { - // BGE-base-en-v1.5: 768 dimensions, 85%+ Top-5 accuracy - // Massive upgrade from MiniLM-L6-v2 (384d, 56% accuracy) + // nomic-embed-text-v1.5: 768 dimensions, 8192 token context + // Matryoshka representation learning, fully open source let options = - InitOptions::new(EmbeddingModel::BGEBaseENV15).with_show_download_progress(true); + InitOptions::new(EmbeddingModel::NomicEmbedTextV15).with_show_download_progress(true); TextEmbedding::try_new(options) .map(Mutex::new) .map_err(|e| { format!( - "Failed to initialize BGE-base-en-v1.5 embedding model: {}. \ + "Failed to initialize nomic-embed-text-v1.5 embedding model: {}. \ Ensure ONNX runtime is available and model files can be downloaded.", e ) @@ -197,7 +198,7 @@ impl EmbeddingService { /// Get the model name pub fn model_name(&self) -> &'static str { - "BAAI/bge-base-en-v1.5" + "nomic-ai/nomic-embed-text-v1.5" } /// Get the embedding dimensions diff --git a/crates/vestige-core/src/embeddings/mod.rs b/crates/vestige-core/src/embeddings/mod.rs index dadfdd5..2dec019 100644 --- a/crates/vestige-core/src/embeddings/mod.rs +++ b/crates/vestige-core/src/embeddings/mod.rs @@ -4,7 +4,7 @@ //! No external API calls required - 100% local and private. //! //! Supports: -//! - Text embedding generation (768-dimensional vectors via BGE-base-en-v1.5) +//! - Text embedding generation (768-dimensional vectors via nomic-embed-text-v1.5) //! - Cosine similarity computation //! - Batch embedding for efficiency //! - Hybrid multi-model fusion (future) diff --git a/crates/vestige-core/src/lib.rs b/crates/vestige-core/src/lib.rs index ce86347..f9f9236 100644 --- a/crates/vestige-core/src/lib.rs +++ b/crates/vestige-core/src/lib.rs @@ -4,7 +4,7 @@ //! //! - **FSRS-6**: 21-parameter spaced repetition (30% more efficient than SM-2) //! - **Dual-Strength Model**: Bjork & Bjork (1992) storage/retrieval strength -//! - **Semantic Embeddings**: Local fastembed v5 (BGE-base-en-v1.5, 768 dimensions) +//! - **Semantic Embeddings**: Local fastembed v5 (nomic-embed-text-v1.5, 768 dimensions) //! - **HNSW Vector Search**: USearch (20x faster than FAISS) //! - **Temporal Memory**: Bi-temporal model with validity periods //! - **Hybrid Search**: RRF fusion of keyword (BM25/FTS5) + semantic @@ -394,9 +394,9 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION"); /// FSRS algorithm version (6 = 21 parameters) pub const FSRS_VERSION: u8 = 6; -/// Default embedding model (2026 GOD TIER: BGE-base-en-v1.5) -/// Upgraded from all-MiniLM-L6-v2 for +30% retrieval accuracy -pub const DEFAULT_EMBEDDING_MODEL: &str = "BAAI/bge-base-en-v1.5"; +/// Default embedding model (2026 GOD TIER: nomic-embed-text-v1.5) +/// 8192 token context, Matryoshka support, fully open source +pub const DEFAULT_EMBEDDING_MODEL: &str = "nomic-ai/nomic-embed-text-v1.5"; // ============================================================================ // PRELUDE diff --git a/crates/vestige-mcp/README.md b/crates/vestige-mcp/README.md index 7c59f9f..10bdfa3 100644 --- a/crates/vestige-mcp/README.md +++ b/crates/vestige-mcp/README.md @@ -6,7 +6,7 @@ A bleeding-edge Rust MCP (Model Context Protocol) server for Vestige - providing - **FSRS-6 Algorithm**: State-of-the-art spaced repetition (21 parameters, personalized decay) - **Dual-Strength Memory Model**: Based on Bjork & Bjork 1992 cognitive science research -- **Local Semantic Embeddings**: BGE-base-en-v1.5 (768d) via fastembed v5 (no external API) +- **Local Semantic Embeddings**: nomic-embed-text-v1.5 (768d) via fastembed v5 (no external API) - **HNSW Vector Search**: USearch-based, 20x faster than FAISS - **Hybrid Search**: BM25 + semantic with RRF fusion - **Codebase Memory**: Remember patterns, decisions, and context