Switch embedding model from BGE to nomic-embed-text-v1.5

- Replace BGE-base-en-v1.5 with nomic-embed-text-v1.5 - 8192 token context window (vs 512 for BGE) - Matryoshka representation learning support - Fully open source with training data released - Same 768 dimensions, no schema changes required Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-06-08 20:25:16 +02:00 · 2026-01-25 03:11:15 -06:00 · 2026-01-25 03:11:15 -06:00 · 5337efdfa7
commit 5337efdfa7
parent 449d60754a
5 changed files with 20 additions and 19 deletions
--- a/crates/vestige-core/Cargo.toml
+++ b/crates/vestige-core/Cargo.toml
@ -60,7 +60,7 @@ notify = "8"
 # ============================================================================
 # OPTIONAL: Embeddings (fastembed v5 - local ONNX inference, 2026 bleeding edge)
 # ============================================================================
-# BGE-base-en-v1.5: 768 dimensions, 85%+ Top-5 accuracy (vs 56% for MiniLM)
+# nomic-embed-text-v1.5: 768 dimensions, 8192 token context, Matryoshka support
 fastembed = { version = "5", optional = true }

 # ============================================================================
--- a/crates/vestige-core/src/embeddings/local.rs
+++ b/crates/vestige-core/src/embeddings/local.rs
@ -1,14 +1,15 @@
 //! Local Semantic Embeddings
 //!
 //! Uses fastembed v5 for local ONNX-based embedding generation.
-//! Default model: BGE-base-en-v1.5 (768 dimensions, 85%+ Top-5 accuracy)
+//! Default model: Nomic Embed Text v1.5 (768 dimensions, Matryoshka support)
 //!
 //! ## 2026 GOD TIER UPGRADE
 //!
-//! Upgraded from all-MiniLM-L6-v2 (384d, 56% accuracy) to BGE-base-en-v1.5:
-//! - +30% retrieval accuracy
-//! - 768 dimensions for richer semantic representation
+//! Upgraded to nomic-embed-text-v1.5:
+//! - 768 dimensions with Matryoshka representation learning
+//! - 8192 token context window (vs 512 for most models)
 //! - State-of-the-art MTEB benchmark performance
+//! - Fully open source with training data released

 use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
 use std::sync::{Mutex, OnceLock};
@ -17,8 +18,8 @@ use std::sync::{Mutex, OnceLock};
 // CONSTANTS
 // ============================================================================

-/// Embedding dimensions for the default model (BGE-base-en-v1.5)
-/// Upgraded from 384 (MiniLM) to 768 (BGE) for +30% accuracy
+/// Embedding dimensions for the default model (nomic-embed-text-v1.5)
+/// 768 dimensions with Matryoshka support (can truncate to 256/512 if needed)
 pub const EMBEDDING_DIMENSIONS: usize = 768;

 /// Maximum text length for embedding (truncated if longer)
@ -35,19 +36,19 @@ pub const BATCH_SIZE: usize = 32;
 static EMBEDDING_MODEL_RESULT: OnceLock<Result<Mutex<TextEmbedding>, String>> = OnceLock::new();

 /// Initialize the global embedding model
-/// Using BGE-base-en-v1.5 (768d) - 2026 GOD TIER upgrade from MiniLM-L6-v2
+/// Using nomic-embed-text-v1.5 (768d) - 8192 token context, Matryoshka support
 fn get_model() -> Result<std::sync::MutexGuard<'static, TextEmbedding>, EmbeddingError> {
    let result = EMBEDDING_MODEL_RESULT.get_or_init(|| {
-        // BGE-base-en-v1.5: 768 dimensions, 85%+ Top-5 accuracy
-        // Massive upgrade from MiniLM-L6-v2 (384d, 56% accuracy)
+        // nomic-embed-text-v1.5: 768 dimensions, 8192 token context
+        // Matryoshka representation learning, fully open source
        let options =
-            InitOptions::new(EmbeddingModel::BGEBaseENV15).with_show_download_progress(true);
+            InitOptions::new(EmbeddingModel::NomicEmbedTextV15).with_show_download_progress(true);

        TextEmbedding::try_new(options)
            .map(Mutex::new)
            .map_err(|e| {
                format!(
-                    "Failed to initialize BGE-base-en-v1.5 embedding model: {}. \
+                    "Failed to initialize nomic-embed-text-v1.5 embedding model: {}. \
                    Ensure ONNX runtime is available and model files can be downloaded.",
                    e
                )
@ -197,7 +198,7 @@ impl EmbeddingService {

    /// Get the model name
    pub fn model_name(&self) -> &'static str {
-        "BAAI/bge-base-en-v1.5"
+        "nomic-ai/nomic-embed-text-v1.5"
    }

    /// Get the embedding dimensions
--- a/crates/vestige-core/src/embeddings/mod.rs
+++ b/crates/vestige-core/src/embeddings/mod.rs
@ -4,7 +4,7 @@
 //! No external API calls required - 100% local and private.
 //!
 //! Supports:
-//! - Text embedding generation (768-dimensional vectors via BGE-base-en-v1.5)
+//! - Text embedding generation (768-dimensional vectors via nomic-embed-text-v1.5)
 //! - Cosine similarity computation
 //! - Batch embedding for efficiency
 //! - Hybrid multi-model fusion (future)
--- a/crates/vestige-core/src/lib.rs
+++ b/crates/vestige-core/src/lib.rs
@ -4,7 +4,7 @@
 //!
 //! - **FSRS-6**: 21-parameter spaced repetition (30% more efficient than SM-2)
 //! - **Dual-Strength Model**: Bjork & Bjork (1992) storage/retrieval strength
-//! - **Semantic Embeddings**: Local fastembed v5 (BGE-base-en-v1.5, 768 dimensions)
+//! - **Semantic Embeddings**: Local fastembed v5 (nomic-embed-text-v1.5, 768 dimensions)
 //! - **HNSW Vector Search**: USearch (20x faster than FAISS)
 //! - **Temporal Memory**: Bi-temporal model with validity periods
 //! - **Hybrid Search**: RRF fusion of keyword (BM25/FTS5) + semantic
@ -394,9 +394,9 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 /// FSRS algorithm version (6 = 21 parameters)
 pub const FSRS_VERSION: u8 = 6;

-/// Default embedding model (2026 GOD TIER: BGE-base-en-v1.5)
-/// Upgraded from all-MiniLM-L6-v2 for +30% retrieval accuracy
-pub const DEFAULT_EMBEDDING_MODEL: &str = "BAAI/bge-base-en-v1.5";
+/// Default embedding model (2026 GOD TIER: nomic-embed-text-v1.5)
+/// 8192 token context, Matryoshka support, fully open source
+pub const DEFAULT_EMBEDDING_MODEL: &str = "nomic-ai/nomic-embed-text-v1.5";

 // ============================================================================
 // PRELUDE
--- a/crates/vestige-mcp/README.md
+++ b/crates/vestige-mcp/README.md
@ -6,7 +6,7 @@ A bleeding-edge Rust MCP (Model Context Protocol) server for Vestige - providing

 - **FSRS-6 Algorithm**: State-of-the-art spaced repetition (21 parameters, personalized decay)
 - **Dual-Strength Memory Model**: Based on Bjork & Bjork 1992 cognitive science research
- **Local Semantic Embeddings**: BGE-base-en-v1.5 (768d) via fastembed v5 (no external API)
+- **Local Semantic Embeddings**: nomic-embed-text-v1.5 (768d) via fastembed v5 (no external API)
 - **HNSW Vector Search**: USearch-based, 20x faster than FAISS
 - **Hybrid Search**: BM25 + semantic with RRF fusion
 - **Codebase Memory**: Remember patterns, decisions, and context