Switch embedding model from BGE to nomic-embed-text-v1.5

- Replace BGE-base-en-v1.5 with nomic-embed-text-v1.5 - 8192 token context window (vs 512 for BGE) - Matryoshka representation learning support - Fully open source with training data released - Same 768 dimensions, no schema changes required Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-05-11 08:42:36 +02:00 · 2026-01-25 03:11:15 -06:00 · 2026-01-25 03:11:15 -06:00 · 5337efdfa7
commit 5337efdfa7
parent 449d60754a
5 changed files with 20 additions and 19 deletions
--- a/crates/vestige-core/src/embeddings/local.rs
+++ b/crates/vestige-core/src/embeddings/local.rs
@ -1,14 +1,15 @@
 //! Local Semantic Embeddings
 //!
 //! Uses fastembed v5 for local ONNX-based embedding generation.
-//! Default model: BGE-base-en-v1.5 (768 dimensions, 85%+ Top-5 accuracy)
+//! Default model: Nomic Embed Text v1.5 (768 dimensions, Matryoshka support)
 //!
 //! ## 2026 GOD TIER UPGRADE
 //!
-//! Upgraded from all-MiniLM-L6-v2 (384d, 56% accuracy) to BGE-base-en-v1.5:
-//! - +30% retrieval accuracy
-//! - 768 dimensions for richer semantic representation
+//! Upgraded to nomic-embed-text-v1.5:
+//! - 768 dimensions with Matryoshka representation learning
+//! - 8192 token context window (vs 512 for most models)
 //! - State-of-the-art MTEB benchmark performance
+//! - Fully open source with training data released

 use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
 use std::sync::{Mutex, OnceLock};
@ -17,8 +18,8 @@ use std::sync::{Mutex, OnceLock};
 // CONSTANTS
 // ============================================================================

-/// Embedding dimensions for the default model (BGE-base-en-v1.5)
-/// Upgraded from 384 (MiniLM) to 768 (BGE) for +30% accuracy
+/// Embedding dimensions for the default model (nomic-embed-text-v1.5)
+/// 768 dimensions with Matryoshka support (can truncate to 256/512 if needed)
 pub const EMBEDDING_DIMENSIONS: usize = 768;

 /// Maximum text length for embedding (truncated if longer)
@ -35,19 +36,19 @@ pub const BATCH_SIZE: usize = 32;
 static EMBEDDING_MODEL_RESULT: OnceLock<Result<Mutex<TextEmbedding>, String>> = OnceLock::new();

 /// Initialize the global embedding model
-/// Using BGE-base-en-v1.5 (768d) - 2026 GOD TIER upgrade from MiniLM-L6-v2
+/// Using nomic-embed-text-v1.5 (768d) - 8192 token context, Matryoshka support
 fn get_model() -> Result<std::sync::MutexGuard<'static, TextEmbedding>, EmbeddingError> {
    let result = EMBEDDING_MODEL_RESULT.get_or_init(|| {
-        // BGE-base-en-v1.5: 768 dimensions, 85%+ Top-5 accuracy
-        // Massive upgrade from MiniLM-L6-v2 (384d, 56% accuracy)
+        // nomic-embed-text-v1.5: 768 dimensions, 8192 token context
+        // Matryoshka representation learning, fully open source
        let options =
-            InitOptions::new(EmbeddingModel::BGEBaseENV15).with_show_download_progress(true);
+            InitOptions::new(EmbeddingModel::NomicEmbedTextV15).with_show_download_progress(true);

        TextEmbedding::try_new(options)
            .map(Mutex::new)
            .map_err(|e| {
                format!(
-                    "Failed to initialize BGE-base-en-v1.5 embedding model: {}. \
+                    "Failed to initialize nomic-embed-text-v1.5 embedding model: {}. \
                    Ensure ONNX runtime is available and model files can be downloaded.",
                    e
                )
@ -197,7 +198,7 @@ impl EmbeddingService {

    /// Get the model name
    pub fn model_name(&self) -> &'static str {
-        "BAAI/bge-base-en-v1.5"
+        "nomic-ai/nomic-embed-text-v1.5"
    }

    /// Get the embedding dimensions
--- a/crates/vestige-core/src/embeddings/mod.rs
+++ b/crates/vestige-core/src/embeddings/mod.rs
@ -4,7 +4,7 @@
 //! No external API calls required - 100% local and private.
 //!
 //! Supports:
-//! - Text embedding generation (768-dimensional vectors via BGE-base-en-v1.5)
+//! - Text embedding generation (768-dimensional vectors via nomic-embed-text-v1.5)
 //! - Cosine similarity computation
 //! - Batch embedding for efficiency
 //! - Hybrid multi-model fusion (future)