mirror of
https://github.com/samvallad33/vestige.git
synced 2026-04-25 00:36:22 +02:00
Switch embedding model from BGE to nomic-embed-text-v1.5
- Replace BGE-base-en-v1.5 with nomic-embed-text-v1.5 - 8192 token context window (vs 512 for BGE) - Matryoshka representation learning support - Fully open source with training data released - Same 768 dimensions, no schema changes required Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
449d60754a
commit
5337efdfa7
5 changed files with 20 additions and 19 deletions
|
|
@ -60,7 +60,7 @@ notify = "8"
|
|||
# ============================================================================
|
||||
# OPTIONAL: Embeddings (fastembed v5 - local ONNX inference, 2026 bleeding edge)
|
||||
# ============================================================================
|
||||
# BGE-base-en-v1.5: 768 dimensions, 85%+ Top-5 accuracy (vs 56% for MiniLM)
|
||||
# nomic-embed-text-v1.5: 768 dimensions, 8192 token context, Matryoshka support
|
||||
fastembed = { version = "5", optional = true }
|
||||
|
||||
# ============================================================================
|
||||
|
|
|
|||
|
|
@ -1,14 +1,15 @@
|
|||
//! Local Semantic Embeddings
|
||||
//!
|
||||
//! Uses fastembed v5 for local ONNX-based embedding generation.
|
||||
//! Default model: BGE-base-en-v1.5 (768 dimensions, 85%+ Top-5 accuracy)
|
||||
//! Default model: Nomic Embed Text v1.5 (768 dimensions, Matryoshka support)
|
||||
//!
|
||||
//! ## 2026 GOD TIER UPGRADE
|
||||
//!
|
||||
//! Upgraded from all-MiniLM-L6-v2 (384d, 56% accuracy) to BGE-base-en-v1.5:
|
||||
//! - +30% retrieval accuracy
|
||||
//! - 768 dimensions for richer semantic representation
|
||||
//! Upgraded to nomic-embed-text-v1.5:
|
||||
//! - 768 dimensions with Matryoshka representation learning
|
||||
//! - 8192 token context window (vs 512 for most models)
|
||||
//! - State-of-the-art MTEB benchmark performance
|
||||
//! - Fully open source with training data released
|
||||
|
||||
use fastembed::{EmbeddingModel, InitOptions, TextEmbedding};
|
||||
use std::sync::{Mutex, OnceLock};
|
||||
|
|
@ -17,8 +18,8 @@ use std::sync::{Mutex, OnceLock};
|
|||
// CONSTANTS
|
||||
// ============================================================================
|
||||
|
||||
/// Embedding dimensions for the default model (BGE-base-en-v1.5)
|
||||
/// Upgraded from 384 (MiniLM) to 768 (BGE) for +30% accuracy
|
||||
/// Embedding dimensions for the default model (nomic-embed-text-v1.5)
|
||||
/// 768 dimensions with Matryoshka support (can truncate to 256/512 if needed)
|
||||
pub const EMBEDDING_DIMENSIONS: usize = 768;
|
||||
|
||||
/// Maximum text length for embedding (truncated if longer)
|
||||
|
|
@ -35,19 +36,19 @@ pub const BATCH_SIZE: usize = 32;
|
|||
static EMBEDDING_MODEL_RESULT: OnceLock<Result<Mutex<TextEmbedding>, String>> = OnceLock::new();
|
||||
|
||||
/// Initialize the global embedding model
|
||||
/// Using BGE-base-en-v1.5 (768d) - 2026 GOD TIER upgrade from MiniLM-L6-v2
|
||||
/// Using nomic-embed-text-v1.5 (768d) - 8192 token context, Matryoshka support
|
||||
fn get_model() -> Result<std::sync::MutexGuard<'static, TextEmbedding>, EmbeddingError> {
|
||||
let result = EMBEDDING_MODEL_RESULT.get_or_init(|| {
|
||||
// BGE-base-en-v1.5: 768 dimensions, 85%+ Top-5 accuracy
|
||||
// Massive upgrade from MiniLM-L6-v2 (384d, 56% accuracy)
|
||||
// nomic-embed-text-v1.5: 768 dimensions, 8192 token context
|
||||
// Matryoshka representation learning, fully open source
|
||||
let options =
|
||||
InitOptions::new(EmbeddingModel::BGEBaseENV15).with_show_download_progress(true);
|
||||
InitOptions::new(EmbeddingModel::NomicEmbedTextV15).with_show_download_progress(true);
|
||||
|
||||
TextEmbedding::try_new(options)
|
||||
.map(Mutex::new)
|
||||
.map_err(|e| {
|
||||
format!(
|
||||
"Failed to initialize BGE-base-en-v1.5 embedding model: {}. \
|
||||
"Failed to initialize nomic-embed-text-v1.5 embedding model: {}. \
|
||||
Ensure ONNX runtime is available and model files can be downloaded.",
|
||||
e
|
||||
)
|
||||
|
|
@ -197,7 +198,7 @@ impl EmbeddingService {
|
|||
|
||||
/// Get the model name
|
||||
pub fn model_name(&self) -> &'static str {
|
||||
"BAAI/bge-base-en-v1.5"
|
||||
"nomic-ai/nomic-embed-text-v1.5"
|
||||
}
|
||||
|
||||
/// Get the embedding dimensions
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
//! No external API calls required - 100% local and private.
|
||||
//!
|
||||
//! Supports:
|
||||
//! - Text embedding generation (768-dimensional vectors via BGE-base-en-v1.5)
|
||||
//! - Text embedding generation (768-dimensional vectors via nomic-embed-text-v1.5)
|
||||
//! - Cosine similarity computation
|
||||
//! - Batch embedding for efficiency
|
||||
//! - Hybrid multi-model fusion (future)
|
||||
|
|
|
|||
|
|
@ -4,7 +4,7 @@
|
|||
//!
|
||||
//! - **FSRS-6**: 21-parameter spaced repetition (30% more efficient than SM-2)
|
||||
//! - **Dual-Strength Model**: Bjork & Bjork (1992) storage/retrieval strength
|
||||
//! - **Semantic Embeddings**: Local fastembed v5 (BGE-base-en-v1.5, 768 dimensions)
|
||||
//! - **Semantic Embeddings**: Local fastembed v5 (nomic-embed-text-v1.5, 768 dimensions)
|
||||
//! - **HNSW Vector Search**: USearch (20x faster than FAISS)
|
||||
//! - **Temporal Memory**: Bi-temporal model with validity periods
|
||||
//! - **Hybrid Search**: RRF fusion of keyword (BM25/FTS5) + semantic
|
||||
|
|
@ -394,9 +394,9 @@ pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
|||
/// FSRS algorithm version (6 = 21 parameters)
|
||||
pub const FSRS_VERSION: u8 = 6;
|
||||
|
||||
/// Default embedding model (2026 GOD TIER: BGE-base-en-v1.5)
|
||||
/// Upgraded from all-MiniLM-L6-v2 for +30% retrieval accuracy
|
||||
pub const DEFAULT_EMBEDDING_MODEL: &str = "BAAI/bge-base-en-v1.5";
|
||||
/// Default embedding model (2026 GOD TIER: nomic-embed-text-v1.5)
|
||||
/// 8192 token context, Matryoshka support, fully open source
|
||||
pub const DEFAULT_EMBEDDING_MODEL: &str = "nomic-ai/nomic-embed-text-v1.5";
|
||||
|
||||
// ============================================================================
|
||||
// PRELUDE
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ A bleeding-edge Rust MCP (Model Context Protocol) server for Vestige - providing
|
|||
|
||||
- **FSRS-6 Algorithm**: State-of-the-art spaced repetition (21 parameters, personalized decay)
|
||||
- **Dual-Strength Memory Model**: Based on Bjork & Bjork 1992 cognitive science research
|
||||
- **Local Semantic Embeddings**: BGE-base-en-v1.5 (768d) via fastembed v5 (no external API)
|
||||
- **Local Semantic Embeddings**: nomic-embed-text-v1.5 (768d) via fastembed v5 (no external API)
|
||||
- **HNSW Vector Search**: USearch-based, 20x faster than FAISS
|
||||
- **Hybrid Search**: BM25 + semantic with RRF fusion
|
||||
- **Codebase Memory**: Remember patterns, decisions, and context
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue