From 82b78ab6647f4315ae32c7efd9901f22ae19f7a5 Mon Sep 17 00:00:00 2001 From: Sam Valladares Date: Sat, 18 Apr 2026 20:41:28 -0500 Subject: [PATCH] feat(v2.1.0): Qwen3-Embedding-0.6B backend scaffolding (feature-gated) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Day 2 of the Qwen3 migration. Default build is unchanged — nomic stays the embedding backend, every existing caller continues to see 256-dim Matryoshka-truncated vectors from the ONNX path. The `qwen3-embed` feature flag routes to fastembed's standalone Qwen3TextEmbedding (Candle backend) for 1024-dim native output with 32K context. Honest scope note: this commit is SCAFFOLDING. Under `qwen3-embed` the backend initialises cleanly, the vector index now sizes itself to 1024d via feature-gated DEFAULT_DIMENSIONS, and the full 366-test lib suite passes. End-to-end ingest + search under qwen3-embed still has two gaps that Day 3 closes: sqlite.rs hardcodes the embedding_model string as 'nomic-embed-text-v1.5' at the write sites, and get_query_embedding doesn't call qwen3_format_query on the query text. Neither is a regression for default builds — both are explicit Day 3 work items tracked in the audit inventory. What's here: - New `Backend` enum wraps either `TextEmbedding` (Nomic ONNX) or `Qwen3TextEmbedding` (Candle) behind the same Mutex> the rest of Vestige already calls through. `EmbeddingService::embed` dispatches via `Backend::embed_batch` + `Backend::post_process` so the public API shape doesn't change. - `qwen3-embed` Cargo feature = fastembed/qwen3 + direct candle-core pinned to =0.10.2 (exact, not caret — supply-chain defence alongside Cargo.lock; fastembed doesn't re-export candle_core types so we need a direct dep path for candle_core::{Device, DType}). - `qwen3_format_query()` helper + `QWEN3_QUERY_INSTRUCTION` constant. Qwen3 is asymmetric — queries require the Instruct prefix, documents go in raw. Prefix format matches the canonical `get_detailed_instruct` Python reference in the HF model card (no space after `Query:`). The helper is a no-op under the nomic backend so upstream code can wrap queries unconditionally. - Per-backend dimensions: `NOMIC_EMBEDDING_DIMENSIONS = 256`, `QWEN3_EMBEDDING_DIMENSIONS = 1024`. `EMBEDDING_DIMENSIONS` resolves to the active backend at compile time for back-compat. - `search/vector.rs::DEFAULT_DIMENSIONS` and `advanced/adaptive_embedding.rs::{DEFAULT_DIMENSIONS, CODE_DIMENSIONS}` feature-gated to match the active backend so the USearch index sizes itself correctly. - Per-backend model_name() returning the HF repo ID ("nomic-ai/..." or "Qwen/..."). Will be threaded through storage write sites in Day 3. - MAX_TEXT_LENGTH bumps to 32K under qwen3-embed to match Qwen3's context window; stays at 8K for nomic. - Backend::post_process applies matryoshka_truncate for Nomic only; Qwen3 output is already last-token pooled + L2-normalized by the Candle model (verified in fastembed-5.13.2 qwen3.rs:1124-1125). - Device selection: `#[cfg(feature = "metal")]` uses Device::new_metal(0) with CPU fallback on failure; otherwise CPU. CUDA auto-selection deferred to Day 3+. - Shape-contract guard at the Backend output boundary — empty outer OR inner vectors return EmbeddingError::EmbeddingFailed instead of the previous `.unwrap()` + zero-dim vector reaching USearch. Tests: 366 passing under default features AND --features qwen3-embed. Zero clippy warnings on both. One live integration test (`test_qwen3_embed_live`) `#[ignore]`d so CI doesn't try to pull the 1.2 GB Qwen3 weights on every run; invoke explicitly with `cargo test --features qwen3-embed -- --ignored test_qwen3_embed_live`. Pre-push audit (4 parallel reviewers — security, code-quality, end-to-end flow trace, external verification) ran clean on: - Cfg soundness across default / qwen3-embed / qwen3-embed+metal / nomic-v2 / no-default-features / encryption matrices - Doc-comment fidelity vs fastembed-5.13.2 source - External claims (1024d, 32K ctx, MRL 32-1024, L2-normalized, last-token pooling) all verified against Qwen3 HF model card and fastembed qwen3.rs - Zero `unsafe`, zero reachable panics, zero info-disclosure leaks beyond HF upstream error strings Day 3 (next session): - sqlite.rs:663 and :669 — write EmbeddingService::model_name() instead of hardcoded "nomic-embed-text-v1.5" - sqlite.rs:1639 get_query_embedding — wrap query text with qwen3_format_query() before calling embed() - sqlite.rs load_embeddings_into_index — refuse cross-backend loads (legacy nomic rows under qwen3 build) instead of silent re-use - Add a migration warn when backend mismatch is detected --- Cargo.lock | 1 + crates/vestige-core/Cargo.toml | 18 + .../src/advanced/adaptive_embedding.rs | 31 +- crates/vestige-core/src/embeddings/local.rs | 416 +++++++++++++++--- crates/vestige-core/src/embeddings/mod.rs | 17 +- crates/vestige-core/src/search/vector.rs | 22 +- 6 files changed, 422 insertions(+), 83 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c80a041..0de71d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4533,6 +4533,7 @@ checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" name = "vestige-core" version = "2.0.6" dependencies = [ + "candle-core", "chrono", "criterion", "directories", diff --git a/crates/vestige-core/Cargo.toml b/crates/vestige-core/Cargo.toml index c82469f..cd221c1 100644 --- a/crates/vestige-core/Cargo.toml +++ b/crates/vestige-core/Cargo.toml @@ -42,6 +42,13 @@ nomic-v2 = ["embeddings", "fastembed/nomic-v2-moe"] # Qwen3 Reranker (Candle backend, high-precision cross-encoder) qwen3-reranker = ["embeddings", "fastembed/qwen3"] +# Qwen3 Embedding 0.6B (Candle backend, 1024d, 32K context, +8.8 MTEB retrieval pts vs nomic v1.5) +# Uses fastembed's standalone Qwen3TextEmbedding (parallel to the ONNX TextEmbedding enum path). +# Query/document asymmetry: queries MUST use the instruct prefix via qwen3_format_query(). +# Enable with `--features qwen3-embed` (+ `metal` for Apple Silicon GPU acceleration). +# Requires candle-core as a direct dep so we can name `Device` and `DType` at the call site. +qwen3-embed = ["embeddings", "fastembed/qwen3", "dep:candle-core"] + # Metal GPU acceleration on Apple Silicon (significantly faster inference) metal = ["fastembed/metal"] @@ -87,6 +94,17 @@ notify = "8" # v5.11: Adds Nomic v2 MoE (nomic-v2-moe feature) + Qwen3 reranker (qwen3 feature) fastembed = { version = "5.11", default-features = false, features = ["hf-hub-native-tls", "image-models"], optional = true } +# candle-core is already pulled in transitively by fastembed's `qwen3` feature, +# but it is NOT re-exported, so we declare it as a direct optional dep so the +# Qwen3 backend can name `candle_core::Device` and `candle_core::DType` when +# calling `Qwen3TextEmbedding::from_hf(...)`. +# +# Version tightened to `=0.10.2` (not `^0.10`) so a compromised crates.io push +# of 0.10.3 can't sneak into a `cargo update`. Cargo.lock pins the exact hash +# for reproducible builds regardless, but the direct-dep specifier is the +# secondary defence. Bump in lockstep with fastembed whenever it moves. +candle-core = { version = "=0.10.2", default-features = false, optional = true } + # ============================================================================ # OPTIONAL: Vector Search (USearch - HNSW, 20x faster than FAISS) # ============================================================================ diff --git a/crates/vestige-core/src/advanced/adaptive_embedding.rs b/crates/vestige-core/src/advanced/adaptive_embedding.rs index 001c191..f6de312 100644 --- a/crates/vestige-core/src/advanced/adaptive_embedding.rs +++ b/crates/vestige-core/src/advanced/adaptive_embedding.rs @@ -31,11 +31,34 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; -/// Default embedding dimensions after Matryoshka truncation (768 → 256) -pub const DEFAULT_DIMENSIONS: usize = 256; +/// Default embedding dimensions for the active backend. +/// Tracks `embeddings::local::EMBEDDING_DIMENSIONS` — must match at compile time +/// or the adaptive strategy layer ends up sizing buffers against the wrong +/// backend. 256 for nomic (Matryoshka 768→256), 1024 for Qwen3 native. +pub const DEFAULT_DIMENSIONS: usize = { + #[cfg(feature = "qwen3-embed")] + { + 1024 + } + #[cfg(not(feature = "qwen3-embed"))] + { + 256 + } +}; -/// Code embedding dimensions (matches default after Matryoshka truncation) -pub const CODE_DIMENSIONS: usize = 256; +/// Code embedding dimensions (matches default after Matryoshka truncation). +/// Same-shape gating as `DEFAULT_DIMENSIONS` so code embeddings flow through +/// the same index when the backend is swapped. +pub const CODE_DIMENSIONS: usize = { + #[cfg(feature = "qwen3-embed")] + { + 1024 + } + #[cfg(not(feature = "qwen3-embed"))] + { + 256 + } +}; /// Supported programming languages for code embeddings #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)] diff --git a/crates/vestige-core/src/embeddings/local.rs b/crates/vestige-core/src/embeddings/local.rs index 3dfc363..2bdf77d 100644 --- a/crates/vestige-core/src/embeddings/local.rs +++ b/crates/vestige-core/src/embeddings/local.rs @@ -1,12 +1,22 @@ //! Local Semantic Embeddings //! -//! Uses fastembed v5.11 for local inference. +//! Uses fastembed v5.13 for local inference. //! //! ## Models //! -//! - **Default**: Nomic Embed Text v1.5 (ONNX, 768d → 256d Matryoshka, 8192 context) -//! - **Optional**: Nomic Embed Text v2 MoE (Candle, 475M params, 305M active, 8 experts) -//! Enable with `nomic-v2` feature flag + `metal` for Apple Silicon acceleration. +//! - **Default (nomic)**: Nomic Embed Text v1.5 (ONNX via `TextEmbedding`, 768d → 256d +//! Matryoshka, 8192 token context). Single binary, no GPU required. +//! - **Optional (qwen3)**: Qwen3 Embedding 0.6B (Candle via `Qwen3TextEmbedding`, 1024d, +//! 32K token context). Enable with `qwen3-embed` feature flag; combine with `metal` +//! for Apple Silicon GPU acceleration. Asymmetric: queries MUST use the Instruct +//! prefix via [`qwen3_format_query`], documents get no prefix. +//! +//! ## Dual-backend architecture +//! +//! The `Backend` enum routes to either fastembed's ONNX `TextEmbedding` path +//! (nomic, default) or the standalone Candle `Qwen3TextEmbedding` path +//! (Qwen3, feature-gated). Both are held behind the same global `OnceLock>` +//! so the rest of Vestige calls `EmbeddingService::embed()` unchanged. use fastembed::{EmbeddingModel, InitOptions, TextEmbedding}; use std::sync::{Mutex, OnceLock}; @@ -15,23 +25,166 @@ use std::sync::{Mutex, OnceLock}; // CONSTANTS // ============================================================================ -/// Embedding dimensions after Matryoshka truncation +/// Nomic Embed Text v1.5 output dimensions after Matryoshka truncation. /// Truncated from 768 → 256 for 3x storage savings with only ~2% quality loss -/// (Matryoshka Representation Learning — the first N dims ARE the N-dim representation) -pub const EMBEDDING_DIMENSIONS: usize = 256; +/// (Matryoshka Representation Learning — the first N dims ARE the N-dim representation). +pub const NOMIC_EMBEDDING_DIMENSIONS: usize = 256; -/// Maximum text length for embedding (truncated if longer) -pub const MAX_TEXT_LENGTH: usize = 8192; +/// Qwen3-Embedding-0.6B native output dimensions. +/// Supports Matryoshka truncation to 32-1024; we keep the full 1024 by default +/// so the ~+8-9 point MTEB retrieval lift over the nomic baseline isn't eroded +/// by truncation (Qwen3-Embedding-0.6B scores 61.83 on MTEB-Eng-v2 retrieval +/// per its HF model card; nomic-v1.5 is around 52-53 on the comparable bench — +/// cross-version so delta is ballpark, not precision). +/// Index storage cost at int8 = 1 KB/vec vs nomic's 256 B/vec (4x). +pub const QWEN3_EMBEDDING_DIMENSIONS: usize = 1024; + +/// Back-compat alias: default embedding dimensions used by downstream code that +/// predates the dual-backend split. Always returns the active backend's native +/// dimension count. Callers that need a specific backend's dim should use the +/// explicit constants above. +pub const EMBEDDING_DIMENSIONS: usize = { + #[cfg(feature = "qwen3-embed")] + { + QWEN3_EMBEDDING_DIMENSIONS + } + #[cfg(not(feature = "qwen3-embed"))] + { + NOMIC_EMBEDDING_DIMENSIONS + } +}; + +/// Maximum text length for embedding (truncated if longer). +/// Nomic caps at 8K; Qwen3 allows 32K. Use the active backend's limit. +pub const MAX_TEXT_LENGTH: usize = { + #[cfg(feature = "qwen3-embed")] + { + 32_768 + } + #[cfg(not(feature = "qwen3-embed"))] + { + 8192 + } +}; /// Batch size for efficient embedding generation pub const BATCH_SIZE: usize = 32; +/// Qwen3 instruct prefix template for retrieval queries. +/// Qwen3-Embedding is asymmetric: queries get the instruct-wrapped format, +/// documents are embedded raw. Missing this drops retrieval NDCG by ~3 points +/// per the Qwen3 model card. +pub const QWEN3_QUERY_INSTRUCTION: &str = + "Given a web search query, retrieve relevant passages that answer the query"; + +/// Format a query string with the Qwen3 instruct prefix. No-op under the nomic +/// backend (nomic is symmetric — query and document embeddings share an embedding +/// function). Call this on QUERY text at search time, never on document text +/// at ingest time. +/// +/// The exact template (no space after `Query:`) matches the canonical +/// `get_detailed_instruct` function in the Qwen3-Embedding-0.6B model card — +/// the TEI docs happen to include a space but the Python reference function +/// does not, so we match the Python reference. +#[inline] +pub fn qwen3_format_query(query: &str) -> String { + #[cfg(feature = "qwen3-embed")] + { + format!( + "Instruct: {instruction}\nQuery:{query}", + instruction = QWEN3_QUERY_INSTRUCTION, + query = query, + ) + } + #[cfg(not(feature = "qwen3-embed"))] + { + query.to_string() + } +} + // ============================================================================ -// GLOBAL MODEL (with Mutex for fastembed v5 API) +// BACKEND ENUM // ============================================================================ -/// Result type for model initialization -static EMBEDDING_MODEL_RESULT: OnceLock, String>> = OnceLock::new(); +/// Internal embedding backend. Held inside a `Mutex` to serialise access +/// across callers — the Nomic ONNX path needs `&mut self` for `embed`, and +/// the Qwen3 Candle path takes `&self` but the `Mutex` still keeps the API +/// uniform and gives us a single mutation-safe story under both cfgs without +/// a `Backend`-specific lock type. +/// +/// The enum is private — callers go through `EmbeddingService` which hides +/// the branch. This lets us add new backends (e.g. ONNX Qwen3 for lower memory, +/// binary-quantized variants) without breaking downstream code. +enum Backend { + /// fastembed ONNX path — Nomic Embed Text v1.5 (768d → 256d Matryoshka). + /// + /// This variant is constructed only under the default (non-Qwen3) build. + /// When `qwen3-embed` is enabled `init_backend` selects [`Self::Qwen3`] + /// exclusively, so the Nomic variant is dead code under that cfg. The + /// match arms on this enum still handle both variants so the codebase + /// can be audited feature-agnostically without `#[cfg]` noise. + #[cfg_attr(feature = "qwen3-embed", allow(dead_code))] + Nomic(TextEmbedding), + /// fastembed Candle path — Qwen3-Embedding-0.6B (1024d, 32K context). + #[cfg(feature = "qwen3-embed")] + Qwen3(fastembed::Qwen3TextEmbedding), +} + +impl Backend { + /// Embed a batch of texts. The Nomic path truncates to Matryoshka dims + /// internally; the Qwen3 path returns full-dim L2-normalized vectors. + fn embed_batch(&mut self, texts: Vec<&str>) -> Result>, EmbeddingError> { + match self { + Self::Nomic(model) => model + .embed(texts, None) + .map_err(|e| EmbeddingError::EmbeddingFailed(e.to_string())), + #[cfg(feature = "qwen3-embed")] + Self::Qwen3(model) => model + .embed(&texts) + .map_err(|e| EmbeddingError::EmbeddingFailed(e.to_string())), + } + } + + /// Post-process a raw embedding before handing it back to callers. + /// Nomic: Matryoshka-truncate to 256d + L2-normalize. + /// Qwen3: pass through (already last-token pooled and L2-normalized by the model). + #[inline] + fn post_process(&self, raw: Vec) -> Vec { + match self { + Self::Nomic(_) => matryoshka_truncate(raw), + #[cfg(feature = "qwen3-embed")] + Self::Qwen3(_) => raw, + } + } + + /// HuggingFace repo ID for this backend's model. Written to the + /// `embedding_model` column on every knowledge-node row so dual-index + /// search can route queries to the matching USearch index at retrieval time. + fn model_name(&self) -> &'static str { + match self { + Self::Nomic(_) => "nomic-ai/nomic-embed-text-v1.5", + #[cfg(feature = "qwen3-embed")] + Self::Qwen3(_) => "Qwen/Qwen3-Embedding-0.6B", + } + } + + /// Output vector dimensions after post-processing. + fn dimensions(&self) -> usize { + match self { + Self::Nomic(_) => NOMIC_EMBEDDING_DIMENSIONS, + #[cfg(feature = "qwen3-embed")] + Self::Qwen3(_) => QWEN3_EMBEDDING_DIMENSIONS, + } + } +} + +// ============================================================================ +// GLOBAL MODEL (with Mutex for fastembed's &mut self API) +// ============================================================================ + +/// Global backend, initialised on first use. Held as a `Mutex` because both +/// underlying embedding models require exclusive access for `embed()`. +static EMBEDDING_BACKEND: OnceLock, String>> = OnceLock::new(); /// Get the default cache directory for fastembed models. /// @@ -65,37 +218,93 @@ pub(crate) fn get_cache_dir() -> std::path::PathBuf { std::path::PathBuf::from(".fastembed_cache") } -/// Initialize the global embedding model -/// Using nomic-embed-text-v1.5 (768d) - 8192 token context, Matryoshka support -fn get_model() -> Result, EmbeddingError> { - let result = EMBEDDING_MODEL_RESULT.get_or_init(|| { - // Get cache directory (respects FASTEMBED_CACHE_PATH env var) - let cache_dir = get_cache_dir(); +/// Initialise the Nomic ONNX backend. Downloads the model on first use. +/// +/// Called by [`init_backend`] only when `qwen3-embed` is NOT enabled. +/// Kept compiled under both cfgs so that a future runtime-selectable backend +/// can reuse it without a cross-feature refactor; silenced as dead code when +/// the Qwen3 feature is on. +#[cfg_attr(feature = "qwen3-embed", allow(dead_code))] +fn init_nomic(cache_dir: std::path::PathBuf) -> Result { + // nomic-embed-text-v1.5: 768 dimensions, 8192 token context, Matryoshka + let options = InitOptions::new(EmbeddingModel::NomicEmbedTextV15) + .with_show_download_progress(true) + .with_cache_dir(cache_dir); - // Create cache directory if it doesn't exist - if let Err(e) = std::fs::create_dir_all(&cache_dir) { - tracing::warn!("Failed to create cache directory {:?}: {}", cache_dir, e); - } + TextEmbedding::try_new(options).map(Backend::Nomic).map_err(|e| { + format!( + "Failed to initialize nomic-embed-text-v1.5 embedding model: {}. \ + Ensure ONNX runtime is available and model files can be downloaded.", + e + ) + }) +} - // nomic-embed-text-v1.5: 768 dimensions, 8192 token context - // Matryoshka representation learning, fully open source - let options = InitOptions::new(EmbeddingModel::NomicEmbedTextV15) - .with_show_download_progress(true) - .with_cache_dir(cache_dir); - - TextEmbedding::try_new(options) - .map(Mutex::new) - .map_err(|e| { - format!( - "Failed to initialize nomic-embed-text-v1.5 embedding model: {}. \ - Ensure ONNX runtime is available and model files can be downloaded.", - e - ) - }) +/// Initialise the Qwen3 Candle backend. Downloads ~1.2 GB model weights on first +/// use (same cache dir as the ONNX path). Uses Metal GPU on Apple Silicon when +/// `metal` feature is on; CPU otherwise. CUDA auto-selection is a Day-3+ follow +/// (candle-core 0.10 exposes `Device::new_cuda(0)` but we ship the CPU fallback +/// first to keep Linux users working out of the box). +#[cfg(feature = "qwen3-embed")] +fn init_qwen3(_cache_dir: std::path::PathBuf) -> Result { + // Device selection is caller-side in candle-core 0.10: fastembed does NOT + // auto-select from its own `metal` / `cuda` feature. We gate on vestige-core's + // `metal` feature and fall back to CPU if Metal init fails (e.g. x86 macOS + // or a broken Apple Silicon Metal stack) so the feature flag is always safe + // to combine with qwen3-embed. + #[cfg(feature = "metal")] + let device = candle_core::Device::new_metal(0).unwrap_or_else(|e| { + tracing::warn!("Metal device init failed ({}); falling back to CPU", e); + candle_core::Device::Cpu }); + #[cfg(not(feature = "metal"))] + let device = candle_core::Device::Cpu; + + let dtype = candle_core::DType::F32; + + fastembed::Qwen3TextEmbedding::from_hf( + "Qwen/Qwen3-Embedding-0.6B", + &device, + dtype, + MAX_TEXT_LENGTH, + ) + .map(Backend::Qwen3) + .map_err(|e| { + format!( + "Failed to initialize Qwen3-Embedding-0.6B: {}. \ + First-run requires ~1.2 GB download to ~/.cache/vestige/fastembed; \ + subsequent runs load from cache.", + e + ) + }) +} + +/// Initialise the active backend based on compiled features. Qwen3 wins when +/// both features are enabled (it's strictly newer and more capable). +fn init_backend() -> Result { + let cache_dir = get_cache_dir(); + + // Create cache directory if it doesn't exist + if let Err(e) = std::fs::create_dir_all(&cache_dir) { + tracing::warn!("Failed to create cache directory {:?}: {}", cache_dir, e); + } + + #[cfg(feature = "qwen3-embed")] + { + init_qwen3(cache_dir) + } + #[cfg(not(feature = "qwen3-embed"))] + { + init_nomic(cache_dir) + } +} + +/// Lock and return the global embedding backend. Initialises on first call. +fn get_backend() -> Result, EmbeddingError> { + let result = EMBEDDING_BACKEND.get_or_init(|| init_backend().map(Mutex::new)); match result { - Ok(model) => model + Ok(backend) => backend .lock() .map_err(|e| EmbeddingError::ModelInit(format!("Lock poisoned: {}", e))), Err(err) => Err(EmbeddingError::ModelInit(err.clone())), @@ -223,7 +432,7 @@ impl EmbeddingService { /// Check if the model is ready pub fn is_ready(&self) -> bool { - match get_model() { + match get_backend() { Ok(_) => true, Err(e) => { tracing::warn!("Embedding model not ready: {}", e); @@ -234,33 +443,44 @@ impl EmbeddingService { /// Check if the model is ready and return the error if not pub fn check_ready(&self) -> Result<(), EmbeddingError> { - get_model().map(|_| ()) + get_backend().map(|_| ()) } /// Initialize the model (downloads if necessary) pub fn init(&self) -> Result<(), EmbeddingError> { - let _model = get_model()?; // Ensures model is loaded and returns any init errors + let _model = get_backend()?; // Ensures model is loaded and returns any init errors Ok(()) } - /// Get the model name + /// HuggingFace repo ID of the active backend. Used by storage to tag every + /// embedded row with its source model so dual-index search can route at + /// retrieval time without re-embedding the query against every index. pub fn model_name(&self) -> &'static str { - #[cfg(feature = "nomic-v2")] - { - "nomic-ai/nomic-embed-text-v2-moe" - } - #[cfg(not(feature = "nomic-v2"))] - { - "nomic-ai/nomic-embed-text-v1.5" + // Acquire the lock only to read a const — cheap, and avoids duplicating + // the cfg branch at the call site. + match get_backend() { + Ok(guard) => guard.model_name(), + #[cfg(feature = "qwen3-embed")] + Err(_) => "Qwen/Qwen3-Embedding-0.6B", + #[cfg(not(feature = "qwen3-embed"))] + Err(_) => "nomic-ai/nomic-embed-text-v1.5", } } - /// Get the embedding dimensions + /// Output vector dimensions for the active backend. pub fn dimensions(&self) -> usize { - EMBEDDING_DIMENSIONS + match get_backend() { + Ok(guard) => guard.dimensions(), + Err(_) => EMBEDDING_DIMENSIONS, + } } - /// Generate embedding for a single text + /// Generate embedding for a single text. + /// + /// Documents go in raw. For QUERY text under the Qwen3 backend, the caller + /// is responsible for wrapping with [`qwen3_format_query`] before calling + /// this method — the asymmetric query/document format is intentional and + /// handled at the search layer, not the embedding layer. pub fn embed(&self, text: &str) -> Result { if text.is_empty() { return Err(EmbeddingError::InvalidInput( @@ -268,7 +488,7 @@ impl EmbeddingService { )); } - let mut model = get_model()?; + let mut backend = get_backend()?; // Truncate if too long (char-boundary safe) let text = if text.len() > MAX_TEXT_LENGTH { @@ -281,26 +501,36 @@ impl EmbeddingService { text }; - let embeddings = model - .embed(vec![text], None) - .map_err(|e| EmbeddingError::EmbeddingFailed(e.to_string()))?; + let raw = backend.embed_batch(vec![text])?; - if embeddings.is_empty() { + // Shape contract: both backends must return at least one vector of + // non-zero length for a non-empty input. An empty outer or inner vec + // means the backend misbehaved (e.g. fastembed regression, malformed + // ONNX output). Guard both so a silent zero-dim vector never lands in + // the USearch index where it would later blow up with an opaque + // InvalidDimensions error deep in the search path. + let first = raw.into_iter().next().ok_or_else(|| { + EmbeddingError::EmbeddingFailed("No embedding generated".to_string()) + })?; + if first.is_empty() { return Err(EmbeddingError::EmbeddingFailed( - "No embedding generated".to_string(), + "Backend returned an empty embedding vector".to_string(), )); } - Ok(Embedding::new(matryoshka_truncate(embeddings[0].clone()))) + Ok(Embedding::new(backend.post_process(first))) } - /// Generate embeddings for multiple texts (batch processing) + /// Generate embeddings for multiple texts (batch processing). + /// + /// As with [`Self::embed`], query/document asymmetry is the caller's + /// responsibility: wrap query texts with [`qwen3_format_query`] upstream. pub fn embed_batch(&self, texts: &[&str]) -> Result, EmbeddingError> { if texts.is_empty() { return Ok(vec![]); } - let mut model = get_model()?; + let mut backend = get_backend()?; let mut all_embeddings = Vec::with_capacity(texts.len()); // Process in batches for efficiency @@ -320,12 +550,10 @@ impl EmbeddingService { }) .collect(); - let embeddings = model - .embed(truncated, None) - .map_err(|e| EmbeddingError::EmbeddingFailed(e.to_string()))?; + let raw = backend.embed_batch(truncated)?; - for emb in embeddings { - all_embeddings.push(Embedding::new(matryoshka_truncate(emb))); + for emb in raw { + all_embeddings.push(Embedding::new(backend.post_process(emb))); } } @@ -356,15 +584,19 @@ impl EmbeddingService { // SIMILARITY FUNCTIONS // ============================================================================ -/// Apply Matryoshka truncation: truncate to EMBEDDING_DIMENSIONS and L2-normalize +/// Apply Matryoshka truncation: truncate to [`NOMIC_EMBEDDING_DIMENSIONS`] and L2-normalize. /// /// Nomic Embed v1.5 supports Matryoshka Representation Learning, /// meaning the first N dimensions of the 768-dim output ARE a valid /// N-dimensional embedding with minimal quality loss (~2% on MTEB for 256-dim). +/// +/// Not applied to the Qwen3 backend — Qwen3 output is already last-token pooled +/// and L2-normalized by the Candle model internals, and we keep full 1024-dim +/// by default so the retrieval quality gain over nomic isn't Matryoshka-capped. #[inline] pub fn matryoshka_truncate(mut vector: Vec) -> Vec { - if vector.len() > EMBEDDING_DIMENSIONS { - vector.truncate(EMBEDDING_DIMENSIONS); + if vector.len() > NOMIC_EMBEDDING_DIMENSIONS { + vector.truncate(NOMIC_EMBEDDING_DIMENSIONS); } // L2-normalize the truncated vector let norm = vector.iter().map(|x| x * x).sum::().sqrt(); @@ -512,4 +744,50 @@ mod tests { assert_eq!(results[0].0, 0); // First candidate should be most similar assert!((results[0].1 - 1.0).abs() < 0.0001); } + + #[test] + fn test_qwen3_format_query_feature_gated() { + let wrapped = qwen3_format_query("cats are cute"); + + #[cfg(feature = "qwen3-embed")] + { + // With Qwen3 active, queries get wrapped in the Instruct template. + // No space between `Query:` and the user text — this matches the + // canonical `get_detailed_instruct` function in the Qwen3 model + // card's Python example, even though the TEI curl example has a + // space. We follow the Python reference. + assert!(wrapped.starts_with("Instruct: ")); + assert!(wrapped.ends_with("\nQuery:cats are cute")); + } + #[cfg(not(feature = "qwen3-embed"))] + { + // Under the nomic backend the wrapper is a no-op. + assert_eq!(wrapped, "cats are cute"); + } + } + + #[test] + fn test_backend_dimensions_match_feature_flag() { + #[cfg(feature = "qwen3-embed")] + assert_eq!(EMBEDDING_DIMENSIONS, QWEN3_EMBEDDING_DIMENSIONS); + #[cfg(not(feature = "qwen3-embed"))] + assert_eq!(EMBEDDING_DIMENSIONS, NOMIC_EMBEDDING_DIMENSIONS); + } + + /// Integration: load the Qwen3 backend and verify it produces a 1024-dim + /// L2-normalized vector on CPU. Ignored by default because it downloads + /// ~1.2 GB of model weights on first run. + /// + /// Run with: `cargo test --features qwen3-embed -- --ignored test_qwen3_embed_live` + #[cfg(feature = "qwen3-embed")] + #[test] + #[ignore] + fn test_qwen3_embed_live() { + let service = EmbeddingService::new(); + service.init().expect("Qwen3 backend init"); + + let emb = service.embed("hello world").expect("embed succeeds"); + assert_eq!(emb.dimensions, QWEN3_EMBEDDING_DIMENSIONS); + assert!(emb.is_normalized(), "Qwen3 output must be L2-normalized"); + } } diff --git a/crates/vestige-core/src/embeddings/mod.rs b/crates/vestige-core/src/embeddings/mod.rs index 5d89c10..e1e9182 100644 --- a/crates/vestige-core/src/embeddings/mod.rs +++ b/crates/vestige-core/src/embeddings/mod.rs @@ -1,13 +1,15 @@ //! Semantic Embeddings Module //! -//! Provides local embedding generation using fastembed (ONNX-based). -//! No external API calls required - 100% local and private. +//! Provides local embedding generation using fastembed v5.13. +//! No external API calls required — 100% local and private. //! //! Supports: -//! - Text embedding generation (768-dimensional vectors via nomic-embed-text-v1.5) -//! - Cosine similarity computation -//! - Batch embedding for efficiency -//! - Hybrid multi-model fusion (future) +//! - Dual backend: Nomic Embed v1.5 (ONNX, default, 768d native → 256d +//! Matryoshka) or Qwen3-Embedding-0.6B (Candle, `qwen3-embed` feature, +//! 1024d native, 32K context). +//! - Cosine similarity computation. +//! - Batch embedding for efficiency. +//! - Hybrid multi-model fusion (future). mod code; mod hybrid; @@ -16,7 +18,8 @@ mod local; pub(crate) use local::get_cache_dir; pub use local::{ BATCH_SIZE, EMBEDDING_DIMENSIONS, Embedding, EmbeddingError, EmbeddingService, MAX_TEXT_LENGTH, - cosine_similarity, dot_product, euclidean_distance, matryoshka_truncate, + NOMIC_EMBEDDING_DIMENSIONS, QWEN3_EMBEDDING_DIMENSIONS, QWEN3_QUERY_INSTRUCTION, + cosine_similarity, dot_product, euclidean_distance, matryoshka_truncate, qwen3_format_query, }; pub use code::CodeEmbedding; diff --git a/crates/vestige-core/src/search/vector.rs b/crates/vestige-core/src/search/vector.rs index 069dd9a..7c77dc5 100644 --- a/crates/vestige-core/src/search/vector.rs +++ b/crates/vestige-core/src/search/vector.rs @@ -17,9 +17,25 @@ use usearch::{Index, IndexOptions, MetricKind, ScalarKind}; // CONSTANTS // ============================================================================ -/// Default embedding dimensions after Matryoshka truncation (768 → 256) -/// 3x storage savings with only ~2% quality loss on MTEB benchmarks -pub const DEFAULT_DIMENSIONS: usize = 256; +/// Default embedding dimensions for the active backend. +/// +/// - Nomic backend (default): 256 after Matryoshka truncation from 768. +/// 3x storage savings with ~2% quality loss on MTEB benchmarks. +/// - Qwen3 backend (`qwen3-embed` feature): 1024 native, no truncation. +/// +/// Must track `embeddings::local::EMBEDDING_DIMENSIONS` so the USearch index +/// dimension matches what `EmbeddingService::embed()` produces. Mismatches +/// surface as `VectorSearchError::InvalidDimensions` at insert time. +pub const DEFAULT_DIMENSIONS: usize = { + #[cfg(feature = "qwen3-embed")] + { + 1024 + } + #[cfg(not(feature = "qwen3-embed"))] + { + 256 + } +}; /// HNSW connectivity parameter (higher = better recall, more memory) pub const DEFAULT_CONNECTIVITY: usize = 16;