diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 68c65a818..a03ef5f8a 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -327,6 +327,7 @@ class Config: EMBEDDING_MODEL, **embedding_kwargs, ) + is_local_embedding_model = "://" not in (EMBEDDING_MODEL or "") chunker_instance = RecursiveChunker( chunk_size=getattr(embedding_model_instance, "max_seq_length", 512) ) diff --git a/surfsense_backend/app/utils/document_converters.py b/surfsense_backend/app/utils/document_converters.py index c96cb698d..6a59990f5 100644 --- a/surfsense_backend/app/utils/document_converters.py +++ b/surfsense_backend/app/utils/document_converters.py @@ -59,13 +59,16 @@ def embed_texts(texts: list[str]) -> list[np.ndarray]: """Batch-embed multiple texts in a single call. Each text is truncated to fit the model's context window before embedding. - Uses ``embed_batch`` under the hood, which every chonkie provider - (OpenAI, Azure, Cohere, SentenceTransformers, etc.) optimizes - into fewer API calls / GPU passes than sequential ``embed``. + For API-based models (``://`` in the model string) this uses + ``embed_batch`` to collapse many network round-trips into one. + For local models (SentenceTransformers) it falls back to sequential + ``embed`` calls to avoid padding overhead. """ if not texts: return [] truncated = [truncate_for_embedding(t) for t in texts] + if config.is_local_embedding_model: + return [config.embedding_model_instance.embed(t) for t in truncated] return config.embedding_model_instance.embed_batch(truncated)