perf: conditional batch embedding — batch for API, sequential for local

2026-07-22 23:31:12 +02:00 · 2026-03-09 19:12:43 +02:00 · 2026-03-09 19:12:43 +02:00 · 6eabfe2396
commit 6eabfe2396
parent a49a4db6d6
2 changed files with 7 additions and 3 deletions
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -327,6 +327,7 @@ class Config:
        EMBEDDING_MODEL,
        **embedding_kwargs,
    )
+    is_local_embedding_model = "://" not in (EMBEDDING_MODEL or "")
    chunker_instance = RecursiveChunker(
        chunk_size=getattr(embedding_model_instance, "max_seq_length", 512)
    )
--- a/surfsense_backend/app/utils/document_converters.py
+++ b/surfsense_backend/app/utils/document_converters.py
@ -59,13 +59,16 @@ def embed_texts(texts: list[str]) -> list[np.ndarray]:
    """Batch-embed multiple texts in a single call.

    Each text is truncated to fit the model's context window before embedding.
-    Uses ``embed_batch`` under the hood, which every chonkie provider
-    (OpenAI, Azure, Cohere, SentenceTransformers, etc.) optimizes
-    into fewer API calls / GPU passes than sequential ``embed``.
+    For API-based models (``://`` in the model string) this uses
+    ``embed_batch`` to collapse many network round-trips into one.
+    For local models (SentenceTransformers) it falls back to sequential
+    ``embed`` calls to avoid padding overhead.
    """
    if not texts:
        return []
    truncated = [truncate_for_embedding(t) for t in texts]
+    if config.is_local_embedding_model:
+        return [config.embedding_model_instance.embed(t) for t in truncated]
    return config.embedding_model_instance.embed_batch(truncated)