diff --git a/api/requirements.txt b/api/requirements.txt
index 9b49c1d..2aca2b6 100644
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -14,5 +14,4 @@ sentry-sdk[fastapi]==2.38.0
sqlalchemy[asyncio]==2.0.43
msgpack==1.1.2
docling[rapidocr]==2.68.0
-sentence-transformers==5.2.0
pgvector==0.4.2
diff --git a/api/routes/knowledge_base.py b/api/routes/knowledge_base.py
index 4553efd..fbe4381 100644
--- a/api/routes/knowledge_base.py
+++ b/api/routes/knowledge_base.py
@@ -103,9 +103,8 @@ async def process_document(
The document status will be updated from 'pending' -> 'processing' -> 'completed' or 'failed'.
- Embedding Services:
- * openai (default): High-quality 1536-dimensional embeddings (requires OPENAI_API_KEY)
- * sentence_transformer: Free, offline-capable, 384-dimensional embeddings
+ Embedding:
+ Uses OpenAI text-embedding-3-small (1536-dimensional embeddings, requires API key configured in Model Configurations).
Access Control:
* Users can only process documents in their organization.
@@ -134,12 +133,11 @@ async def process_document(
request.s3_key,
user.selected_organization_id,
128, # max_tokens (default)
- request.embedding_service,
)
logger.info(
f"Created document {request.document_uuid} (id={document.id}) and enqueued processing "
- f"with {request.embedding_service} embeddings, org {user.selected_organization_id}"
+ f"with OpenAI embeddings, org {user.selected_organization_id}"
)
return DocumentResponseSchema(
diff --git a/api/schemas/knowledge_base.py b/api/schemas/knowledge_base.py
index 0cb1a72..363c7d0 100644
--- a/api/schemas/knowledge_base.py
+++ b/api/schemas/knowledge_base.py
@@ -1,7 +1,7 @@
"""Pydantic schemas for knowledge base operations."""
from datetime import datetime
-from typing import Any, Dict, List, Literal, Optional
+from typing import Any, Dict, List, Optional
from pydantic import BaseModel, Field
@@ -29,11 +29,6 @@ class ProcessDocumentRequestSchema(BaseModel):
document_uuid: str = Field(..., description="Document UUID to process")
s3_key: str = Field(..., description="S3 key of the uploaded file")
- embedding_service: Literal["sentence_transformer", "openai"] = Field(
- default="openai",
- description="Embedding service to use for processing. "
- "Options: 'openai' (default, 1536-dim, requires API key) or 'sentence_transformer' (free, 384-dim)",
- )
class DocumentResponseSchema(BaseModel):
diff --git a/api/services/gen_ai/__init__.py b/api/services/gen_ai/__init__.py
index 9a25a6f..4d5b8fe 100644
--- a/api/services/gen_ai/__init__.py
+++ b/api/services/gen_ai/__init__.py
@@ -4,14 +4,12 @@ from .embedding import (
BaseEmbeddingService,
EmbeddingAPIKeyNotConfiguredError,
OpenAIEmbeddingService,
- SentenceTransformerEmbeddingService,
)
from .json_parser import parse_llm_json
__all__ = [
"BaseEmbeddingService",
"EmbeddingAPIKeyNotConfiguredError",
- "SentenceTransformerEmbeddingService",
"OpenAIEmbeddingService",
"parse_llm_json",
]
diff --git a/api/services/gen_ai/embedding/__init__.py b/api/services/gen_ai/embedding/__init__.py
index c47c66a..f6a4f18 100644
--- a/api/services/gen_ai/embedding/__init__.py
+++ b/api/services/gen_ai/embedding/__init__.py
@@ -2,11 +2,9 @@
from .base import BaseEmbeddingService
from .openai_service import EmbeddingAPIKeyNotConfiguredError, OpenAIEmbeddingService
-from .sentence_transformer_service import SentenceTransformerEmbeddingService
__all__ = [
"BaseEmbeddingService",
"EmbeddingAPIKeyNotConfiguredError",
- "SentenceTransformerEmbeddingService",
"OpenAIEmbeddingService",
]
diff --git a/api/services/gen_ai/embedding/sentence_transformer_service.py b/api/services/gen_ai/embedding/sentence_transformer_service.py
deleted file mode 100644
index f2773a2..0000000
--- a/api/services/gen_ai/embedding/sentence_transformer_service.py
+++ /dev/null
@@ -1,350 +0,0 @@
-"""Sentence Transformer embedding service.
-
-This module provides document processing capabilities using:
-- Sentence-transformers for embeddings (all-MiniLM-L6-v2)
-- Docling for document conversion and chunking
-- pgvector for vector similarity search
-
-Setup for offline usage:
-1. First run: Downloads and caches models to ~/.cache/sentence_transformers
-2. Subsequent runs: Uses cached models (no internet needed)
-3. For fully offline mode: Set TRANSFORMERS_OFFLINE=1 and HF_HUB_OFFLINE=1
-"""
-
-import os
-from pathlib import Path
-from typing import Any, Dict, List, Optional
-
-from docling.chunking import HybridChunker
-from docling.document_converter import DocumentConverter
-from docling_core.transforms.chunker.tokenizer.huggingface import HuggingFaceTokenizer
-from loguru import logger
-from sentence_transformers import SentenceTransformer
-from transformers import AutoTokenizer
-
-from api.db.db_client import DBClient
-from api.db.models import KnowledgeBaseChunkModel
-
-from .base import BaseEmbeddingService
-
-# Set environment variables for model caching
-os.environ.setdefault("TRANSFORMERS_OFFLINE", "0")
-os.environ.setdefault("HF_HUB_OFFLINE", "0")
-os.environ.setdefault(
- "SENTENCE_TRANSFORMERS_HOME", os.path.expanduser("~/.cache/sentence_transformers")
-)
-
-# Model configuration
-DEFAULT_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
-EMBEDDING_DIMENSION = 384 # Dimension for all-MiniLM-L6-v2
-
-
-class SentenceTransformerEmbeddingService(BaseEmbeddingService):
- """Embedding service using Sentence Transformers."""
-
- def __init__(
- self,
- db_client: DBClient,
- model_id: str = DEFAULT_MODEL_ID,
- max_tokens: int = 512,
- ):
- """Initialize the Sentence Transformer embedding service.
-
- Args:
- db_client: Database client for storing documents and chunks
- model_id: Sentence-transformers model ID (default: all-MiniLM-L6-v2)
- max_tokens: Maximum number of tokens per chunk (default: 512)
- Note: This applies to the contextualized text (with headings/captions)
- """
- self.db = db_client
- self.model_id = model_id
- self.max_tokens = max_tokens
-
- # Initialize embedding model
- logger.info(f"Loading embedding model: {model_id}")
- try:
- # Try to load from cache first (local_files_only=True)
- self.embedding_model = SentenceTransformer(
- model_id,
- cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
- local_files_only=True,
- )
- logger.info("Loaded model from cache")
- except Exception as e:
- logger.warning(f"Model not in cache, downloading: {e}")
- # If not in cache, download it (this will cache it for next time)
- self.embedding_model = SentenceTransformer(
- model_id,
- cache_folder=os.environ.get("SENTENCE_TRANSFORMERS_HOME"),
- )
- logger.info("Model downloaded and cached")
-
- # Initialize tokenizer for chunking with max_tokens
- logger.info(f"Loading tokenizer: {model_id} with max_tokens={max_tokens}")
- try:
- # Try to load from cache first
- self.tokenizer = HuggingFaceTokenizer(
- tokenizer=AutoTokenizer.from_pretrained(
- model_id,
- local_files_only=True,
- ),
- max_tokens=max_tokens,
- )
- logger.info("Loaded tokenizer from cache")
- except Exception as e:
- logger.warning(f"Tokenizer not in cache, downloading: {e}")
- # If not in cache, download it
- self.tokenizer = HuggingFaceTokenizer(
- tokenizer=AutoTokenizer.from_pretrained(model_id),
- max_tokens=max_tokens,
- )
- logger.info("Tokenizer downloaded and cached")
-
- # Initialize chunker
- logger.info(f"Initializing HybridChunker with max_tokens={max_tokens}")
- self.chunker = HybridChunker(tokenizer=self.tokenizer)
-
- # Initialize document converter
- self.converter = DocumentConverter()
-
- def get_model_id(self) -> str:
- """Return the model identifier."""
- return self.model_id
-
- def get_embedding_dimension(self) -> int:
- """Return the embedding dimension."""
- return EMBEDDING_DIMENSION
-
- async def embed_texts(self, texts: List[str]) -> List[List[float]]:
- """Embed a batch of texts.
-
- Args:
- texts: List of text strings to embed
-
- Returns:
- List of embedding vectors (each vector is a list of floats)
- """
- embeddings = self.embedding_model.encode(
- texts,
- show_progress_bar=False,
- convert_to_numpy=True,
- )
- return [embedding.tolist() for embedding in embeddings]
-
- async def embed_query(self, query: str) -> List[float]:
- """Embed a single query text.
-
- Args:
- query: Query text to embed
-
- Returns:
- Embedding vector as list of floats
- """
- embedding = self.embedding_model.encode([query])[0]
- return embedding.tolist()
-
- async def search_similar_chunks(
- self,
- query: str,
- organization_id: int,
- limit: int = 5,
- document_uuids: Optional[List[str]] = None,
- ) -> List[Dict[str, Any]]:
- """Search for similar chunks using vector similarity.
-
- Returns top-k most similar chunks without any threshold filtering.
- Apply similarity thresholds and reranking at the application layer.
-
- Args:
- query: Search query text
- organization_id: Organization ID for scoping
- limit: Maximum number of results to return
- document_uuids: Optional list of document UUIDs to filter by
-
- Returns:
- List of dictionaries with chunk data and similarity scores
- """
- # Generate query embedding
- query_embedding = await self.embed_query(query)
-
- # Perform vector similarity search
- results = await self.db.search_similar_chunks(
- query_embedding=query_embedding,
- organization_id=organization_id,
- limit=limit,
- document_uuids=document_uuids,
- embedding_model=self.model_id,
- )
-
- return results
-
- async def process_document(
- self,
- file_path: str,
- organization_id: int,
- created_by: int,
- custom_metadata: dict = None,
- ):
- """Process a document: convert, chunk, embed, and store in database.
-
- Args:
- file_path: Path to the document file
- organization_id: Organization ID for scoping
- created_by: User ID who uploaded the document
- custom_metadata: Optional custom metadata dictionary
-
- Returns:
- The created document record
- """
- try:
- # Extract file metadata
- filename = Path(file_path).name
- file_hash = self.db.compute_file_hash(file_path)
- file_size = os.path.getsize(file_path)
- mime_type = self.db.get_mime_type(file_path)
-
- # Check if document already exists
- existing_doc = await self.db.get_document_by_hash(
- file_hash, organization_id
- )
- if existing_doc:
- logger.info(f"Document already exists: {filename} (hash: {file_hash})")
- return existing_doc
-
- # Create document record
- doc_record = await self.db.create_document(
- organization_id=organization_id,
- created_by=created_by,
- filename=filename,
- file_size_bytes=file_size,
- file_hash=file_hash,
- mime_type=mime_type,
- custom_metadata=custom_metadata or {},
- )
-
- logger.info(f"Processing document: {filename}")
-
- # Update status to processing
- await self.db.update_document_status(doc_record.id, "processing")
-
- # Step 1: Convert document using docling
- logger.info("Converting document with docling...")
- conversion_result = self.converter.convert(file_path)
- doc = conversion_result.document
-
- # Store docling metadata
- docling_metadata = {
- "num_pages": len(doc.pages) if hasattr(doc, "pages") else None,
- "document_type": type(doc).__name__,
- }
-
- # Step 2: Chunk the document
- logger.info(f"Chunking document with max_tokens={self.max_tokens}...")
- chunks = list(self.chunker.chunk(dl_doc=doc))
- total_chunks = len(chunks)
-
- logger.info(f"Generated {total_chunks} chunks")
-
- # Step 3: Process each chunk
- chunk_texts = []
- chunk_records = []
- token_counts = []
-
- for i, chunk in enumerate(chunks):
- # Get chunk text
- chunk_text = chunk.text
-
- # Get contextualized text (enriched with surrounding context)
- contextualized_text = self.chunker.contextualize(chunk=chunk)
-
- # Calculate actual token count using the tokenizer
- text_to_tokenize = (
- contextualized_text if contextualized_text else chunk_text
- )
- token_count = len(
- self.tokenizer.tokenizer.encode(
- text_to_tokenize, add_special_tokens=False
- )
- )
- token_counts.append(token_count)
-
- # Prepare chunk metadata
- chunk_metadata = {}
- if hasattr(chunk, "meta") and chunk.meta:
- chunk_metadata = {
- "doc_items": (
- [str(item) for item in chunk.meta.doc_items]
- if hasattr(chunk.meta, "doc_items")
- else []
- ),
- "headings": (
- chunk.meta.headings
- if hasattr(chunk.meta, "headings")
- else []
- ),
- }
-
- # Create chunk record (without embedding yet)
- chunk_record = KnowledgeBaseChunkModel(
- document_id=doc_record.id,
- organization_id=organization_id,
- chunk_text=chunk_text,
- contextualized_text=contextualized_text,
- chunk_index=i,
- chunk_metadata=chunk_metadata,
- embedding_model=self.model_id,
- embedding_dimension=EMBEDDING_DIMENSION,
- token_count=token_count,
- )
-
- chunk_records.append(chunk_record)
- # Use contextualized text for embedding if available
- chunk_texts.append(text_to_tokenize)
-
- # Log chunk statistics
- if token_counts:
- avg_tokens = sum(token_counts) / len(token_counts)
- min_tokens = min(token_counts)
- max_tokens = max(token_counts)
- logger.info("Chunk token statistics:")
- logger.info(f" - Average: {avg_tokens:.1f} tokens")
- logger.info(f" - Min: {min_tokens} tokens")
- logger.info(f" - Max: {max_tokens} tokens")
-
- # Step 4: Generate embeddings in batch
- logger.info("Generating embeddings...")
- embeddings = await self.embed_texts(chunk_texts)
-
- # Step 5: Attach embeddings to chunk records
- for chunk_record, embedding in zip(chunk_records, embeddings):
- chunk_record.embedding = embedding
-
- # Step 6: Save all chunks in batch
- logger.info("Storing chunks in database...")
- await self.db.create_chunks_batch(chunk_records)
-
- # Update document status to completed
- await self.db.update_document_status(
- doc_record.id,
- "completed",
- total_chunks=total_chunks,
- docling_metadata=docling_metadata,
- )
-
- logger.info(f"Successfully processed document: {filename}")
- logger.info(f" - Total chunks: {total_chunks}")
- logger.info(f" - Document ID: {doc_record.id}")
- logger.info(f" - Document UUID: {doc_record.document_uuid}")
-
- return doc_record
-
- except Exception as e:
- logger.error(f"Error processing document: {e}")
-
- # Update document status to failed if it exists
- if "doc_record" in locals():
- await self.db.update_document_status(
- doc_record.id, "failed", error_message=str(e)
- )
-
- raise
diff --git a/api/tasks/knowledge_base_processing.py b/api/tasks/knowledge_base_processing.py
index 7bef401..9dc925b 100644
--- a/api/tasks/knowledge_base_processing.py
+++ b/api/tasks/knowledge_base_processing.py
@@ -2,7 +2,6 @@
import os
import tempfile
-from typing import Literal
from docling.chunking import HybridChunker
from docling.document_converter import DocumentConverter
@@ -12,13 +11,10 @@ from transformers import AutoTokenizer
from api.db import db_client
from api.db.models import KnowledgeBaseChunkModel
-from api.services.gen_ai import (
- OpenAIEmbeddingService,
- SentenceTransformerEmbeddingService,
-)
+from api.services.gen_ai import OpenAIEmbeddingService
from api.services.storage import storage_fs
-# For tokenization/chunking - use SentenceTransformer tokenizer as baseline
+# For tokenization/chunking
TOKENIZER_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
@@ -28,7 +24,6 @@ async def process_knowledge_base_document(
s3_key: str,
organization_id: int,
max_tokens: int = 128,
- embedding_service: Literal["sentence_transformer", "openai"] = "openai",
):
"""Process a knowledge base document: download, chunk, embed, and store.
@@ -38,9 +33,6 @@ async def process_knowledge_base_document(
s3_key: S3 key where the file is stored
organization_id: Organization ID
max_tokens: Maximum number of tokens per chunk (default: 128)
- embedding_service: Embedding service to use (default: "openai")
- - "openai": Use OpenAI text-embedding-3-small (1536-dim, requires API key)
- - "sentence_transformer": Use SentenceTransformer (all-MiniLM-L6-v2, 384-dim, free)
"""
logger.info(
f"Starting knowledge base document processing for document_id={document_id}, "
@@ -125,56 +117,42 @@ async def process_knowledge_base_document(
mime_type=mime_type,
)
- # Initialize the embedding service based on the parameter
- if embedding_service == "openai":
- logger.info(
- f"Initializing OpenAI embedding service with max_tokens={max_tokens}"
+ # Initialize the OpenAI embedding service
+ logger.info(
+ f"Initializing OpenAI embedding service with max_tokens={max_tokens}"
+ )
+ # Try to get user's embeddings configuration
+ embeddings_api_key = None
+ embeddings_model = None
+ if document.created_by:
+ user_config = await db_client.get_user_configurations(
+ document.created_by
)
- # Try to get user's embeddings configuration
- embeddings_api_key = None
- embeddings_model = None
- if document.created_by:
- user_config = await db_client.get_user_configurations(
- document.created_by
+ if user_config.embeddings:
+ embeddings_api_key = user_config.embeddings.api_key
+ embeddings_model = user_config.embeddings.model
+ logger.info(
+ f"Using user embeddings config: model={embeddings_model}"
)
- if user_config.embeddings:
- embeddings_api_key = user_config.embeddings.api_key
- embeddings_model = user_config.embeddings.model
- logger.info(
- f"Using user embeddings config: model={embeddings_model}"
- )
- # Check if API key is configured
- if not embeddings_api_key:
- error_message = (
- "OpenAI API key not configured. Please set your API key in "
- "Model Configurations > Embedding to process documents."
- )
- logger.warning(f"Document {document_id}: {error_message}")
- await db_client.update_document_status(
- document_id, "failed", error_message=error_message
- )
- return
+ # Check if API key is configured
+ if not embeddings_api_key:
+ error_message = (
+ "OpenAI API key not configured. Please set your API key in "
+ "Model Configurations > Embedding to process documents."
+ )
+ logger.warning(f"Document {document_id}: {error_message}")
+ await db_client.update_document_status(
+ document_id, "failed", error_message=error_message
+ )
+ return
- service = OpenAIEmbeddingService(
- db_client=db_client,
- max_tokens=max_tokens,
- api_key=embeddings_api_key,
- model_id=embeddings_model or "text-embedding-3-small",
- )
- elif embedding_service == "sentence_transformer":
- logger.info(
- f"Initializing SentenceTransformer embedding service with max_tokens={max_tokens}"
- )
- service = SentenceTransformerEmbeddingService(
- db_client=db_client,
- max_tokens=max_tokens,
- )
- else:
- raise ValueError(
- f"Invalid embedding_service: {embedding_service}. "
- f"Must be 'sentence_transformer' or 'openai'"
- )
+ service = OpenAIEmbeddingService(
+ db_client=db_client,
+ max_tokens=max_tokens,
+ api_key=embeddings_api_key,
+ model_id=embeddings_model or "text-embedding-3-small",
+ )
# Step 1: Convert document with docling
logger.info("Converting document with docling")
@@ -265,8 +243,8 @@ async def process_knowledge_base_document(
logger.info(f" - Min: {min_tokens} tokens")
logger.info(f" - Max: {max_tokens_actual} tokens")
- # Step 6: Generate embeddings using the embedding service
- logger.info(f"Generating embeddings using {embedding_service}")
+ # Step 6: Generate embeddings using OpenAI
+ logger.info(f"Generating embeddings using {service.get_model_id()}")
embeddings = await service.embed_texts(chunk_texts)
# Step 7: Attach embeddings to chunk records
diff --git a/docs/deployment/custom-domain.mdx b/docs/deployment/custom-domain.mdx
index e2bedbb..b25dacc 100644
--- a/docs/deployment/custom-domain.mdx
+++ b/docs/deployment/custom-domain.mdx
@@ -54,7 +54,14 @@ You should see your server's IP address in the response.
## Step 2: Quick Setup (Recommended)
-Once your DNS is configured, run the automated setup script that handles the rest:
+Once your DNS is configured, run the automated setup script that handles the rest.
+
+
+You must be at the same place where you had run `setup_remote.sh` from. The directory should contain `dograh/` with the artifacts that got created when `setup_remote.sh` was run.
+
+
+You must not move the `dograh/` directory to a different location after running `setup_custom_domain.sh`, since we set up auto certificate renewal script as certbot renewal hook pointing to the `dograh/` directory.
+
```bash
curl -o setup_custom_domain.sh https://raw.githubusercontent.com/dograh-hq/dograh/main/scripts/setup_custom_domain.sh && chmod +x setup_custom_domain.sh && sudo ./setup_custom_domain.sh
diff --git a/docs/deployment/docker.mdx b/docs/deployment/docker.mdx
index 2936f0a..abc6c6e 100644
--- a/docs/deployment/docker.mdx
+++ b/docs/deployment/docker.mdx
@@ -106,6 +106,7 @@ The setup script creates the following files in the `dograh/` directory:
| File | Purpose |
|------|---------|
| `docker-compose.yaml` | Main Docker Compose configuration |
+| `turnserver.conf` | Configuration for TURN server |
| `nginx.conf` | nginx reverse proxy configuration with your IP |
| `generate_certificate.sh` | Script to regenerate SSL certificates |
| `certs/local.crt` | Self-signed SSL certificate |
diff --git a/scripts/setup_custom_domain.sh b/scripts/setup_custom_domain.sh
index 4b6e2b4..90fb386 100755
--- a/scripts/setup_custom_domain.sh
+++ b/scripts/setup_custom_domain.sh
@@ -245,8 +245,21 @@ server {
NGINX_EOF
echo -e "${GREEN}✓ nginx.conf updated${NC}"
+# Update .env file with domain name
+echo -e "${BLUE}[6/8] Updating environment variables...${NC}"
+if [[ -f ".env" ]]; then
+ # Update BACKEND_API_ENDPOINT to use domain
+ sed -i.bak "s|^BACKEND_API_ENDPOINT=.*|BACKEND_API_ENDPOINT=https://$DOMAIN_NAME|" .env
+ # Update TURN_HOST to use domain
+ sed -i.bak "s|^TURN_HOST=.*|TURN_HOST=$DOMAIN_NAME|" .env
+ rm -f .env.bak
+ echo -e "${GREEN}✓ .env updated with domain name${NC}"
+else
+ echo -e "${YELLOW}⚠ .env file not found - skipping environment update${NC}"
+fi
+
# Setup auto-renewal
-echo -e "${BLUE}[6/7] Setting up automatic certificate renewal...${NC}"
+echo -e "${BLUE}[7/8] Setting up automatic certificate renewal...${NC}"
DOGRAH_PATH=$(pwd)
# Create renewal hook script that copies new certificates and restarts nginx
@@ -268,7 +281,7 @@ certbot renew --dry-run --quiet && echo -e "${GREEN}✓ Auto-renewal configured
# Start Dograh services
echo ""
-echo -e "${BLUE}[7/7] Starting Dograh services...${NC}"
+echo -e "${BLUE}[8/8] Starting Dograh services...${NC}"
docker compose --profile remote up -d --pull always
echo ""
@@ -287,6 +300,7 @@ echo -e " Auto-renewal: Enabled (certificates renew automatically)"
echo ""
echo -e "${YELLOW}Files modified:${NC}"
echo " - dograh/nginx.conf (updated with domain name)"
+echo " - dograh/.env (BACKEND_API_ENDPOINT and TURN_HOST updated)"
echo " - dograh/certs/local.crt (SSL certificate)"
echo " - dograh/certs/local.key (SSL private key)"
echo " - /etc/letsencrypt/renewal-hooks/deploy/dograh-reload.sh (renewal hook)"