diff --git a/surfsense_backend/app/agents/podcaster/utils.py b/surfsense_backend/app/agents/podcaster/utils.py index 1a460827b..96ea1d51e 100644 --- a/surfsense_backend/app/agents/podcaster/utils.py +++ b/surfsense_backend/app/agents/podcaster/utils.py @@ -63,6 +63,17 @@ def get_voice_for_provider(provider: str, speaker_id: int) -> dict | str: }, } return vertex_voices.get(speaker_id, vertex_voices[0]) + elif provider_type == "azure": + # OpenAI voice mapping - simple string values + azure_voices = { + 0: "alloy", # Default/intro voice + 1: "echo", # First speaker + 2: "fable", # Second speaker + 3: "onyx", # Third speaker + 4: "nova", # Fourth speaker + 5: "shimmer", # Fifth speaker + } + return azure_voices.get(speaker_id, "alloy") else: # Default fallback to OpenAI format for unknown providers diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index ec78dfbac..548a22031 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -1,6 +1,7 @@ import os import shutil from pathlib import Path +from typing import Any from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker from chonkie.embeddings.azure_openai import AzureOpenAIEmbeddings @@ -8,8 +9,46 @@ from chonkie.embeddings.registry import EmbeddingsRegistry from dotenv import load_dotenv from rerankers import Reranker + +# Monkey patch AzureOpenAIEmbeddings to fix parameter order issue +# This is a temporary workaround until the upstream chonkie library is fixed +class FixedAzureOpenAIEmbeddings(AzureOpenAIEmbeddings): + """Wrapper around AzureOpenAIEmbeddings with fixed parameter order.""" + + def __init__( + self, + model: str = "text-embedding-3-small", + azure_endpoint: str | None = None, + tokenizer: Any | None = None, + dimension: int | None = None, + azure_api_key: str | None = None, + api_version: str = "2024-10-21", + deployment: str | None = None, + max_retries: int = 3, + timeout: float = 60.0, + batch_size: int = 128, + **kwargs: dict[str, Any], + ): + """Initialize with model as first parameter to avoid conflicts.""" + # Call parent's __init__ by explicitly passing azure_endpoint as first arg + # to maintain compatibility with the original signature + super().__init__( + azure_endpoint=azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT", ""), + model=model, + tokenizer=tokenizer, + dimension=dimension, + azure_api_key=azure_api_key, + api_version=api_version, + deployment=deployment, + max_retries=max_retries, + timeout=timeout, + batch_size=batch_size, + **kwargs, + ) + + # TODO: Fix this in chonkie upstream -# Register Azure OpenAI embeddings with pattern +# Register our fixed Azure OpenAI embeddings with pattern # This automatically infers the following arguments from their corresponding environment variables if they are not provided: # - `api_key` from `AZURE_OPENAI_API_KEY` # - `organization` from `OPENAI_ORG_ID` @@ -17,11 +56,11 @@ from rerankers import Reranker # - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN` # - `api_version` from `OPENAI_API_VERSION` # - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT` -EmbeddingsRegistry.register_provider("azure_openai", AzureOpenAIEmbeddings) -EmbeddingsRegistry.register_pattern(r"^text-embedding-", AzureOpenAIEmbeddings) -EmbeddingsRegistry.register_model("text-embedding-ada-002", AzureOpenAIEmbeddings) -EmbeddingsRegistry.register_model("text-embedding-3-small", AzureOpenAIEmbeddings) -EmbeddingsRegistry.register_model("text-embedding-3-large", AzureOpenAIEmbeddings) +EmbeddingsRegistry.register_provider("azure_openai", FixedAzureOpenAIEmbeddings) +EmbeddingsRegistry.register_pattern(r"^text-embedding-", FixedAzureOpenAIEmbeddings) +EmbeddingsRegistry.register_model("text-embedding-ada-002", FixedAzureOpenAIEmbeddings) +EmbeddingsRegistry.register_model("text-embedding-3-small", FixedAzureOpenAIEmbeddings) +EmbeddingsRegistry.register_model("text-embedding-3-large", FixedAzureOpenAIEmbeddings) # Get the base directory of the project @@ -83,7 +122,21 @@ class Config: # Chonkie Configuration | Edit this to your needs EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") - embedding_model_instance = AutoEmbeddings.get_embeddings(EMBEDDING_MODEL) + # Azure OpenAI credentials from environment variables + AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT") + AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY") + + # Pass Azure credentials to embeddings when using Azure OpenAI + embedding_kwargs = {} + if AZURE_OPENAI_ENDPOINT: + embedding_kwargs["azure_endpoint"] = AZURE_OPENAI_ENDPOINT + if AZURE_OPENAI_API_KEY: + embedding_kwargs["azure_api_key"] = AZURE_OPENAI_API_KEY + + embedding_model_instance = AutoEmbeddings.get_embeddings( + EMBEDDING_MODEL, + **embedding_kwargs, + ) chunker_instance = RecursiveChunker( chunk_size=getattr(embedding_model_instance, "max_seq_length", 512) ) diff --git a/surfsense_backend/app/services/page_limit_service.py b/surfsense_backend/app/services/page_limit_service.py index bc3f87f18..080d05b5d 100644 --- a/surfsense_backend/app/services/page_limit_service.py +++ b/surfsense_backend/app/services/page_limit_service.py @@ -98,7 +98,7 @@ class PageLimitService: # Get user result = await self.session.execute(select(User).where(User.id == user_id)) - user = result.scalar_one_or_none() + user = result.unique().scalar_one_or_none() if not user: raise ValueError(f"User with ID {user_id} not found") diff --git a/surfsense_backend/scripts/docker/entrypoint.sh b/surfsense_backend/scripts/docker/entrypoint.sh index 32754d3e9..969970541 100644 --- a/surfsense_backend/scripts/docker/entrypoint.sh +++ b/surfsense_backend/scripts/docker/entrypoint.sh @@ -19,7 +19,7 @@ backend_pid=$! sleep 5 echo "Starting Celery Worker..." -celery -A app.celery_app worker --loglevel=info --concurrency=1 --pool=solo & +celery -A app.celery_app worker --loglevel=info & celery_worker_pid=$! # Wait a bit for worker to initialize