refactor: updated chonkie & removed junk code

- Removed all references to the Serper API from the codebase, including related components, hooks, and schemas. - Updated the `pyproject.toml` to reflect the new version of `chonkie` and other dependencies. - Cleaned up the configuration and connector management to streamline the application.
2026-06-24 21:38:09 +02:00 · 2025-12-19 23:26:45 -08:00 · 2025-12-19 23:26:45 -08:00 · 086048a4db
commit 086048a4db
parent 70ca585379
18 changed files with 344 additions and 701 deletions
--- a/surfsense_backend/app/config/init.py
+++ b/surfsense_backend/app/config/init.py
@ -1,69 +1,12 @@
 import os
 import shutil
 from pathlib import Path
-from typing import Any

 import yaml
 from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
-from chonkie.embeddings.azure_openai import AzureOpenAIEmbeddings
-from chonkie.embeddings.registry import EmbeddingsRegistry
 from dotenv import load_dotenv
 from rerankers import Reranker

-
-# Monkey patch AzureOpenAIEmbeddings to fix parameter order issue
-# This is a temporary workaround until the upstream chonkie library is fixed
-class FixedAzureOpenAIEmbeddings(AzureOpenAIEmbeddings):
-    """Wrapper around AzureOpenAIEmbeddings with fixed parameter order."""
-
-    def __init__(
-        self,
-        model: str = "text-embedding-3-small",
-        azure_endpoint: str | None = None,
-        tokenizer: Any | None = None,
-        dimension: int | None = None,
-        azure_api_key: str | None = None,
-        api_version: str = "2024-10-21",
-        deployment: str | None = None,
-        max_retries: int = 3,
-        timeout: float = 60.0,
-        batch_size: int = 128,
-        **kwargs: dict[str, Any],
-    ):
-        """Initialize with model as first parameter to avoid conflicts."""
-        # Call parent's __init__ by explicitly passing azure_endpoint as first arg
-        # to maintain compatibility with the original signature
-        super().__init__(
-            azure_endpoint=azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT", ""),
-            model=model,
-            tokenizer=tokenizer,
-            dimension=dimension,
-            azure_api_key=azure_api_key,
-            api_version=api_version,
-            deployment=deployment,
-            max_retries=max_retries,
-            timeout=timeout,
-            batch_size=batch_size,
-            **kwargs,
-        )
-
-
-# TODO: Fix this in chonkie upstream
-# Register our fixed Azure OpenAI embeddings with pattern
-# This automatically infers the following arguments from their corresponding environment variables if they are not provided:
-# - `api_key` from `AZURE_OPENAI_API_KEY`
-# - `organization` from `OPENAI_ORG_ID`
-# - `project` from `OPENAI_PROJECT_ID`
-# - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
-# - `api_version` from `OPENAI_API_VERSION`
-# - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
-EmbeddingsRegistry.register_provider("azure_openai", FixedAzureOpenAIEmbeddings)
-EmbeddingsRegistry.register_pattern(r"^text-embedding-", FixedAzureOpenAIEmbeddings)
-EmbeddingsRegistry.register_model("text-embedding-ada-002", FixedAzureOpenAIEmbeddings)
-EmbeddingsRegistry.register_model("text-embedding-3-small", FixedAzureOpenAIEmbeddings)
-EmbeddingsRegistry.register_model("text-embedding-3-large", FixedAzureOpenAIEmbeddings)
-
-
 # Get the base directory of the project
 BASE_DIR = Path(__file__).resolve().parent.parent.parent

--- a/surfsense_backend/app/schemas/google_auth_credentials.py
+++ b/surfsense_backend/app/schemas/google_auth_credentials.py
@ -1,18 +0,0 @@
-from datetime import UTC, datetime
-
-from pydantic import BaseModel
-
-
-class GoogleAuthCredentialsBase(BaseModel):
-    token: str
-    refresh_token: str
-    token_uri: str
-    client_id: str
-    expiry: datetime
-    scopes: list[str]
-    client_secret: str
-
-    @property
-    def expired(self) -> bool:
-        """Check if the credentials have expired."""
-        return self.expiry <= datetime.now(UTC)
--- a/surfsense_backend/app/tasks/podcast_tasks.py
+++ b/surfsense_backend/app/tasks/podcast_tasks.py
@ -8,13 +8,6 @@ from app.db import Chat, Podcast
 from app.services.task_logging_service import TaskLoggingService


-async def generate_document_podcast(
-    session: AsyncSession, document_id: int, search_space_id: int, user_id: int
-):
-    # TODO: Need to fetch the document chunks, then concatenate them and pass them to the podcast generation model
-    pass
-
-
 async def generate_chat_podcast(
    session: AsyncSession,
    chat_id: int,
--- a/surfsense_backend/pyproject.toml
+++ b/surfsense_backend/pyproject.toml
@ -41,7 +41,6 @@ dependencies = [
    "celery[redis]>=5.5.3",
    "flower>=2.0.1",
    "redis>=5.2.1",
-    "chonkie[all]>=1.4.0",
    "firecrawl-py>=4.9.0",
    "boto3>=1.35.0",
    "langchain-community>=0.3.31",
@ -54,6 +53,7 @@ dependencies = [
    "deepagents>=0.3.0",
    "trafilatura>=2.0.0",
    "fastapi-users[oauth,sqlalchemy]>=15.0.3",
+    "chonkie[all]>=1.5.0",
 ]

 [dependency-groups]
--- a/surfsense_backend/uv.lock
+++ b/surfsense_backend/uv.lock