mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-24 21:38:09 +02:00
refactor: updated chonkie & removed junk code
- Removed all references to the Serper API from the codebase, including related components, hooks, and schemas. - Updated the `pyproject.toml` to reflect the new version of `chonkie` and other dependencies. - Cleaned up the configuration and connector management to streamline the application.
This commit is contained in:
parent
70ca585379
commit
086048a4db
18 changed files with 344 additions and 701 deletions
|
|
@ -1,69 +1,12 @@
|
|||
import os
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
|
||||
from chonkie.embeddings.azure_openai import AzureOpenAIEmbeddings
|
||||
from chonkie.embeddings.registry import EmbeddingsRegistry
|
||||
from dotenv import load_dotenv
|
||||
from rerankers import Reranker
|
||||
|
||||
|
||||
# Monkey patch AzureOpenAIEmbeddings to fix parameter order issue
|
||||
# This is a temporary workaround until the upstream chonkie library is fixed
|
||||
class FixedAzureOpenAIEmbeddings(AzureOpenAIEmbeddings):
|
||||
"""Wrapper around AzureOpenAIEmbeddings with fixed parameter order."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: str = "text-embedding-3-small",
|
||||
azure_endpoint: str | None = None,
|
||||
tokenizer: Any | None = None,
|
||||
dimension: int | None = None,
|
||||
azure_api_key: str | None = None,
|
||||
api_version: str = "2024-10-21",
|
||||
deployment: str | None = None,
|
||||
max_retries: int = 3,
|
||||
timeout: float = 60.0,
|
||||
batch_size: int = 128,
|
||||
**kwargs: dict[str, Any],
|
||||
):
|
||||
"""Initialize with model as first parameter to avoid conflicts."""
|
||||
# Call parent's __init__ by explicitly passing azure_endpoint as first arg
|
||||
# to maintain compatibility with the original signature
|
||||
super().__init__(
|
||||
azure_endpoint=azure_endpoint or os.getenv("AZURE_OPENAI_ENDPOINT", ""),
|
||||
model=model,
|
||||
tokenizer=tokenizer,
|
||||
dimension=dimension,
|
||||
azure_api_key=azure_api_key,
|
||||
api_version=api_version,
|
||||
deployment=deployment,
|
||||
max_retries=max_retries,
|
||||
timeout=timeout,
|
||||
batch_size=batch_size,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
|
||||
# TODO: Fix this in chonkie upstream
|
||||
# Register our fixed Azure OpenAI embeddings with pattern
|
||||
# This automatically infers the following arguments from their corresponding environment variables if they are not provided:
|
||||
# - `api_key` from `AZURE_OPENAI_API_KEY`
|
||||
# - `organization` from `OPENAI_ORG_ID`
|
||||
# - `project` from `OPENAI_PROJECT_ID`
|
||||
# - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
|
||||
# - `api_version` from `OPENAI_API_VERSION`
|
||||
# - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
|
||||
EmbeddingsRegistry.register_provider("azure_openai", FixedAzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_pattern(r"^text-embedding-", FixedAzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_model("text-embedding-ada-002", FixedAzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_model("text-embedding-3-small", FixedAzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_model("text-embedding-3-large", FixedAzureOpenAIEmbeddings)
|
||||
|
||||
|
||||
# Get the base directory of the project
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
||||
|
||||
|
|
|
|||
|
|
@ -1,18 +0,0 @@
|
|||
from datetime import UTC, datetime
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class GoogleAuthCredentialsBase(BaseModel):
|
||||
token: str
|
||||
refresh_token: str
|
||||
token_uri: str
|
||||
client_id: str
|
||||
expiry: datetime
|
||||
scopes: list[str]
|
||||
client_secret: str
|
||||
|
||||
@property
|
||||
def expired(self) -> bool:
|
||||
"""Check if the credentials have expired."""
|
||||
return self.expiry <= datetime.now(UTC)
|
||||
|
|
@ -8,13 +8,6 @@ from app.db import Chat, Podcast
|
|||
from app.services.task_logging_service import TaskLoggingService
|
||||
|
||||
|
||||
async def generate_document_podcast(
|
||||
session: AsyncSession, document_id: int, search_space_id: int, user_id: int
|
||||
):
|
||||
# TODO: Need to fetch the document chunks, then concatenate them and pass them to the podcast generation model
|
||||
pass
|
||||
|
||||
|
||||
async def generate_chat_podcast(
|
||||
session: AsyncSession,
|
||||
chat_id: int,
|
||||
|
|
|
|||
|
|
@ -41,7 +41,6 @@ dependencies = [
|
|||
"celery[redis]>=5.5.3",
|
||||
"flower>=2.0.1",
|
||||
"redis>=5.2.1",
|
||||
"chonkie[all]>=1.4.0",
|
||||
"firecrawl-py>=4.9.0",
|
||||
"boto3>=1.35.0",
|
||||
"langchain-community>=0.3.31",
|
||||
|
|
@ -54,6 +53,7 @@ dependencies = [
|
|||
"deepagents>=0.3.0",
|
||||
"trafilatura>=2.0.0",
|
||||
"fastapi-users[oauth,sqlalchemy]>=15.0.3",
|
||||
"chonkie[all]>=1.5.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
599
surfsense_backend/uv.lock
generated
599
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue