mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-26 21:39:43 +02:00
chore: updated chonkie and temp fix for azure embeddings registry
- TODO: Raise PR in upstream for fix in next version
This commit is contained in:
parent
5addc317f0
commit
71e4860495
4 changed files with 650 additions and 284 deletions
|
|
@ -39,6 +39,19 @@ AIRTABLE_CLIENT_SECRET=your_airtable_client_secret
|
||||||
AIRTABLE_REDIRECT_URI=http://localhost:8000/api/v1/auth/airtable/connector/callback
|
AIRTABLE_REDIRECT_URI=http://localhost:8000/api/v1/auth/airtable/connector/callback
|
||||||
|
|
||||||
# Embedding Model
|
# Embedding Model
|
||||||
|
# Examples:
|
||||||
|
# # Get sentence transformers embeddings
|
||||||
|
# embeddings = AutoEmbeddings.get_embeddings("sentence-transformers/all-MiniLM-L6-v2")
|
||||||
|
|
||||||
|
# # Get OpenAI embeddings
|
||||||
|
# embeddings = AutoEmbeddings.get_embeddings("openai://text-embedding-ada-002", api_key="...")
|
||||||
|
|
||||||
|
# # Get Anthropic embeddings
|
||||||
|
# embeddings = AutoEmbeddings.get_embeddings("anthropic://claude-v1", api_key="...")
|
||||||
|
|
||||||
|
# # Get Cohere embeddings
|
||||||
|
# embeddings = AutoEmbeddings.get_embeddings("cohere://embed-english-light-v3.0", api_key="...")
|
||||||
|
|
||||||
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
||||||
|
|
||||||
RERANKERS_MODEL_NAME=ms-marco-MiniLM-L-12-v2
|
RERANKERS_MODEL_NAME=ms-marco-MiniLM-L-12-v2
|
||||||
|
|
|
||||||
|
|
@ -3,9 +3,27 @@ import shutil
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
|
from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
|
||||||
|
from chonkie.embeddings.azure_openai import AzureOpenAIEmbeddings
|
||||||
|
from chonkie.embeddings.registry import EmbeddingsRegistry
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
from rerankers import Reranker
|
from rerankers import Reranker
|
||||||
|
|
||||||
|
# TODO: Fix this in chonkie upstream
|
||||||
|
# Register Azure OpenAI embeddings with pattern
|
||||||
|
# This automatically infers the following arguments from their corresponding environment variables if they are not provided:
|
||||||
|
# - `api_key` from `AZURE_OPENAI_API_KEY`
|
||||||
|
# - `organization` from `OPENAI_ORG_ID`
|
||||||
|
# - `project` from `OPENAI_PROJECT_ID`
|
||||||
|
# - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
|
||||||
|
# - `api_version` from `OPENAI_API_VERSION`
|
||||||
|
# - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
|
||||||
|
EmbeddingsRegistry.register_provider("azure_openai", AzureOpenAIEmbeddings)
|
||||||
|
EmbeddingsRegistry.register_pattern(r"^text-embedding-", AzureOpenAIEmbeddings)
|
||||||
|
EmbeddingsRegistry.register_model("text-embedding-ada-002", AzureOpenAIEmbeddings)
|
||||||
|
EmbeddingsRegistry.register_model("text-embedding-3-small", AzureOpenAIEmbeddings)
|
||||||
|
EmbeddingsRegistry.register_model("text-embedding-3-large", AzureOpenAIEmbeddings)
|
||||||
|
|
||||||
|
|
||||||
# Get the base directory of the project
|
# Get the base directory of the project
|
||||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -7,7 +7,6 @@ requires-python = ">=3.12"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"alembic>=1.13.0",
|
"alembic>=1.13.0",
|
||||||
"asyncpg>=0.30.0",
|
"asyncpg>=0.30.0",
|
||||||
"chonkie[all]>=1.0.6",
|
|
||||||
"discord-py>=2.5.2",
|
"discord-py>=2.5.2",
|
||||||
"docling>=2.15.0",
|
"docling>=2.15.0",
|
||||||
"fastapi>=0.115.8",
|
"fastapi>=0.115.8",
|
||||||
|
|
@ -48,6 +47,7 @@ dependencies = [
|
||||||
"celery[redis]>=5.5.3",
|
"celery[redis]>=5.5.3",
|
||||||
"flower>=2.0.1",
|
"flower>=2.0.1",
|
||||||
"redis>=5.2.1",
|
"redis>=5.2.1",
|
||||||
|
"chonkie[all]>=1.4.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
|
|
|
||||||
901
surfsense_backend/uv.lock
generated
901
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue