mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-24 21:38:09 +02:00
Merge remote-tracking branch 'upstream/dev' into feat/web-search
This commit is contained in:
commit
60d12b0a70
45 changed files with 377 additions and 198 deletions
|
|
@ -0,0 +1,23 @@
|
|||
"""Add MINIMAX to LiteLLMProvider enum
|
||||
|
||||
Revision ID: 106
|
||||
Revises: 105
|
||||
"""
|
||||
|
||||
from collections.abc import Sequence
|
||||
|
||||
from alembic import op
|
||||
|
||||
revision: str = "106"
|
||||
down_revision: str | None = "105"
|
||||
branch_labels: str | Sequence[str] | None = None
|
||||
depends_on: str | Sequence[str] | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.execute("COMMIT")
|
||||
op.execute("ALTER TYPE litellmprovider ADD VALUE IF NOT EXISTS 'MINIMAX'")
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
pass
|
||||
|
|
@ -59,6 +59,7 @@ PROVIDER_MAP = {
|
|||
"DATABRICKS": "databricks",
|
||||
"COMETAPI": "cometapi",
|
||||
"HUGGINGFACE": "huggingface",
|
||||
"MINIMAX": "openai",
|
||||
"CUSTOM": "custom",
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -183,6 +183,23 @@ global_llm_configs:
|
|||
use_default_system_instructions: true
|
||||
citations_enabled: true
|
||||
|
||||
# Example: MiniMax M2.5 - High-performance with 204K context window
|
||||
- id: -8
|
||||
name: "Global MiniMax M2.5"
|
||||
description: "MiniMax M2.5 with 204K context window and competitive pricing"
|
||||
provider: "MINIMAX"
|
||||
model_name: "MiniMax-M2.5"
|
||||
api_key: "your-minimax-api-key-here"
|
||||
api_base: "https://api.minimax.io/v1"
|
||||
rpm: 60
|
||||
tpm: 100000
|
||||
litellm_params:
|
||||
temperature: 1.0 # MiniMax requires temperature in (0.0, 1.0], cannot be 0
|
||||
max_tokens: 4000
|
||||
system_instructions: ""
|
||||
use_default_system_instructions: true
|
||||
citations_enabled: true
|
||||
|
||||
# =============================================================================
|
||||
# Image Generation Configuration
|
||||
# =============================================================================
|
||||
|
|
|
|||
|
|
@ -463,7 +463,7 @@ async def _process_gmail_messages_phase2(
|
|||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -477,7 +477,7 @@ async def index_composio_google_calendar(
|
|||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -1112,7 +1112,7 @@ async def _index_composio_drive_delta_sync(
|
|||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
@ -1520,7 +1520,7 @@ async def _index_composio_drive_full_scan(
|
|||
"connector_id": connector_id,
|
||||
"source": "composio",
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -215,6 +215,7 @@ class LiteLLMProvider(StrEnum):
|
|||
COMETAPI = "COMETAPI"
|
||||
HUGGINGFACE = "HUGGINGFACE"
|
||||
GITHUB_MODELS = "GITHUB_MODELS"
|
||||
MINIMAX = "MINIMAX"
|
||||
CUSTOM = "CUSTOM"
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import asyncio
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
|
|
@ -49,7 +50,7 @@ class ChucksHybridSearchRetriever:
|
|||
# Get embedding for the query
|
||||
embedding_model = config.embedding_model_instance
|
||||
t_embed = time.perf_counter()
|
||||
query_embedding = embedding_model.embed(query_text)
|
||||
query_embedding = await asyncio.to_thread(embedding_model.embed, query_text)
|
||||
perf.debug(
|
||||
"[chunk_search] vector_search embedding in %.3fs",
|
||||
time.perf_counter() - t_embed,
|
||||
|
|
@ -195,7 +196,7 @@ class ChucksHybridSearchRetriever:
|
|||
if query_embedding is None:
|
||||
embedding_model = config.embedding_model_instance
|
||||
t_embed = time.perf_counter()
|
||||
query_embedding = embedding_model.embed(query_text)
|
||||
query_embedding = await asyncio.to_thread(embedding_model.embed, query_text)
|
||||
perf.debug(
|
||||
"[chunk_search] hybrid_search embedding in %.3fs",
|
||||
time.perf_counter() - t_embed,
|
||||
|
|
@ -427,4 +428,4 @@ class ChucksHybridSearchRetriever:
|
|||
search_space_id,
|
||||
document_type,
|
||||
)
|
||||
return final_docs
|
||||
return final_docs
|
||||
|
|
@ -1,11 +1,10 @@
|
|||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.connectors.linear_connector import LinearConnector
|
||||
from app.db import Chunk, Document
|
||||
from app.db import Document
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -105,10 +104,6 @@ class LinearKBSyncService:
|
|||
)
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
await self.db_session.execute(
|
||||
delete(Chunk).where(Chunk.document_id == document.id)
|
||||
)
|
||||
|
||||
chunks = await create_document_chunks(issue_content)
|
||||
|
||||
document.title = f"{issue_identifier}: {issue_title}"
|
||||
|
|
@ -131,7 +126,7 @@ class LinearKBSyncService:
|
|||
"connector_id": connector_id,
|
||||
}
|
||||
flag_modified(document, "document_metadata")
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(self.db_session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
||||
await self.db_session.commit()
|
||||
|
|
|
|||
|
|
@ -85,6 +85,7 @@ PROVIDER_MAP = {
|
|||
"ZHIPU": "openai",
|
||||
"GITHUB_MODELS": "github",
|
||||
"HUGGINGFACE": "huggingface",
|
||||
"MINIMAX": "openai",
|
||||
"CUSTOM": "custom",
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -127,6 +127,7 @@ async def validate_llm_config(
|
|||
"ALIBABA_QWEN": "openai",
|
||||
"MOONSHOT": "openai",
|
||||
"ZHIPU": "openai", # GLM needs special handling
|
||||
"MINIMAX": "openai",
|
||||
"GITHUB_MODELS": "github",
|
||||
}
|
||||
provider_prefix = provider_map.get(provider, provider.lower())
|
||||
|
|
@ -277,6 +278,7 @@ async def get_search_space_llm_instance(
|
|||
"ALIBABA_QWEN": "openai",
|
||||
"MOONSHOT": "openai",
|
||||
"ZHIPU": "openai",
|
||||
"MINIMAX": "openai",
|
||||
}
|
||||
provider_prefix = provider_map.get(
|
||||
global_config["provider"], global_config["provider"].lower()
|
||||
|
|
@ -350,6 +352,7 @@ async def get_search_space_llm_instance(
|
|||
"ALIBABA_QWEN": "openai",
|
||||
"MOONSHOT": "openai",
|
||||
"ZHIPU": "openai",
|
||||
"MINIMAX": "openai",
|
||||
"GITHUB_MODELS": "github",
|
||||
}
|
||||
provider_prefix = provider_map.get(
|
||||
|
|
|
|||
|
|
@ -1,10 +1,9 @@
|
|||
import logging
|
||||
from datetime import datetime
|
||||
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Chunk, Document
|
||||
from app.db import Document
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
|
|
@ -130,11 +129,6 @@ class NotionKBSyncService:
|
|||
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
logger.debug(f"Deleting old chunks for document {document_id}")
|
||||
await self.db_session.execute(
|
||||
delete(Chunk).where(Chunk.document_id == document.id)
|
||||
)
|
||||
|
||||
logger.debug("Creating new chunks")
|
||||
chunks = await create_document_chunks(full_content)
|
||||
logger.debug(f"Created {len(chunks)} chunks")
|
||||
|
|
@ -147,7 +141,7 @@ class NotionKBSyncService:
|
|||
**document.document_metadata,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(self.db_session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
||||
logger.debug("Committing changes to database")
|
||||
|
|
|
|||
|
|
@ -432,7 +432,7 @@ async def index_airtable_records(
|
|||
"table_name": item["table_name"],
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -28,45 +28,37 @@ def get_current_timestamp() -> datetime:
|
|||
return datetime.now(UTC)
|
||||
|
||||
|
||||
def safe_set_chunks(document: Document, chunks: list) -> None:
|
||||
async def safe_set_chunks(
|
||||
session: "AsyncSession", document: Document, chunks: list
|
||||
) -> None:
|
||||
"""
|
||||
Safely assign chunks to a document without triggering lazy loading.
|
||||
Delete old chunks and assign new ones to a document.
|
||||
|
||||
ALWAYS use this instead of `document.chunks = chunks` to avoid
|
||||
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
|
||||
|
||||
Why this is needed:
|
||||
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
|
||||
load the OLD chunks first (for comparison/orphan detection)
|
||||
- This lazy loading fails in async context with asyncpg driver
|
||||
- set_committed_value bypasses this by setting the value directly
|
||||
|
||||
This function is safe regardless of how the document was loaded
|
||||
(with or without selectinload).
|
||||
This replaces direct ``document.chunks = chunks`` which triggers lazy
|
||||
loading (and MissingGreenlet errors in async contexts). It also
|
||||
explicitly deletes pre-existing chunks so they don't accumulate across
|
||||
repeated re-indexes — ``set_committed_value`` bypasses SQLAlchemy's
|
||||
delete-orphan cascade.
|
||||
|
||||
Args:
|
||||
document: The Document object to update
|
||||
chunks: List of Chunk objects to assign
|
||||
|
||||
Example:
|
||||
# Instead of: document.chunks = chunks (DANGEROUS!)
|
||||
safe_set_chunks(document, chunks) # Always safe
|
||||
session: The current async database session.
|
||||
document: The Document object to update.
|
||||
chunks: List of Chunk objects to assign.
|
||||
"""
|
||||
from sqlalchemy.orm import object_session
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy.orm.attributes import set_committed_value
|
||||
|
||||
# Keep relationship assignment lazy-load-safe.
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
from app.db import Chunk
|
||||
|
||||
# Ensure chunk rows are actually persisted.
|
||||
# set_committed_value bypasses normal unit-of-work tracking, so we need to
|
||||
# explicitly attach chunk objects to the current session.
|
||||
session = object_session(document)
|
||||
if session is not None:
|
||||
if document.id is not None:
|
||||
for chunk in chunks:
|
||||
chunk.document_id = document.id
|
||||
session.add_all(chunks)
|
||||
if document.id is not None:
|
||||
await session.execute(
|
||||
delete(Chunk).where(Chunk.document_id == document.id)
|
||||
)
|
||||
for chunk in chunks:
|
||||
chunk.document_id = document.id
|
||||
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
session.add_all(chunks)
|
||||
|
||||
|
||||
def parse_date_flexible(date_str: str) -> datetime:
|
||||
|
|
|
|||
|
|
@ -430,7 +430,7 @@ async def index_bookstack_pages(
|
|||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = doc_metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -439,7 +439,7 @@ async def index_clickup_tasks(
|
|||
"connector_id": connector_id,
|
||||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -413,7 +413,7 @@ async def index_confluence_pages(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -690,7 +690,7 @@ async def index_discord_messages(
|
|||
"indexed_at": datetime.now(UTC).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -386,7 +386,7 @@ async def index_elasticsearch_documents(
|
|||
document.content_hash = item["content_hash"]
|
||||
document.unique_identifier_hash = item["unique_identifier_hash"]
|
||||
document.document_metadata = metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -415,7 +415,7 @@ async def index_github_repos(
|
|||
document.content_hash = item["content_hash"]
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = doc_metadata
|
||||
safe_set_chunks(document, chunks_data)
|
||||
await safe_set_chunks(session, document, chunks_data)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -528,7 +528,7 @@ async def index_google_calendar_events(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -451,7 +451,7 @@ async def index_google_gmail_messages(
|
|||
"date": item["date_str"],
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -393,7 +393,7 @@ async def index_jira_issues(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -431,7 +431,7 @@ async def index_linear_issues(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -488,7 +488,7 @@ async def index_luma_events(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -479,7 +479,7 @@ async def index_notion_pages(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -571,7 +571,7 @@ async def index_obsidian_vault(
|
|||
document.content_hash = content_hash
|
||||
document.embedding = embedding
|
||||
document.document_metadata = document_metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -564,7 +564,7 @@ async def index_slack_messages(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -603,7 +603,7 @@ async def index_teams_messages(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.updated_at = get_current_timestamp()
|
||||
document.status = DocumentStatus.ready()
|
||||
|
||||
|
|
|
|||
|
|
@ -410,7 +410,7 @@ async def index_crawled_urls(
|
|||
"indexed_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
|
||||
"connector_id": connector_id,
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.status = DocumentStatus.ready() # READY status
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
||||
|
|
|
|||
|
|
@ -14,45 +14,37 @@ from app.db import Document
|
|||
md = MarkdownifyTransformer()
|
||||
|
||||
|
||||
def safe_set_chunks(document: Document, chunks: list) -> None:
|
||||
async def safe_set_chunks(
|
||||
session: "AsyncSession", document: Document, chunks: list
|
||||
) -> None:
|
||||
"""
|
||||
Safely assign chunks to a document without triggering lazy loading.
|
||||
Delete old chunks and assign new ones to a document.
|
||||
|
||||
ALWAYS use this instead of `document.chunks = chunks` to avoid
|
||||
SQLAlchemy async errors (MissingGreenlet / greenlet_spawn).
|
||||
|
||||
Why this is needed:
|
||||
- Direct assignment `document.chunks = chunks` triggers SQLAlchemy to
|
||||
load the OLD chunks first (for comparison/orphan detection)
|
||||
- This lazy loading fails in async context with asyncpg driver
|
||||
- set_committed_value bypasses this by setting the value directly
|
||||
|
||||
This function is safe regardless of how the document was loaded
|
||||
(with or without selectinload).
|
||||
This replaces direct ``document.chunks = chunks`` which triggers lazy
|
||||
loading (and MissingGreenlet errors in async contexts). It also
|
||||
explicitly deletes pre-existing chunks so they don't accumulate across
|
||||
repeated re-indexes — ``set_committed_value`` bypasses SQLAlchemy's
|
||||
delete-orphan cascade.
|
||||
|
||||
Args:
|
||||
document: The Document object to update
|
||||
chunks: List of Chunk objects to assign
|
||||
|
||||
Example:
|
||||
# Instead of: document.chunks = chunks (DANGEROUS!)
|
||||
safe_set_chunks(document, chunks) # Always safe
|
||||
session: The current async database session.
|
||||
document: The Document object to update.
|
||||
chunks: List of Chunk objects to assign.
|
||||
"""
|
||||
from sqlalchemy.orm import object_session
|
||||
from sqlalchemy import delete
|
||||
from sqlalchemy.orm.attributes import set_committed_value
|
||||
|
||||
# Keep relationship assignment lazy-load-safe.
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
from app.db import Chunk
|
||||
|
||||
# Ensure chunk rows are actually persisted.
|
||||
# set_committed_value bypasses normal unit-of-work tracking, so we need to
|
||||
# explicitly attach chunk objects to the current session.
|
||||
session = object_session(document)
|
||||
if session is not None:
|
||||
if document.id is not None:
|
||||
for chunk in chunks:
|
||||
chunk.document_id = document.id
|
||||
session.add_all(chunks)
|
||||
if document.id is not None:
|
||||
await session.execute(
|
||||
delete(Chunk).where(Chunk.document_id == document.id)
|
||||
)
|
||||
for chunk in chunks:
|
||||
chunk.document_id = document.id
|
||||
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
session.add_all(chunks)
|
||||
|
||||
|
||||
def get_current_timestamp() -> datetime:
|
||||
|
|
|
|||
|
|
@ -227,7 +227,7 @@ async def add_circleback_meeting_document(
|
|||
if summary_embedding is not None:
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = document_metadata
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.source_markdown = markdown_content
|
||||
document.content_needs_reindexing = False
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from app.utils.document_converters import (
|
|||
from .base import (
|
||||
check_document_by_unique_identifier,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -154,7 +155,7 @@ async def add_extension_received_document(
|
|||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.document_metadata = content.metadata.model_dump()
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = combined_document_string
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
||||
|
|
|
|||
|
|
@ -35,6 +35,7 @@ from .base import (
|
|||
check_document_by_unique_identifier,
|
||||
check_duplicate_document,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
from .markdown_processor import add_received_markdown_file_document
|
||||
|
||||
|
|
@ -488,7 +489,7 @@ async def add_received_file_document_using_unstructured(
|
|||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "UNSTRUCTURED",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
|
@ -622,7 +623,7 @@ async def add_received_file_document_using_llamacloud(
|
|||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "LLAMACLOUD",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
|
@ -777,7 +778,7 @@ async def add_received_file_document_using_docling(
|
|||
"FILE_NAME": file_name,
|
||||
"ETL_SERVICE": "DOCLING",
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.content_needs_reindexing = False
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
|
|
|
|||
|
|
@ -21,6 +21,7 @@ from .base import (
|
|||
check_document_by_unique_identifier,
|
||||
check_duplicate_document,
|
||||
get_current_timestamp,
|
||||
safe_set_chunks,
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -258,7 +259,7 @@ async def add_received_markdown_file_document(
|
|||
existing_document.document_metadata = {
|
||||
"FILE_NAME": file_name,
|
||||
}
|
||||
existing_document.chunks = chunks
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = file_in_markdown
|
||||
existing_document.updated_at = get_current_timestamp()
|
||||
existing_document.status = DocumentStatus.ready() # Mark as ready
|
||||
|
|
|
|||
|
|
@ -419,7 +419,7 @@ async def add_youtube_video_document(
|
|||
"author": video_data.get("author_name", "Unknown"),
|
||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||
}
|
||||
safe_set_chunks(document, chunks)
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.source_markdown = combined_document_string
|
||||
document.status = DocumentStatus.ready() # READY status - fully processed
|
||||
document.updated_at = get_current_timestamp()
|
||||
|
|
|
|||
|
|
@ -13,12 +13,32 @@ from sqlalchemy import select
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
from sqlalchemy.orm import selectinload
|
||||
|
||||
from sqlalchemy import delete as sa_delete
|
||||
from sqlalchemy.orm.attributes import set_committed_value
|
||||
|
||||
from app.config import config
|
||||
from app.db import SurfsenseDocsChunk, SurfsenseDocsDocument, async_session_maker
|
||||
from app.utils.document_converters import embed_text
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _safe_set_docs_chunks(
|
||||
session: AsyncSession, document: SurfsenseDocsDocument, chunks: list
|
||||
) -> None:
|
||||
"""safe_set_chunks variant for the SurfsenseDocsDocument/Chunk models."""
|
||||
if document.id is not None:
|
||||
await session.execute(
|
||||
sa_delete(SurfsenseDocsChunk).where(
|
||||
SurfsenseDocsChunk.document_id == document.id
|
||||
)
|
||||
)
|
||||
for chunk in chunks:
|
||||
chunk.document_id = document.id
|
||||
|
||||
set_committed_value(document, "chunks", chunks)
|
||||
session.add_all(chunks)
|
||||
|
||||
# Path to docs relative to project root
|
||||
DOCS_DIR = (
|
||||
Path(__file__).resolve().parent.parent.parent.parent
|
||||
|
|
@ -156,7 +176,7 @@ async def index_surfsense_docs(session: AsyncSession) -> tuple[int, int, int, in
|
|||
existing_doc.content = content
|
||||
existing_doc.content_hash = content_hash
|
||||
existing_doc.embedding = embed_text(content)
|
||||
existing_doc.chunks = chunks
|
||||
await _safe_set_docs_chunks(session, existing_doc, chunks)
|
||||
existing_doc.updated_at = datetime.now(UTC)
|
||||
|
||||
updated += 1
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue