mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
Merge upstream/dev
This commit is contained in:
commit
8bdfd00a15
191 changed files with 3301 additions and 4079 deletions
|
|
@ -1,20 +1,15 @@
|
|||
"""
|
||||
Unified document save/update logic for file processors.
|
||||
"""
|
||||
"""Unified document save/update logic for file processors."""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
|
||||
from sqlalchemy.exc import SQLAlchemyError
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
)
|
||||
|
||||
from ._helpers import (
|
||||
|
|
@ -24,59 +19,6 @@ from ._helpers import (
|
|||
)
|
||||
from .base import get_current_timestamp, safe_set_chunks
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Summary generation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
async def _generate_summary(
|
||||
markdown_content: str,
|
||||
file_name: str,
|
||||
etl_service: str,
|
||||
user_llm,
|
||||
enable_summary: bool,
|
||||
) -> tuple[str, list[float]]:
|
||||
"""
|
||||
Generate a document summary and embedding.
|
||||
|
||||
Docling uses its own large-document summary strategy; other ETL services
|
||||
use the standard ``generate_document_summary`` helper.
|
||||
"""
|
||||
if not enable_summary:
|
||||
summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
|
||||
return summary, await asyncio.to_thread(embed_text, summary)
|
||||
|
||||
if etl_service == "DOCLING":
|
||||
from app.services.docling_service import create_docling_service
|
||||
|
||||
docling_service = create_docling_service()
|
||||
summary_text = await docling_service.process_large_document_summary(
|
||||
content=markdown_content, llm=user_llm, document_title=file_name
|
||||
)
|
||||
|
||||
meta = {
|
||||
"file_name": file_name,
|
||||
"etl_service": etl_service,
|
||||
"document_type": "File Document",
|
||||
}
|
||||
parts = ["# DOCUMENT METADATA"]
|
||||
for key, value in meta.items():
|
||||
if value:
|
||||
formatted_key = key.replace("_", " ").title()
|
||||
parts.append(f"**{formatted_key}:** {value}")
|
||||
|
||||
enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
|
||||
return enhanced, await asyncio.to_thread(embed_text, enhanced)
|
||||
|
||||
# Standard summary (Unstructured / LlamaCloud / others)
|
||||
meta = {
|
||||
"file_name": file_name,
|
||||
"etl_service": etl_service,
|
||||
"document_type": "File Document",
|
||||
}
|
||||
return await generate_document_summary(markdown_content, user_llm, meta)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unified save function
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -90,7 +32,6 @@ async def save_file_document(
|
|||
user_id: str,
|
||||
etl_service: str,
|
||||
connector: dict | None = None,
|
||||
enable_summary: bool = True,
|
||||
) -> Document | None:
|
||||
"""
|
||||
Process and store a file document with deduplication and migration support.
|
||||
|
|
@ -106,7 +47,6 @@ async def save_file_document(
|
|||
user_id: ID of the user
|
||||
etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
|
||||
connector: Optional connector info for Google Drive files
|
||||
enable_summary: Whether to generate an AI summary
|
||||
|
||||
Returns:
|
||||
Document object if successful, None if duplicate detected
|
||||
|
|
@ -133,24 +73,16 @@ async def save_file_document(
|
|||
if should_skip:
|
||||
return doc
|
||||
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
if not user_llm:
|
||||
raise RuntimeError(
|
||||
f"No long context LLM configured for user {user_id} "
|
||||
f"in search space {search_space_id}"
|
||||
)
|
||||
|
||||
summary_content, summary_embedding = await _generate_summary(
|
||||
markdown_content, file_name, etl_service, user_llm, enable_summary
|
||||
)
|
||||
document_content = f"File: {file_name}\n\n{markdown_content[:4000]}"
|
||||
document_embedding = embed_text(document_content)
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
|
||||
|
||||
if existing_document:
|
||||
existing_document.title = file_name
|
||||
existing_document.content = summary_content
|
||||
existing_document.content = document_content
|
||||
existing_document.content_hash = content_hash
|
||||
existing_document.embedding = summary_embedding
|
||||
existing_document.embedding = document_embedding
|
||||
existing_document.document_metadata = doc_metadata
|
||||
await safe_set_chunks(session, existing_document, chunks)
|
||||
existing_document.source_markdown = markdown_content
|
||||
|
|
@ -171,8 +103,8 @@ async def save_file_document(
|
|||
title=file_name,
|
||||
document_type=doc_type,
|
||||
document_metadata=doc_metadata,
|
||||
content=summary_content,
|
||||
embedding=summary_embedding,
|
||||
content=document_content,
|
||||
embedding=document_embedding,
|
||||
chunks=chunks,
|
||||
content_hash=content_hash,
|
||||
unique_identifier_hash=primary_hash,
|
||||
|
|
|
|||
|
|
@ -25,11 +25,10 @@ from app.db import (
|
|||
SearchSourceConnectorType,
|
||||
SearchSpace,
|
||||
)
|
||||
from app.services.llm_service import get_document_summary_llm
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
|
|
@ -176,34 +175,8 @@ async def add_circleback_meeting_document(
|
|||
# PHASE 3: Process the document content
|
||||
# =======================================================================
|
||||
|
||||
# Get LLM for generating summary
|
||||
llm = await get_document_summary_llm(session, search_space_id)
|
||||
if not llm:
|
||||
logger.warning(
|
||||
f"No LLM configured for search space {search_space_id}. Using content as summary."
|
||||
)
|
||||
# Use first 1000 chars as summary if no LLM available
|
||||
summary_content = (
|
||||
markdown_content[:1000] + "..."
|
||||
if len(markdown_content) > 1000
|
||||
else markdown_content
|
||||
)
|
||||
summary_embedding = None
|
||||
else:
|
||||
# Generate summary with metadata
|
||||
summary_metadata = {
|
||||
"meeting_name": meeting_name,
|
||||
"meeting_id": meeting_id,
|
||||
"document_type": "Circleback Meeting",
|
||||
**{
|
||||
k: v
|
||||
for k, v in metadata.items()
|
||||
if isinstance(v, str | int | float | bool)
|
||||
},
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
markdown_content, llm, summary_metadata
|
||||
)
|
||||
summary_content = markdown_content
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(markdown_content)
|
||||
|
|
@ -224,8 +197,7 @@ async def add_circleback_meeting_document(
|
|||
document.title = meeting_name
|
||||
document.content = summary_content
|
||||
document.content_hash = content_hash
|
||||
if summary_embedding is not None:
|
||||
document.embedding = summary_embedding
|
||||
document.embedding = summary_embedding
|
||||
document.document_metadata = document_metadata
|
||||
await safe_set_chunks(session, document, chunks)
|
||||
document.source_markdown = markdown_content
|
||||
|
|
|
|||
|
|
@ -9,12 +9,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
|
||||
from app.db import Document, DocumentType
|
||||
from app.schemas import ExtensionDocumentContent
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
|
||||
|
|
@ -123,26 +122,8 @@ async def add_extension_received_document(
|
|||
f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
|
||||
)
|
||||
|
||||
# Get user's long context LLM (needed for both create and update)
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
if not user_llm:
|
||||
raise RuntimeError(
|
||||
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
document_metadata = {
|
||||
"session_id": content.metadata.BrowsingSessionId,
|
||||
"url": content.metadata.VisitedWebPageURL,
|
||||
"title": content.metadata.VisitedWebPageTitle,
|
||||
"referrer": content.metadata.VisitedWebPageReffererURL,
|
||||
"timestamp": content.metadata.VisitedWebPageDateWithTimeInISOString,
|
||||
"duration_ms": content.metadata.VisitedWebPageVisitDurationInMilliseconds,
|
||||
"document_type": "Browser Extension Capture",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
combined_document_string, user_llm, document_metadata
|
||||
)
|
||||
summary_content = combined_document_string
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(content.pageContent)
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
|||
import contextlib
|
||||
import logging
|
||||
import os
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
|
||||
from fastapi import HTTPException
|
||||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
|
@ -49,12 +49,6 @@ class _ProcessingContext:
|
|||
notification: Notification | None = None
|
||||
use_vision_llm: bool = False
|
||||
processing_mode: str = "basic"
|
||||
enable_summary: bool = field(init=False)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
self.enable_summary = (
|
||||
self.connector.get("enable_summary", True) if self.connector else True
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -262,7 +256,6 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
|
|||
ctx.user_id,
|
||||
etl_result.etl_service,
|
||||
ctx.connector,
|
||||
enable_summary=ctx.enable_summary,
|
||||
)
|
||||
|
||||
if result:
|
||||
|
|
@ -467,7 +460,6 @@ async def process_file_in_background_with_document(
|
|||
log_entry: Log,
|
||||
connector: dict | None = None,
|
||||
notification: Notification | None = None,
|
||||
should_summarize: bool = False,
|
||||
use_vision_llm: bool = False,
|
||||
processing_mode: str = "basic",
|
||||
) -> Document | None:
|
||||
|
|
@ -483,7 +475,6 @@ async def process_file_in_background_with_document(
|
|||
from app.indexing_pipeline.adapters.file_upload_adapter import (
|
||||
UploadDocumentAdapter,
|
||||
)
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.utils.document_converters import generate_content_hash
|
||||
|
||||
from .base import check_duplicate_document
|
||||
|
|
@ -523,8 +514,6 @@ async def process_file_in_background_with_document(
|
|||
stage="chunking",
|
||||
)
|
||||
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
|
||||
adapter = UploadDocumentAdapter(session)
|
||||
await adapter.index(
|
||||
markdown_content=markdown_content,
|
||||
|
|
@ -532,8 +521,6 @@ async def process_file_in_background_with_document(
|
|||
etl_service=etl_service,
|
||||
search_space_id=search_space_id,
|
||||
user_id=user_id,
|
||||
llm=user_llm,
|
||||
should_summarize=should_summarize,
|
||||
)
|
||||
|
||||
if billable_pages > 0:
|
||||
|
|
|
|||
|
|
@ -8,12 +8,11 @@ from sqlalchemy.exc import SQLAlchemyError
|
|||
from sqlalchemy.ext.asyncio import AsyncSession
|
||||
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
)
|
||||
|
||||
from ._helpers import (
|
||||
|
|
@ -183,21 +182,8 @@ async def add_received_markdown_file_document(
|
|||
return doc
|
||||
# Content changed - continue to update
|
||||
|
||||
# Get user's long context LLM (needed for both create and update)
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
if not user_llm:
|
||||
raise RuntimeError(
|
||||
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
document_metadata = {
|
||||
"file_name": file_name,
|
||||
"document_type": "Markdown File Document",
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
file_in_markdown, user_llm, document_metadata
|
||||
)
|
||||
summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks
|
||||
chunks = await create_document_chunks(file_in_markdown)
|
||||
|
|
|
|||
|
|
@ -17,12 +17,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
|||
from youtube_transcript_api import YouTubeTranscriptApi
|
||||
|
||||
from app.db import Document, DocumentStatus, DocumentType
|
||||
from app.services.llm_service import get_user_long_context_llm
|
||||
from app.services.task_logging_service import TaskLoggingService
|
||||
from app.utils.document_converters import (
|
||||
create_document_chunks,
|
||||
embed_text,
|
||||
generate_content_hash,
|
||||
generate_document_summary,
|
||||
generate_unique_identifier_hash,
|
||||
)
|
||||
from app.utils.proxy_config import get_requests_proxies
|
||||
|
|
@ -355,40 +354,8 @@ async def add_youtube_video_document(
|
|||
await session.commit()
|
||||
return document
|
||||
|
||||
# Get LLM for summary generation
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
|
||||
{"stage": "llm_setup"},
|
||||
)
|
||||
|
||||
# Get user's long context LLM
|
||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||
if not user_llm:
|
||||
raise RuntimeError(
|
||||
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
||||
)
|
||||
|
||||
# Generate summary
|
||||
await task_logger.log_task_progress(
|
||||
log_entry,
|
||||
f"Generating summary for video: {video_data.get('title', 'YouTube Video')}",
|
||||
{"stage": "summary_generation"},
|
||||
)
|
||||
|
||||
# Generate summary with metadata
|
||||
document_metadata_for_summary = {
|
||||
"url": url,
|
||||
"video_id": video_id,
|
||||
"title": video_data.get("title", "YouTube Video"),
|
||||
"author": video_data.get("author_name", "Unknown"),
|
||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
||||
"document_type": "YouTube Video Document",
|
||||
"has_transcript": "No captions available" not in transcript_text,
|
||||
}
|
||||
summary_content, summary_embedding = await generate_document_summary(
|
||||
combined_document_string, user_llm, document_metadata_for_summary
|
||||
)
|
||||
summary_content = combined_document_string
|
||||
summary_embedding = embed_text(summary_content)
|
||||
|
||||
# Process chunks
|
||||
await task_logger.log_task_progress(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue