Merge upstream/dev

This commit is contained in:
CREDO23 2026-06-05 19:18:12 +02:00
commit 8bdfd00a15
191 changed files with 3301 additions and 4079 deletions

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -65,29 +64,11 @@ class ConfluenceKBSyncService:
if dup:
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"page_title": page_title,
"space_id": space_id,
"document_type": "Confluence Page",
"connector_type": "Confluence",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
page_content, user_llm, doc_metadata_for_summary
)
else:
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(page_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -185,25 +166,10 @@ class ConfluenceKBSyncService:
space_id = (document.document_metadata or {}).get("space_id", "")
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session, user_id, search_space_id, disable_streaming=True
)
if user_llm:
doc_meta = {
"page_title": page_title,
"space_id": space_id,
"document_type": "Confluence Page",
"connector_type": "Confluence",
}
summary_content, summary_embedding = await generate_document_summary(
page_content, user_llm, doc_meta
)
else:
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(page_content)

View file

@ -191,149 +191,6 @@ class DoclingService:
logger.error(f"Full traceback: {traceback.format_exc()}")
raise RuntimeError(f"Docling processing failed: {e}") from e
async def process_large_document_summary(
self, content: str, llm, document_title: str = "Document"
) -> str:
"""
Process large documents using chunked LLM summarization.
Args:
content: The full document content
llm: The language model to use for summarization
document_title: Title of the document for context
Returns:
Final summary of the document
"""
# Large document threshold (100K characters ≈ 25K tokens)
large_document_threshold = 100_000
if len(content) <= large_document_threshold:
# For smaller documents, use direct processing
logger.info(
f"📄 Document size: {len(content)} chars - using direct processing"
)
from app.prompts import SUMMARY_PROMPT_TEMPLATE
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
result = await summary_chain.ainvoke({"document": content})
return result.content
logger.info(
f"📚 Large document detected: {len(content)} chars - using chunked processing"
)
# Import chunker from config
# Create LLM-optimized chunks (8K tokens max for safety)
from chonkie import OverlapRefinery, RecursiveChunker
from langchain_core.prompts import PromptTemplate
llm_chunker = RecursiveChunker(
chunk_size=8000 # Conservative for most LLMs
)
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
overlap_refinery = OverlapRefinery(
context_size=0.1, # 10% overlap for context preservation
method="suffix", # Add next chunk context to current chunk
)
# First chunk the content, then apply overlap refinery
initial_chunks = llm_chunker.chunk(content)
chunks = overlap_refinery.refine(initial_chunks)
total_chunks = len(chunks)
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
# Template for chunk processing
chunk_template = PromptTemplate(
input_variables=["chunk", "chunk_number", "total_chunks"],
template="""<INSTRUCTIONS>
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
Create a comprehensive summary of this document chunk. Focus on:
- Key concepts, facts, and information
- Important details and context
- Main topics and themes
Provide a clear, structured summary that captures the essential content.
Chunk {chunk_number}/{total_chunks}:
<document_chunk>
{chunk}
</document_chunk>
</INSTRUCTIONS>""",
)
# Process each chunk individually
chunk_summaries = []
for i, chunk in enumerate(chunks, 1):
try:
logger.info(
f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)"
)
chunk_chain = chunk_template | llm
chunk_result = await chunk_chain.ainvoke(
{
"chunk": chunk.text,
"chunk_number": i,
"total_chunks": total_chunks,
}
)
chunk_summary = chunk_result.content
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
except Exception as e:
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
# Combine summaries into final document summary
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
try:
combine_template = PromptTemplate(
input_variables=["summaries", "document_title"],
template="""<INSTRUCTIONS>
You are combining multiple section summaries into a final comprehensive document summary.
Create a unified, coherent summary from the following section summaries of "{document_title}".
Ensure:
- Logical flow and organization
- No redundancy or repetition
- Comprehensive coverage of all key points
- Professional, objective tone
<section_summaries>
{summaries}
</section_summaries>
</INSTRUCTIONS>""",
)
combined_summaries = "\n\n".join(chunk_summaries)
combine_chain = combine_template | llm
final_result = await combine_chain.ainvoke(
{"summaries": combined_summaries, "document_title": document_title}
)
final_summary = final_result.content
logger.info(
f"✅ Large document processing complete: {len(final_summary)} chars summary"
)
return final_summary
except Exception as e:
logger.error(f"❌ Failed to combine summaries: {e}")
# Fallback: return concatenated chunk summaries
fallback_summary = "\n\n".join(chunk_summaries)
logger.warning("⚠️ Using fallback combined summary")
return fallback_summary
def create_docling_service() -> DoclingService:
"""Create a Docling service instance."""

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
)
logger = logging.getLogger(__name__)
@ -72,29 +71,11 @@ class DropboxKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"file_name": file_name,
"document_type": "Dropbox File",
"connector_type": "Dropbox",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = f"Dropbox File: {file_name}\n\n{indexable_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Dropbox File: {file_name}\n\n{indexable_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -78,30 +77,11 @@ class GmailKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"subject": subject,
"sender": sender,
"document_type": "Gmail Message",
"connector_type": "Gmail",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured -- using fallback summary")
summary_content = f"Gmail Message: {subject}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_content = f"Gmail Message: {subject}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -19,7 +19,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -90,33 +89,13 @@ class GoogleCalendarKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
doc_metadata_for_summary = {
"event_summary": event_summary,
"start_time": start_time,
"end_time": end_time,
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured -- using fallback summary")
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -273,29 +252,13 @@ class GoogleCalendarKBSyncService:
if not indexable_content:
return {"status": "error", "message": "Event produced empty content"}
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session, user_id, search_space_id, disable_streaming=True
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
doc_metadata_for_summary = {
"event_summary": event_summary,
"start_time": start_time,
"end_time": end_time,
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -8,7 +8,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -74,32 +73,13 @@ class GoogleDriveKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
summary_content = (
f"Google Drive File: {file_name}\n\n{indexable_content}"
)
doc_metadata_for_summary = {
"file_name": file_name,
"mime_type": mime_type,
"document_type": "Google Drive File",
"connector_type": "Google Drive",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = (
f"Google Drive File: {file_name}\n\n{indexable_content}"
)
summary_embedding = embed_text(summary_content)
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -84,32 +83,13 @@ class LinearKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
doc_metadata_for_summary = {
"issue_id": issue_identifier,
"issue_title": issue_title,
"document_type": "Linear Issue",
"connector_type": "Linear",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
issue_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
summary_embedding = embed_text(summary_content)
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(issue_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -227,30 +207,12 @@ class LinearKBSyncService:
comment_count = len(formatted_issue.get("comments", []))
formatted_issue.get("description", "")
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session, user_id, search_space_id, disable_streaming=True
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
if user_llm:
document_metadata_for_summary = {
"issue_id": issue_identifier,
"issue_title": issue_title,
"state": state,
"priority": priority,
"comment_count": comment_count,
"document_type": "Linear Issue",
"connector_type": "Linear",
}
summary_content, summary_embedding = await generate_document_summary(
issue_content, user_llm, document_metadata_for_summary
)
else:
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
summary_embedding = embed_text(summary_content)
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(issue_content)

View file

@ -68,7 +68,6 @@ def _is_interactive_auth_provider(
class LLMRole:
AGENT = "agent" # For agent/chat operations
DOCUMENT_SUMMARY = "document_summary" # For document summarization
def get_global_llm_config(llm_config_id: int) -> dict | None:
@ -268,7 +267,7 @@ async def get_search_space_llm_instance(
Args:
session: Database session
search_space_id: Search Space ID
role: LLM role ('agent' or 'document_summary')
role: LLM role ('agent')
Returns:
ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
@ -285,11 +284,8 @@ async def get_search_space_llm_instance(
return None
# Get the appropriate LLM config ID based on role
llm_config_id = None
if role == LLMRole.AGENT:
llm_config_id = search_space.agent_llm_id
elif role == LLMRole.DOCUMENT_SUMMARY:
llm_config_id = search_space.document_summary_llm_id
else:
logger.error(f"Invalid LLM role: {role}")
return None
@ -476,20 +472,13 @@ async def get_search_space_llm_instance(
async def get_agent_llm(
session: AsyncSession, search_space_id: int
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""Get the search space's agent LLM instance for chat operations."""
return await get_search_space_llm_instance(session, search_space_id, LLMRole.AGENT)
async def get_document_summary_llm(
session: AsyncSession, search_space_id: int, disable_streaming: bool = False
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""Get the search space's document summary LLM instance."""
"""Get the search space's agent LLM instance for chat operations."""
return await get_search_space_llm_instance(
session,
search_space_id,
LLMRole.DOCUMENT_SUMMARY,
LLMRole.AGENT,
disable_streaming=disable_streaming,
)
@ -655,22 +644,6 @@ async def get_vision_llm(
return None
# Backward-compatible alias (LLM preferences are now per-search-space, not per-user)
async def get_user_long_context_llm(
session: AsyncSession,
user_id: str,
search_space_id: int,
disable_streaming: bool = False,
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""
Deprecated: Use get_document_summary_llm instead.
The user_id parameter is ignored as LLM preferences are now per-search-space.
"""
return await get_document_summary_llm(
session, search_space_id, disable_streaming=disable_streaming
)
def get_planner_llm() -> ChatLiteLLM | None:
"""Return a planner LLM instance from the first global config marked
``is_planner: true``, or ``None`` if no planner config is defined.

View file

@ -8,7 +8,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -73,30 +72,11 @@ class NotionKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"page_title": page_title,
"page_id": page_id,
"document_type": "Notion Page",
"connector_type": "Notion",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = f"Notion Page: {page_title}\n\n{markdown_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Notion Page: {page_title}\n\n{markdown_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(markdown_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -245,31 +225,11 @@ class NotionKBSyncService:
f"Final content length: {len(full_content)} chars, verified={content_verified}"
)
from app.services.llm_service import get_user_long_context_llm
logger.debug("Generating summary and embeddings")
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True, # disable streaming to avoid leaking into the chat
)
if user_llm:
document_metadata_for_summary = {
"page_title": document.document_metadata.get("page_title"),
"page_id": document.document_metadata.get("page_id"),
"document_type": "Notion Page",
"connector_type": "Notion",
}
summary_content, summary_embedding = await generate_document_summary(
full_content, user_llm, document_metadata_for_summary
)
logger.debug(f"Generated summary length: {len(summary_content)} chars")
else:
logger.warning("No LLM configured - using fallback summary")
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
summary_embedding = embed_text(summary_content)
logger.debug("Creating new chunks")
chunks = await create_document_chunks(full_content)

View file

@ -233,18 +233,6 @@ async def _resolve_attachment_vision_llm(
return await get_vision_llm(session, search_space_id)
async def _resolve_summary_llm(
session: AsyncSession, *, user_id: str, search_space_id: int, should_summarize: bool
):
"""Fetch summary LLM only when indexing summary is enabled."""
if not should_summarize:
return None
from app.services.llm_service import get_user_long_context_llm
return await get_user_long_context_llm(session, user_id, search_space_id)
def _require_extracted_attachment_content(
*, content: str, etl_meta: dict[str, Any], path: str
) -> str:
@ -349,13 +337,6 @@ async def upsert_note(
path=payload.path,
)
llm = await _resolve_summary_llm(
session,
user_id=str(user_id),
search_space_id=search_space_id,
should_summarize=connector.enable_summary,
)
document_string = _build_document_string(
payload, vault_name, content_override=content_for_index
)
@ -374,8 +355,6 @@ async def upsert_note(
search_space_id=search_space_id,
connector_id=connector.id,
created_by_id=str(user_id),
should_summarize=connector.enable_summary,
fallback_summary=f"Obsidian Note: {payload.name}\n\n{content_for_index}",
metadata=metadata,
)
@ -388,7 +367,7 @@ async def upsert_note(
document = prepared[0]
return await pipeline.index(document, connector_doc, llm)
return await pipeline.index(document, connector_doc)
async def rename_note(

View file

@ -10,7 +10,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
)
logger = logging.getLogger(__name__)
@ -73,30 +72,11 @@ class OneDriveKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"file_name": file_name,
"mime_type": mime_type,
"document_type": "OneDrive File",
"connector_type": "OneDrive",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -18,7 +18,6 @@ class TaskDispatcher(Protocol):
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> None: ...
@ -35,7 +34,6 @@ class CeleryTaskDispatcher:
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> None:
@ -49,7 +47,6 @@ class CeleryTaskDispatcher:
filename=filename,
search_space_id=search_space_id,
user_id=user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
)