Merge pull request #1467 from AnishSarkar22/feat/ui-fixes

feat: improve chat navigation, automation UI, and summary-free indexing
This commit is contained in:
Rohan Verma 2026-06-04 13:55:50 -07:00 committed by GitHub
commit c2b8b3ac5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
197 changed files with 3331 additions and 4051 deletions

View file

@ -54,6 +54,17 @@ USER_COLS = [
"premium_credit_micros_used",
]
AUTOMATION_RUN_COLS = [
"id",
"automation_id",
"trigger_id",
"status",
"step_results",
"started_at",
"finished_at",
"created_at",
]
def _has_zero_version(conn, table: str) -> bool:
return (
conn.execute(
@ -150,7 +161,8 @@ def _build_set_table_ddl(
f"new_chat_messages, "
f"chat_comments, "
f"chat_session_state, "
f'"user" ({_cols(user_cols)})'
f'"user" ({_cols(user_cols)}), '
f"automation_runs ({_cols(AUTOMATION_RUN_COLS)})"
)
@ -523,7 +535,7 @@ def downgrade() -> None:
if exists:
documents_has_zero_ver = _has_zero_version(conn, "documents")
user_has_zero_ver = _has_zero_version(conn, "user")
# Restore the publication shape from migration 143.
# Restore the publication shape from migration 148.
doc_cols = DOCUMENT_COLS + (['"_0_version"'] if documents_has_zero_ver else [])
user_cols = USER_COLS + (['"_0_version"'] if user_has_zero_ver else [])
ddl = (
@ -535,7 +547,8 @@ def downgrade() -> None:
f"new_chat_messages, "
f"chat_comments, "
f"chat_session_state, "
f'"user" ({_cols(user_cols)})'
f'"user" ({_cols(user_cols)}), '
f"automation_runs ({_cols(AUTOMATION_RUN_COLS)})"
)
tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
with tx:

View file

@ -0,0 +1,134 @@
"""remove document summary llm settings
Revision ID: 154
Revises: 153
"""
from collections.abc import Sequence
import sqlalchemy as sa
from alembic import op
revision: str = "154"
down_revision: str | None = "153"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
PUBLICATION_NAME = "zero_publication"
DOCUMENT_COLS = [
"id",
"title",
"document_type",
"search_space_id",
"folder_id",
"created_by_id",
"status",
"created_at",
"updated_at",
]
USER_COLS = [
"id",
"pages_limit",
"pages_used",
"premium_credit_micros_limit",
"premium_credit_micros_used",
]
AUTOMATION_RUN_COLS = [
"id",
"automation_id",
"trigger_id",
"status",
"step_results",
"started_at",
"finished_at",
"created_at",
]
def _column_exists(conn, table: str, column: str) -> bool:
return (
conn.execute(
sa.text(
"SELECT 1 FROM information_schema.columns "
"WHERE table_name = :table AND column_name = :column"
),
{"table": table, "column": column},
).fetchone()
is not None
)
def _has_zero_version(conn, table: str) -> bool:
return _column_exists(conn, table, "_0_version")
def _set_table_ddl(conn) -> str:
doc_cols = DOCUMENT_COLS + (['"_0_version"'] if _has_zero_version(conn, "documents") else [])
user_cols = USER_COLS + (['"_0_version"'] if _has_zero_version(conn, "user") else [])
tables = [
"notifications",
f"documents ({', '.join(doc_cols)})",
"folders",
"search_source_connectors",
"new_chat_messages",
"chat_comments",
"chat_session_state",
f'"user" ({", ".join(user_cols)})',
f"automation_runs ({', '.join(AUTOMATION_RUN_COLS)})",
]
return f"ALTER PUBLICATION {PUBLICATION_NAME} SET TABLE " + ", ".join(tables)
def _resync_zero_publication(tag: str) -> None:
conn = op.get_bind()
exists = conn.execute(
sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"),
{"name": PUBLICATION_NAME},
).fetchone()
if not exists:
return
tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
with tx:
conn.execute(sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'pre-{tag}'"))
conn.execute(sa.text(_set_table_ddl(conn)))
conn.execute(sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'post-{tag}'"))
def upgrade() -> None:
conn = op.get_bind()
if _column_exists(conn, "searchspaces", "document_summary_llm_id"):
op.drop_column("searchspaces", "document_summary_llm_id")
if _column_exists(conn, "search_source_connectors", "enable_summary"):
op.drop_column("search_source_connectors", "enable_summary")
_resync_zero_publication("154-summary-removal")
def downgrade() -> None:
conn = op.get_bind()
if not _column_exists(conn, "searchspaces", "document_summary_llm_id"):
op.add_column(
"searchspaces",
sa.Column("document_summary_llm_id", sa.Integer(), nullable=True, server_default="0"),
)
if not _column_exists(conn, "search_source_connectors", "enable_summary"):
op.add_column(
"search_source_connectors",
sa.Column(
"enable_summary",
sa.Boolean(),
nullable=False,
server_default=sa.text("false"),
),
)
_resync_zero_publication("154-summary-removal-downgrade")

View file

@ -16,7 +16,7 @@ from app.agents.shared.receipt import make_receipt
from app.agents.shared.receipt_command import with_receipt
from app.db import Report, shielded_async_session
from app.services.connector_service import ConnectorService
from app.services.llm_service import get_document_summary_llm
from app.services.llm_service import get_agent_llm
logger = logging.getLogger(__name__)
@ -546,7 +546,7 @@ def create_generate_report_tool(
Factory function to create the generate_report tool with injected dependencies.
The tool generates a Markdown report inline using the search space's
document summary LLM, saves it to the database, and returns immediately.
agent LLM, saves it to the database, and returns immediately.
Uses short-lived database sessions for each DB operation so no connection
is held during the long LLM API call.
@ -767,7 +767,7 @@ def create_generate_report_tool(
"creating standalone report"
)
llm = await get_document_summary_llm(read_session, search_space_id)
llm = await get_agent_llm(read_session, search_space_id)
# read_session closed — connection returned to pool
if not llm:

View file

@ -17,7 +17,7 @@ from langgraph.types import Command
from app.agents.shared.receipt import make_receipt
from app.agents.shared.receipt_command import with_receipt
from app.db import Report, shielded_async_session
from app.services.llm_service import get_document_summary_llm
from app.services.llm_service import get_agent_llm
logger = logging.getLogger(__name__)
@ -578,7 +578,7 @@ def create_generate_resume_tool(
f"(group {report_group_id})"
)
llm = await get_document_summary_llm(read_session, search_space_id)
llm = await get_agent_llm(read_session, search_space_id)
if not llm:
error_msg = (

View file

@ -35,7 +35,7 @@ from langchain_core.tools import tool
from app.db import Report, shielded_async_session
from app.services.connector_service import ConnectorService
from app.services.llm_service import get_document_summary_llm
from app.services.llm_service import get_agent_llm
logger = logging.getLogger(__name__)
@ -565,7 +565,7 @@ def create_generate_report_tool(
Factory function to create the generate_report tool with injected dependencies.
The tool generates a Markdown report inline using the search space's
document summary LLM, saves it to the database, and returns immediately.
agent LLM, saves it to the database, and returns immediately.
Uses short-lived database sessions for each DB operation so no connection
is held during the long LLM API call.
@ -768,7 +768,7 @@ def create_generate_report_tool(
"creating standalone report"
)
llm = await get_document_summary_llm(read_session, search_space_id)
llm = await get_agent_llm(read_session, search_space_id)
# read_session closed — connection returned to pool
if not llm:

View file

@ -26,7 +26,7 @@ from langchain_core.messages import HumanMessage
from langchain_core.tools import tool
from app.db import Report, shielded_async_session
from app.services.llm_service import get_document_summary_llm
from app.services.llm_service import get_agent_llm
logger = logging.getLogger(__name__)
@ -547,7 +547,7 @@ def create_generate_resume_tool(
f"(group {report_group_id})"
)
llm = await get_document_summary_llm(read_session, search_space_id)
llm = await get_agent_llm(read_session, search_space_id)
if not llm:
error_msg = (

View file

@ -31,12 +31,10 @@ async def create_podcast_transcript(
search_space_id = configuration.search_space_id
user_prompt = configuration.user_prompt
# Get search space's document summary LLM
# Use the search space's agent LLM for podcast transcript generation.
llm = await get_agent_llm(state.db_session, search_space_id)
if not llm:
error_message = (
f"No document summary LLM configured for search space {search_space_id}"
)
error_message = f"No agent LLM configured for search space {search_space_id}"
print(error_message)
raise RuntimeError(error_message)

View file

@ -103,7 +103,7 @@ def init_worker(**kwargs):
"""Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
This ensures the Auto mode (LiteLLM Router) is available for background tasks
like document summarization and image generation.
like agent workflows and image generation.
"""
from app.observability.bootstrap import init_otel

View file

@ -141,7 +141,6 @@ async def download_and_process_file(
task_logger: TaskLoggingService,
log_entry: Log,
connector_id: int | None = None,
enable_summary: bool = True,
) -> tuple[Any, str | None, dict[str, Any] | None]:
"""
Download Google Drive file and process using Surfsense file processors.
@ -215,8 +214,6 @@ async def download_and_process_file(
"source_connector": "google_drive",
},
}
# Include connector_id for de-indexing support
connector_info["enable_summary"] = enable_summary
if connector_id is not None:
connector_info["connector_id"] = connector_id

View file

@ -1781,9 +1781,6 @@ class SearchSpace(BaseModel, TimestampMixin):
agent_llm_id = Column(
Integer, nullable=True, default=0
) # For agent/chat operations, defaults to Auto mode
document_summary_llm_id = Column(
Integer, nullable=True, default=0
) # For document summarization, defaults to Auto mode
image_generation_config_id = Column(
Integer, nullable=True, default=0
) # For image generation, defaults to Auto mode
@ -1951,12 +1948,6 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True)
config = Column(JSON, nullable=False)
# Summary generation (LLM-based) - disabled by default to save resources.
# When enabled, improves hybrid search quality at the cost of LLM calls.
enable_summary = Column(
Boolean, nullable=False, default=False, server_default="false"
)
# Vision LLM for image files - disabled by default to save cost/time.
# When enabled, images are described via a vision language model instead
# of falling back to the document parser.
@ -2972,7 +2963,7 @@ async def shielded_async_session():
async def setup_indexes():
async with engine.begin() as conn:
# Create indexes
# Document Summary Indexes
# Document embedding indexes
await conn.execute(
text(
"CREATE INDEX IF NOT EXISTS document_vector_index ON documents USING hnsw (embedding public.vector_cosine_ops)"

View file

@ -18,8 +18,6 @@ class UploadDocumentAdapter:
etl_service: str,
search_space_id: int,
user_id: str,
llm,
should_summarize: bool = False,
) -> None:
connector_doc = ConnectorDocument(
title=filename,
@ -29,9 +27,7 @@ class UploadDocumentAdapter:
search_space_id=search_space_id,
created_by_id=user_id,
connector_id=None,
should_summarize=should_summarize,
should_use_code_chunker=False,
fallback_summary=markdown_content[:4000],
metadata={
"FILE_NAME": filename,
"ETL_SERVICE": etl_service,
@ -43,7 +39,7 @@ class UploadDocumentAdapter:
if not documents:
raise RuntimeError("prepare_for_indexing returned no documents")
indexed = await self._service.index(documents[0], connector_doc, llm)
indexed = await self._service.index(documents[0], connector_doc)
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
raise RuntimeError(indexed.status.get("reason", "Indexing failed"))
@ -51,7 +47,7 @@ class UploadDocumentAdapter:
indexed.content_needs_reindexing = False
await self._session.commit()
async def reindex(self, document: Document, llm) -> None:
async def reindex(self, document: Document) -> None:
"""Re-index an existing document after its source_markdown has been updated."""
if not document.source_markdown:
raise RuntimeError("Document has no source_markdown to reindex")
@ -66,15 +62,13 @@ class UploadDocumentAdapter:
search_space_id=document.search_space_id,
created_by_id=str(document.created_by_id),
connector_id=document.connector_id,
should_summarize=True,
should_use_code_chunker=False,
fallback_summary=document.source_markdown[:4000],
metadata=metadata,
)
document.content_hash = compute_content_hash(connector_doc)
indexed = await self._service.index(document, connector_doc, llm)
indexed = await self._service.index(document, connector_doc)
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
raise RuntimeError(indexed.status.get("reason", "Reindexing failed"))

View file

@ -11,9 +11,7 @@ class ConnectorDocument(BaseModel):
unique_id: str
document_type: DocumentType
search_space_id: int = Field(gt=0)
should_summarize: bool = True
should_use_code_chunker: bool = False
fallback_summary: str | None = None
metadata: dict = {}
connector_id: int | None = None
created_by_id: str

View file

@ -1,30 +0,0 @@
from app.prompts import SUMMARY_PROMPT_TEMPLATE
from app.utils.document_converters import optimize_content_for_context_window
async def summarize_document(
source_markdown: str, llm, metadata: dict | None = None
) -> str:
"""Generate a text summary of a document using an LLM, prefixed with metadata when provided."""
model_name = getattr(llm, "model", "gpt-3.5-turbo")
optimized_content = optimize_content_for_context_window(
source_markdown, metadata, model_name
)
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
content_with_metadata = (
f"<DOCUMENT><DOCUMENT_METADATA>\n\n{metadata}\n\n</DOCUMENT_METADATA>"
f"\n\n<DOCUMENT_CONTENT>\n\n{optimized_content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
)
summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
summary_content = summary_result.content
if metadata:
metadata_parts = ["# DOCUMENT METADATA"]
for key, value in metadata.items():
if value:
metadata_parts.append(f"**{key.replace('_', ' ').title()}:** {value}")
metadata_section = "\n".join(metadata_parts)
return f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
return summary_content

View file

@ -31,7 +31,6 @@ from app.indexing_pipeline.document_persistence import (
attach_chunks_to_document,
rollback_and_persist_failure,
)
from app.indexing_pipeline.document_summarizer import summarize_document
from app.indexing_pipeline.exceptions import (
EMBEDDING_ERRORS,
PERMANENT_LLM_ERRORS,
@ -203,9 +202,7 @@ class IndexingPipelineService:
await self.session.commit()
async def index_batch(
self, connector_docs: list[ConnectorDocument], llm
) -> list[Document]:
async def index_batch(self, connector_docs: list[ConnectorDocument]) -> list[Document]:
"""Convenience method: prepare_for_indexing then index each document.
Indexers that need heartbeat callbacks or custom per-document logic
@ -218,7 +215,7 @@ class IndexingPipelineService:
connector_doc = doc_map.get(document.unique_identifier_hash)
if connector_doc is None:
continue
result = await self.index(document, connector_doc, llm)
result = await self.index(document, connector_doc)
results.append(result)
return results
@ -350,11 +347,9 @@ class IndexingPipelineService:
await self.session.rollback()
return []
async def index(
self, document: Document, connector_doc: ConnectorDocument, llm
) -> Document:
async def index(self, document: Document, connector_doc: ConnectorDocument) -> Document:
"""
Run summarization, embedding, and chunking for a document and persist the results.
Run deterministic content storage, embedding, and chunking for a document.
"""
ctx = PipelineLogContext(
connector_id=connector_doc.connector_id,
@ -379,20 +374,7 @@ class IndexingPipelineService:
document.status = DocumentStatus.processing()
await self.session.commit()
t_step = time.perf_counter()
if connector_doc.should_summarize and llm is not None:
content = await summarize_document(
connector_doc.source_markdown, llm, connector_doc.metadata
)
perf.info(
"[indexing] summarize_document doc=%d in %.3fs",
document.id,
time.perf_counter() - t_step,
)
elif connector_doc.should_summarize and connector_doc.fallback_summary:
content = connector_doc.fallback_summary
else:
content = connector_doc.source_markdown
content = connector_doc.source_markdown
await self.session.execute(
delete(Chunk).where(Chunk.document_id == document.id)
@ -523,7 +505,6 @@ class IndexingPipelineService:
async def index_batch_parallel(
self,
connector_docs: list[ConnectorDocument],
get_llm: Callable[[AsyncSession], Awaitable],
*,
max_concurrency: int = 4,
on_heartbeat: Callable[[int], Awaitable[None]] | None = None,
@ -532,8 +513,8 @@ class IndexingPipelineService:
"""Index documents in parallel with bounded concurrency.
Phase 1 (serial): prepare_for_indexing using self.session.
Phase 2 (parallel): index each document in an isolated session,
bounded by a semaphore to avoid overwhelming APIs/DB.
Phase 2 (parallel): index each document in an isolated session, bounded
by a semaphore to avoid overwhelming embedding APIs/DB.
"""
logger = logging.getLogger(__name__)
perf = get_perf_logger()
@ -577,9 +558,8 @@ class IndexingPipelineService:
failed_count += 1
return document
llm = await get_llm(isolated_session)
iso_pipeline = IndexingPipelineService(isolated_session)
result = await iso_pipeline.index(refetched, connector_doc, llm)
result = await iso_pipeline.index(refetched, connector_doc)
async with lock:
if DocumentStatus.is_state(

View file

@ -125,7 +125,6 @@ async def create_documents(
async def create_documents_file_upload(
files: list[UploadFile],
search_space_id: int = Form(...),
should_summarize: bool = Form(False),
use_vision_llm: bool = Form(False),
processing_mode: str = Form("basic"),
session: AsyncSession = Depends(get_async_session),
@ -309,7 +308,6 @@ async def create_documents_file_upload(
filename=filename,
search_space_id=search_space_id,
user_id=str(user.id),
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
processing_mode=validated_mode.value,
)
@ -1586,7 +1584,6 @@ async def folder_upload(
search_space_id: int = Form(...),
relative_paths: str = Form(...),
root_folder_id: int | None = Form(None),
enable_summary: bool = Form(False),
use_vision_llm: bool = Form(False),
processing_mode: str = Form("basic"),
session: AsyncSession = Depends(get_async_session),
@ -1719,7 +1716,6 @@ async def folder_upload(
user_id=str(user.id),
folder_name=folder_name,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
use_vision_llm=use_vision_llm,
file_mappings=list(file_mappings),
processing_mode=validated_mode.value,

View file

@ -617,9 +617,6 @@ async def get_llm_preferences(
# Get full config objects for each role
agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id)
document_summary_llm = await _get_llm_config_by_id(
session, search_space.document_summary_llm_id
)
image_generation_config = await _get_image_gen_config_by_id(
session, search_space.image_generation_config_id
)
@ -629,11 +626,9 @@ async def get_llm_preferences(
return LLMPreferencesRead(
agent_llm_id=search_space.agent_llm_id,
document_summary_llm_id=search_space.document_summary_llm_id,
image_generation_config_id=search_space.image_generation_config_id,
vision_llm_config_id=search_space.vision_llm_config_id,
agent_llm=agent_llm,
document_summary_llm=document_summary_llm,
image_generation_config=image_generation_config,
vision_llm_config=vision_llm_config,
)
@ -707,9 +702,6 @@ async def update_llm_preferences(
# Get full config objects for response
agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id)
document_summary_llm = await _get_llm_config_by_id(
session, search_space.document_summary_llm_id
)
image_generation_config = await _get_image_gen_config_by_id(
session, search_space.image_generation_config_id
)
@ -719,11 +711,9 @@ async def update_llm_preferences(
return LLMPreferencesRead(
agent_llm_id=search_space.agent_llm_id,
document_summary_llm_id=search_space.document_summary_llm_id,
image_generation_config_id=search_space.image_generation_config_id,
vision_llm_config_id=search_space.vision_llm_config_id,
agent_llm=agent_llm,
document_summary_llm=document_summary_llm,
image_generation_config=image_generation_config,
vision_llm_config=vision_llm_config,
)

View file

@ -221,9 +221,6 @@ class LLMPreferencesRead(BaseModel):
agent_llm_id: int | None = Field(
None, description="ID of the LLM config to use for agent/chat tasks"
)
document_summary_llm_id: int | None = Field(
None, description="ID of the LLM config to use for document summarization"
)
image_generation_config_id: int | None = Field(
None, description="ID of the image generation config to use"
)
@ -234,9 +231,6 @@ class LLMPreferencesRead(BaseModel):
agent_llm: dict[str, Any] | None = Field(
None, description="Full config for agent LLM"
)
document_summary_llm: dict[str, Any] | None = Field(
None, description="Full config for document summary LLM"
)
image_generation_config: dict[str, Any] | None = Field(
None, description="Full config for image generation"
)
@ -253,9 +247,6 @@ class LLMPreferencesUpdate(BaseModel):
agent_llm_id: int | None = Field(
None, description="ID of the LLM config to use for agent/chat tasks"
)
document_summary_llm_id: int | None = Field(
None, description="ID of the LLM config to use for document summarization"
)
image_generation_config_id: int | None = Field(
None, description="ID of the image generation config to use"
)

View file

@ -16,7 +16,6 @@ class SearchSourceConnectorBase(BaseModel):
is_indexable: bool
last_indexed_at: datetime | None = None
config: dict[str, Any]
enable_summary: bool = False
enable_vision_llm: bool = False
periodic_indexing_enabled: bool = False
indexing_frequency_minutes: int | None = None
@ -67,7 +66,6 @@ class SearchSourceConnectorUpdate(BaseModel):
is_indexable: bool | None = None
last_indexed_at: datetime | None = None
config: dict[str, Any] | None = None
enable_summary: bool | None = None
enable_vision_llm: bool | None = None
periodic_indexing_enabled: bool | None = None
indexing_frequency_minutes: int | None = None

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -65,29 +64,11 @@ class ConfluenceKBSyncService:
if dup:
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"page_title": page_title,
"space_id": space_id,
"document_type": "Confluence Page",
"connector_type": "Confluence",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
page_content, user_llm, doc_metadata_for_summary
)
else:
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(page_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -185,25 +166,10 @@ class ConfluenceKBSyncService:
space_id = (document.document_metadata or {}).get("space_id", "")
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session, user_id, search_space_id, disable_streaming=True
)
if user_llm:
doc_meta = {
"page_title": page_title,
"space_id": space_id,
"document_type": "Confluence Page",
"connector_type": "Confluence",
}
summary_content, summary_embedding = await generate_document_summary(
page_content, user_llm, doc_meta
)
else:
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(page_content)

View file

@ -191,149 +191,6 @@ class DoclingService:
logger.error(f"Full traceback: {traceback.format_exc()}")
raise RuntimeError(f"Docling processing failed: {e}") from e
async def process_large_document_summary(
self, content: str, llm, document_title: str = "Document"
) -> str:
"""
Process large documents using chunked LLM summarization.
Args:
content: The full document content
llm: The language model to use for summarization
document_title: Title of the document for context
Returns:
Final summary of the document
"""
# Large document threshold (100K characters ≈ 25K tokens)
large_document_threshold = 100_000
if len(content) <= large_document_threshold:
# For smaller documents, use direct processing
logger.info(
f"📄 Document size: {len(content)} chars - using direct processing"
)
from app.prompts import SUMMARY_PROMPT_TEMPLATE
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
result = await summary_chain.ainvoke({"document": content})
return result.content
logger.info(
f"📚 Large document detected: {len(content)} chars - using chunked processing"
)
# Import chunker from config
# Create LLM-optimized chunks (8K tokens max for safety)
from chonkie import OverlapRefinery, RecursiveChunker
from langchain_core.prompts import PromptTemplate
llm_chunker = RecursiveChunker(
chunk_size=8000 # Conservative for most LLMs
)
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
overlap_refinery = OverlapRefinery(
context_size=0.1, # 10% overlap for context preservation
method="suffix", # Add next chunk context to current chunk
)
# First chunk the content, then apply overlap refinery
initial_chunks = llm_chunker.chunk(content)
chunks = overlap_refinery.refine(initial_chunks)
total_chunks = len(chunks)
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
# Template for chunk processing
chunk_template = PromptTemplate(
input_variables=["chunk", "chunk_number", "total_chunks"],
template="""<INSTRUCTIONS>
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
Create a comprehensive summary of this document chunk. Focus on:
- Key concepts, facts, and information
- Important details and context
- Main topics and themes
Provide a clear, structured summary that captures the essential content.
Chunk {chunk_number}/{total_chunks}:
<document_chunk>
{chunk}
</document_chunk>
</INSTRUCTIONS>""",
)
# Process each chunk individually
chunk_summaries = []
for i, chunk in enumerate(chunks, 1):
try:
logger.info(
f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)"
)
chunk_chain = chunk_template | llm
chunk_result = await chunk_chain.ainvoke(
{
"chunk": chunk.text,
"chunk_number": i,
"total_chunks": total_chunks,
}
)
chunk_summary = chunk_result.content
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
except Exception as e:
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
# Combine summaries into final document summary
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
try:
combine_template = PromptTemplate(
input_variables=["summaries", "document_title"],
template="""<INSTRUCTIONS>
You are combining multiple section summaries into a final comprehensive document summary.
Create a unified, coherent summary from the following section summaries of "{document_title}".
Ensure:
- Logical flow and organization
- No redundancy or repetition
- Comprehensive coverage of all key points
- Professional, objective tone
<section_summaries>
{summaries}
</section_summaries>
</INSTRUCTIONS>""",
)
combined_summaries = "\n\n".join(chunk_summaries)
combine_chain = combine_template | llm
final_result = await combine_chain.ainvoke(
{"summaries": combined_summaries, "document_title": document_title}
)
final_summary = final_result.content
logger.info(
f"✅ Large document processing complete: {len(final_summary)} chars summary"
)
return final_summary
except Exception as e:
logger.error(f"❌ Failed to combine summaries: {e}")
# Fallback: return concatenated chunk summaries
fallback_summary = "\n\n".join(chunk_summaries)
logger.warning("⚠️ Using fallback combined summary")
return fallback_summary
def create_docling_service() -> DoclingService:
"""Create a Docling service instance."""

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
)
logger = logging.getLogger(__name__)
@ -72,29 +71,11 @@ class DropboxKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"file_name": file_name,
"document_type": "Dropbox File",
"connector_type": "Dropbox",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = f"Dropbox File: {file_name}\n\n{indexable_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Dropbox File: {file_name}\n\n{indexable_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -78,30 +77,11 @@ class GmailKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"subject": subject,
"sender": sender,
"document_type": "Gmail Message",
"connector_type": "Gmail",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured -- using fallback summary")
summary_content = f"Gmail Message: {subject}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_content = f"Gmail Message: {subject}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -19,7 +19,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -90,33 +89,13 @@ class GoogleCalendarKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
doc_metadata_for_summary = {
"event_summary": event_summary,
"start_time": start_time,
"end_time": end_time,
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured -- using fallback summary")
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -273,29 +252,13 @@ class GoogleCalendarKBSyncService:
if not indexable_content:
return {"status": "error", "message": "Event produced empty content"}
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session, user_id, search_space_id, disable_streaming=True
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
doc_metadata_for_summary = {
"event_summary": event_summary,
"start_time": start_time,
"end_time": end_time,
"document_type": "Google Calendar Event",
"connector_type": "Google Calendar",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
summary_content = (
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -8,7 +8,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -74,32 +73,13 @@ class GoogleDriveKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
summary_content = (
f"Google Drive File: {file_name}\n\n{indexable_content}"
)
doc_metadata_for_summary = {
"file_name": file_name,
"mime_type": mime_type,
"document_type": "Google Drive File",
"connector_type": "Google Drive",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = (
f"Google Drive File: {file_name}\n\n{indexable_content}"
)
summary_embedding = embed_text(summary_content)
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -9,7 +9,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -84,32 +83,13 @@ class LinearKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
doc_metadata_for_summary = {
"issue_id": issue_identifier,
"issue_title": issue_title,
"document_type": "Linear Issue",
"connector_type": "Linear",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
issue_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
summary_embedding = embed_text(summary_content)
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(issue_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -227,30 +207,12 @@ class LinearKBSyncService:
comment_count = len(formatted_issue.get("comments", []))
formatted_issue.get("description", "")
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session, user_id, search_space_id, disable_streaming=True
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
if user_llm:
document_metadata_for_summary = {
"issue_id": issue_identifier,
"issue_title": issue_title,
"state": state,
"priority": priority,
"comment_count": comment_count,
"document_type": "Linear Issue",
"connector_type": "Linear",
}
summary_content, summary_embedding = await generate_document_summary(
issue_content, user_llm, document_metadata_for_summary
)
else:
summary_content = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
)
summary_embedding = embed_text(summary_content)
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(issue_content)

View file

@ -68,7 +68,6 @@ def _is_interactive_auth_provider(
class LLMRole:
AGENT = "agent" # For agent/chat operations
DOCUMENT_SUMMARY = "document_summary" # For document summarization
def get_global_llm_config(llm_config_id: int) -> dict | None:
@ -266,7 +265,7 @@ async def get_search_space_llm_instance(
Args:
session: Database session
search_space_id: Search Space ID
role: LLM role ('agent' or 'document_summary')
role: LLM role ('agent')
Returns:
ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
@ -283,11 +282,8 @@ async def get_search_space_llm_instance(
return None
# Get the appropriate LLM config ID based on role
llm_config_id = None
if role == LLMRole.AGENT:
llm_config_id = search_space.agent_llm_id
elif role == LLMRole.DOCUMENT_SUMMARY:
llm_config_id = search_space.document_summary_llm_id
else:
logger.error(f"Invalid LLM role: {role}")
return None
@ -470,20 +466,13 @@ async def get_search_space_llm_instance(
async def get_agent_llm(
session: AsyncSession, search_space_id: int
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""Get the search space's agent LLM instance for chat operations."""
return await get_search_space_llm_instance(session, search_space_id, LLMRole.AGENT)
async def get_document_summary_llm(
session: AsyncSession, search_space_id: int, disable_streaming: bool = False
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""Get the search space's document summary LLM instance."""
"""Get the search space's agent LLM instance for chat operations."""
return await get_search_space_llm_instance(
session,
search_space_id,
LLMRole.DOCUMENT_SUMMARY,
LLMRole.AGENT,
disable_streaming=disable_streaming,
)
@ -645,22 +634,6 @@ async def get_vision_llm(
return None
# Backward-compatible alias (LLM preferences are now per-search-space, not per-user)
async def get_user_long_context_llm(
session: AsyncSession,
user_id: str,
search_space_id: int,
disable_streaming: bool = False,
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
"""
Deprecated: Use get_document_summary_llm instead.
The user_id parameter is ignored as LLM preferences are now per-search-space.
"""
return await get_document_summary_llm(
session, search_space_id, disable_streaming=disable_streaming
)
def get_planner_llm() -> ChatLiteLLM | None:
"""Return a planner LLM instance from the first global config marked
``is_planner: true``, or ``None`` if no planner config is defined.

View file

@ -8,7 +8,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -73,30 +72,11 @@ class NotionKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"page_title": page_title,
"page_id": page_id,
"document_type": "Notion Page",
"connector_type": "Notion",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
markdown_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = f"Notion Page: {page_title}\n\n{markdown_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Notion Page: {page_title}\n\n{markdown_content}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(markdown_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
@ -245,31 +225,11 @@ class NotionKBSyncService:
f"Final content length: {len(full_content)} chars, verified={content_verified}"
)
from app.services.llm_service import get_user_long_context_llm
logger.debug("Generating summary and embeddings")
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True, # disable streaming to avoid leaking into the chat
)
if user_llm:
document_metadata_for_summary = {
"page_title": document.document_metadata.get("page_title"),
"page_id": document.document_metadata.get("page_id"),
"document_type": "Notion Page",
"connector_type": "Notion",
}
summary_content, summary_embedding = await generate_document_summary(
full_content, user_llm, document_metadata_for_summary
)
logger.debug(f"Generated summary length: {len(summary_content)} chars")
else:
logger.warning("No LLM configured - using fallback summary")
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
summary_embedding = embed_text(summary_content)
logger.debug("Creating new chunks")
chunks = await create_document_chunks(full_content)

View file

@ -233,18 +233,6 @@ async def _resolve_attachment_vision_llm(
return await get_vision_llm(session, search_space_id)
async def _resolve_summary_llm(
session: AsyncSession, *, user_id: str, search_space_id: int, should_summarize: bool
):
"""Fetch summary LLM only when indexing summary is enabled."""
if not should_summarize:
return None
from app.services.llm_service import get_user_long_context_llm
return await get_user_long_context_llm(session, user_id, search_space_id)
def _require_extracted_attachment_content(
*, content: str, etl_meta: dict[str, Any], path: str
) -> str:
@ -349,13 +337,6 @@ async def upsert_note(
path=payload.path,
)
llm = await _resolve_summary_llm(
session,
user_id=str(user_id),
search_space_id=search_space_id,
should_summarize=connector.enable_summary,
)
document_string = _build_document_string(
payload, vault_name, content_override=content_for_index
)
@ -374,8 +355,6 @@ async def upsert_note(
search_space_id=search_space_id,
connector_id=connector.id,
created_by_id=str(user_id),
should_summarize=connector.enable_summary,
fallback_summary=f"Obsidian Note: {payload.name}\n\n{content_for_index}",
metadata=metadata,
)
@ -388,7 +367,7 @@ async def upsert_note(
document = prepared[0]
return await pipeline.index(document, connector_doc, llm)
return await pipeline.index(document, connector_doc)
async def rename_note(

View file

@ -10,7 +10,6 @@ from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
)
logger = logging.getLogger(__name__)
@ -73,30 +72,11 @@ class OneDriveKBSyncService:
)
content_hash = unique_hash
from app.services.llm_service import get_user_long_context_llm
user_llm = await get_user_long_context_llm(
self.db_session,
user_id,
search_space_id,
disable_streaming=True,
)
doc_metadata_for_summary = {
"file_name": file_name,
"mime_type": mime_type,
"document_type": "OneDrive File",
"connector_type": "OneDrive",
}
if user_llm:
summary_content, summary_embedding = await generate_document_summary(
indexable_content, user_llm, doc_metadata_for_summary
)
else:
logger.warning("No LLM configured — using fallback summary")
summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}"
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
chunks = await create_document_chunks(indexable_content)
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

View file

@ -18,7 +18,6 @@ class TaskDispatcher(Protocol):
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> None: ...
@ -35,7 +34,6 @@ class CeleryTaskDispatcher:
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> None:
@ -49,7 +47,6 @@ class CeleryTaskDispatcher:
filename=filename,
search_space_id=search_space_id,
user_id=user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
)

View file

@ -9,7 +9,6 @@ from sqlalchemy.orm import selectinload
from app.celery_app import celery_app
from app.db import Document
from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
@ -68,12 +67,8 @@ async def _reindex_document(document_id: int, user_id: str):
logger.info(f"Reindexing document {document_id} ({document.title})")
user_llm = await get_user_long_context_llm(
session, user_id, document.search_space_id
)
adapter = UploadDocumentAdapter(session)
await adapter.reindex(document=document, llm=user_llm)
await adapter.reindex(document=document)
await task_logger.log_task_success(
log_entry,

View file

@ -765,7 +765,6 @@ def process_file_upload_with_document_task(
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
):
@ -782,7 +781,6 @@ def process_file_upload_with_document_task(
filename: Original filename
search_space_id: ID of the search space
user_id: ID of the user
should_summarize: Whether to generate an LLM summary
"""
import traceback
@ -814,7 +812,6 @@ def process_file_upload_with_document_task(
filename,
search_space_id,
user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
)
@ -850,7 +847,6 @@ async def _process_file_with_document(
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
):
@ -954,7 +950,6 @@ async def _process_file_with_document(
task_logger=task_logger,
log_entry=log_entry,
notification=notification,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
)
@ -1258,7 +1253,6 @@ def index_local_folder_task(
exclude_patterns: list[str] | None = None,
file_extensions: list[str] | None = None,
root_folder_id: int | None = None,
enable_summary: bool = False,
target_file_paths: list[str] | None = None,
):
"""Celery task to index a local folder. Config is passed directly — no connector row."""
@ -1271,7 +1265,6 @@ def index_local_folder_task(
exclude_patterns=exclude_patterns,
file_extensions=file_extensions,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
target_file_paths=target_file_paths,
)
)
@ -1285,7 +1278,6 @@ async def _index_local_folder_async(
exclude_patterns: list[str] | None = None,
file_extensions: list[str] | None = None,
root_folder_id: int | None = None,
enable_summary: bool = False,
target_file_paths: list[str] | None = None,
):
"""Run local folder indexing with notification + heartbeat."""
@ -1343,8 +1335,7 @@ async def _index_local_folder_async(
exclude_patterns=exclude_patterns,
file_extensions=file_extensions,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
target_file_paths=target_file_paths,
target_file_paths=target_file_paths,
on_heartbeat_callback=_heartbeat_progress
if (is_batch or is_full_scan)
else None,
@ -1400,7 +1391,6 @@ def index_uploaded_folder_files_task(
user_id: str,
folder_name: str,
root_folder_id: int,
enable_summary: bool,
file_mappings: list[dict],
use_vision_llm: bool = False,
processing_mode: str = "basic",
@ -1412,7 +1402,6 @@ def index_uploaded_folder_files_task(
user_id=user_id,
folder_name=folder_name,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
file_mappings=file_mappings,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
@ -1425,7 +1414,6 @@ async def _index_uploaded_folder_files_async(
user_id: str,
folder_name: str,
root_folder_id: int,
enable_summary: bool,
file_mappings: list[dict],
use_vision_llm: bool = False,
processing_mode: str = "basic",
@ -1475,8 +1463,7 @@ async def _index_uploaded_folder_files_async(
user_id=user_id,
folder_name=folder_name,
root_folder_id=root_folder_id,
enable_summary=enable_summary,
file_mappings=file_mappings,
file_mappings=file_mappings,
on_heartbeat_callback=_heartbeat_progress,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
@ -1563,12 +1550,10 @@ async def _ai_sort_search_space_async(search_space_id: int, user_id: str):
t_start = time.perf_counter()
try:
from app.services.ai_file_sort_service import ai_sort_all_documents
from app.services.llm_service import get_document_summary_llm
from app.services.llm_service import get_agent_llm
async with get_celery_session_maker()() as session:
llm = await get_document_summary_llm(
session, search_space_id, disable_streaming=True
)
llm = await get_agent_llm(session, search_space_id, disable_streaming=True)
if llm is None:
logger.warning(
"No LLM configured for search_space=%d, skipping AI sort",
@ -1604,7 +1589,7 @@ def ai_sort_document_task(self, search_space_id: int, user_id: str, document_id:
async def _ai_sort_document_async(search_space_id: int, user_id: str, document_id: int):
from app.db import Document
from app.services.ai_file_sort_service import ai_sort_document
from app.services.llm_service import get_document_summary_llm
from app.services.llm_service import get_agent_llm
async with get_celery_session_maker()() as session:
document = await session.get(Document, document_id)
@ -1612,9 +1597,7 @@ async def _ai_sort_document_async(search_space_id: int, user_id: str, document_i
logger.warning("Document %d not found, skipping AI sort", document_id)
return
llm = await get_document_summary_llm(
session, search_space_id, disable_streaming=True
)
llm = await get_agent_llm(session, search_space_id, disable_streaming=True)
if llm is None:
logger.warning(
"No LLM for search_space=%d, skipping AI sort of doc=%d",

View file

@ -62,6 +62,7 @@ async def build_new_chat_input_state(
user_image_data_urls: list[str] | None,
mentioned_document_ids: list[int] | None,
mentioned_folder_ids: list[int] | None,
mentioned_connectors: list[dict[str, Any]] | None,
mentioned_documents: list[dict[str, Any]] | None,
needs_history_bootstrap: bool,
thread_visibility: ChatVisibility,
@ -110,6 +111,7 @@ async def build_new_chat_input_state(
final_query = _render_query_with_context(
agent_user_query=agent_user_query,
mentioned_connectors=mentioned_connectors,
recent_reports=recent_reports,
)
@ -196,11 +198,16 @@ async def _resolve_mentions_for_query(
def _render_query_with_context(
*,
agent_user_query: str,
mentioned_connectors: list[dict[str, Any]] | None,
recent_reports: list[Report],
) -> str:
"""Prepend recent-reports XML block to the user query."""
"""Prepend connector/report XML context blocks to the user query."""
context_parts: list[str] = []
connector_context = _render_mentioned_connectors(mentioned_connectors)
if connector_context:
context_parts.append(connector_context)
if recent_reports:
report_lines: list[str] = []
for r in recent_reports:
@ -225,3 +232,40 @@ def _render_query_with_context(
return f"{context}\n\n<user_query>{agent_user_query}</user_query>"
return agent_user_query
def _render_mentioned_connectors(
mentioned_connectors: list[dict[str, Any]] | None,
) -> str | None:
"""Render selected connector account metadata for connector-backed tools."""
if not mentioned_connectors:
return None
connector_lines: list[str] = []
for connector in mentioned_connectors:
if not isinstance(connector, dict):
continue
connector_id = connector.get("id")
connector_type = connector.get("connector_type") or connector.get(
"document_type"
)
account_name = connector.get("account_name") or connector.get("title")
if connector_id is None or connector_type is None:
continue
connector_lines.append(
f' - connector_id={connector_id}, connector_type="{connector_type}", '
f'account_name="{account_name or ""}"'
)
if not connector_lines:
return None
return (
"<mentioned_connectors>\n"
"The user selected these exact connector accounts with @. "
"These entries are selection metadata, not retrieved connector content. "
"When a connector-backed tool needs an account, use the matching "
"connector_id from this list if the tool supports connector_id:\n"
+ "\n".join(connector_lines)
+ "\n</mentioned_connectors>"
)

View file

@ -124,6 +124,8 @@ async def stream_new_chat(
llm_config_id: int = -1,
mentioned_document_ids: list[int] | None = None,
mentioned_folder_ids: list[int] | None = None,
mentioned_connector_ids: list[int] | None = None,
mentioned_connectors: list[dict[str, Any]] | None = None,
mentioned_documents: list[dict[str, Any]] | None = None,
checkpoint_id: str | None = None,
needs_history_bootstrap: bool = False,
@ -435,6 +437,7 @@ async def stream_new_chat(
user_image_data_urls=user_image_data_urls,
mentioned_document_ids=mentioned_document_ids,
mentioned_folder_ids=mentioned_folder_ids,
mentioned_connectors=mentioned_connectors,
mentioned_documents=mentioned_documents,
needs_history_bootstrap=needs_history_bootstrap,
thread_visibility=visibility,
@ -588,6 +591,8 @@ async def stream_new_chat(
mentioned_document_ids=mentioned_document_ids,
accepted_folder_ids=accepted_folder_ids,
mentioned_folder_ids=mentioned_folder_ids,
mentioned_connector_ids=mentioned_connector_ids,
mentioned_connectors=mentioned_connectors,
request_id=request_id,
turn_id=stream_result.turn_id,
)

View file

@ -8,6 +8,8 @@ mention lists / request ids / turn ids without rebuilding the graph.
from __future__ import annotations
from typing import Any
from app.agents.new_chat.context import SurfSenseContextSchema
@ -17,6 +19,8 @@ def build_new_chat_runtime_context(
mentioned_document_ids: list[int] | None,
accepted_folder_ids: list[int],
mentioned_folder_ids: list[int] | None,
mentioned_connector_ids: list[int] | None,
mentioned_connectors: list[dict[str, Any]] | None,
request_id: str | None,
turn_id: str,
) -> SurfSenseContextSchema:
@ -31,6 +35,8 @@ def build_new_chat_runtime_context(
search_space_id=search_space_id,
mentioned_document_ids=list(mentioned_document_ids or []),
mentioned_folder_ids=list(accepted_folder_ids or mentioned_folder_ids or []),
mentioned_connector_ids=list(mentioned_connector_ids or []),
mentioned_connectors=list(mentioned_connectors or []),
request_id=request_id,
turn_id=turn_id,
)

View file

@ -14,13 +14,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.airtable_history import AirtableHistoryConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -394,29 +392,10 @@ async def index_airtable_records(
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
# Heavy processing (embeddings, chunks)
if user_llm and connector.enable_summary:
document_metadata_for_summary = {
"record_id": item["record_id"],
"created_time": item["record"].get("CREATED_TIME()", ""),
"document_type": "Airtable Record",
"connector_type": "Airtable",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["markdown_content"],
user_llm,
document_metadata_for_summary,
)
else:
summary_content = f"Airtable Record: {item['record_id']}\n\n{item['markdown_content']}"
summary_embedding = embed_text(summary_content)
summary_content = f"Airtable Record: {item['record_id']}\n\n{item['markdown_content']}"
summary_embedding = embed_text(summary_content)
chunks = await create_document_chunks(item["markdown_content"])

View file

@ -15,13 +15,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.bookstack_connector import BookStackConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -384,10 +382,7 @@ async def index_bookstack_pages(
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
# Heavy processing (embeddings, chunks)
# Build document metadata
doc_metadata = {
@ -403,23 +398,8 @@ async def index_bookstack_pages(
"connector_id": connector_id,
}
if user_llm and connector.enable_summary:
summary_metadata = {
"page_name": item["page_name"],
"page_id": item["page_id"],
"book_id": item["book_id"],
"document_type": "BookStack Page",
"connector_type": "BookStack",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["full_content"], user_llm, summary_metadata
)
else:
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n{item['full_content']}"
summary_embedding = embed_text(summary_content)
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n{item['full_content']}"
summary_embedding = embed_text(summary_content)
# Process chunks - using the full page content
chunks = await create_document_chunks(item["full_content"])

View file

@ -16,13 +16,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.clickup_history import ClickUpHistoryConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -393,32 +391,10 @@ async def index_clickup_tasks(
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
# Heavy processing (embeddings, chunks)
if user_llm and connector.enable_summary:
document_metadata_for_summary = {
"task_id": item["task_id"],
"task_name": item["task_name"],
"task_status": item["task_status"],
"task_priority": item["task_priority"],
"task_list": item["task_list_name"],
"task_space": item["task_space_name"],
"assignees": len(item["task_assignees"]),
"document_type": "ClickUp Task",
"connector_type": "ClickUp",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["task_content"], user_llm, document_metadata_for_summary
)
else:
summary_content = item["task_content"]
summary_embedding = embed_text(item["task_content"])
summary_content = item["task_content"]
summary_embedding = embed_text(item["task_content"])
chunks = await create_document_chunks(item["task_content"])

View file

@ -14,7 +14,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
IndexingPipelineService,
PlaceholderInfo,
)
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from .base import (
@ -36,7 +35,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Map a raw Confluence page dict to a ConnectorDocument."""
page_id = page.get("id", "")
@ -54,10 +52,6 @@ def _build_connector_doc(
"connector_type": "Confluence",
}
fallback_summary = (
f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n{full_content}"
)
return ConnectorDocument(
title=page_title,
source_markdown=full_content,
@ -66,8 +60,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -268,8 +260,7 @@ async def index_confluence_pages(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector.enable_summary,
)
)
with session.no_autoflush:
duplicate_by_content = await check_duplicate_document_by_hash(
@ -297,12 +288,8 @@ async def index_confluence_pages(
await pipeline.migrate_legacy_docs(connector_docs)
async def _get_llm(s: AsyncSession):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat_callback,
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,

View file

@ -27,7 +27,6 @@ from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
from app.services.llm_service import get_user_long_context_llm
from app.services.page_limit_service import PageLimitService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
@ -126,7 +125,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
file_id = file.get("id", "")
file_name = file.get("name", "Unknown")
@ -138,8 +136,6 @@ def _build_connector_doc(
"connector_type": "Dropbox",
}
fallback_summary = f"File: {file_name}\n\n{markdown[:4000]}"
return ConnectorDocument(
title=file_name,
source_markdown=markdown,
@ -148,8 +144,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -161,7 +155,6 @@ async def _download_files_parallel(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
max_concurrency: int = 3,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
@ -191,7 +184,6 @@ async def _download_files_parallel(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
)
async with hb_lock:
completed_count += 1
@ -223,7 +215,6 @@ async def _download_and_index(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int]:
@ -234,7 +225,6 @@ async def _download_and_index(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
@ -243,13 +233,8 @@ async def _download_and_index(
batch_failed = 0
if connector_docs:
pipeline = IndexingPipelineService(session)
async def _get_llm(s):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat,
)
@ -289,7 +274,6 @@ async def _index_with_delta_sync(
log_entry: object,
max_files: int,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int, str]:
"""Delta sync using Dropbox cursor-based change tracking.
@ -361,7 +345,6 @@ async def _index_with_delta_sync(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
@ -388,7 +371,6 @@ async def _index_full_scan(
include_subfolders: bool = True,
incremental_sync: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
@ -473,7 +455,6 @@ async def _index_full_scan(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
@ -502,7 +483,6 @@ async def _index_selected_files(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
incremental_sync: bool = True,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
@ -563,7 +543,6 @@ async def _index_selected_files(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
@ -629,7 +608,6 @@ async def index_dropbox_files(
)
return 0, 0, error_msg, 0
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
@ -664,7 +642,6 @@ async def index_dropbox_files(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector_enable_summary,
incremental_sync=incremental_sync,
vision_llm=vision_llm,
)
@ -700,7 +677,6 @@ async def index_dropbox_files(
task_logger,
log_entry,
max_files,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
folder_cursors[folder_path] = new_cursor
@ -720,7 +696,6 @@ async def index_dropbox_files(
max_files,
include_subfolders,
incremental_sync=incremental_sync,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_unsupported += unsup

View file

@ -18,13 +18,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.github_connector import GitHubConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -351,42 +349,14 @@ async def index_github_repos(
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
# Heavy processing (embeddings, chunks)
summary_text = (
f"# GitHub Repository: {repo_full_name}\n\n"
f"## Summary\n{digest.summary}\n\n"
f"## File Structure\n{digest.tree}"
)
document_metadata_for_summary = {
"repository": repo_full_name,
"document_type": "GitHub Repository",
"connector_type": "GitHub",
"ingestion_method": "gitingest",
"file_tree": digest.tree[:2000]
if len(digest.tree) > 2000
else digest.tree,
"estimated_tokens": digest.estimated_tokens,
}
if user_llm and connector.enable_summary:
# Prepare content for summarization
summary_content = digest.full_digest
if len(summary_content) > MAX_DIGEST_CHARS:
summary_content = (
f"# Repository: {repo_full_name}\n\n"
f"## File Structure\n\n{digest.tree}\n\n"
f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
)
summary_text, summary_embedding = await generate_document_summary(
summary_content, user_llm, document_metadata_for_summary
)
else:
summary_text = (
f"# GitHub Repository: {repo_full_name}\n\n"
f"## Summary\n{digest.summary}\n\n"
f"## File Structure\n{digest.tree}"
)
summary_embedding = embed_text(summary_text)
summary_embedding = embed_text(summary_text)
# Chunk the full digest content for granular search
try:

View file

@ -2,7 +2,7 @@
Google Calendar connector indexer.
Uses the shared IndexingPipelineService for document deduplication,
summarization, chunking, and embedding.
chunking, and embedding.
"""
from collections.abc import Awaitable, Callable
@ -21,7 +21,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
PlaceholderInfo,
)
from app.services.composio_service import ComposioService
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
@ -53,7 +52,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Map a raw Google Calendar API event dict to a ConnectorDocument."""
event_id = event.get("id", "")
@ -78,8 +76,6 @@ def _build_connector_doc(
"connector_type": "Google Calendar",
}
fallback_summary = f"Google Calendar Event: {event_summary}\n\n{event_markdown}"
return ConnectorDocument(
title=event_summary,
source_markdown=event_markdown,
@ -88,8 +84,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -420,8 +414,7 @@ async def index_google_calendar_events(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector.enable_summary,
)
)
with session.no_autoflush:
duplicate = await check_duplicate_document_by_hash(
@ -448,13 +441,8 @@ async def index_google_calendar_events(
# ── Pipeline: migrate legacy docs + parallel index ─────────────
await pipeline.migrate_legacy_docs(connector_docs)
async def _get_llm(s):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat_callback,
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,

View file

@ -40,7 +40,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
PlaceholderInfo,
)
from app.services.composio_service import ComposioService
from app.services.llm_service import get_user_long_context_llm
from app.services.page_limit_service import PageLimitService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
@ -381,7 +380,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Build a ConnectorDocument from Drive file metadata + extracted markdown."""
file_id = file.get("id", "")
@ -394,8 +392,6 @@ def _build_connector_doc(
"connector_type": "Google Drive",
}
fallback_summary = f"File: {file_name}\n\n{markdown[:4000]}"
return ConnectorDocument(
title=file_name,
source_markdown=markdown,
@ -404,8 +400,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -461,7 +455,6 @@ async def _download_files_parallel(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
max_concurrency: int = 3,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
@ -494,7 +487,6 @@ async def _download_files_parallel(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
)
async with hb_lock:
completed_count += 1
@ -525,7 +517,6 @@ async def _process_single_file(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Download, extract, and index a single Drive file via the pipeline.
@ -561,8 +552,7 @@ async def _process_single_file(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
)
)
pipeline = IndexingPipelineService(session)
documents = await pipeline.prepare_for_indexing([doc])
@ -578,10 +568,7 @@ async def _process_single_file(
connector_doc = doc_map.get(document.unique_identifier_hash)
if not connector_doc:
continue
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
await pipeline.index(document, connector_doc, user_llm)
await pipeline.index(document, connector_doc)
await page_limit_service.update_page_usage(
user_id, estimated_pages, allow_exceed=True
@ -636,7 +623,6 @@ async def _download_and_index(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int]:
@ -650,7 +636,6 @@ async def _download_and_index(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
@ -659,13 +644,8 @@ async def _download_and_index(
batch_failed = 0
if connector_docs:
pipeline = IndexingPipelineService(session)
async def _get_llm(s):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat,
)
@ -681,7 +661,6 @@ async def _index_selected_files(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int, int, list[str]]:
@ -746,7 +725,6 @@ async def _index_selected_files(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
@ -781,7 +759,6 @@ async def _index_full_scan(
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
@ -911,7 +888,6 @@ async def _index_full_scan(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
@ -946,7 +922,6 @@ async def _index_with_delta_sync(
max_files: int,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Delta sync using change tracking.
@ -1054,7 +1029,6 @@ async def _index_with_delta_sync(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
@ -1142,7 +1116,6 @@ async def index_google_drive_files(
)
return 0, 0, client_error, 0
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
@ -1189,7 +1162,6 @@ async def index_google_drive_files(
max_files,
include_subfolders,
on_heartbeat_callback,
connector_enable_summary,
vision_llm=vision_llm,
)
documents_unsupported += du
@ -1208,7 +1180,6 @@ async def index_google_drive_files(
max_files,
include_subfolders,
on_heartbeat_callback,
connector_enable_summary,
vision_llm=vision_llm,
)
documents_indexed += ri
@ -1234,7 +1205,6 @@ async def index_google_drive_files(
max_files,
include_subfolders,
on_heartbeat_callback,
connector_enable_summary,
vision_llm=vision_llm,
)
@ -1346,7 +1316,6 @@ async def index_google_drive_single_file(
)
return 0, client_error
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
@ -1370,7 +1339,6 @@ async def index_google_drive_single_file(
connector_id,
search_space_id,
user_id,
connector_enable_summary,
vision_llm=vision_llm,
)
await session.commit()
@ -1467,7 +1435,6 @@ async def index_google_drive_selected_files(
)
return 0, 0, [error_msg]
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
@ -1481,7 +1448,6 @@ async def index_google_drive_selected_files(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector_enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)

View file

@ -2,7 +2,7 @@
Google Gmail connector indexer.
Uses the shared IndexingPipelineService for document deduplication,
summarization, chunking, and embedding.
chunking, and embedding.
"""
from collections.abc import Awaitable, Callable
@ -21,7 +21,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
PlaceholderInfo,
)
from app.services.composio_service import ComposioService
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
@ -105,7 +104,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Map a raw Gmail API message dict to a ConnectorDocument."""
message_id = message.get("id", "")
@ -138,12 +136,6 @@ def _build_connector_doc(
"connector_type": "Google Gmail",
}
fallback_summary = (
f"Google Gmail Message: {subject}\n\n"
f"From: {sender}\nDate: {date_str}\n\n"
f"{markdown_content}"
)
return ConnectorDocument(
title=subject,
source_markdown=markdown_content,
@ -152,8 +144,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -454,8 +444,7 @@ async def index_google_gmail_messages(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector.enable_summary,
)
)
with session.no_autoflush:
duplicate = await check_duplicate_document_by_hash(
@ -483,13 +472,8 @@ async def index_google_gmail_messages(
# ── Pipeline: migrate legacy docs + parallel index ─────────────
await pipeline.migrate_legacy_docs(connector_docs)
async def _get_llm(s):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat_callback,
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,

View file

@ -2,7 +2,7 @@
Linear connector indexer.
Uses the shared IndexingPipelineService for document deduplication,
summarization, chunking, and embedding with bounded parallel indexing.
chunking, and embedding with bounded parallel indexing.
"""
from collections.abc import Awaitable, Callable
@ -18,7 +18,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
IndexingPipelineService,
PlaceholderInfo,
)
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from .base import (
@ -41,7 +40,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Map a raw Linear issue dict to a ConnectorDocument."""
issue_id = issue.get("id", "")
@ -63,11 +61,6 @@ def _build_connector_doc(
"connector_type": "Linear",
}
fallback_summary = (
f"Linear Issue {issue_identifier}: {issue_title}\n\n"
f"Status: {state}\n\n{issue_content}"
)
return ConnectorDocument(
title=f"{issue_identifier}: {issue_title}",
source_markdown=issue_content,
@ -76,8 +69,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -277,8 +268,7 @@ async def index_linear_issues(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector.enable_summary,
)
)
with session.no_autoflush:
duplicate = await check_duplicate_document_by_hash(
@ -306,13 +296,8 @@ async def index_linear_issues(
# ── Pipeline: migrate legacy docs + parallel index ────────────
await pipeline.migrate_legacy_docs(connector_docs)
async def _get_llm(s):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat_callback,
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,

View file

@ -33,7 +33,6 @@ from app.db import (
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
from app.services.llm_service import get_user_long_context_llm
from app.services.page_limit_service import PageLimitExceededError, PageLimitService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.celery_tasks import get_celery_session_maker
@ -478,7 +477,6 @@ def _build_connector_doc(
*,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Build a ConnectorDocument from a local file's extracted content."""
unique_id = f"{folder_name}:{relative_path}"
@ -488,7 +486,6 @@ def _build_connector_doc(
"document_type": "Local Folder File",
"connector_type": "Local Folder",
}
fallback_summary = f"File: {title}\n\n{content[:4000]}"
return ConnectorDocument(
title=title,
@ -498,8 +495,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=None,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -513,7 +508,6 @@ async def index_local_folder(
exclude_patterns: list[str] | None = None,
file_extensions: list[str] | None = None,
root_folder_id: int | None = None,
enable_summary: bool = False,
target_file_paths: list[str] | None = None,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int, int | None, str | None]:
@ -574,8 +568,7 @@ async def index_local_folder(
folder_path=folder_path,
folder_name=folder_name,
target_file_path=target_file_paths[0],
enable_summary=enable_summary,
root_folder_id=root_folder_id,
root_folder_id=root_folder_id,
task_logger=task_logger,
log_entry=log_entry,
)
@ -587,8 +580,7 @@ async def index_local_folder(
folder_path=folder_path,
folder_name=folder_name,
target_file_paths=target_file_paths,
enable_summary=enable_summary,
root_folder_id=root_folder_id,
root_folder_id=root_folder_id,
on_progress_callback=on_heartbeat_callback,
)
if err:
@ -774,8 +766,7 @@ async def index_local_folder(
folder_name=folder_name,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
)
)
connector_docs.append(doc)
file_meta_map[unique_identifier] = {
"relative_path": relative_path,
@ -845,15 +836,13 @@ async def index_local_folder(
doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
documents = await pipeline.prepare_for_indexing(connector_docs)
llm = await get_user_long_context_llm(session, user_id, search_space_id)
for document in documents:
connector_doc = doc_map.get(document.unique_identifier_hash)
if connector_doc is None:
failed_count += 1
continue
result = await pipeline.index(document, connector_doc, llm)
result = await pipeline.index(document, connector_doc)
if DocumentStatus.is_state(result.status, DocumentStatus.READY):
indexed_count += 1
@ -960,7 +949,6 @@ async def _index_batch_files(
folder_path: str,
folder_name: str,
target_file_paths: list[str],
enable_summary: bool,
root_folder_id: int | None,
on_progress_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int, str | None]:
@ -995,8 +983,7 @@ async def _index_batch_files(
folder_path=folder_path,
folder_name=folder_name,
target_file_path=file_path,
enable_summary=enable_summary,
root_folder_id=root_folder_id,
root_folder_id=root_folder_id,
task_logger=task_logger,
log_entry=log_entry,
)
@ -1036,7 +1023,6 @@ async def _index_single_file(
folder_path: str,
folder_name: str,
target_file_path: str,
enable_summary: bool,
root_folder_id: int | None,
task_logger,
log_entry,
@ -1125,8 +1111,7 @@ async def _index_single_file(
folder_name=folder_name,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
)
)
if root_folder_id:
connector_doc.folder_id = await _resolve_folder_for_file(
@ -1134,7 +1119,6 @@ async def _index_single_file(
)
pipeline = IndexingPipelineService(session)
llm = await get_user_long_context_llm(session, user_id, search_space_id)
documents = await pipeline.prepare_for_indexing([connector_doc])
if not documents:
@ -1142,7 +1126,7 @@ async def _index_single_file(
db_doc = documents[0]
await pipeline.index(db_doc, connector_doc, llm)
await pipeline.index(db_doc, connector_doc)
await session.refresh(db_doc)
doc_meta = dict(db_doc.document_metadata or {})
@ -1275,7 +1259,6 @@ async def index_uploaded_files(
user_id: str,
folder_name: str,
root_folder_id: int,
enable_summary: bool,
file_mappings: list[dict],
on_heartbeat_callback: HeartbeatCallbackType | None = None,
use_vision_llm: bool = False,
@ -1318,7 +1301,6 @@ async def index_uploaded_files(
page_limit_service = PageLimitService(session)
pipeline = IndexingPipelineService(session)
llm = await get_user_long_context_llm(session, user_id, search_space_id)
vision_llm_instance = None
if use_vision_llm:
@ -1414,8 +1396,7 @@ async def index_uploaded_files(
folder_name=folder_name,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
)
)
connector_doc.folder_id = await _resolve_folder_for_file(
session,
@ -1432,7 +1413,7 @@ async def index_uploaded_files(
db_doc = documents[0]
await pipeline.index(db_doc, connector_doc, llm)
await pipeline.index(db_doc, connector_doc)
await session.refresh(db_doc)
doc_meta = dict(db_doc.document_metadata or {})

View file

@ -16,13 +16,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.luma_connector import LumaConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -437,38 +435,14 @@ async def index_luma_events(
document.status = DocumentStatus.processing()
await session.commit()
# Heavy processing (LLM, embeddings, chunks)
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
# Heavy processing (embeddings, chunks)
if user_llm and connector.enable_summary:
document_metadata_for_summary = {
"event_id": item["event_id"],
"event_name": item["event_name"],
"event_url": item["event_url"],
"start_at": item["start_at"],
"end_at": item["end_at"],
"timezone": item["timezone"],
"location": item["location"] or "No location",
"city": item["city"],
"hosts": item["host_names"],
"document_type": "Luma Event",
"connector_type": "Luma",
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
item["event_markdown"], user_llm, document_metadata_for_summary
)
else:
summary_content = (
f"Luma Event: {item['event_name']}\n\n{item['event_markdown']}"
)
summary_embedding = await asyncio.to_thread(
embed_text, summary_content
)
summary_content = (
f"Luma Event: {item['event_name']}\n\n{item['event_markdown']}"
)
summary_embedding = await asyncio.to_thread(
embed_text, summary_content
)
chunks = await create_document_chunks(item["event_markdown"])

View file

@ -2,7 +2,7 @@
Notion connector indexer.
Uses the shared IndexingPipelineService for document deduplication,
summarization, chunking, and embedding with bounded parallel indexing.
chunking, and embedding with bounded parallel indexing.
"""
from collections.abc import Awaitable, Callable
@ -19,7 +19,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
IndexingPipelineService,
PlaceholderInfo,
)
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.notion_utils import process_blocks
@ -43,7 +42,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
"""Map a raw Notion page dict to a ConnectorDocument."""
page_id = page.get("page_id", "")
@ -57,8 +55,6 @@ def _build_connector_doc(
"connector_type": "Notion",
}
fallback_summary = f"Notion Page: {page_title}\n\n{markdown_content}"
return ConnectorDocument(
title=page_title,
source_markdown=markdown_content,
@ -67,8 +63,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -314,8 +308,7 @@ async def index_notion_pages(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector.enable_summary,
)
)
with session.no_autoflush:
duplicate = await check_duplicate_document_by_hash(
@ -343,13 +336,8 @@ async def index_notion_pages(
# ── Pipeline: migrate legacy docs + parallel index ────────────
await pipeline.migrate_legacy_docs(connector_docs)
async def _get_llm(s):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat_callback,
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,

View file

@ -27,7 +27,6 @@ from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_identifier_hash
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
from app.services.llm_service import get_user_long_context_llm
from app.services.page_limit_service import PageLimitService
from app.services.task_logging_service import TaskLoggingService
from app.tasks.connector_indexers.base import (
@ -133,7 +132,6 @@ def _build_connector_doc(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
) -> ConnectorDocument:
file_id = file.get("id", "")
file_name = file.get("name", "Unknown")
@ -145,8 +143,6 @@ def _build_connector_doc(
"connector_type": "OneDrive",
}
fallback_summary = f"File: {file_name}\n\n{markdown[:4000]}"
return ConnectorDocument(
title=file_name,
source_markdown=markdown,
@ -155,8 +151,6 @@ def _build_connector_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=enable_summary,
fallback_summary=fallback_summary,
metadata=metadata,
)
@ -168,7 +162,6 @@ async def _download_files_parallel(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
max_concurrency: int = 3,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
@ -198,7 +191,6 @@ async def _download_files_parallel(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
)
async with hb_lock:
completed_count += 1
@ -230,7 +222,6 @@ async def _download_and_index(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int]:
@ -241,7 +232,6 @@ async def _download_and_index(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
@ -250,13 +240,8 @@ async def _download_and_index(
batch_failed = 0
if connector_docs:
pipeline = IndexingPipelineService(session)
async def _get_llm(s):
return await get_user_long_context_llm(s, user_id, search_space_id)
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
connector_docs,
_get_llm,
max_concurrency=3,
on_heartbeat=on_heartbeat,
)
@ -294,7 +279,6 @@ async def _index_selected_files(
connector_id: int,
search_space_id: int,
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
vision_llm=None,
) -> tuple[int, int, int, list[str]]:
@ -345,7 +329,6 @@ async def _index_selected_files(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat,
vision_llm=vision_llm,
)
@ -379,7 +362,6 @@ async def _index_full_scan(
max_files: int,
include_subfolders: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
@ -454,7 +436,6 @@ async def _index_full_scan(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
@ -487,7 +468,6 @@ async def _index_with_delta_sync(
log_entry: object,
max_files: int,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
vision_llm=None,
) -> tuple[int, int, int, str | None]:
"""Delta sync using OneDrive change tracking.
@ -579,7 +559,6 @@ async def _index_with_delta_sync(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
vision_llm=vision_llm,
)
@ -651,7 +630,6 @@ async def index_onedrive_files(
)
return 0, 0, error_msg, 0
connector_enable_summary = getattr(connector, "enable_summary", True)
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
vision_llm = None
if connector_enable_vision_llm:
@ -681,7 +659,6 @@ async def index_onedrive_files(
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += indexed
@ -711,7 +688,6 @@ async def index_onedrive_files(
task_logger,
log_entry,
max_files,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += indexed
@ -738,7 +714,6 @@ async def index_onedrive_files(
log_entry,
max_files,
include_subfolders,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += ri
@ -758,7 +733,6 @@ async def index_onedrive_files(
log_entry,
max_files,
include_subfolders,
enable_summary=connector_enable_summary,
vision_llm=vision_llm,
)
total_indexed += indexed

View file

@ -15,13 +15,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.connectors.webcrawler_connector import WebCrawlerConnector
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from app.utils.webcrawler_utils import parse_webcrawler_urls
@ -372,29 +370,10 @@ async def index_crawled_urls(
documents_skipped += 1
continue
# Generate summary with LLM
user_llm = await get_user_long_context_llm(
session, user_id, search_space_id
)
# Select deterministic document content
if user_llm and connector.enable_summary:
document_metadata_for_summary = {
"url": url,
"title": title,
"description": description,
"language": language,
"document_type": "Crawled URL",
"crawler_type": crawler_type,
}
(
summary_content,
summary_embedding,
) = await generate_document_summary(
structured_document, user_llm, document_metadata_for_summary
)
else:
summary_content = f"Crawled URL: {title}\n\nURL: {url}\n\n{content}"
summary_embedding = embed_text(summary_content)
summary_content = f"Crawled URL: {title}\n\nURL: {url}\n\n{content}"
summary_embedding = embed_text(summary_content)
# Process chunks
chunks = await create_document_chunks(content)

View file

@ -1,20 +1,15 @@
"""
Unified document save/update logic for file processors.
"""
"""Unified document save/update logic for file processors."""
import asyncio
import logging
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentStatus, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
)
from ._helpers import (
@ -24,59 +19,6 @@ from ._helpers import (
)
from .base import get_current_timestamp, safe_set_chunks
# ---------------------------------------------------------------------------
# Summary generation
# ---------------------------------------------------------------------------
async def _generate_summary(
markdown_content: str,
file_name: str,
etl_service: str,
user_llm,
enable_summary: bool,
) -> tuple[str, list[float]]:
"""
Generate a document summary and embedding.
Docling uses its own large-document summary strategy; other ETL services
use the standard ``generate_document_summary`` helper.
"""
if not enable_summary:
summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
return summary, await asyncio.to_thread(embed_text, summary)
if etl_service == "DOCLING":
from app.services.docling_service import create_docling_service
docling_service = create_docling_service()
summary_text = await docling_service.process_large_document_summary(
content=markdown_content, llm=user_llm, document_title=file_name
)
meta = {
"file_name": file_name,
"etl_service": etl_service,
"document_type": "File Document",
}
parts = ["# DOCUMENT METADATA"]
for key, value in meta.items():
if value:
formatted_key = key.replace("_", " ").title()
parts.append(f"**{formatted_key}:** {value}")
enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
return enhanced, await asyncio.to_thread(embed_text, enhanced)
# Standard summary (Unstructured / LlamaCloud / others)
meta = {
"file_name": file_name,
"etl_service": etl_service,
"document_type": "File Document",
}
return await generate_document_summary(markdown_content, user_llm, meta)
# ---------------------------------------------------------------------------
# Unified save function
# ---------------------------------------------------------------------------
@ -90,7 +32,6 @@ async def save_file_document(
user_id: str,
etl_service: str,
connector: dict | None = None,
enable_summary: bool = True,
) -> Document | None:
"""
Process and store a file document with deduplication and migration support.
@ -106,7 +47,6 @@ async def save_file_document(
user_id: ID of the user
etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
connector: Optional connector info for Google Drive files
enable_summary: Whether to generate an AI summary
Returns:
Document object if successful, None if duplicate detected
@ -133,24 +73,16 @@ async def save_file_document(
if should_skip:
return doc
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
f"No long context LLM configured for user {user_id} "
f"in search space {search_space_id}"
)
summary_content, summary_embedding = await _generate_summary(
markdown_content, file_name, etl_service, user_llm, enable_summary
)
document_content = f"File: {file_name}\n\n{markdown_content[:4000]}"
document_embedding = embed_text(document_content)
chunks = await create_document_chunks(markdown_content)
doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
if existing_document:
existing_document.title = file_name
existing_document.content = summary_content
existing_document.content = document_content
existing_document.content_hash = content_hash
existing_document.embedding = summary_embedding
existing_document.embedding = document_embedding
existing_document.document_metadata = doc_metadata
await safe_set_chunks(session, existing_document, chunks)
existing_document.source_markdown = markdown_content
@ -171,8 +103,8 @@ async def save_file_document(
title=file_name,
document_type=doc_type,
document_metadata=doc_metadata,
content=summary_content,
embedding=summary_embedding,
content=document_content,
embedding=document_embedding,
chunks=chunks,
content_hash=content_hash,
unique_identifier_hash=primary_hash,

View file

@ -25,11 +25,10 @@ from app.db import (
SearchSourceConnectorType,
SearchSpace,
)
from app.services.llm_service import get_document_summary_llm
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -176,34 +175,8 @@ async def add_circleback_meeting_document(
# PHASE 3: Process the document content
# =======================================================================
# Get LLM for generating summary
llm = await get_document_summary_llm(session, search_space_id)
if not llm:
logger.warning(
f"No LLM configured for search space {search_space_id}. Using content as summary."
)
# Use first 1000 chars as summary if no LLM available
summary_content = (
markdown_content[:1000] + "..."
if len(markdown_content) > 1000
else markdown_content
)
summary_embedding = None
else:
# Generate summary with metadata
summary_metadata = {
"meeting_name": meeting_name,
"meeting_id": meeting_id,
"document_type": "Circleback Meeting",
**{
k: v
for k, v in metadata.items()
if isinstance(v, str | int | float | bool)
},
}
summary_content, summary_embedding = await generate_document_summary(
markdown_content, llm, summary_metadata
)
summary_content = markdown_content
summary_embedding = embed_text(summary_content)
# Process chunks
chunks = await create_document_chunks(markdown_content)
@ -224,8 +197,7 @@ async def add_circleback_meeting_document(
document.title = meeting_name
document.content = summary_content
document.content_hash = content_hash
if summary_embedding is not None:
document.embedding = summary_embedding
document.embedding = summary_embedding
document.document_metadata = document_metadata
await safe_set_chunks(session, document, chunks)
document.source_markdown = markdown_content

View file

@ -9,12 +9,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentType
from app.schemas import ExtensionDocumentContent
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
@ -123,26 +122,8 @@ async def add_extension_received_document(
f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
)
# Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
)
# Generate summary with metadata
document_metadata = {
"session_id": content.metadata.BrowsingSessionId,
"url": content.metadata.VisitedWebPageURL,
"title": content.metadata.VisitedWebPageTitle,
"referrer": content.metadata.VisitedWebPageReffererURL,
"timestamp": content.metadata.VisitedWebPageDateWithTimeInISOString,
"duration_ms": content.metadata.VisitedWebPageVisitDurationInMilliseconds,
"document_type": "Browser Extension Capture",
}
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm, document_metadata
)
summary_content = combined_document_string
summary_embedding = embed_text(summary_content)
# Process chunks
chunks = await create_document_chunks(content.pageContent)

View file

@ -10,7 +10,7 @@ from __future__ import annotations
import contextlib
import logging
import os
from dataclasses import dataclass, field
from dataclasses import dataclass
from fastapi import HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
@ -48,12 +48,6 @@ class _ProcessingContext:
notification: Notification | None = None
use_vision_llm: bool = False
processing_mode: str = "basic"
enable_summary: bool = field(init=False)
def __post_init__(self) -> None:
self.enable_summary = (
self.connector.get("enable_summary", True) if self.connector else True
)
# ---------------------------------------------------------------------------
@ -261,7 +255,6 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
ctx.user_id,
etl_result.etl_service,
ctx.connector,
enable_summary=ctx.enable_summary,
)
if result:
@ -466,7 +459,6 @@ async def process_file_in_background_with_document(
log_entry: Log,
connector: dict | None = None,
notification: Notification | None = None,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> Document | None:
@ -482,7 +474,6 @@ async def process_file_in_background_with_document(
from app.indexing_pipeline.adapters.file_upload_adapter import (
UploadDocumentAdapter,
)
from app.services.llm_service import get_user_long_context_llm
from app.utils.document_converters import generate_content_hash
from .base import check_duplicate_document
@ -522,8 +513,6 @@ async def process_file_in_background_with_document(
stage="chunking",
)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
adapter = UploadDocumentAdapter(session)
await adapter.index(
markdown_content=markdown_content,
@ -531,8 +520,6 @@ async def process_file_in_background_with_document(
etl_service=etl_service,
search_space_id=search_space_id,
user_id=user_id,
llm=user_llm,
should_summarize=should_summarize,
)
if billable_pages > 0:

View file

@ -8,12 +8,11 @@ from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentStatus, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
)
from ._helpers import (
@ -183,21 +182,8 @@ async def add_received_markdown_file_document(
return doc
# Content changed - continue to update
# Get user's long context LLM (needed for both create and update)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
)
# Generate summary with metadata
document_metadata = {
"file_name": file_name,
"document_type": "Markdown File Document",
}
summary_content, summary_embedding = await generate_document_summary(
file_in_markdown, user_llm, document_metadata
)
summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
summary_embedding = embed_text(summary_content)
# Process chunks
chunks = await create_document_chunks(file_in_markdown)

View file

@ -17,12 +17,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
from youtube_transcript_api import YouTubeTranscriptApi
from app.db import Document, DocumentStatus, DocumentType
from app.services.llm_service import get_user_long_context_llm
from app.services.task_logging_service import TaskLoggingService
from app.utils.document_converters import (
create_document_chunks,
embed_text,
generate_content_hash,
generate_document_summary,
generate_unique_identifier_hash,
)
from app.utils.proxy_config import get_requests_proxies
@ -355,40 +354,8 @@ async def add_youtube_video_document(
await session.commit()
return document
# Get LLM for summary generation
await task_logger.log_task_progress(
log_entry,
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
{"stage": "llm_setup"},
)
# Get user's long context LLM
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
if not user_llm:
raise RuntimeError(
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
)
# Generate summary
await task_logger.log_task_progress(
log_entry,
f"Generating summary for video: {video_data.get('title', 'YouTube Video')}",
{"stage": "summary_generation"},
)
# Generate summary with metadata
document_metadata_for_summary = {
"url": url,
"video_id": video_id,
"title": video_data.get("title", "YouTube Video"),
"author": video_data.get("author_name", "Unknown"),
"thumbnail": video_data.get("thumbnail_url", ""),
"document_type": "YouTube Video Document",
"has_transcript": "No captions available" not in transcript_text,
}
summary_content, summary_embedding = await generate_document_summary(
combined_document_string, user_llm, document_metadata_for_summary
)
summary_content = combined_document_string
summary_embedding = embed_text(summary_content)
# Process chunks
await task_logger.log_task_progress(

View file

@ -9,7 +9,6 @@ from litellm import get_model_info, token_counter
from app.config import config
from app.db import Chunk, DocumentType
from app.prompts import SUMMARY_PROMPT_TEMPLATE
logger = logging.getLogger(__name__)
@ -176,57 +175,6 @@ def optimize_content_for_context_window(
return optimized_content
async def generate_document_summary(
content: str,
user_llm,
document_metadata: dict | None = None,
) -> tuple[str, list[float]]:
"""
Generate summary and embedding for document content with metadata.
Args:
content: Document content
user_llm: User's LLM instance
document_metadata: Optional metadata dictionary to include in summary
Returns:
Tuple of (enhanced_summary_content, summary_embedding)
"""
# Get model name from user_llm for token counting
model_name = getattr(user_llm, "model", "gpt-3.5-turbo") # Fallback to default
# Optimize content to fit within context window
optimized_content = optimize_content_for_context_window(
content, document_metadata, model_name
)
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
content_with_metadata = f"<DOCUMENT><DOCUMENT_METADATA>\n\n{document_metadata}\n\n</DOCUMENT_METADATA>\n\n<DOCUMENT_CONTENT>\n\n{optimized_content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
summary_content = summary_result.content
# Combine summary with metadata if provided
if document_metadata:
metadata_parts = []
metadata_parts.append("# DOCUMENT METADATA")
for key, value in document_metadata.items():
if value: # Only include non-empty values
formatted_key = key.replace("_", " ").title()
metadata_parts.append(f"**{formatted_key}:** {value}")
metadata_section = "\n".join(metadata_parts)
enhanced_summary_content = (
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
)
else:
enhanced_summary_content = summary_content
summary_embedding = await asyncio.to_thread(embed_text, enhanced_summary_content)
return enhanced_summary_content, summary_embedding
async def create_document_chunks(content: str) -> list[Chunk]:
"""
Create chunks from document content.

View file

@ -7,13 +7,13 @@ The production indexing pipeline summarizes documents with:
summary_content = summary_result.content
The `llm` parameter is supplied per-document by
`app.services.llm_service.get_user_long_context_llm`. We patch THAT
`app.services.llm_service.get_agent_llm`. We patch THAT
function to return a langchain-native FakeListChatModel so the rest of
the chain works unchanged. No real LLM provider package is touched.
Run-backend / run-celery use unittest.mock.patch.start() to install
this at every binding site (the source module + every consumer that
did `from app.services.llm_service import get_user_long_context_llm`
did `from app.services.llm_service import get_agent_llm`
at module load time).
"""
@ -42,7 +42,7 @@ def _make_fake_llm() -> FakeListChatModel:
return fake
async def fake_get_user_long_context_llm(*args: Any, **kwargs: Any) -> Any:
"""Drop-in replacement for app.services.llm_service.get_user_long_context_llm."""
async def fake_get_agent_llm(*args: Any, **kwargs: Any) -> Any:
"""Drop-in replacement for app.services.llm_service.get_agent_llm."""
logger.info("[fake-llm] returning FakeListChatModel for E2E indexing")
return _make_fake_llm()

View file

@ -206,23 +206,23 @@ def _patch_llm_bindings() -> None:
fake_create_chat_litellm_from_agent_config,
fake_create_chat_litellm_from_config,
)
from tests.e2e.fakes.llm import fake_get_user_long_context_llm
from tests.e2e.fakes.llm import fake_get_agent_llm
targets = [
"app.services.llm_service.get_user_long_context_llm",
"app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm",
"app.tasks.document_processors._save.get_user_long_context_llm",
"app.tasks.document_processors.markdown_processor.get_user_long_context_llm",
"app.services.llm_service.get_agent_llm",
"app.tasks.connector_indexers.confluence_indexer.get_agent_llm",
"app.tasks.connector_indexers.google_drive_indexer.get_agent_llm",
"app.tasks.connector_indexers.google_gmail_indexer.get_agent_llm",
"app.tasks.connector_indexers.notion_indexer.get_agent_llm",
"app.tasks.connector_indexers.onedrive_indexer.get_agent_llm",
"app.tasks.connector_indexers.dropbox_indexer.get_agent_llm",
"app.tasks.connector_indexers.local_folder_indexer.get_agent_llm",
"app.tasks.document_processors._save.get_agent_llm",
"app.tasks.document_processors.markdown_processor.get_agent_llm",
]
for target in targets:
try:
p = patch(target, fake_get_user_long_context_llm)
p = patch(target, fake_get_agent_llm)
p.start()
_active_patches.append(p)
logger.info("[fake-llm] patched %s", target)

View file

@ -183,23 +183,23 @@ def _patch_llm_bindings() -> None:
fake_create_chat_litellm_from_agent_config,
fake_create_chat_litellm_from_config,
)
from tests.e2e.fakes.llm import fake_get_user_long_context_llm
from tests.e2e.fakes.llm import fake_get_agent_llm
targets = [
"app.services.llm_service.get_user_long_context_llm",
"app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm",
"app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm",
"app.tasks.document_processors._save.get_user_long_context_llm",
"app.tasks.document_processors.markdown_processor.get_user_long_context_llm",
"app.services.llm_service.get_agent_llm",
"app.tasks.connector_indexers.confluence_indexer.get_agent_llm",
"app.tasks.connector_indexers.google_drive_indexer.get_agent_llm",
"app.tasks.connector_indexers.google_gmail_indexer.get_agent_llm",
"app.tasks.connector_indexers.notion_indexer.get_agent_llm",
"app.tasks.connector_indexers.onedrive_indexer.get_agent_llm",
"app.tasks.connector_indexers.dropbox_indexer.get_agent_llm",
"app.tasks.connector_indexers.local_folder_indexer.get_agent_llm",
"app.tasks.document_processors._save.get_agent_llm",
"app.tasks.document_processors.markdown_processor.get_agent_llm",
]
for target in targets:
try:
p = patch(target, fake_get_user_long_context_llm)
p = patch(target, fake_get_agent_llm)
p.start()
_active_patches.append(p)
logger.info("[fake-llm] patched %s in celery worker", target)

View file

@ -0,0 +1,279 @@
"""Integration tests for new-chat thread visibility invariants.
These tests exercise the route handlers directly with real DB-backed
users, memberships, and permissions. The important contract is that a
thread shared with a search space stays shared across normal metadata
updates until the creator explicitly makes it private again.
"""
from __future__ import annotations
import uuid
import pytest
import pytest_asyncio
from fastapi import HTTPException
from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.db import (
ChatVisibility,
SearchSpace,
SearchSpaceMembership,
SearchSpaceRole,
User,
)
from app.routes import new_chat_routes
from app.schemas.new_chat import (
NewChatThreadCreate,
NewChatThreadUpdate,
NewChatThreadVisibilityUpdate,
)
pytestmark = pytest.mark.integration
@pytest_asyncio.fixture
async def db_member(db_session: AsyncSession, db_search_space: SearchSpace) -> User:
member = User(
id=uuid.uuid4(),
email="member@surfsense.net",
hashed_password="hashed",
is_active=True,
is_superuser=False,
is_verified=True,
)
db_session.add(member)
await db_session.flush()
role = (
(
await db_session.execute(
select(SearchSpaceRole).where(
SearchSpaceRole.search_space_id == db_search_space.id,
SearchSpaceRole.name == "Editor",
)
)
)
.scalars()
.one()
)
db_session.add(
SearchSpaceMembership(
user_id=member.id,
search_space_id=db_search_space.id,
role_id=role.id,
is_owner=False,
)
)
await db_session.flush()
return member
async def _create_thread(
db_session: AsyncSession,
db_user: User,
db_search_space: SearchSpace,
*,
title: str = "Visibility Invariant Chat",
):
return await new_chat_routes.create_thread(
NewChatThreadCreate(
title=title,
archived=False,
search_space_id=db_search_space.id,
visibility=ChatVisibility.PRIVATE,
),
session=db_session,
user=db_user,
)
def _active_thread_ids(response) -> set[int]:
return {thread.id for thread in response.threads}
def _search_thread_ids(response) -> set[int]:
return {thread.id for thread in response}
async def test_private_thread_is_hidden_from_other_search_space_member(
db_session: AsyncSession,
db_user: User,
db_member: User,
db_search_space: SearchSpace,
):
thread = await _create_thread(db_session, db_user, db_search_space)
member_threads = await new_chat_routes.list_threads(
search_space_id=db_search_space.id,
session=db_session,
user=db_member,
)
member_search = await new_chat_routes.search_threads(
search_space_id=db_search_space.id,
title="Visibility",
session=db_session,
user=db_member,
)
assert thread.id not in _active_thread_ids(member_threads)
assert thread.id not in _search_thread_ids(member_search)
with pytest.raises(HTTPException) as exc_info:
await new_chat_routes.get_thread_full(
thread_id=thread.id,
session=db_session,
user=db_member,
)
assert exc_info.value.status_code == 403
async def test_creator_can_share_thread_and_member_can_list_search_read_it(
db_session: AsyncSession,
db_user: User,
db_member: User,
db_search_space: SearchSpace,
):
thread = await _create_thread(db_session, db_user, db_search_space)
updated = await new_chat_routes.update_thread_visibility(
thread_id=thread.id,
visibility_update=NewChatThreadVisibilityUpdate(
visibility=ChatVisibility.SEARCH_SPACE,
),
session=db_session,
user=db_user,
)
member_threads = await new_chat_routes.list_threads(
search_space_id=db_search_space.id,
session=db_session,
user=db_member,
)
member_search = await new_chat_routes.search_threads(
search_space_id=db_search_space.id,
title="Visibility",
session=db_session,
user=db_member,
)
full_thread = await new_chat_routes.get_thread_full(
thread_id=thread.id,
session=db_session,
user=db_member,
)
assert updated.visibility == ChatVisibility.SEARCH_SPACE
assert thread.id in _active_thread_ids(member_threads)
assert thread.id in _search_thread_ids(member_search)
assert full_thread["id"] == thread.id
assert full_thread["visibility"] == ChatVisibility.SEARCH_SPACE
async def test_rename_and_archive_do_not_reset_shared_visibility(
db_session: AsyncSession,
db_user: User,
db_search_space: SearchSpace,
):
thread = await _create_thread(db_session, db_user, db_search_space)
await new_chat_routes.update_thread_visibility(
thread_id=thread.id,
visibility_update=NewChatThreadVisibilityUpdate(
visibility=ChatVisibility.SEARCH_SPACE,
),
session=db_session,
user=db_user,
)
renamed = await new_chat_routes.update_thread(
thread_id=thread.id,
thread_update=NewChatThreadUpdate(title="Renamed Shared Chat"),
session=db_session,
user=db_user,
)
archived = await new_chat_routes.update_thread(
thread_id=thread.id,
thread_update=NewChatThreadUpdate(archived=True),
session=db_session,
user=db_user,
)
assert renamed.visibility == ChatVisibility.SEARCH_SPACE
assert archived.visibility == ChatVisibility.SEARCH_SPACE
assert archived.archived is True
async def test_non_creator_cannot_change_shared_thread_back_to_private(
db_session: AsyncSession,
db_user: User,
db_member: User,
db_search_space: SearchSpace,
):
thread = await _create_thread(db_session, db_user, db_search_space)
await new_chat_routes.update_thread_visibility(
thread_id=thread.id,
visibility_update=NewChatThreadVisibilityUpdate(
visibility=ChatVisibility.SEARCH_SPACE,
),
session=db_session,
user=db_user,
)
with pytest.raises(HTTPException) as exc_info:
await new_chat_routes.update_thread_visibility(
thread_id=thread.id,
visibility_update=NewChatThreadVisibilityUpdate(
visibility=ChatVisibility.PRIVATE,
),
session=db_session,
user=db_member,
)
assert exc_info.value.status_code == 403
async def test_creator_can_make_shared_thread_private_again(
db_session: AsyncSession,
db_user: User,
db_member: User,
db_search_space: SearchSpace,
):
thread = await _create_thread(db_session, db_user, db_search_space)
await new_chat_routes.update_thread_visibility(
thread_id=thread.id,
visibility_update=NewChatThreadVisibilityUpdate(
visibility=ChatVisibility.SEARCH_SPACE,
),
session=db_session,
user=db_user,
)
private_again = await new_chat_routes.update_thread_visibility(
thread_id=thread.id,
visibility_update=NewChatThreadVisibilityUpdate(
visibility=ChatVisibility.PRIVATE,
),
session=db_session,
user=db_user,
)
member_threads = await new_chat_routes.list_threads(
search_space_id=db_search_space.id,
session=db_session,
user=db_member,
)
member_search = await new_chat_routes.search_threads(
search_space_id=db_search_space.id,
title="Visibility",
session=db_session,
user=db_member,
)
assert private_again.visibility == ChatVisibility.PRIVATE
assert thread.id not in _active_thread_ids(member_threads)
assert thread.id not in _search_thread_ids(member_search)
with pytest.raises(HTTPException) as exc_info:
await new_chat_routes.get_thread_full(
thread_id=thread.id,
session=db_session,
user=db_member,
)
assert exc_info.value.status_code == 403

View file

@ -1,7 +1,7 @@
import importlib
import sys
import uuid
from unittest.mock import AsyncMock, MagicMock
from unittest.mock import MagicMock
import pytest
import pytest_asyncio
@ -123,26 +123,6 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac
return space
@pytest.fixture
def patched_summarize(monkeypatch) -> AsyncMock:
mock = AsyncMock(return_value="Mocked summary.")
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
mock,
)
return mock
@pytest.fixture
def patched_summarize_raises(monkeypatch) -> AsyncMock:
mock = AsyncMock(side_effect=RuntimeError("LLM unavailable"))
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
mock,
)
return mock
@pytest.fixture
def patched_embed_texts(monkeypatch) -> MagicMock:
mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
@ -153,6 +133,16 @@ def patched_embed_texts(monkeypatch) -> MagicMock:
return mock
@pytest.fixture
def patched_embed_texts_raises(monkeypatch) -> MagicMock:
mock = MagicMock(side_effect=RuntimeError("Embedding unavailable"))
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
mock,
)
return mock
@pytest.fixture
def patched_chunk_text(monkeypatch) -> MagicMock:
mock = MagicMock(return_value=["Test chunk content."])

View file

@ -68,7 +68,6 @@ class InlineTaskDispatcher:
filename: str,
search_space_id: int,
user_id: str,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> None:
@ -83,7 +82,6 @@ class InlineTaskDispatcher:
filename,
search_space_id,
user_id,
should_summarize=should_summarize,
use_vision_llm=use_vision_llm,
processing_mode=processing_mode,
)
@ -266,10 +264,6 @@ async def page_limits():
@pytest.fixture(autouse=True)
def _mock_external_apis(monkeypatch):
"""Mock LLM, embedding, and chunking — these are external API boundaries."""
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
AsyncMock(return_value="Mocked summary."),
)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),

View file

@ -8,7 +8,7 @@ pytestmark = pytest.mark.integration
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
"""Document status is READY after successful indexing."""
@ -19,7 +19,6 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -31,10 +30,10 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
"""Document content is set to the LLM-generated summary."""
async def test_content_is_source_markdown(db_session, db_search_space, db_user, mocker):
"""Document content is set to the extracted source markdown."""
adapter = UploadDocumentAdapter(db_session)
await adapter.index(
markdown_content="## Hello\n\nSome content.",
@ -42,8 +41,6 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
should_summarize=True,
)
result = await db_session.execute(
@ -51,11 +48,11 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
)
document = result.scalars().first()
assert document.content == "Mocked summary."
assert document.content == "## Hello\n\nSome content."
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
"""Chunks derived from the source markdown are persisted in the DB."""
@ -66,7 +63,6 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -83,9 +79,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
assert chunks[0].content == "Test chunk content."
@pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
"""RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
adapter = UploadDocumentAdapter(db_session)
@ -96,8 +90,6 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user,
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
should_summarize=True,
)
@ -107,10 +99,10 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user,
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_reindex_updates_content(db_session, db_search_space, db_user, mocker):
"""Document content is updated to the new summary after reindexing."""
"""Document content is updated to the new source markdown after reindexing."""
adapter = UploadDocumentAdapter(db_session)
await adapter.index(
markdown_content="## Original\n\nOriginal content.",
@ -118,7 +110,6 @@ async def test_reindex_updates_content(db_session, db_search_space, db_user, moc
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -129,14 +120,14 @@ async def test_reindex_updates_content(db_session, db_search_space, db_user, moc
document.source_markdown = "## Edited\n\nNew content after user edit."
await db_session.flush()
await adapter.reindex(document=document, llm=mocker.Mock())
await adapter.reindex(document=document)
await db_session.refresh(document)
assert document.content == "Mocked summary."
assert document.content == "## Edited\n\nNew content after user edit."
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_reindex_updates_content_hash(
db_session, db_search_space, db_user, mocker
@ -149,7 +140,6 @@ async def test_reindex_updates_content_hash(
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -161,14 +151,14 @@ async def test_reindex_updates_content_hash(
document.source_markdown = "## Edited\n\nNew content after user edit."
await db_session.flush()
await adapter.reindex(document=document, llm=mocker.Mock())
await adapter.reindex(document=document)
await db_session.refresh(document)
assert document.content_hash != original_hash
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, mocker):
"""Document status is READY after successful reindexing."""
@ -179,7 +169,6 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -190,13 +179,13 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
document.source_markdown = "## Edited\n\nNew content after user edit."
await db_session.flush()
await adapter.reindex(document=document, llm=mocker.Mock())
await adapter.reindex(document=document)
await db_session.refresh(document)
assert DocumentStatus.is_state(document.status, DocumentStatus.READY)
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts")
@pytest.mark.usefixtures("patched_embed_texts")
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
"""Reindexing replaces old chunks with new content rather than appending."""
mocker.patch(
@ -211,7 +200,6 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -223,7 +211,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc
document.source_markdown = "## Edited\n\nNew content after user edit."
await db_session.flush()
await adapter.reindex(document=document, llm=mocker.Mock())
await adapter.reindex(document=document)
chunks_result = await db_session.execute(
select(Chunk).filter(Chunk.document_id == document_id)
@ -235,7 +223,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_reindex_clears_reindexing_flag(
db_session, db_search_space, db_user, mocker
@ -248,7 +236,6 @@ async def test_reindex_clears_reindexing_flag(
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -260,19 +247,17 @@ async def test_reindex_clears_reindexing_flag(
document.content_needs_reindexing = True
await db_session.flush()
await adapter.reindex(document=document, llm=mocker.Mock())
await adapter.reindex(document=document)
await db_session.refresh(document)
assert document.content_needs_reindexing is False
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, mocker):
async def test_reindex_raises_on_failure(
db_session, db_search_space, db_user, patched_embed_texts, mocker
):
"""RuntimeError is raised when reindexing fails so the caller can handle it."""
mocker.patch(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
return_value="Mocked summary.",
)
adapter = UploadDocumentAdapter(db_session)
await adapter.index(
@ -281,7 +266,6 @@ async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, m
etl_service="UNSTRUCTURED",
search_space_id=db_search_space.id,
user_id=str(db_user.id),
llm=mocker.Mock(),
)
result = await db_session.execute(
@ -292,13 +276,10 @@ async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, m
document.source_markdown = "## Edited\n\nNew content after user edit."
await db_session.flush()
mocker.patch(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
side_effect=RuntimeError("LLM unavailable"),
)
patched_embed_texts.side_effect = RuntimeError("Embedding unavailable")
with pytest.raises(RuntimeError, match=r"Embedding failed|Reindexing failed"):
await adapter.reindex(document=document, llm=mocker.Mock())
await adapter.reindex(document=document)
async def test_reindex_raises_on_empty_source_markdown(
@ -323,4 +304,4 @@ async def test_reindex_raises_on_empty_source_markdown(
adapter = UploadDocumentAdapter(db_session)
with pytest.raises(RuntimeError, match="no source_markdown"):
await adapter.reindex(document=document, llm=mocker.Mock())
await adapter.reindex(document=document)

View file

@ -25,8 +25,6 @@ def _cal_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=True,
fallback_summary=f"Calendar: Event {unique_id}",
metadata={
"event_id": unique_id,
"start_time": "2025-01-15T10:00:00",
@ -37,7 +35,7 @@ def _cal_doc(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_calendar_pipeline_creates_ready_document(
db_session, db_search_space, db_connector, db_user, mocker
@ -55,7 +53,7 @@ async def test_calendar_pipeline_creates_ready_document(
prepared = await service.prepare_for_indexing([doc])
assert len(prepared) == 1
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.search_space_id == space_id)
@ -68,7 +66,7 @@ async def test_calendar_pipeline_creates_ready_document(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_calendar_legacy_doc_migrated(
db_session, db_search_space, db_connector, db_user, mocker

View file

@ -25,8 +25,6 @@ def _drive_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=True,
fallback_summary=f"File: {unique_id}.pdf",
metadata={
"google_drive_file_id": unique_id,
"google_drive_file_name": f"{unique_id}.pdf",
@ -36,7 +34,7 @@ def _drive_doc(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_drive_pipeline_creates_ready_document(
db_session, db_search_space, db_connector, db_user, mocker
@ -54,7 +52,7 @@ async def test_drive_pipeline_creates_ready_document(
prepared = await service.prepare_for_indexing([doc])
assert len(prepared) == 1
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.search_space_id == space_id)
@ -67,7 +65,7 @@ async def test_drive_pipeline_creates_ready_document(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_drive_legacy_doc_migrated(
db_session, db_search_space, db_connector, db_user, mocker

View file

@ -24,8 +24,6 @@ def _dropbox_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=True,
fallback_summary=f"File: {unique_id}.docx",
metadata={
"dropbox_file_id": unique_id,
"dropbox_file_name": f"{unique_id}.docx",
@ -35,7 +33,7 @@ def _dropbox_doc(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_dropbox_pipeline_creates_ready_document(
db_session, db_search_space, db_connector, db_user, mocker
@ -53,7 +51,7 @@ async def test_dropbox_pipeline_creates_ready_document(
prepared = await service.prepare_for_indexing([doc])
assert len(prepared) == 1
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.search_space_id == space_id)
@ -66,7 +64,7 @@ async def test_dropbox_pipeline_creates_ready_document(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_dropbox_duplicate_content_skipped(
db_session, db_search_space, db_connector, db_user, mocker
@ -86,7 +84,7 @@ async def test_dropbox_duplicate_content_skipped(
prepared = await service.prepare_for_indexing([doc])
assert len(prepared) == 1
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.search_space_id == space_id)

View file

@ -28,8 +28,6 @@ def _gmail_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=True,
fallback_summary=f"Gmail: Subject for {unique_id}",
metadata={
"message_id": unique_id,
"from": "sender@example.com",
@ -39,7 +37,7 @@ def _gmail_doc(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_gmail_pipeline_creates_ready_document(
db_session, db_search_space, db_connector, db_user, mocker
@ -57,7 +55,7 @@ async def test_gmail_pipeline_creates_ready_document(
prepared = await service.prepare_for_indexing([doc])
assert len(prepared) == 1
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.search_space_id == space_id)
@ -71,7 +69,7 @@ async def test_gmail_pipeline_creates_ready_document(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_gmail_legacy_doc_migrated_then_reused(
db_session, db_search_space, db_connector, db_user, mocker

View file

@ -10,7 +10,7 @@ pytestmark = pytest.mark.integration
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_index_batch_creates_ready_documents(
db_session, db_search_space, make_connector_document, mocker
@ -33,7 +33,7 @@ async def test_index_batch_creates_ready_documents(
]
service = IndexingPipelineService(session=db_session)
results = await service.index_batch(docs, llm=mocker.Mock())
results = await service.index_batch(docs)
assert len(results) == 2
@ -50,10 +50,10 @@ async def test_index_batch_creates_ready_documents(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_index_batch_empty_returns_empty(db_session, mocker):
"""index_batch with empty input returns an empty list."""
service = IndexingPipelineService(session=db_session)
results = await service.index_batch([], llm=mocker.Mock())
results = await service.index_batch([])
assert results == []

View file

@ -10,9 +10,7 @@ _EMBEDDING_DIM = app_config.embedding_model_instance.dimension
pytestmark = pytest.mark.integration
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_sets_status_ready(
db_session,
db_search_space,
@ -27,7 +25,7 @@ async def test_sets_status_ready(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
@ -37,16 +35,14 @@ async def test_sets_status_ready(
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
async def test_content_is_summary_when_should_summarize_true(
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_content_is_source_markdown_by_default(
db_session,
db_search_space,
make_connector_document,
mocker,
):
"""Document content is set to the LLM-generated summary when should_summarize=True."""
"""Document content is set to source_markdown by default."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -54,28 +50,25 @@ async def test_content_is_summary_when_should_summarize_true(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
)
reloaded = result.scalars().first()
assert reloaded.content == "Mocked summary."
assert reloaded.content == connector_doc.source_markdown
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
async def test_content_is_source_markdown_when_should_summarize_false(
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_content_is_source_markdown_when_custom_content(
db_session,
db_search_space,
make_connector_document,
):
"""Document content is set to source_markdown verbatim when should_summarize=False."""
"""Document content is set to source_markdown verbatim."""
connector_doc = make_connector_document(
search_space_id=db_search_space.id,
should_summarize=False,
source_markdown="## Raw content",
)
service = IndexingPipelineService(session=db_session)
@ -84,7 +77,7 @@ async def test_content_is_source_markdown_when_should_summarize_false(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=None)
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
@ -94,9 +87,7 @@ async def test_content_is_source_markdown_when_should_summarize_false(
assert reloaded.content == "## Raw content"
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_chunks_written_to_db(
db_session,
db_search_space,
@ -111,7 +102,7 @@ async def test_chunks_written_to_db(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
result = await db_session.execute(
select(Chunk).filter(Chunk.document_id == document_id)
@ -122,9 +113,7 @@ async def test_chunks_written_to_db(
assert chunks[0].content == "Test chunk content."
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_embedding_written_to_db(
db_session,
db_search_space,
@ -139,7 +128,7 @@ async def test_embedding_written_to_db(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
@ -150,9 +139,7 @@ async def test_embedding_written_to_db(
assert len(reloaded.embedding) == _EMBEDDING_DIM
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_updated_at_advances_after_indexing(
db_session,
db_search_space,
@ -172,7 +159,7 @@ async def test_updated_at_advances_after_indexing(
)
updated_at_pending = result.scalars().first().updated_at
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
@ -182,18 +169,15 @@ async def test_updated_at_advances_after_indexing(
assert updated_at_ready > updated_at_pending
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_no_llm_falls_back_to_source_markdown(
db_session,
db_search_space,
make_connector_document,
):
"""When llm=None and no fallback_summary, content falls back to source_markdown."""
"""Content stays deterministic source markdown without an LLM."""
connector_doc = make_connector_document(
search_space_id=db_search_space.id,
should_summarize=True,
source_markdown="## Fallback content",
)
service = IndexingPipelineService(session=db_session)
@ -202,7 +186,7 @@ async def test_no_llm_falls_back_to_source_markdown(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=None)
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
@ -213,27 +197,23 @@ async def test_no_llm_falls_back_to_source_markdown(
assert reloaded.content == "## Fallback content"
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
async def test_fallback_summary_used_when_llm_unavailable(
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_source_markdown_used_without_preview(
db_session,
db_search_space,
make_connector_document,
):
"""fallback_summary is used as content when llm=None and should_summarize=True."""
"""Source markdown is used without fallback preview fields."""
connector_doc = make_connector_document(
search_space_id=db_search_space.id,
should_summarize=True,
source_markdown="## Full raw content",
fallback_summary="Short pre-built summary.",
)
service = IndexingPipelineService(session=db_session)
prepared = await service.prepare_for_indexing([connector_doc])
document_id = prepared[0].id
await service.index(prepared[0], connector_doc, llm=None)
await service.index(prepared[0], connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
@ -241,12 +221,10 @@ async def test_fallback_summary_used_when_llm_unavailable(
reloaded = result.scalars().first()
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
assert reloaded.content == "Short pre-built summary."
assert reloaded.content == "## Full raw content"
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
async def test_reindex_replaces_old_chunks(
db_session,
db_search_space,
@ -264,14 +242,14 @@ async def test_reindex_replaces_old_chunks(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
updated_doc = make_connector_document(
search_space_id=db_search_space.id,
source_markdown="## v2",
)
re_prepared = await service.prepare_for_indexing([updated_doc])
await service.index(re_prepared[0], updated_doc, llm=mocker.Mock())
await service.index(re_prepared[0], updated_doc)
result = await db_session.execute(
select(Chunk).filter(Chunk.document_id == document_id)
@ -281,16 +259,14 @@ async def test_reindex_replaces_old_chunks(
assert len(chunks) == 1
@pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
)
async def test_llm_error_sets_status_failed(
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
async def test_embedding_error_sets_status_failed(
db_session,
db_search_space,
make_connector_document,
mocker,
):
"""Document status is FAILED when the LLM raises during indexing."""
"""Document status is FAILED when embedding raises during indexing."""
connector_doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
@ -298,7 +274,7 @@ async def test_llm_error_sets_status_failed(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)
@ -308,10 +284,8 @@ async def test_llm_error_sets_status_failed(
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.FAILED)
@pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
)
async def test_llm_error_leaves_no_partial_data(
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
async def test_embedding_error_leaves_no_partial_data(
db_session,
db_search_space,
make_connector_document,
@ -325,7 +299,7 @@ async def test_llm_error_leaves_no_partial_data(
document = prepared[0]
document_id = document.id
await service.index(document, connector_doc, llm=mocker.Mock())
await service.index(document, connector_doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)

View file

@ -21,7 +21,6 @@ from app.db import (
pytestmark = pytest.mark.integration
UNIFIED_FIXTURES = (
"patched_summarize",
"patched_embed_texts",
"patched_chunk_text",
)
@ -787,7 +786,7 @@ class TestPipelineIntegration:
assert len(prepared) == 1
db_doc = prepared[0]
result = await service.index(db_doc, doc, llm=mocker.Mock())
result = await service.index(db_doc, doc)
assert result is not None
docs = (
@ -1272,7 +1271,7 @@ class TestIndexingProgressFlag:
original_index = IndexingPipelineService.index
flag_observed = []
async def patched_index(self_pipe, document, connector_doc, llm):
async def patched_index(self_pipe, document, connector_doc):
folder = (
await db_session.execute(
select(Folder).where(
@ -1284,7 +1283,7 @@ class TestIndexingProgressFlag:
if folder:
meta = folder.folder_metadata or {}
flag_observed.append(meta.get("indexing_in_progress", False))
return await original_index(self_pipe, document, connector_doc, llm)
return await original_index(self_pipe, document, connector_doc)
IndexingPipelineService.index = patched_index
try:

View file

@ -24,8 +24,6 @@ def _onedrive_doc(
search_space_id=search_space_id,
connector_id=connector_id,
created_by_id=user_id,
should_summarize=True,
fallback_summary=f"File: {unique_id}.docx",
metadata={
"onedrive_file_id": unique_id,
"onedrive_file_name": f"{unique_id}.docx",
@ -35,7 +33,7 @@ def _onedrive_doc(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_onedrive_pipeline_creates_ready_document(
db_session, db_search_space, db_connector, db_user, mocker
@ -53,7 +51,7 @@ async def test_onedrive_pipeline_creates_ready_document(
prepared = await service.prepare_for_indexing([doc])
assert len(prepared) == 1
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.search_space_id == space_id)
@ -66,7 +64,7 @@ async def test_onedrive_pipeline_creates_ready_document(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_onedrive_duplicate_content_skipped(
db_session, db_search_space, db_connector, db_user, mocker
@ -86,7 +84,7 @@ async def test_onedrive_duplicate_content_skipped(
prepared = await service.prepare_for_indexing([doc])
assert len(prepared) == 1
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.search_space_id == space_id)

View file

@ -33,7 +33,7 @@ async def test_new_document_is_persisted_with_pending_status(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_unchanged_ready_document_is_skipped(
db_session,
@ -47,7 +47,7 @@ async def test_unchanged_ready_document_is_skipped(
# Index fully so the document reaches ready state
prepared = await service.prepare_for_indexing([doc])
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
# Same content on the next run — a ready document must be skipped
results = await service.prepare_for_indexing([doc])
@ -56,7 +56,7 @@ async def test_unchanged_ready_document_is_skipped(
@pytest.mark.usefixtures(
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
"patched_embed_texts", "patched_chunk_text"
)
async def test_title_only_change_updates_title_in_db(
db_session,
@ -72,7 +72,7 @@ async def test_title_only_change_updates_title_in_db(
prepared = await service.prepare_for_indexing([original])
document_id = prepared[0].id
await service.index(prepared[0], original, llm=mocker.Mock())
await service.index(prepared[0], original)
renamed = make_connector_document(
search_space_id=db_search_space.id, title="Updated Title"
@ -338,9 +338,7 @@ async def test_same_content_from_different_source_is_skipped(
assert len(result.scalars().all()) == 1
@pytest.mark.usefixtures(
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
)
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
async def test_failed_document_with_unchanged_content_is_requeued(
db_session,
db_search_space,
@ -351,10 +349,10 @@ async def test_failed_document_with_unchanged_content_is_requeued(
doc = make_connector_document(search_space_id=db_search_space.id)
service = IndexingPipelineService(session=db_session)
# First run: document is created and indexing crashes → status = failed
# First run: document is created and indexing crashes, so status becomes failed.
prepared = await service.prepare_for_indexing([doc])
document_id = prepared[0].id
await service.index(prepared[0], doc, llm=mocker.Mock())
await service.index(prepared[0], doc)
result = await db_session.execute(
select(Document).filter(Document.id == document_id)

View file

@ -101,7 +101,7 @@ async def test_generate_resume_defaults_to_one_page_target(monkeypatch) -> None:
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=_llm_invoke))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
@ -130,7 +130,7 @@ async def test_generate_resume_compresses_when_over_limit(monkeypatch) -> None:
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
@ -165,7 +165,7 @@ async def test_generate_resume_returns_ready_when_target_not_met(monkeypatch) ->
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
@ -198,7 +198,7 @@ async def test_generate_resume_fails_when_hard_limit_exceeded(monkeypatch) -> No
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
monkeypatch.setattr(
resume_tool,
"get_document_summary_llm",
"get_agent_llm",
AsyncMock(return_value=llm),
)
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")

View file

@ -14,8 +14,8 @@ from typing import Any
import pytest
import app.automations.actions.agent_task.dependencies as deps_mod
from app.automations.actions.agent_task.dependencies import (
import app.automations.actions.builtin.agent_task.dependencies as deps_mod
from app.automations.actions.builtin.agent_task.dependencies import (
DependencyError,
build_dependencies,
)

View file

@ -71,7 +71,6 @@ async def test_build_connector_doc_produces_correct_fields():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert doc.title == "Engineering Handbook"
@ -81,7 +80,6 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.search_space_id == _SEARCH_SPACE_ID
assert doc.connector_id == _CONNECTOR_ID
assert doc.created_by_id == _USER_ID
assert doc.should_summarize is True
assert doc.metadata["page_id"] == "abc-123"
assert doc.metadata["page_title"] == "Engineering Handbook"
assert doc.metadata["space_id"] == "ENG"
@ -89,21 +87,6 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.metadata["connector_id"] == _CONNECTOR_ID
assert doc.metadata["document_type"] == "Confluence Page"
assert doc.metadata["connector_type"] == "Confluence"
assert doc.fallback_summary is not None
assert "Engineering Handbook" in doc.fallback_summary
assert markdown in doc.fallback_summary
async def test_build_connector_doc_summary_disabled():
doc = _build_connector_doc(
_make_page(),
_to_markdown(_make_page()),
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=False,
)
assert doc.should_summarize is False
# ---------------------------------------------------------------------------
@ -111,10 +94,9 @@ async def test_build_connector_doc_summary_disabled():
# ---------------------------------------------------------------------------
def _mock_connector(enable_summary: bool = True):
def _mock_connector():
c = MagicMock()
c.config = {"access_token": "tok"}
c.enable_summary = enable_summary
c.last_indexed_at = None
return c

View file

@ -71,7 +71,6 @@ async def test_single_file_returns_one_connector_document(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -97,7 +96,6 @@ async def test_multiple_files_all_produce_documents(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 3
@ -125,7 +123,6 @@ async def test_one_download_exception_does_not_block_others(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 2
@ -152,7 +149,6 @@ async def test_etl_error_counts_as_download_failure(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -191,7 +187,6 @@ async def test_concurrency_bounded_by_semaphore(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
max_concurrency=2,
)
@ -231,7 +226,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
on_heartbeat=_on_heartbeat,
)
@ -324,7 +318,6 @@ async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500):
mocks["task_logger"],
mocks["log_entry"],
max_files,
enable_summary=True,
)
@ -434,7 +427,6 @@ async def _run_selected(mocks, file_tuples):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
@ -569,7 +561,6 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["id:del1", "id:del2"]
@ -608,7 +599,6 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert indexed == 2
@ -670,7 +660,6 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["id:del1", "id:del2"]
@ -704,7 +693,6 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert cursor == "brand-new-cursor-xyz"
@ -725,7 +713,7 @@ def orchestrator_mocks(monkeypatch):
mock_connector = MagicMock()
mock_connector.config = {"_token_encrypted": False}
mock_connector.last_indexed_at = None
mock_connector.enable_summary = True
mock_connector.enable_vision_llm = True
monkeypatch.setattr(
_mod,

View file

@ -66,7 +66,6 @@ async def test_single_file_returns_one_connector_document(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 3
@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 2
@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -186,7 +182,6 @@ async def test_concurrency_bounded_by_semaphore(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
max_concurrency=2,
)
@ -226,7 +221,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
on_heartbeat=_on_heartbeat,
)
@ -300,12 +294,6 @@ def full_scan_mocks(mock_drive_client, monkeypatch):
MagicMock(return_value=pipeline_mock),
)
monkeypatch.setattr(
_mod,
"get_user_long_context_llm",
AsyncMock(return_value=MagicMock()),
)
return {
"drive_client": mock_drive_client,
"session": mock_session,
@ -333,7 +321,6 @@ async def _run_full_scan(mocks, *, max_files=500, include_subfolders=False):
mocks["log_entry"],
max_files,
include_subfolders=include_subfolders,
enable_summary=True,
)
@ -487,12 +474,6 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
"IndexingPipelineService",
MagicMock(return_value=pipeline_mock),
)
monkeypatch.setattr(
_mod,
"get_user_long_context_llm",
AsyncMock(return_value=MagicMock()),
)
mock_session, _ = _make_page_limit_session()
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
@ -509,7 +490,6 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["del1", "del2", "trash1"]
@ -577,7 +557,6 @@ async def _run_selected(mocks, file_ids):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)

View file

@ -70,7 +70,6 @@ async def test_build_connector_doc_produces_correct_fields():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert doc.title == "ENG-42: Fix login bug"
@ -80,7 +79,6 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.search_space_id == _SEARCH_SPACE_ID
assert doc.connector_id == _CONNECTOR_ID
assert doc.created_by_id == _USER_ID
assert doc.should_summarize is True
assert doc.metadata["issue_id"] == "abc-123"
assert doc.metadata["issue_identifier"] == "ENG-42"
assert doc.metadata["issue_title"] == "Fix login bug"
@ -90,24 +88,6 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.metadata["connector_id"] == _CONNECTOR_ID
assert doc.metadata["document_type"] == "Linear Issue"
assert doc.metadata["connector_type"] == "Linear"
assert doc.fallback_summary is not None
assert "ENG-42" in doc.fallback_summary
assert markdown in doc.fallback_summary
async def test_build_connector_doc_summary_disabled():
"""When enable_summary is False, should_summarize is False."""
doc = _build_connector_doc(
_make_issue(),
_make_formatted_issue(),
"# content",
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=False,
)
assert doc.should_summarize is False
# ---------------------------------------------------------------------------
@ -115,10 +95,9 @@ async def test_build_connector_doc_summary_disabled():
# ---------------------------------------------------------------------------
def _mock_connector(enable_summary: bool = True):
def _mock_connector():
c = MagicMock()
c.config = {"access_token": "tok"}
c.enable_summary = enable_summary
c.last_indexed_at = None
return c

View file

@ -41,7 +41,6 @@ async def test_build_connector_doc_produces_correct_fields():
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert doc.title == "My Notion Page"
@ -51,29 +50,11 @@ async def test_build_connector_doc_produces_correct_fields():
assert doc.search_space_id == _SEARCH_SPACE_ID
assert doc.connector_id == _CONNECTOR_ID
assert doc.created_by_id == _USER_ID
assert doc.should_summarize is True
assert doc.metadata["page_title"] == "My Notion Page"
assert doc.metadata["page_id"] == "abc-123"
assert doc.metadata["connector_id"] == _CONNECTOR_ID
assert doc.metadata["document_type"] == "Notion Page"
assert doc.metadata["connector_type"] == "Notion"
assert doc.fallback_summary is not None
assert "My Notion Page" in doc.fallback_summary
assert markdown in doc.fallback_summary
async def test_build_connector_doc_summary_disabled():
"""When enable_summary is False, should_summarize is False."""
doc = _build_connector_doc(
_make_page(),
"# content",
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=False,
)
assert doc.should_summarize is False
# ---------------------------------------------------------------------------
@ -81,10 +62,9 @@ async def test_build_connector_doc_summary_disabled():
# ---------------------------------------------------------------------------
def _mock_connector(enable_summary: bool = True):
def _mock_connector():
c = MagicMock()
c.config = {"access_token": "tok"}
c.enable_summary = enable_summary
c.last_indexed_at = None
return c

View file

@ -65,7 +65,6 @@ async def test_single_file_returns_one_connector_document(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 3
@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 2
@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
assert len(docs) == 1
@ -185,7 +181,6 @@ async def test_concurrency_bounded_by_semaphore(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
max_concurrency=2,
)
@ -225,7 +220,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
on_heartbeat=_on_heartbeat,
)

View file

@ -180,7 +180,6 @@ async def _run_gdrive_selected(mocks, file_ids):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
@ -336,10 +335,6 @@ def gdrive_full_scan_mocks(monkeypatch):
monkeypatch.setattr(
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
)
monkeypatch.setattr(
_mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock())
)
return {
"mod": _mod,
"session": session,
@ -366,7 +361,6 @@ async def _run_gdrive_full_scan(mocks, max_files=500):
MagicMock(),
max_files,
include_subfolders=False,
enable_summary=True,
)
@ -454,10 +448,6 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
monkeypatch.setattr(
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
)
monkeypatch.setattr(
_mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock())
)
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
@ -473,7 +463,6 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
call_files = download_mock.call_args[0][1]
@ -539,7 +528,6 @@ async def _run_onedrive_selected(mocks, file_ids):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
@ -641,7 +629,6 @@ async def _run_dropbox_selected(mocks, file_paths):
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)

View file

@ -69,6 +69,13 @@ def _signed_slack_request(payload: dict, *, secret: str = "signing-secret") -> R
)
def _enable_slack_gateway(monkeypatch):
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_ENABLED", True)
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_CLIENT_ID", "client-id")
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_CLIENT_SECRET", "client-secret")
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
async def _call_webhook(*, request: RequestStub, account_id: int, session):
return await routes.telegram_webhook(
request=request,
@ -207,7 +214,7 @@ def test_verify_slack_signature_accepts_valid_signature():
@pytest.mark.asyncio
async def test_slack_webhook_url_verification(monkeypatch, mocker):
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
_enable_slack_gateway(monkeypatch)
request = _signed_slack_request({"type": "url_verification", "challenge": "abc123"})
response = await routes.slack_webhook(request=request, session=mocker.AsyncMock())
@ -218,7 +225,7 @@ async def test_slack_webhook_url_verification(monkeypatch, mocker):
@pytest.mark.asyncio
async def test_slack_webhook_persists_event(monkeypatch, mocker):
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
_enable_slack_gateway(monkeypatch)
session = mocker.AsyncMock()
monkeypatch.setattr(routes, "get_slack_account_by_team", mocker.AsyncMock(return_value=_slack_account()))
persist = mocker.AsyncMock(return_value=100)
@ -248,7 +255,7 @@ async def test_slack_webhook_persists_event(monkeypatch, mocker):
@pytest.mark.asyncio
async def test_slack_webhook_ignores_self_event(monkeypatch, mocker):
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
_enable_slack_gateway(monkeypatch)
session = mocker.AsyncMock()
monkeypatch.setattr(routes, "get_slack_account_by_team", mocker.AsyncMock(return_value=_slack_account()))
persist = mocker.AsyncMock(return_value=100)
@ -275,7 +282,7 @@ async def test_slack_webhook_ignores_self_event(monkeypatch, mocker):
@pytest.mark.asyncio
async def test_discord_gateway_install_returns_oauth_url(monkeypatch):
async def test_discord_gateway_install_returns_oauth_url(monkeypatch, mocker):
monkeypatch.setattr(routes.config, "DISCORD_CLIENT_ID", "discord-client")
monkeypatch.setattr(
routes.config,
@ -283,10 +290,12 @@ async def test_discord_gateway_install_returns_oauth_url(monkeypatch):
"http://localhost:8000/api/v1/gateway/discord/callback",
)
monkeypatch.setattr(routes.config, "SECRET_KEY", "test-secret")
monkeypatch.setattr(routes, "check_search_space_access", mocker.AsyncMock())
response = await routes.install_discord_gateway(
search_space_id=123,
user=SimpleNamespace(id="00000000-0000-0000-0000-000000000001"),
session=mocker.AsyncMock(),
)
assert response["auth_url"].startswith("https://discord.com/api/oauth2/authorize?")

View file

@ -18,7 +18,6 @@ def test_valid_document_created_with_required_fields():
connector_id=42,
created_by_id="00000000-0000-0000-0000-000000000001",
)
assert doc.should_summarize is True
assert doc.should_use_code_chunker is False
assert doc.metadata == {}
assert doc.connector_id == 42

View file

@ -1,41 +0,0 @@
from unittest.mock import MagicMock
import pytest
from app.indexing_pipeline.document_summarizer import summarize_document
pytestmark = pytest.mark.unit
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_without_metadata_returns_raw_summary():
"""Summarizer returns the LLM output directly when no metadata is provided."""
result = await summarize_document("# Content", llm=MagicMock(model="gpt-4"))
assert result == "The summary."
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_with_metadata_includes_metadata_values_in_output():
"""Non-empty metadata values are prepended to the summary output."""
result = await summarize_document(
"# Content",
llm=MagicMock(model="gpt-4"),
metadata={"author": "Alice", "source": "Notion"},
)
assert "Alice" in result
assert "Notion" in result
@pytest.mark.usefixtures("patched_summarizer_chain")
async def test_with_metadata_omits_empty_fields_from_output():
"""Empty metadata fields are omitted from the summary output."""
result = await summarize_document(
"# Content",
llm=MagicMock(model="gpt-4"),
metadata={"author": "Alice", "description": ""},
)
assert "Alice" in result
assert "description" not in result.lower()

View file

@ -37,12 +37,10 @@ async def test_calls_prepare_then_index_per_document(pipeline, make_connector_do
orm2 = MagicMock(spec=Document)
orm2.unique_identifier_hash = compute_unique_identifier_hash(doc2)
mock_llm = MagicMock()
pipeline.prepare_for_indexing = AsyncMock(return_value=[orm1, orm2])
pipeline.index = AsyncMock(side_effect=lambda doc, cdoc, llm: doc)
pipeline.index = AsyncMock(side_effect=lambda doc, cdoc: doc)
results = await pipeline.index_batch([doc1, doc2], mock_llm)
results = await pipeline.index_batch([doc1, doc2])
pipeline.prepare_for_indexing.assert_awaited_once_with([doc1, doc2])
assert pipeline.index.await_count == 2
@ -53,7 +51,7 @@ async def test_empty_input_returns_empty(pipeline):
"""Empty connector_docs list returns empty result."""
pipeline.prepare_for_indexing = AsyncMock(return_value=[])
results = await pipeline.index_batch([], MagicMock())
results = await pipeline.index_batch([])
assert results == []
@ -74,7 +72,7 @@ async def test_skips_document_without_matching_connector_doc(
pipeline.prepare_for_indexing = AsyncMock(return_value=[orphan_orm])
pipeline.index = AsyncMock()
results = await pipeline.index_batch([doc1], MagicMock())
results = await pipeline.index_batch([doc1])
pipeline.index.assert_not_awaited()
assert results == []

View file

@ -51,11 +51,6 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
return await original_to_thread(func, *args, **kwargs)
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
AsyncMock(return_value="Summary."),
)
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
@ -85,7 +80,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
document.id = 1
document.status = DocumentStatus.pending()
await pipeline.index(document, connector_doc, llm=MagicMock())
await pipeline.index(document, connector_doc)
# Either chunker entry point satisfies the "chunking runs off the event
# loop" contract this test guards. Routing between the two is verified
@ -104,10 +99,6 @@ async def test_non_code_documents_use_hybrid_chunker(
mid-row. Only documents flagged with ``should_use_code_chunker=True``
should take the ``chunk_text`` path.
"""
monkeypatch.setattr(
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
AsyncMock(return_value="Summary."),
)
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
monkeypatch.setattr(
@ -139,7 +130,7 @@ async def test_non_code_documents_use_hybrid_chunker(
document.id = 1
document.status = DocumentStatus.pending()
await pipeline.index(document, connector_doc, llm=MagicMock())
await pipeline.index(document, connector_doc)
mock_chunk_hybrid.assert_called_once()
mock_chunk_code.assert_not_called()
@ -192,19 +183,14 @@ async def test_batch_parallel_indexes_all_documents(
index_calls = []
async def fake_index(self, document, connector_doc, llm):
async def fake_index(self, document, connector_doc):
index_calls.append(document.id)
document.status = DocumentStatus.ready()
return document
monkeypatch.setattr(IndexingPipelineService, "index", fake_index)
async def mock_get_llm(session):
return MagicMock()
_, indexed, failed = await pipeline.index_batch_parallel(
docs, mock_get_llm, max_concurrency=2
)
_, indexed, failed = await pipeline.index_batch_parallel(docs, max_concurrency=2)
assert indexed == 3
assert failed == 0
@ -233,20 +219,15 @@ async def test_batch_parallel_one_failure_does_not_affect_others(
_mock_session_factory(orm_by_id),
)
async def failing_index(self, document, connector_doc, llm):
async def failing_index(self, document, connector_doc):
if document.id == 2:
raise RuntimeError("LLM exploded")
raise RuntimeError("Indexing exploded")
document.status = DocumentStatus.ready()
return document
monkeypatch.setattr(IndexingPipelineService, "index", failing_index)
async def mock_get_llm(session):
return MagicMock()
_, indexed, failed = await pipeline.index_batch_parallel(
docs, mock_get_llm, max_concurrency=4
)
_, indexed, failed = await pipeline.index_batch_parallel(docs, max_concurrency=4)
assert indexed == 2
assert failed == 1

View file

@ -246,6 +246,8 @@ def test_new_chat_runtime_context_prefers_accepted_folder_ids() -> None:
mentioned_document_ids=[1, 2],
accepted_folder_ids=[10],
mentioned_folder_ids=[20, 30],
mentioned_connector_ids=None,
mentioned_connectors=None,
request_id="req",
turn_id="t1",
)
@ -263,6 +265,8 @@ def test_new_chat_runtime_context_falls_back_to_mentioned_folder_ids() -> None:
mentioned_document_ids=None,
accepted_folder_ids=[],
mentioned_folder_ids=[20, 30],
mentioned_connector_ids=None,
mentioned_connectors=None,
request_id=None,
turn_id="t2",
)

View file

@ -137,15 +137,14 @@ Notes:
- `--skip-unanswerable` (run) — drop unanswerable questions
- `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs
## Ingestion knobs (vision LLM, processing mode, summarize)
## Ingestion knobs (vision LLM, processing mode)
The harness exposes `POST /api/v1/documents/fileupload`'s three knobs on every `ingest` subcommand:
The harness exposes `POST /api/v1/documents/fileupload`'s ingest knobs on every `ingest` subcommand:
| Flag pair | Effect |
|--------------------------------------------|-----------------------------------------------------------------------------------------|
| `--use-vision-llm` / `--no-vision-llm` | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
| `--processing-mode {basic,premium}` | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
| `--should-summarize` / `--no-summarize` | Generate a per-document summary at ingest. |
The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.

View file

@ -110,7 +110,6 @@ class DocumentsClient:
files: Iterable[Path],
*,
search_space_id: int,
should_summarize: bool = False,
use_vision_llm: bool = False,
processing_mode: str = "basic",
) -> FileUploadResult:
@ -149,7 +148,6 @@ class DocumentsClient:
f"{self._base}/api/v1/documents/fileupload",
data={
"search_space_id": str(search_space_id),
"should_summarize": "true" if should_summarize else "false",
"use_vision_llm": "true" if use_vision_llm else "false",
"processing_mode": processing_mode,
},

View file

@ -83,7 +83,6 @@ class LlmPreferences:
"""
agent_llm_id: int | None
document_summary_llm_id: int | None
image_generation_config_id: int | None
vision_llm_config_id: int | None
agent_llm: dict[str, Any] | None
@ -93,7 +92,6 @@ class LlmPreferences:
def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences:
return cls(
agent_llm_id=payload.get("agent_llm_id"),
document_summary_llm_id=payload.get("document_summary_llm_id"),
image_generation_config_id=payload.get("image_generation_config_id"),
vision_llm_config_id=payload.get("vision_llm_config_id"),
agent_llm=payload.get("agent_llm"),
@ -154,7 +152,6 @@ class SearchSpaceClient:
search_space_id: int,
*,
agent_llm_id: int | None = None,
document_summary_llm_id: int | None = None,
image_generation_config_id: int | None = None,
vision_llm_config_id: int | None = None,
) -> LlmPreferences:
@ -167,8 +164,6 @@ class SearchSpaceClient:
body: dict[str, Any] = {}
if agent_llm_id is not None:
body["agent_llm_id"] = agent_llm_id
if document_summary_llm_id is not None:
body["document_summary_llm_id"] = document_summary_llm_id
if image_generation_config_id is not None:
body["image_generation_config_id"] = image_generation_config_id
if vision_llm_config_id is not None:

View file

@ -8,15 +8,13 @@ exactly three knobs (verified at
* ``processing_mode`` ``"basic"`` (default) | ``"premium"``
* ``use_vision_llm`` ``bool`` (run vision LLM during ingest to
extract image content / captions / tables)
* ``should_summarize`` ``bool`` (generate document summary)
This module gives every benchmark a uniform way to:
1. Receive sensible per-benchmark defaults (text-only benchmarks
default vision off; image-bearing benchmarks default vision on).
2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``,
``--processing-mode {basic,premium}``,
``--should-summarize`` / ``--no-summarize``).
``--processing-mode {basic,premium}``).
3. Persist the *actual* settings used into the doc-map manifest and
the run artifact so reports can show "vision=ON, mode=premium →
65% accuracy" head-to-head with "vision=OFF, mode=basic 52%".
@ -71,13 +69,11 @@ class IngestSettings:
use_vision_llm: bool = False
processing_mode: str = "basic"
should_summarize: bool = False
def to_dict(self) -> dict[str, Any]:
return {
"use_vision_llm": self.use_vision_llm,
"processing_mode": self.processing_mode,
"should_summarize": self.should_summarize,
}
@classmethod
@ -87,14 +83,13 @@ class IngestSettings:
``opts`` is the kwargs dict built by ``core.cli`` from the
argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys
we look for: ``use_vision_llm`` (bool or None), ``processing_mode``
(str or None), ``should_summarize`` (bool or None). Anything
(str or None). Anything
else is ignored so benchmarks can pass through their own opts.
"""
return cls(
use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm),
processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode),
should_summarize=_coerce_bool(opts.get("should_summarize"), defaults.should_summarize),
)
def render_label(self) -> str:
@ -102,8 +97,7 @@ class IngestSettings:
return (
f"vision={'on' if self.use_vision_llm else 'off'}, "
f"mode={self.processing_mode}, "
f"summarize={'on' if self.should_summarize else 'off'}"
f"mode={self.processing_mode}"
)
@ -179,14 +173,14 @@ def add_ingest_settings_args(
*,
defaults: IngestSettings,
) -> None:
"""Attach the three ingest-settings flag pairs to ``parser``.
"""Attach ingest-settings flags to ``parser``.
Each bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
pair so an operator can flip either direction without restating
every flag. Default is ``None`` so that "operator didn't pass the
flag" is distinguishable from "operator explicitly passed false"
``IngestSettings.merge`` then folds in the benchmark default
only when the operator was silent.
The vision bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
pair so an operator can flip either direction without restating every
flag. Default is ``None`` so that "operator didn't pass the flag" is
distinguishable from "operator explicitly passed false"
``IngestSettings.merge`` then folds in the benchmark default only when
the operator was silent.
"""
settings_group = parser.add_argument_group(
@ -217,18 +211,6 @@ def add_ingest_settings_args(
f"Default for this benchmark: {defaults.processing_mode!r}."
),
)
_add_bool_pair(
settings_group,
dest="should_summarize",
on_flag="--should-summarize",
off_flag="--no-summarize",
on_help=(
"Have SurfSense generate a document summary at ingest "
f"(default for this benchmark: "
f"{'on' if defaults.should_summarize else 'off'})."
),
off_help="Skip per-document summary generation.",
)
# ---------------------------------------------------------------------------
@ -292,10 +274,9 @@ def format_ingest_settings_md(settings: Any) -> str:
return "- SurfSense ingest settings: (not recorded — re-ingest to capture)"
vision = "on" if settings.get("use_vision_llm") else "off"
mode = settings.get("processing_mode") or "basic"
summarize = "on" if settings.get("should_summarize") else "off"
return (
f"- SurfSense ingest settings: vision_llm=`{vision}`, "
f"processing_mode=`{mode}`, summarize=`{summarize}`"
f"processing_mode=`{mode}`"
)

View file

@ -160,8 +160,7 @@ async def run_ingest(
upload_result = await docs_client.upload(
files=[b.path for b in batches],
search_space_id=ctx.search_space_id,
should_summarize=settings.should_summarize,
use_vision_llm=settings.use_vision_llm,
use_vision_llm=settings.use_vision_llm,
processing_mode=settings.processing_mode,
)
new_doc_ids = list(upload_result.document_ids)

View file

@ -63,7 +63,6 @@ _DESCRIPTION = "CUREv1 retrieval (single-arm SurfSense): Recall@k / MRR / nDCG@1
_DEFAULT_INGEST_SETTINGS = IngestSettings(
use_vision_llm=False,
processing_mode="basic",
should_summarize=False,
)

View file

@ -208,7 +208,6 @@ async def _upload_pdfs(
result = await docs_client.upload(
files=batch,
search_space_id=ctx.search_space_id,
should_summarize=settings.should_summarize,
use_vision_llm=settings.use_vision_llm,
processing_mode=settings.processing_mode,
)

View file

@ -169,7 +169,6 @@ _DESCRIPTION = (
_DEFAULT_INGEST_SETTINGS = IngestSettings(
use_vision_llm=True,
processing_mode="basic",
should_summarize=False,
)

View file

@ -480,7 +480,6 @@ async def run_ingest(
upload_result = await docs_client.upload(
files=[b.path for b in batches],
search_space_id=ctx.search_space_id,
should_summarize=settings.should_summarize,
use_vision_llm=settings.use_vision_llm,
processing_mode=settings.processing_mode,
)

View file

@ -48,7 +48,6 @@ _DESCRIPTION = "MIRAGE (7,663 medical MCQs) — single-arm SurfSense per-task ac
_DEFAULT_INGEST_SETTINGS = IngestSettings(
use_vision_llm=False,
processing_mode="basic",
should_summarize=False,
)

View file

@ -225,7 +225,6 @@ async def _upload_pdfs(
result = await docs_client.upload(
files=batch,
search_space_id=ctx.search_space_id,
should_summarize=settings.should_summarize,
use_vision_llm=settings.use_vision_llm,
processing_mode=settings.processing_mode,
)

View file

@ -178,7 +178,6 @@ _TEXT_ONLY_HINTS = ("gpt-5.4-mini", "gpt-3.5", "text-only", "instruct-")
_DEFAULT_INGEST_SETTINGS = IngestSettings(
use_vision_llm=True,
processing_mode="basic",
should_summarize=False,
)

Some files were not shown because too many files have changed in this diff Show more