mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
Merge pull request #1467 from AnishSarkar22/feat/ui-fixes
feat: improve chat navigation, automation UI, and summary-free indexing
This commit is contained in:
commit
c2b8b3ac5e
197 changed files with 3331 additions and 4051 deletions
|
|
@ -54,6 +54,17 @@ USER_COLS = [
|
||||||
"premium_credit_micros_used",
|
"premium_credit_micros_used",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
AUTOMATION_RUN_COLS = [
|
||||||
|
"id",
|
||||||
|
"automation_id",
|
||||||
|
"trigger_id",
|
||||||
|
"status",
|
||||||
|
"step_results",
|
||||||
|
"started_at",
|
||||||
|
"finished_at",
|
||||||
|
"created_at",
|
||||||
|
]
|
||||||
|
|
||||||
def _has_zero_version(conn, table: str) -> bool:
|
def _has_zero_version(conn, table: str) -> bool:
|
||||||
return (
|
return (
|
||||||
conn.execute(
|
conn.execute(
|
||||||
|
|
@ -150,7 +161,8 @@ def _build_set_table_ddl(
|
||||||
f"new_chat_messages, "
|
f"new_chat_messages, "
|
||||||
f"chat_comments, "
|
f"chat_comments, "
|
||||||
f"chat_session_state, "
|
f"chat_session_state, "
|
||||||
f'"user" ({_cols(user_cols)})'
|
f'"user" ({_cols(user_cols)}), '
|
||||||
|
f"automation_runs ({_cols(AUTOMATION_RUN_COLS)})"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -523,7 +535,7 @@ def downgrade() -> None:
|
||||||
if exists:
|
if exists:
|
||||||
documents_has_zero_ver = _has_zero_version(conn, "documents")
|
documents_has_zero_ver = _has_zero_version(conn, "documents")
|
||||||
user_has_zero_ver = _has_zero_version(conn, "user")
|
user_has_zero_ver = _has_zero_version(conn, "user")
|
||||||
# Restore the publication shape from migration 143.
|
# Restore the publication shape from migration 148.
|
||||||
doc_cols = DOCUMENT_COLS + (['"_0_version"'] if documents_has_zero_ver else [])
|
doc_cols = DOCUMENT_COLS + (['"_0_version"'] if documents_has_zero_ver else [])
|
||||||
user_cols = USER_COLS + (['"_0_version"'] if user_has_zero_ver else [])
|
user_cols = USER_COLS + (['"_0_version"'] if user_has_zero_ver else [])
|
||||||
ddl = (
|
ddl = (
|
||||||
|
|
@ -535,7 +547,8 @@ def downgrade() -> None:
|
||||||
f"new_chat_messages, "
|
f"new_chat_messages, "
|
||||||
f"chat_comments, "
|
f"chat_comments, "
|
||||||
f"chat_session_state, "
|
f"chat_session_state, "
|
||||||
f'"user" ({_cols(user_cols)})'
|
f'"user" ({_cols(user_cols)}), '
|
||||||
|
f"automation_runs ({_cols(AUTOMATION_RUN_COLS)})"
|
||||||
)
|
)
|
||||||
tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
|
tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
|
||||||
with tx:
|
with tx:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,134 @@
|
||||||
|
"""remove document summary llm settings
|
||||||
|
|
||||||
|
Revision ID: 154
|
||||||
|
Revises: 153
|
||||||
|
"""
|
||||||
|
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
revision: str = "154"
|
||||||
|
down_revision: str | None = "153"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
PUBLICATION_NAME = "zero_publication"
|
||||||
|
|
||||||
|
DOCUMENT_COLS = [
|
||||||
|
"id",
|
||||||
|
"title",
|
||||||
|
"document_type",
|
||||||
|
"search_space_id",
|
||||||
|
"folder_id",
|
||||||
|
"created_by_id",
|
||||||
|
"status",
|
||||||
|
"created_at",
|
||||||
|
"updated_at",
|
||||||
|
]
|
||||||
|
|
||||||
|
USER_COLS = [
|
||||||
|
"id",
|
||||||
|
"pages_limit",
|
||||||
|
"pages_used",
|
||||||
|
"premium_credit_micros_limit",
|
||||||
|
"premium_credit_micros_used",
|
||||||
|
]
|
||||||
|
|
||||||
|
AUTOMATION_RUN_COLS = [
|
||||||
|
"id",
|
||||||
|
"automation_id",
|
||||||
|
"trigger_id",
|
||||||
|
"status",
|
||||||
|
"step_results",
|
||||||
|
"started_at",
|
||||||
|
"finished_at",
|
||||||
|
"created_at",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _column_exists(conn, table: str, column: str) -> bool:
|
||||||
|
return (
|
||||||
|
conn.execute(
|
||||||
|
sa.text(
|
||||||
|
"SELECT 1 FROM information_schema.columns "
|
||||||
|
"WHERE table_name = :table AND column_name = :column"
|
||||||
|
),
|
||||||
|
{"table": table, "column": column},
|
||||||
|
).fetchone()
|
||||||
|
is not None
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _has_zero_version(conn, table: str) -> bool:
|
||||||
|
return _column_exists(conn, table, "_0_version")
|
||||||
|
|
||||||
|
|
||||||
|
def _set_table_ddl(conn) -> str:
|
||||||
|
doc_cols = DOCUMENT_COLS + (['"_0_version"'] if _has_zero_version(conn, "documents") else [])
|
||||||
|
user_cols = USER_COLS + (['"_0_version"'] if _has_zero_version(conn, "user") else [])
|
||||||
|
tables = [
|
||||||
|
"notifications",
|
||||||
|
f"documents ({', '.join(doc_cols)})",
|
||||||
|
"folders",
|
||||||
|
"search_source_connectors",
|
||||||
|
"new_chat_messages",
|
||||||
|
"chat_comments",
|
||||||
|
"chat_session_state",
|
||||||
|
f'"user" ({", ".join(user_cols)})',
|
||||||
|
f"automation_runs ({', '.join(AUTOMATION_RUN_COLS)})",
|
||||||
|
]
|
||||||
|
return f"ALTER PUBLICATION {PUBLICATION_NAME} SET TABLE " + ", ".join(tables)
|
||||||
|
|
||||||
|
|
||||||
|
def _resync_zero_publication(tag: str) -> None:
|
||||||
|
conn = op.get_bind()
|
||||||
|
exists = conn.execute(
|
||||||
|
sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"),
|
||||||
|
{"name": PUBLICATION_NAME},
|
||||||
|
).fetchone()
|
||||||
|
if not exists:
|
||||||
|
return
|
||||||
|
|
||||||
|
tx = conn.begin_nested() if conn.in_transaction() else conn.begin()
|
||||||
|
with tx:
|
||||||
|
conn.execute(sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'pre-{tag}'"))
|
||||||
|
conn.execute(sa.text(_set_table_ddl(conn)))
|
||||||
|
conn.execute(sa.text(f"COMMENT ON PUBLICATION {PUBLICATION_NAME} IS 'post-{tag}'"))
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
conn = op.get_bind()
|
||||||
|
|
||||||
|
if _column_exists(conn, "searchspaces", "document_summary_llm_id"):
|
||||||
|
op.drop_column("searchspaces", "document_summary_llm_id")
|
||||||
|
|
||||||
|
if _column_exists(conn, "search_source_connectors", "enable_summary"):
|
||||||
|
op.drop_column("search_source_connectors", "enable_summary")
|
||||||
|
|
||||||
|
_resync_zero_publication("154-summary-removal")
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
conn = op.get_bind()
|
||||||
|
|
||||||
|
if not _column_exists(conn, "searchspaces", "document_summary_llm_id"):
|
||||||
|
op.add_column(
|
||||||
|
"searchspaces",
|
||||||
|
sa.Column("document_summary_llm_id", sa.Integer(), nullable=True, server_default="0"),
|
||||||
|
)
|
||||||
|
|
||||||
|
if not _column_exists(conn, "search_source_connectors", "enable_summary"):
|
||||||
|
op.add_column(
|
||||||
|
"search_source_connectors",
|
||||||
|
sa.Column(
|
||||||
|
"enable_summary",
|
||||||
|
sa.Boolean(),
|
||||||
|
nullable=False,
|
||||||
|
server_default=sa.text("false"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
_resync_zero_publication("154-summary-removal-downgrade")
|
||||||
|
|
@ -16,7 +16,7 @@ from app.agents.shared.receipt import make_receipt
|
||||||
from app.agents.shared.receipt_command import with_receipt
|
from app.agents.shared.receipt_command import with_receipt
|
||||||
from app.db import Report, shielded_async_session
|
from app.db import Report, shielded_async_session
|
||||||
from app.services.connector_service import ConnectorService
|
from app.services.connector_service import ConnectorService
|
||||||
from app.services.llm_service import get_document_summary_llm
|
from app.services.llm_service import get_agent_llm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -546,7 +546,7 @@ def create_generate_report_tool(
|
||||||
Factory function to create the generate_report tool with injected dependencies.
|
Factory function to create the generate_report tool with injected dependencies.
|
||||||
|
|
||||||
The tool generates a Markdown report inline using the search space's
|
The tool generates a Markdown report inline using the search space's
|
||||||
document summary LLM, saves it to the database, and returns immediately.
|
agent LLM, saves it to the database, and returns immediately.
|
||||||
|
|
||||||
Uses short-lived database sessions for each DB operation so no connection
|
Uses short-lived database sessions for each DB operation so no connection
|
||||||
is held during the long LLM API call.
|
is held during the long LLM API call.
|
||||||
|
|
@ -767,7 +767,7 @@ def create_generate_report_tool(
|
||||||
"creating standalone report"
|
"creating standalone report"
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = await get_document_summary_llm(read_session, search_space_id)
|
llm = await get_agent_llm(read_session, search_space_id)
|
||||||
# read_session closed — connection returned to pool
|
# read_session closed — connection returned to pool
|
||||||
|
|
||||||
if not llm:
|
if not llm:
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,7 @@ from langgraph.types import Command
|
||||||
from app.agents.shared.receipt import make_receipt
|
from app.agents.shared.receipt import make_receipt
|
||||||
from app.agents.shared.receipt_command import with_receipt
|
from app.agents.shared.receipt_command import with_receipt
|
||||||
from app.db import Report, shielded_async_session
|
from app.db import Report, shielded_async_session
|
||||||
from app.services.llm_service import get_document_summary_llm
|
from app.services.llm_service import get_agent_llm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -578,7 +578,7 @@ def create_generate_resume_tool(
|
||||||
f"(group {report_group_id})"
|
f"(group {report_group_id})"
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = await get_document_summary_llm(read_session, search_space_id)
|
llm = await get_agent_llm(read_session, search_space_id)
|
||||||
|
|
||||||
if not llm:
|
if not llm:
|
||||||
error_msg = (
|
error_msg = (
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,7 @@ from langchain_core.tools import tool
|
||||||
|
|
||||||
from app.db import Report, shielded_async_session
|
from app.db import Report, shielded_async_session
|
||||||
from app.services.connector_service import ConnectorService
|
from app.services.connector_service import ConnectorService
|
||||||
from app.services.llm_service import get_document_summary_llm
|
from app.services.llm_service import get_agent_llm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -565,7 +565,7 @@ def create_generate_report_tool(
|
||||||
Factory function to create the generate_report tool with injected dependencies.
|
Factory function to create the generate_report tool with injected dependencies.
|
||||||
|
|
||||||
The tool generates a Markdown report inline using the search space's
|
The tool generates a Markdown report inline using the search space's
|
||||||
document summary LLM, saves it to the database, and returns immediately.
|
agent LLM, saves it to the database, and returns immediately.
|
||||||
|
|
||||||
Uses short-lived database sessions for each DB operation so no connection
|
Uses short-lived database sessions for each DB operation so no connection
|
||||||
is held during the long LLM API call.
|
is held during the long LLM API call.
|
||||||
|
|
@ -768,7 +768,7 @@ def create_generate_report_tool(
|
||||||
"creating standalone report"
|
"creating standalone report"
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = await get_document_summary_llm(read_session, search_space_id)
|
llm = await get_agent_llm(read_session, search_space_id)
|
||||||
# read_session closed — connection returned to pool
|
# read_session closed — connection returned to pool
|
||||||
|
|
||||||
if not llm:
|
if not llm:
|
||||||
|
|
|
||||||
|
|
@ -26,7 +26,7 @@ from langchain_core.messages import HumanMessage
|
||||||
from langchain_core.tools import tool
|
from langchain_core.tools import tool
|
||||||
|
|
||||||
from app.db import Report, shielded_async_session
|
from app.db import Report, shielded_async_session
|
||||||
from app.services.llm_service import get_document_summary_llm
|
from app.services.llm_service import get_agent_llm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -547,7 +547,7 @@ def create_generate_resume_tool(
|
||||||
f"(group {report_group_id})"
|
f"(group {report_group_id})"
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = await get_document_summary_llm(read_session, search_space_id)
|
llm = await get_agent_llm(read_session, search_space_id)
|
||||||
|
|
||||||
if not llm:
|
if not llm:
|
||||||
error_msg = (
|
error_msg = (
|
||||||
|
|
|
||||||
|
|
@ -31,12 +31,10 @@ async def create_podcast_transcript(
|
||||||
search_space_id = configuration.search_space_id
|
search_space_id = configuration.search_space_id
|
||||||
user_prompt = configuration.user_prompt
|
user_prompt = configuration.user_prompt
|
||||||
|
|
||||||
# Get search space's document summary LLM
|
# Use the search space's agent LLM for podcast transcript generation.
|
||||||
llm = await get_agent_llm(state.db_session, search_space_id)
|
llm = await get_agent_llm(state.db_session, search_space_id)
|
||||||
if not llm:
|
if not llm:
|
||||||
error_message = (
|
error_message = f"No agent LLM configured for search space {search_space_id}"
|
||||||
f"No document summary LLM configured for search space {search_space_id}"
|
|
||||||
)
|
|
||||||
print(error_message)
|
print(error_message)
|
||||||
raise RuntimeError(error_message)
|
raise RuntimeError(error_message)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -103,7 +103,7 @@ def init_worker(**kwargs):
|
||||||
"""Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
|
"""Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
|
||||||
|
|
||||||
This ensures the Auto mode (LiteLLM Router) is available for background tasks
|
This ensures the Auto mode (LiteLLM Router) is available for background tasks
|
||||||
like document summarization and image generation.
|
like agent workflows and image generation.
|
||||||
"""
|
"""
|
||||||
from app.observability.bootstrap import init_otel
|
from app.observability.bootstrap import init_otel
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -141,7 +141,6 @@ async def download_and_process_file(
|
||||||
task_logger: TaskLoggingService,
|
task_logger: TaskLoggingService,
|
||||||
log_entry: Log,
|
log_entry: Log,
|
||||||
connector_id: int | None = None,
|
connector_id: int | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
) -> tuple[Any, str | None, dict[str, Any] | None]:
|
) -> tuple[Any, str | None, dict[str, Any] | None]:
|
||||||
"""
|
"""
|
||||||
Download Google Drive file and process using Surfsense file processors.
|
Download Google Drive file and process using Surfsense file processors.
|
||||||
|
|
@ -215,8 +214,6 @@ async def download_and_process_file(
|
||||||
"source_connector": "google_drive",
|
"source_connector": "google_drive",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
# Include connector_id for de-indexing support
|
|
||||||
connector_info["enable_summary"] = enable_summary
|
|
||||||
if connector_id is not None:
|
if connector_id is not None:
|
||||||
connector_info["connector_id"] = connector_id
|
connector_info["connector_id"] = connector_id
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1781,9 +1781,6 @@ class SearchSpace(BaseModel, TimestampMixin):
|
||||||
agent_llm_id = Column(
|
agent_llm_id = Column(
|
||||||
Integer, nullable=True, default=0
|
Integer, nullable=True, default=0
|
||||||
) # For agent/chat operations, defaults to Auto mode
|
) # For agent/chat operations, defaults to Auto mode
|
||||||
document_summary_llm_id = Column(
|
|
||||||
Integer, nullable=True, default=0
|
|
||||||
) # For document summarization, defaults to Auto mode
|
|
||||||
image_generation_config_id = Column(
|
image_generation_config_id = Column(
|
||||||
Integer, nullable=True, default=0
|
Integer, nullable=True, default=0
|
||||||
) # For image generation, defaults to Auto mode
|
) # For image generation, defaults to Auto mode
|
||||||
|
|
@ -1951,12 +1948,6 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
|
||||||
last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
last_indexed_at = Column(TIMESTAMP(timezone=True), nullable=True)
|
||||||
config = Column(JSON, nullable=False)
|
config = Column(JSON, nullable=False)
|
||||||
|
|
||||||
# Summary generation (LLM-based) - disabled by default to save resources.
|
|
||||||
# When enabled, improves hybrid search quality at the cost of LLM calls.
|
|
||||||
enable_summary = Column(
|
|
||||||
Boolean, nullable=False, default=False, server_default="false"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Vision LLM for image files - disabled by default to save cost/time.
|
# Vision LLM for image files - disabled by default to save cost/time.
|
||||||
# When enabled, images are described via a vision language model instead
|
# When enabled, images are described via a vision language model instead
|
||||||
# of falling back to the document parser.
|
# of falling back to the document parser.
|
||||||
|
|
@ -2972,7 +2963,7 @@ async def shielded_async_session():
|
||||||
async def setup_indexes():
|
async def setup_indexes():
|
||||||
async with engine.begin() as conn:
|
async with engine.begin() as conn:
|
||||||
# Create indexes
|
# Create indexes
|
||||||
# Document Summary Indexes
|
# Document embedding indexes
|
||||||
await conn.execute(
|
await conn.execute(
|
||||||
text(
|
text(
|
||||||
"CREATE INDEX IF NOT EXISTS document_vector_index ON documents USING hnsw (embedding public.vector_cosine_ops)"
|
"CREATE INDEX IF NOT EXISTS document_vector_index ON documents USING hnsw (embedding public.vector_cosine_ops)"
|
||||||
|
|
|
||||||
|
|
@ -18,8 +18,6 @@ class UploadDocumentAdapter:
|
||||||
etl_service: str,
|
etl_service: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
llm,
|
|
||||||
should_summarize: bool = False,
|
|
||||||
) -> None:
|
) -> None:
|
||||||
connector_doc = ConnectorDocument(
|
connector_doc = ConnectorDocument(
|
||||||
title=filename,
|
title=filename,
|
||||||
|
|
@ -29,9 +27,7 @@ class UploadDocumentAdapter:
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
connector_id=None,
|
connector_id=None,
|
||||||
should_summarize=should_summarize,
|
|
||||||
should_use_code_chunker=False,
|
should_use_code_chunker=False,
|
||||||
fallback_summary=markdown_content[:4000],
|
|
||||||
metadata={
|
metadata={
|
||||||
"FILE_NAME": filename,
|
"FILE_NAME": filename,
|
||||||
"ETL_SERVICE": etl_service,
|
"ETL_SERVICE": etl_service,
|
||||||
|
|
@ -43,7 +39,7 @@ class UploadDocumentAdapter:
|
||||||
if not documents:
|
if not documents:
|
||||||
raise RuntimeError("prepare_for_indexing returned no documents")
|
raise RuntimeError("prepare_for_indexing returned no documents")
|
||||||
|
|
||||||
indexed = await self._service.index(documents[0], connector_doc, llm)
|
indexed = await self._service.index(documents[0], connector_doc)
|
||||||
|
|
||||||
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
|
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
|
||||||
raise RuntimeError(indexed.status.get("reason", "Indexing failed"))
|
raise RuntimeError(indexed.status.get("reason", "Indexing failed"))
|
||||||
|
|
@ -51,7 +47,7 @@ class UploadDocumentAdapter:
|
||||||
indexed.content_needs_reindexing = False
|
indexed.content_needs_reindexing = False
|
||||||
await self._session.commit()
|
await self._session.commit()
|
||||||
|
|
||||||
async def reindex(self, document: Document, llm) -> None:
|
async def reindex(self, document: Document) -> None:
|
||||||
"""Re-index an existing document after its source_markdown has been updated."""
|
"""Re-index an existing document after its source_markdown has been updated."""
|
||||||
if not document.source_markdown:
|
if not document.source_markdown:
|
||||||
raise RuntimeError("Document has no source_markdown to reindex")
|
raise RuntimeError("Document has no source_markdown to reindex")
|
||||||
|
|
@ -66,15 +62,13 @@ class UploadDocumentAdapter:
|
||||||
search_space_id=document.search_space_id,
|
search_space_id=document.search_space_id,
|
||||||
created_by_id=str(document.created_by_id),
|
created_by_id=str(document.created_by_id),
|
||||||
connector_id=document.connector_id,
|
connector_id=document.connector_id,
|
||||||
should_summarize=True,
|
|
||||||
should_use_code_chunker=False,
|
should_use_code_chunker=False,
|
||||||
fallback_summary=document.source_markdown[:4000],
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
document.content_hash = compute_content_hash(connector_doc)
|
document.content_hash = compute_content_hash(connector_doc)
|
||||||
|
|
||||||
indexed = await self._service.index(document, connector_doc, llm)
|
indexed = await self._service.index(document, connector_doc)
|
||||||
|
|
||||||
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
|
if not DocumentStatus.is_state(indexed.status, DocumentStatus.READY):
|
||||||
raise RuntimeError(indexed.status.get("reason", "Reindexing failed"))
|
raise RuntimeError(indexed.status.get("reason", "Reindexing failed"))
|
||||||
|
|
|
||||||
|
|
@ -11,9 +11,7 @@ class ConnectorDocument(BaseModel):
|
||||||
unique_id: str
|
unique_id: str
|
||||||
document_type: DocumentType
|
document_type: DocumentType
|
||||||
search_space_id: int = Field(gt=0)
|
search_space_id: int = Field(gt=0)
|
||||||
should_summarize: bool = True
|
|
||||||
should_use_code_chunker: bool = False
|
should_use_code_chunker: bool = False
|
||||||
fallback_summary: str | None = None
|
|
||||||
metadata: dict = {}
|
metadata: dict = {}
|
||||||
connector_id: int | None = None
|
connector_id: int | None = None
|
||||||
created_by_id: str
|
created_by_id: str
|
||||||
|
|
|
||||||
|
|
@ -1,30 +0,0 @@
|
||||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
|
||||||
from app.utils.document_converters import optimize_content_for_context_window
|
|
||||||
|
|
||||||
|
|
||||||
async def summarize_document(
|
|
||||||
source_markdown: str, llm, metadata: dict | None = None
|
|
||||||
) -> str:
|
|
||||||
"""Generate a text summary of a document using an LLM, prefixed with metadata when provided."""
|
|
||||||
model_name = getattr(llm, "model", "gpt-3.5-turbo")
|
|
||||||
optimized_content = optimize_content_for_context_window(
|
|
||||||
source_markdown, metadata, model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
|
|
||||||
content_with_metadata = (
|
|
||||||
f"<DOCUMENT><DOCUMENT_METADATA>\n\n{metadata}\n\n</DOCUMENT_METADATA>"
|
|
||||||
f"\n\n<DOCUMENT_CONTENT>\n\n{optimized_content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
|
|
||||||
)
|
|
||||||
summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
|
|
||||||
summary_content = summary_result.content
|
|
||||||
|
|
||||||
if metadata:
|
|
||||||
metadata_parts = ["# DOCUMENT METADATA"]
|
|
||||||
for key, value in metadata.items():
|
|
||||||
if value:
|
|
||||||
metadata_parts.append(f"**{key.replace('_', ' ').title()}:** {value}")
|
|
||||||
metadata_section = "\n".join(metadata_parts)
|
|
||||||
return f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
|
|
||||||
|
|
||||||
return summary_content
|
|
||||||
|
|
@ -31,7 +31,6 @@ from app.indexing_pipeline.document_persistence import (
|
||||||
attach_chunks_to_document,
|
attach_chunks_to_document,
|
||||||
rollback_and_persist_failure,
|
rollback_and_persist_failure,
|
||||||
)
|
)
|
||||||
from app.indexing_pipeline.document_summarizer import summarize_document
|
|
||||||
from app.indexing_pipeline.exceptions import (
|
from app.indexing_pipeline.exceptions import (
|
||||||
EMBEDDING_ERRORS,
|
EMBEDDING_ERRORS,
|
||||||
PERMANENT_LLM_ERRORS,
|
PERMANENT_LLM_ERRORS,
|
||||||
|
|
@ -203,9 +202,7 @@ class IndexingPipelineService:
|
||||||
|
|
||||||
await self.session.commit()
|
await self.session.commit()
|
||||||
|
|
||||||
async def index_batch(
|
async def index_batch(self, connector_docs: list[ConnectorDocument]) -> list[Document]:
|
||||||
self, connector_docs: list[ConnectorDocument], llm
|
|
||||||
) -> list[Document]:
|
|
||||||
"""Convenience method: prepare_for_indexing then index each document.
|
"""Convenience method: prepare_for_indexing then index each document.
|
||||||
|
|
||||||
Indexers that need heartbeat callbacks or custom per-document logic
|
Indexers that need heartbeat callbacks or custom per-document logic
|
||||||
|
|
@ -218,7 +215,7 @@ class IndexingPipelineService:
|
||||||
connector_doc = doc_map.get(document.unique_identifier_hash)
|
connector_doc = doc_map.get(document.unique_identifier_hash)
|
||||||
if connector_doc is None:
|
if connector_doc is None:
|
||||||
continue
|
continue
|
||||||
result = await self.index(document, connector_doc, llm)
|
result = await self.index(document, connector_doc)
|
||||||
results.append(result)
|
results.append(result)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
@ -350,11 +347,9 @@ class IndexingPipelineService:
|
||||||
await self.session.rollback()
|
await self.session.rollback()
|
||||||
return []
|
return []
|
||||||
|
|
||||||
async def index(
|
async def index(self, document: Document, connector_doc: ConnectorDocument) -> Document:
|
||||||
self, document: Document, connector_doc: ConnectorDocument, llm
|
|
||||||
) -> Document:
|
|
||||||
"""
|
"""
|
||||||
Run summarization, embedding, and chunking for a document and persist the results.
|
Run deterministic content storage, embedding, and chunking for a document.
|
||||||
"""
|
"""
|
||||||
ctx = PipelineLogContext(
|
ctx = PipelineLogContext(
|
||||||
connector_id=connector_doc.connector_id,
|
connector_id=connector_doc.connector_id,
|
||||||
|
|
@ -379,20 +374,7 @@ class IndexingPipelineService:
|
||||||
document.status = DocumentStatus.processing()
|
document.status = DocumentStatus.processing()
|
||||||
await self.session.commit()
|
await self.session.commit()
|
||||||
|
|
||||||
t_step = time.perf_counter()
|
content = connector_doc.source_markdown
|
||||||
if connector_doc.should_summarize and llm is not None:
|
|
||||||
content = await summarize_document(
|
|
||||||
connector_doc.source_markdown, llm, connector_doc.metadata
|
|
||||||
)
|
|
||||||
perf.info(
|
|
||||||
"[indexing] summarize_document doc=%d in %.3fs",
|
|
||||||
document.id,
|
|
||||||
time.perf_counter() - t_step,
|
|
||||||
)
|
|
||||||
elif connector_doc.should_summarize and connector_doc.fallback_summary:
|
|
||||||
content = connector_doc.fallback_summary
|
|
||||||
else:
|
|
||||||
content = connector_doc.source_markdown
|
|
||||||
|
|
||||||
await self.session.execute(
|
await self.session.execute(
|
||||||
delete(Chunk).where(Chunk.document_id == document.id)
|
delete(Chunk).where(Chunk.document_id == document.id)
|
||||||
|
|
@ -523,7 +505,6 @@ class IndexingPipelineService:
|
||||||
async def index_batch_parallel(
|
async def index_batch_parallel(
|
||||||
self,
|
self,
|
||||||
connector_docs: list[ConnectorDocument],
|
connector_docs: list[ConnectorDocument],
|
||||||
get_llm: Callable[[AsyncSession], Awaitable],
|
|
||||||
*,
|
*,
|
||||||
max_concurrency: int = 4,
|
max_concurrency: int = 4,
|
||||||
on_heartbeat: Callable[[int], Awaitable[None]] | None = None,
|
on_heartbeat: Callable[[int], Awaitable[None]] | None = None,
|
||||||
|
|
@ -532,8 +513,8 @@ class IndexingPipelineService:
|
||||||
"""Index documents in parallel with bounded concurrency.
|
"""Index documents in parallel with bounded concurrency.
|
||||||
|
|
||||||
Phase 1 (serial): prepare_for_indexing using self.session.
|
Phase 1 (serial): prepare_for_indexing using self.session.
|
||||||
Phase 2 (parallel): index each document in an isolated session,
|
Phase 2 (parallel): index each document in an isolated session, bounded
|
||||||
bounded by a semaphore to avoid overwhelming APIs/DB.
|
by a semaphore to avoid overwhelming embedding APIs/DB.
|
||||||
"""
|
"""
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
perf = get_perf_logger()
|
perf = get_perf_logger()
|
||||||
|
|
@ -577,9 +558,8 @@ class IndexingPipelineService:
|
||||||
failed_count += 1
|
failed_count += 1
|
||||||
return document
|
return document
|
||||||
|
|
||||||
llm = await get_llm(isolated_session)
|
|
||||||
iso_pipeline = IndexingPipelineService(isolated_session)
|
iso_pipeline = IndexingPipelineService(isolated_session)
|
||||||
result = await iso_pipeline.index(refetched, connector_doc, llm)
|
result = await iso_pipeline.index(refetched, connector_doc)
|
||||||
|
|
||||||
async with lock:
|
async with lock:
|
||||||
if DocumentStatus.is_state(
|
if DocumentStatus.is_state(
|
||||||
|
|
|
||||||
|
|
@ -125,7 +125,6 @@ async def create_documents(
|
||||||
async def create_documents_file_upload(
|
async def create_documents_file_upload(
|
||||||
files: list[UploadFile],
|
files: list[UploadFile],
|
||||||
search_space_id: int = Form(...),
|
search_space_id: int = Form(...),
|
||||||
should_summarize: bool = Form(False),
|
|
||||||
use_vision_llm: bool = Form(False),
|
use_vision_llm: bool = Form(False),
|
||||||
processing_mode: str = Form("basic"),
|
processing_mode: str = Form("basic"),
|
||||||
session: AsyncSession = Depends(get_async_session),
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
|
@ -309,7 +308,6 @@ async def create_documents_file_upload(
|
||||||
filename=filename,
|
filename=filename,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=str(user.id),
|
user_id=str(user.id),
|
||||||
should_summarize=should_summarize,
|
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
processing_mode=validated_mode.value,
|
processing_mode=validated_mode.value,
|
||||||
)
|
)
|
||||||
|
|
@ -1586,7 +1584,6 @@ async def folder_upload(
|
||||||
search_space_id: int = Form(...),
|
search_space_id: int = Form(...),
|
||||||
relative_paths: str = Form(...),
|
relative_paths: str = Form(...),
|
||||||
root_folder_id: int | None = Form(None),
|
root_folder_id: int | None = Form(None),
|
||||||
enable_summary: bool = Form(False),
|
|
||||||
use_vision_llm: bool = Form(False),
|
use_vision_llm: bool = Form(False),
|
||||||
processing_mode: str = Form("basic"),
|
processing_mode: str = Form("basic"),
|
||||||
session: AsyncSession = Depends(get_async_session),
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
|
@ -1719,7 +1716,6 @@ async def folder_upload(
|
||||||
user_id=str(user.id),
|
user_id=str(user.id),
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
root_folder_id=root_folder_id,
|
root_folder_id=root_folder_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
file_mappings=list(file_mappings),
|
file_mappings=list(file_mappings),
|
||||||
processing_mode=validated_mode.value,
|
processing_mode=validated_mode.value,
|
||||||
|
|
|
||||||
|
|
@ -617,9 +617,6 @@ async def get_llm_preferences(
|
||||||
|
|
||||||
# Get full config objects for each role
|
# Get full config objects for each role
|
||||||
agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id)
|
agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id)
|
||||||
document_summary_llm = await _get_llm_config_by_id(
|
|
||||||
session, search_space.document_summary_llm_id
|
|
||||||
)
|
|
||||||
image_generation_config = await _get_image_gen_config_by_id(
|
image_generation_config = await _get_image_gen_config_by_id(
|
||||||
session, search_space.image_generation_config_id
|
session, search_space.image_generation_config_id
|
||||||
)
|
)
|
||||||
|
|
@ -629,11 +626,9 @@ async def get_llm_preferences(
|
||||||
|
|
||||||
return LLMPreferencesRead(
|
return LLMPreferencesRead(
|
||||||
agent_llm_id=search_space.agent_llm_id,
|
agent_llm_id=search_space.agent_llm_id,
|
||||||
document_summary_llm_id=search_space.document_summary_llm_id,
|
|
||||||
image_generation_config_id=search_space.image_generation_config_id,
|
image_generation_config_id=search_space.image_generation_config_id,
|
||||||
vision_llm_config_id=search_space.vision_llm_config_id,
|
vision_llm_config_id=search_space.vision_llm_config_id,
|
||||||
agent_llm=agent_llm,
|
agent_llm=agent_llm,
|
||||||
document_summary_llm=document_summary_llm,
|
|
||||||
image_generation_config=image_generation_config,
|
image_generation_config=image_generation_config,
|
||||||
vision_llm_config=vision_llm_config,
|
vision_llm_config=vision_llm_config,
|
||||||
)
|
)
|
||||||
|
|
@ -707,9 +702,6 @@ async def update_llm_preferences(
|
||||||
|
|
||||||
# Get full config objects for response
|
# Get full config objects for response
|
||||||
agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id)
|
agent_llm = await _get_llm_config_by_id(session, search_space.agent_llm_id)
|
||||||
document_summary_llm = await _get_llm_config_by_id(
|
|
||||||
session, search_space.document_summary_llm_id
|
|
||||||
)
|
|
||||||
image_generation_config = await _get_image_gen_config_by_id(
|
image_generation_config = await _get_image_gen_config_by_id(
|
||||||
session, search_space.image_generation_config_id
|
session, search_space.image_generation_config_id
|
||||||
)
|
)
|
||||||
|
|
@ -719,11 +711,9 @@ async def update_llm_preferences(
|
||||||
|
|
||||||
return LLMPreferencesRead(
|
return LLMPreferencesRead(
|
||||||
agent_llm_id=search_space.agent_llm_id,
|
agent_llm_id=search_space.agent_llm_id,
|
||||||
document_summary_llm_id=search_space.document_summary_llm_id,
|
|
||||||
image_generation_config_id=search_space.image_generation_config_id,
|
image_generation_config_id=search_space.image_generation_config_id,
|
||||||
vision_llm_config_id=search_space.vision_llm_config_id,
|
vision_llm_config_id=search_space.vision_llm_config_id,
|
||||||
agent_llm=agent_llm,
|
agent_llm=agent_llm,
|
||||||
document_summary_llm=document_summary_llm,
|
|
||||||
image_generation_config=image_generation_config,
|
image_generation_config=image_generation_config,
|
||||||
vision_llm_config=vision_llm_config,
|
vision_llm_config=vision_llm_config,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -221,9 +221,6 @@ class LLMPreferencesRead(BaseModel):
|
||||||
agent_llm_id: int | None = Field(
|
agent_llm_id: int | None = Field(
|
||||||
None, description="ID of the LLM config to use for agent/chat tasks"
|
None, description="ID of the LLM config to use for agent/chat tasks"
|
||||||
)
|
)
|
||||||
document_summary_llm_id: int | None = Field(
|
|
||||||
None, description="ID of the LLM config to use for document summarization"
|
|
||||||
)
|
|
||||||
image_generation_config_id: int | None = Field(
|
image_generation_config_id: int | None = Field(
|
||||||
None, description="ID of the image generation config to use"
|
None, description="ID of the image generation config to use"
|
||||||
)
|
)
|
||||||
|
|
@ -234,9 +231,6 @@ class LLMPreferencesRead(BaseModel):
|
||||||
agent_llm: dict[str, Any] | None = Field(
|
agent_llm: dict[str, Any] | None = Field(
|
||||||
None, description="Full config for agent LLM"
|
None, description="Full config for agent LLM"
|
||||||
)
|
)
|
||||||
document_summary_llm: dict[str, Any] | None = Field(
|
|
||||||
None, description="Full config for document summary LLM"
|
|
||||||
)
|
|
||||||
image_generation_config: dict[str, Any] | None = Field(
|
image_generation_config: dict[str, Any] | None = Field(
|
||||||
None, description="Full config for image generation"
|
None, description="Full config for image generation"
|
||||||
)
|
)
|
||||||
|
|
@ -253,9 +247,6 @@ class LLMPreferencesUpdate(BaseModel):
|
||||||
agent_llm_id: int | None = Field(
|
agent_llm_id: int | None = Field(
|
||||||
None, description="ID of the LLM config to use for agent/chat tasks"
|
None, description="ID of the LLM config to use for agent/chat tasks"
|
||||||
)
|
)
|
||||||
document_summary_llm_id: int | None = Field(
|
|
||||||
None, description="ID of the LLM config to use for document summarization"
|
|
||||||
)
|
|
||||||
image_generation_config_id: int | None = Field(
|
image_generation_config_id: int | None = Field(
|
||||||
None, description="ID of the image generation config to use"
|
None, description="ID of the image generation config to use"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,6 @@ class SearchSourceConnectorBase(BaseModel):
|
||||||
is_indexable: bool
|
is_indexable: bool
|
||||||
last_indexed_at: datetime | None = None
|
last_indexed_at: datetime | None = None
|
||||||
config: dict[str, Any]
|
config: dict[str, Any]
|
||||||
enable_summary: bool = False
|
|
||||||
enable_vision_llm: bool = False
|
enable_vision_llm: bool = False
|
||||||
periodic_indexing_enabled: bool = False
|
periodic_indexing_enabled: bool = False
|
||||||
indexing_frequency_minutes: int | None = None
|
indexing_frequency_minutes: int | None = None
|
||||||
|
|
@ -67,7 +66,6 @@ class SearchSourceConnectorUpdate(BaseModel):
|
||||||
is_indexable: bool | None = None
|
is_indexable: bool | None = None
|
||||||
last_indexed_at: datetime | None = None
|
last_indexed_at: datetime | None = None
|
||||||
config: dict[str, Any] | None = None
|
config: dict[str, Any] | None = None
|
||||||
enable_summary: bool | None = None
|
|
||||||
enable_vision_llm: bool | None = None
|
enable_vision_llm: bool | None = None
|
||||||
periodic_indexing_enabled: bool | None = None
|
periodic_indexing_enabled: bool | None = None
|
||||||
indexing_frequency_minutes: int | None = None
|
indexing_frequency_minutes: int | None = None
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -65,29 +64,11 @@ class ConfluenceKBSyncService:
|
||||||
if dup:
|
if dup:
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
|
||||||
search_space_id,
|
|
||||||
disable_streaming=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"page_title": page_title,
|
|
||||||
"space_id": space_id,
|
|
||||||
"document_type": "Confluence Page",
|
|
||||||
"connector_type": "Confluence",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
summary_embedding = embed_text(summary_content)
|
||||||
page_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(page_content)
|
chunks = await create_document_chunks(page_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
@ -185,25 +166,10 @@ class ConfluenceKBSyncService:
|
||||||
|
|
||||||
space_id = (document.document_metadata or {}).get("space_id", "")
|
space_id = (document.document_metadata or {}).get("space_id", "")
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session, user_id, search_space_id, disable_streaming=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if user_llm:
|
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
|
||||||
doc_meta = {
|
summary_embedding = embed_text(summary_content)
|
||||||
"page_title": page_title,
|
|
||||||
"space_id": space_id,
|
|
||||||
"document_type": "Confluence Page",
|
|
||||||
"connector_type": "Confluence",
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
page_content, user_llm, doc_meta
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"Confluence Page: {page_title}\n\n{page_content}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(page_content)
|
chunks = await create_document_chunks(page_content)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -191,149 +191,6 @@ class DoclingService:
|
||||||
logger.error(f"Full traceback: {traceback.format_exc()}")
|
logger.error(f"Full traceback: {traceback.format_exc()}")
|
||||||
raise RuntimeError(f"Docling processing failed: {e}") from e
|
raise RuntimeError(f"Docling processing failed: {e}") from e
|
||||||
|
|
||||||
async def process_large_document_summary(
|
|
||||||
self, content: str, llm, document_title: str = "Document"
|
|
||||||
) -> str:
|
|
||||||
"""
|
|
||||||
Process large documents using chunked LLM summarization.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
content: The full document content
|
|
||||||
llm: The language model to use for summarization
|
|
||||||
document_title: Title of the document for context
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Final summary of the document
|
|
||||||
"""
|
|
||||||
# Large document threshold (100K characters ≈ 25K tokens)
|
|
||||||
large_document_threshold = 100_000
|
|
||||||
|
|
||||||
if len(content) <= large_document_threshold:
|
|
||||||
# For smaller documents, use direct processing
|
|
||||||
logger.info(
|
|
||||||
f"📄 Document size: {len(content)} chars - using direct processing"
|
|
||||||
)
|
|
||||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
|
||||||
|
|
||||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | llm
|
|
||||||
result = await summary_chain.ainvoke({"document": content})
|
|
||||||
return result.content
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
f"📚 Large document detected: {len(content)} chars - using chunked processing"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Import chunker from config
|
|
||||||
# Create LLM-optimized chunks (8K tokens max for safety)
|
|
||||||
from chonkie import OverlapRefinery, RecursiveChunker
|
|
||||||
from langchain_core.prompts import PromptTemplate
|
|
||||||
|
|
||||||
llm_chunker = RecursiveChunker(
|
|
||||||
chunk_size=8000 # Conservative for most LLMs
|
|
||||||
)
|
|
||||||
|
|
||||||
# Apply overlap refinery for context preservation (10% overlap = 800 tokens)
|
|
||||||
overlap_refinery = OverlapRefinery(
|
|
||||||
context_size=0.1, # 10% overlap for context preservation
|
|
||||||
method="suffix", # Add next chunk context to current chunk
|
|
||||||
)
|
|
||||||
|
|
||||||
# First chunk the content, then apply overlap refinery
|
|
||||||
initial_chunks = llm_chunker.chunk(content)
|
|
||||||
chunks = overlap_refinery.refine(initial_chunks)
|
|
||||||
total_chunks = len(chunks)
|
|
||||||
|
|
||||||
logger.info(f"📄 Split into {total_chunks} chunks for LLM processing")
|
|
||||||
|
|
||||||
# Template for chunk processing
|
|
||||||
chunk_template = PromptTemplate(
|
|
||||||
input_variables=["chunk", "chunk_number", "total_chunks"],
|
|
||||||
template="""<INSTRUCTIONS>
|
|
||||||
You are summarizing chunk {chunk_number} of {total_chunks} from a large document.
|
|
||||||
|
|
||||||
Create a comprehensive summary of this document chunk. Focus on:
|
|
||||||
- Key concepts, facts, and information
|
|
||||||
- Important details and context
|
|
||||||
- Main topics and themes
|
|
||||||
|
|
||||||
Provide a clear, structured summary that captures the essential content.
|
|
||||||
|
|
||||||
Chunk {chunk_number}/{total_chunks}:
|
|
||||||
<document_chunk>
|
|
||||||
{chunk}
|
|
||||||
</document_chunk>
|
|
||||||
</INSTRUCTIONS>""",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process each chunk individually
|
|
||||||
chunk_summaries = []
|
|
||||||
for i, chunk in enumerate(chunks, 1):
|
|
||||||
try:
|
|
||||||
logger.info(
|
|
||||||
f"🔄 Processing chunk {i}/{total_chunks} ({len(chunk.text)} chars)"
|
|
||||||
)
|
|
||||||
|
|
||||||
chunk_chain = chunk_template | llm
|
|
||||||
chunk_result = await chunk_chain.ainvoke(
|
|
||||||
{
|
|
||||||
"chunk": chunk.text,
|
|
||||||
"chunk_number": i,
|
|
||||||
"total_chunks": total_chunks,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
chunk_summary = chunk_result.content
|
|
||||||
chunk_summaries.append(f"=== Section {i} ===\n{chunk_summary}")
|
|
||||||
|
|
||||||
logger.info(f"✅ Completed chunk {i}/{total_chunks}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"❌ Failed to process chunk {i}/{total_chunks}: {e}")
|
|
||||||
chunk_summaries.append(f"=== Section {i} ===\n[Processing failed]")
|
|
||||||
|
|
||||||
# Combine summaries into final document summary
|
|
||||||
logger.info(f"🔄 Combining {len(chunk_summaries)} chunk summaries")
|
|
||||||
|
|
||||||
try:
|
|
||||||
combine_template = PromptTemplate(
|
|
||||||
input_variables=["summaries", "document_title"],
|
|
||||||
template="""<INSTRUCTIONS>
|
|
||||||
You are combining multiple section summaries into a final comprehensive document summary.
|
|
||||||
|
|
||||||
Create a unified, coherent summary from the following section summaries of "{document_title}".
|
|
||||||
Ensure:
|
|
||||||
- Logical flow and organization
|
|
||||||
- No redundancy or repetition
|
|
||||||
- Comprehensive coverage of all key points
|
|
||||||
- Professional, objective tone
|
|
||||||
|
|
||||||
<section_summaries>
|
|
||||||
{summaries}
|
|
||||||
</section_summaries>
|
|
||||||
</INSTRUCTIONS>""",
|
|
||||||
)
|
|
||||||
|
|
||||||
combined_summaries = "\n\n".join(chunk_summaries)
|
|
||||||
combine_chain = combine_template | llm
|
|
||||||
|
|
||||||
final_result = await combine_chain.ainvoke(
|
|
||||||
{"summaries": combined_summaries, "document_title": document_title}
|
|
||||||
)
|
|
||||||
|
|
||||||
final_summary = final_result.content
|
|
||||||
logger.info(
|
|
||||||
f"✅ Large document processing complete: {len(final_summary)} chars summary"
|
|
||||||
)
|
|
||||||
|
|
||||||
return final_summary
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"❌ Failed to combine summaries: {e}")
|
|
||||||
# Fallback: return concatenated chunk summaries
|
|
||||||
fallback_summary = "\n\n".join(chunk_summaries)
|
|
||||||
logger.warning("⚠️ Using fallback combined summary")
|
|
||||||
return fallback_summary
|
|
||||||
|
|
||||||
|
|
||||||
def create_docling_service() -> DoclingService:
|
def create_docling_service() -> DoclingService:
|
||||||
"""Create a Docling service instance."""
|
"""Create a Docling service instance."""
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -72,29 +71,11 @@ class DropboxKBSyncService:
|
||||||
)
|
)
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
|
||||||
search_space_id,
|
|
||||||
disable_streaming=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"file_name": file_name,
|
|
||||||
"document_type": "Dropbox File",
|
|
||||||
"connector_type": "Dropbox",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
summary_content = f"Dropbox File: {file_name}\n\n{indexable_content}"
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
summary_embedding = embed_text(summary_content)
|
||||||
indexable_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured — using fallback summary")
|
|
||||||
summary_content = f"Dropbox File: {file_name}\n\n{indexable_content}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(indexable_content)
|
chunks = await create_document_chunks(indexable_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -78,30 +77,11 @@ class GmailKBSyncService:
|
||||||
)
|
)
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
|
||||||
search_space_id,
|
|
||||||
disable_streaming=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"subject": subject,
|
|
||||||
"sender": sender,
|
|
||||||
"document_type": "Gmail Message",
|
|
||||||
"connector_type": "Gmail",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
summary_content = f"Gmail Message: {subject}\n\n{indexable_content}"
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
||||||
indexable_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured -- using fallback summary")
|
|
||||||
summary_content = f"Gmail Message: {subject}\n\n{indexable_content}"
|
|
||||||
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(indexable_content)
|
chunks = await create_document_chunks(indexable_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
|
||||||
|
|
@ -19,7 +19,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -90,33 +89,13 @@ class GoogleCalendarKBSyncService:
|
||||||
)
|
)
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
summary_content = (
|
||||||
search_space_id,
|
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
|
||||||
disable_streaming=True,
|
|
||||||
)
|
)
|
||||||
|
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"event_summary": event_summary,
|
|
||||||
"start_time": start_time,
|
|
||||||
"end_time": end_time,
|
|
||||||
"document_type": "Google Calendar Event",
|
|
||||||
"connector_type": "Google Calendar",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
indexable_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured -- using fallback summary")
|
|
||||||
summary_content = (
|
|
||||||
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
|
|
||||||
)
|
|
||||||
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(indexable_content)
|
chunks = await create_document_chunks(indexable_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
@ -273,29 +252,13 @@ class GoogleCalendarKBSyncService:
|
||||||
if not indexable_content:
|
if not indexable_content:
|
||||||
return {"status": "error", "message": "Event produced empty content"}
|
return {"status": "error", "message": "Event produced empty content"}
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session, user_id, search_space_id, disable_streaming=True
|
|
||||||
|
summary_content = (
|
||||||
|
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
|
||||||
)
|
)
|
||||||
|
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"event_summary": event_summary,
|
|
||||||
"start_time": start_time,
|
|
||||||
"end_time": end_time,
|
|
||||||
"document_type": "Google Calendar Event",
|
|
||||||
"connector_type": "Google Calendar",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
indexable_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = (
|
|
||||||
f"Google Calendar Event: {event_summary}\n\n{indexable_content}"
|
|
||||||
)
|
|
||||||
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(indexable_content)
|
chunks = await create_document_chunks(indexable_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -74,32 +73,13 @@ class GoogleDriveKBSyncService:
|
||||||
)
|
)
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
summary_content = (
|
||||||
search_space_id,
|
f"Google Drive File: {file_name}\n\n{indexable_content}"
|
||||||
disable_streaming=True,
|
|
||||||
)
|
)
|
||||||
|
summary_embedding = embed_text(summary_content)
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"file_name": file_name,
|
|
||||||
"mime_type": mime_type,
|
|
||||||
"document_type": "Google Drive File",
|
|
||||||
"connector_type": "Google Drive",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
indexable_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured — using fallback summary")
|
|
||||||
summary_content = (
|
|
||||||
f"Google Drive File: {file_name}\n\n{indexable_content}"
|
|
||||||
)
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(indexable_content)
|
chunks = await create_document_chunks(indexable_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -84,32 +83,13 @@ class LinearKBSyncService:
|
||||||
)
|
)
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
summary_content = (
|
||||||
search_space_id,
|
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
|
||||||
disable_streaming=True,
|
|
||||||
)
|
)
|
||||||
|
summary_embedding = embed_text(summary_content)
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"issue_id": issue_identifier,
|
|
||||||
"issue_title": issue_title,
|
|
||||||
"document_type": "Linear Issue",
|
|
||||||
"connector_type": "Linear",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
issue_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured — using fallback summary")
|
|
||||||
summary_content = (
|
|
||||||
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
|
|
||||||
)
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(issue_content)
|
chunks = await create_document_chunks(issue_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
@ -227,30 +207,12 @@ class LinearKBSyncService:
|
||||||
comment_count = len(formatted_issue.get("comments", []))
|
comment_count = len(formatted_issue.get("comments", []))
|
||||||
formatted_issue.get("description", "")
|
formatted_issue.get("description", "")
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session, user_id, search_space_id, disable_streaming=True
|
summary_content = (
|
||||||
|
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
|
||||||
)
|
)
|
||||||
|
summary_embedding = embed_text(summary_content)
|
||||||
if user_llm:
|
|
||||||
document_metadata_for_summary = {
|
|
||||||
"issue_id": issue_identifier,
|
|
||||||
"issue_title": issue_title,
|
|
||||||
"state": state,
|
|
||||||
"priority": priority,
|
|
||||||
"comment_count": comment_count,
|
|
||||||
"document_type": "Linear Issue",
|
|
||||||
"connector_type": "Linear",
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
issue_content, user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = (
|
|
||||||
f"Linear Issue {issue_identifier}: {issue_title}\n\n{issue_content}"
|
|
||||||
)
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(issue_content)
|
chunks = await create_document_chunks(issue_content)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,6 @@ def _is_interactive_auth_provider(
|
||||||
|
|
||||||
class LLMRole:
|
class LLMRole:
|
||||||
AGENT = "agent" # For agent/chat operations
|
AGENT = "agent" # For agent/chat operations
|
||||||
DOCUMENT_SUMMARY = "document_summary" # For document summarization
|
|
||||||
|
|
||||||
|
|
||||||
def get_global_llm_config(llm_config_id: int) -> dict | None:
|
def get_global_llm_config(llm_config_id: int) -> dict | None:
|
||||||
|
|
@ -266,7 +265,7 @@ async def get_search_space_llm_instance(
|
||||||
Args:
|
Args:
|
||||||
session: Database session
|
session: Database session
|
||||||
search_space_id: Search Space ID
|
search_space_id: Search Space ID
|
||||||
role: LLM role ('agent' or 'document_summary')
|
role: LLM role ('agent')
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
|
ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
|
||||||
|
|
@ -283,11 +282,8 @@ async def get_search_space_llm_instance(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Get the appropriate LLM config ID based on role
|
# Get the appropriate LLM config ID based on role
|
||||||
llm_config_id = None
|
|
||||||
if role == LLMRole.AGENT:
|
if role == LLMRole.AGENT:
|
||||||
llm_config_id = search_space.agent_llm_id
|
llm_config_id = search_space.agent_llm_id
|
||||||
elif role == LLMRole.DOCUMENT_SUMMARY:
|
|
||||||
llm_config_id = search_space.document_summary_llm_id
|
|
||||||
else:
|
else:
|
||||||
logger.error(f"Invalid LLM role: {role}")
|
logger.error(f"Invalid LLM role: {role}")
|
||||||
return None
|
return None
|
||||||
|
|
@ -470,20 +466,13 @@ async def get_search_space_llm_instance(
|
||||||
|
|
||||||
|
|
||||||
async def get_agent_llm(
|
async def get_agent_llm(
|
||||||
session: AsyncSession, search_space_id: int
|
|
||||||
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
|
|
||||||
"""Get the search space's agent LLM instance for chat operations."""
|
|
||||||
return await get_search_space_llm_instance(session, search_space_id, LLMRole.AGENT)
|
|
||||||
|
|
||||||
|
|
||||||
async def get_document_summary_llm(
|
|
||||||
session: AsyncSession, search_space_id: int, disable_streaming: bool = False
|
session: AsyncSession, search_space_id: int, disable_streaming: bool = False
|
||||||
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
|
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
|
||||||
"""Get the search space's document summary LLM instance."""
|
"""Get the search space's agent LLM instance for chat operations."""
|
||||||
return await get_search_space_llm_instance(
|
return await get_search_space_llm_instance(
|
||||||
session,
|
session,
|
||||||
search_space_id,
|
search_space_id,
|
||||||
LLMRole.DOCUMENT_SUMMARY,
|
LLMRole.AGENT,
|
||||||
disable_streaming=disable_streaming,
|
disable_streaming=disable_streaming,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -645,22 +634,6 @@ async def get_vision_llm(
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Backward-compatible alias (LLM preferences are now per-search-space, not per-user)
|
|
||||||
async def get_user_long_context_llm(
|
|
||||||
session: AsyncSession,
|
|
||||||
user_id: str,
|
|
||||||
search_space_id: int,
|
|
||||||
disable_streaming: bool = False,
|
|
||||||
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
|
|
||||||
"""
|
|
||||||
Deprecated: Use get_document_summary_llm instead.
|
|
||||||
The user_id parameter is ignored as LLM preferences are now per-search-space.
|
|
||||||
"""
|
|
||||||
return await get_document_summary_llm(
|
|
||||||
session, search_space_id, disable_streaming=disable_streaming
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def get_planner_llm() -> ChatLiteLLM | None:
|
def get_planner_llm() -> ChatLiteLLM | None:
|
||||||
"""Return a planner LLM instance from the first global config marked
|
"""Return a planner LLM instance from the first global config marked
|
||||||
``is_planner: true``, or ``None`` if no planner config is defined.
|
``is_planner: true``, or ``None`` if no planner config is defined.
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -73,30 +72,11 @@ class NotionKBSyncService:
|
||||||
)
|
)
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
|
||||||
search_space_id,
|
|
||||||
disable_streaming=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"page_title": page_title,
|
|
||||||
"page_id": page_id,
|
|
||||||
"document_type": "Notion Page",
|
|
||||||
"connector_type": "Notion",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
summary_content = f"Notion Page: {page_title}\n\n{markdown_content}"
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
summary_embedding = embed_text(summary_content)
|
||||||
markdown_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured — using fallback summary")
|
|
||||||
summary_content = f"Notion Page: {page_title}\n\n{markdown_content}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(markdown_content)
|
chunks = await create_document_chunks(markdown_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
@ -245,31 +225,11 @@ class NotionKBSyncService:
|
||||||
f"Final content length: {len(full_content)} chars, verified={content_verified}"
|
f"Final content length: {len(full_content)} chars, verified={content_verified}"
|
||||||
)
|
)
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
logger.debug("Generating summary and embeddings")
|
logger.debug("Generating summary and embeddings")
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
|
||||||
search_space_id,
|
|
||||||
disable_streaming=True, # disable streaming to avoid leaking into the chat
|
|
||||||
)
|
|
||||||
|
|
||||||
if user_llm:
|
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
|
||||||
document_metadata_for_summary = {
|
summary_embedding = embed_text(summary_content)
|
||||||
"page_title": document.document_metadata.get("page_title"),
|
|
||||||
"page_id": document.document_metadata.get("page_id"),
|
|
||||||
"document_type": "Notion Page",
|
|
||||||
"connector_type": "Notion",
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
full_content, user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
logger.debug(f"Generated summary length: {len(summary_content)} chars")
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured - using fallback summary")
|
|
||||||
summary_content = f"Notion Page: {document.document_metadata.get('page_title')}\n\n{full_content}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
logger.debug("Creating new chunks")
|
logger.debug("Creating new chunks")
|
||||||
chunks = await create_document_chunks(full_content)
|
chunks = await create_document_chunks(full_content)
|
||||||
|
|
|
||||||
|
|
@ -233,18 +233,6 @@ async def _resolve_attachment_vision_llm(
|
||||||
return await get_vision_llm(session, search_space_id)
|
return await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
|
|
||||||
async def _resolve_summary_llm(
|
|
||||||
session: AsyncSession, *, user_id: str, search_space_id: int, should_summarize: bool
|
|
||||||
):
|
|
||||||
"""Fetch summary LLM only when indexing summary is enabled."""
|
|
||||||
if not should_summarize:
|
|
||||||
return None
|
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
return await get_user_long_context_llm(session, user_id, search_space_id)
|
|
||||||
|
|
||||||
|
|
||||||
def _require_extracted_attachment_content(
|
def _require_extracted_attachment_content(
|
||||||
*, content: str, etl_meta: dict[str, Any], path: str
|
*, content: str, etl_meta: dict[str, Any], path: str
|
||||||
) -> str:
|
) -> str:
|
||||||
|
|
@ -349,13 +337,6 @@ async def upsert_note(
|
||||||
path=payload.path,
|
path=payload.path,
|
||||||
)
|
)
|
||||||
|
|
||||||
llm = await _resolve_summary_llm(
|
|
||||||
session,
|
|
||||||
user_id=str(user_id),
|
|
||||||
search_space_id=search_space_id,
|
|
||||||
should_summarize=connector.enable_summary,
|
|
||||||
)
|
|
||||||
|
|
||||||
document_string = _build_document_string(
|
document_string = _build_document_string(
|
||||||
payload, vault_name, content_override=content_for_index
|
payload, vault_name, content_override=content_for_index
|
||||||
)
|
)
|
||||||
|
|
@ -374,8 +355,6 @@ async def upsert_note(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector.id,
|
connector_id=connector.id,
|
||||||
created_by_id=str(user_id),
|
created_by_id=str(user_id),
|
||||||
should_summarize=connector.enable_summary,
|
|
||||||
fallback_summary=f"Obsidian Note: {payload.name}\n\n{content_for_index}",
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -388,7 +367,7 @@ async def upsert_note(
|
||||||
|
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
|
|
||||||
return await pipeline.index(document, connector_doc, llm)
|
return await pipeline.index(document, connector_doc)
|
||||||
|
|
||||||
|
|
||||||
async def rename_note(
|
async def rename_note(
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,6 @@ from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -73,30 +72,11 @@ class OneDriveKBSyncService:
|
||||||
)
|
)
|
||||||
content_hash = unique_hash
|
content_hash = unique_hash
|
||||||
|
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
self.db_session,
|
|
||||||
user_id,
|
|
||||||
search_space_id,
|
|
||||||
disable_streaming=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
doc_metadata_for_summary = {
|
|
||||||
"file_name": file_name,
|
|
||||||
"mime_type": mime_type,
|
|
||||||
"document_type": "OneDrive File",
|
|
||||||
"connector_type": "OneDrive",
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm:
|
summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}"
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
||||||
indexable_content, user_llm, doc_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
logger.warning("No LLM configured — using fallback summary")
|
|
||||||
summary_content = f"OneDrive File: {file_name}\n\n{indexable_content}"
|
|
||||||
summary_embedding = await asyncio.to_thread(embed_text, summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(indexable_content)
|
chunks = await create_document_chunks(indexable_content)
|
||||||
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
now_str = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,6 @@ class TaskDispatcher(Protocol):
|
||||||
filename: str,
|
filename: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
|
|
@ -35,7 +34,6 @@ class CeleryTaskDispatcher:
|
||||||
filename: str,
|
filename: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
@ -49,7 +47,6 @@ class CeleryTaskDispatcher:
|
||||||
filename=filename,
|
filename=filename,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
should_summarize=should_summarize,
|
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
processing_mode=processing_mode,
|
processing_mode=processing_mode,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from sqlalchemy.orm import selectinload
|
||||||
from app.celery_app import celery_app
|
from app.celery_app import celery_app
|
||||||
from app.db import Document
|
from app.db import Document
|
||||||
from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
|
from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
|
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
|
||||||
|
|
||||||
|
|
@ -68,12 +67,8 @@ async def _reindex_document(document_id: int, user_id: str):
|
||||||
|
|
||||||
logger.info(f"Reindexing document {document_id} ({document.title})")
|
logger.info(f"Reindexing document {document_id} ({document.title})")
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, document.search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
adapter = UploadDocumentAdapter(session)
|
adapter = UploadDocumentAdapter(session)
|
||||||
await adapter.reindex(document=document, llm=user_llm)
|
await adapter.reindex(document=document)
|
||||||
|
|
||||||
await task_logger.log_task_success(
|
await task_logger.log_task_success(
|
||||||
log_entry,
|
log_entry,
|
||||||
|
|
|
||||||
|
|
@ -765,7 +765,6 @@ def process_file_upload_with_document_task(
|
||||||
filename: str,
|
filename: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
):
|
):
|
||||||
|
|
@ -782,7 +781,6 @@ def process_file_upload_with_document_task(
|
||||||
filename: Original filename
|
filename: Original filename
|
||||||
search_space_id: ID of the search space
|
search_space_id: ID of the search space
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
should_summarize: Whether to generate an LLM summary
|
|
||||||
"""
|
"""
|
||||||
import traceback
|
import traceback
|
||||||
|
|
||||||
|
|
@ -814,7 +812,6 @@ def process_file_upload_with_document_task(
|
||||||
filename,
|
filename,
|
||||||
search_space_id,
|
search_space_id,
|
||||||
user_id,
|
user_id,
|
||||||
should_summarize=should_summarize,
|
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
processing_mode=processing_mode,
|
processing_mode=processing_mode,
|
||||||
)
|
)
|
||||||
|
|
@ -850,7 +847,6 @@ async def _process_file_with_document(
|
||||||
filename: str,
|
filename: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
):
|
):
|
||||||
|
|
@ -954,7 +950,6 @@ async def _process_file_with_document(
|
||||||
task_logger=task_logger,
|
task_logger=task_logger,
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
notification=notification,
|
notification=notification,
|
||||||
should_summarize=should_summarize,
|
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
processing_mode=processing_mode,
|
processing_mode=processing_mode,
|
||||||
)
|
)
|
||||||
|
|
@ -1258,7 +1253,6 @@ def index_local_folder_task(
|
||||||
exclude_patterns: list[str] | None = None,
|
exclude_patterns: list[str] | None = None,
|
||||||
file_extensions: list[str] | None = None,
|
file_extensions: list[str] | None = None,
|
||||||
root_folder_id: int | None = None,
|
root_folder_id: int | None = None,
|
||||||
enable_summary: bool = False,
|
|
||||||
target_file_paths: list[str] | None = None,
|
target_file_paths: list[str] | None = None,
|
||||||
):
|
):
|
||||||
"""Celery task to index a local folder. Config is passed directly — no connector row."""
|
"""Celery task to index a local folder. Config is passed directly — no connector row."""
|
||||||
|
|
@ -1271,7 +1265,6 @@ def index_local_folder_task(
|
||||||
exclude_patterns=exclude_patterns,
|
exclude_patterns=exclude_patterns,
|
||||||
file_extensions=file_extensions,
|
file_extensions=file_extensions,
|
||||||
root_folder_id=root_folder_id,
|
root_folder_id=root_folder_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
target_file_paths=target_file_paths,
|
target_file_paths=target_file_paths,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
@ -1285,7 +1278,6 @@ async def _index_local_folder_async(
|
||||||
exclude_patterns: list[str] | None = None,
|
exclude_patterns: list[str] | None = None,
|
||||||
file_extensions: list[str] | None = None,
|
file_extensions: list[str] | None = None,
|
||||||
root_folder_id: int | None = None,
|
root_folder_id: int | None = None,
|
||||||
enable_summary: bool = False,
|
|
||||||
target_file_paths: list[str] | None = None,
|
target_file_paths: list[str] | None = None,
|
||||||
):
|
):
|
||||||
"""Run local folder indexing with notification + heartbeat."""
|
"""Run local folder indexing with notification + heartbeat."""
|
||||||
|
|
@ -1343,8 +1335,7 @@ async def _index_local_folder_async(
|
||||||
exclude_patterns=exclude_patterns,
|
exclude_patterns=exclude_patterns,
|
||||||
file_extensions=file_extensions,
|
file_extensions=file_extensions,
|
||||||
root_folder_id=root_folder_id,
|
root_folder_id=root_folder_id,
|
||||||
enable_summary=enable_summary,
|
target_file_paths=target_file_paths,
|
||||||
target_file_paths=target_file_paths,
|
|
||||||
on_heartbeat_callback=_heartbeat_progress
|
on_heartbeat_callback=_heartbeat_progress
|
||||||
if (is_batch or is_full_scan)
|
if (is_batch or is_full_scan)
|
||||||
else None,
|
else None,
|
||||||
|
|
@ -1400,7 +1391,6 @@ def index_uploaded_folder_files_task(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
folder_name: str,
|
folder_name: str,
|
||||||
root_folder_id: int,
|
root_folder_id: int,
|
||||||
enable_summary: bool,
|
|
||||||
file_mappings: list[dict],
|
file_mappings: list[dict],
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
|
|
@ -1412,7 +1402,6 @@ def index_uploaded_folder_files_task(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
root_folder_id=root_folder_id,
|
root_folder_id=root_folder_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
file_mappings=file_mappings,
|
file_mappings=file_mappings,
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
processing_mode=processing_mode,
|
processing_mode=processing_mode,
|
||||||
|
|
@ -1425,7 +1414,6 @@ async def _index_uploaded_folder_files_async(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
folder_name: str,
|
folder_name: str,
|
||||||
root_folder_id: int,
|
root_folder_id: int,
|
||||||
enable_summary: bool,
|
|
||||||
file_mappings: list[dict],
|
file_mappings: list[dict],
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
|
|
@ -1475,8 +1463,7 @@ async def _index_uploaded_folder_files_async(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
root_folder_id=root_folder_id,
|
root_folder_id=root_folder_id,
|
||||||
enable_summary=enable_summary,
|
file_mappings=file_mappings,
|
||||||
file_mappings=file_mappings,
|
|
||||||
on_heartbeat_callback=_heartbeat_progress,
|
on_heartbeat_callback=_heartbeat_progress,
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
processing_mode=processing_mode,
|
processing_mode=processing_mode,
|
||||||
|
|
@ -1563,12 +1550,10 @@ async def _ai_sort_search_space_async(search_space_id: int, user_id: str):
|
||||||
t_start = time.perf_counter()
|
t_start = time.perf_counter()
|
||||||
try:
|
try:
|
||||||
from app.services.ai_file_sort_service import ai_sort_all_documents
|
from app.services.ai_file_sort_service import ai_sort_all_documents
|
||||||
from app.services.llm_service import get_document_summary_llm
|
from app.services.llm_service import get_agent_llm
|
||||||
|
|
||||||
async with get_celery_session_maker()() as session:
|
async with get_celery_session_maker()() as session:
|
||||||
llm = await get_document_summary_llm(
|
llm = await get_agent_llm(session, search_space_id, disable_streaming=True)
|
||||||
session, search_space_id, disable_streaming=True
|
|
||||||
)
|
|
||||||
if llm is None:
|
if llm is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"No LLM configured for search_space=%d, skipping AI sort",
|
"No LLM configured for search_space=%d, skipping AI sort",
|
||||||
|
|
@ -1604,7 +1589,7 @@ def ai_sort_document_task(self, search_space_id: int, user_id: str, document_id:
|
||||||
async def _ai_sort_document_async(search_space_id: int, user_id: str, document_id: int):
|
async def _ai_sort_document_async(search_space_id: int, user_id: str, document_id: int):
|
||||||
from app.db import Document
|
from app.db import Document
|
||||||
from app.services.ai_file_sort_service import ai_sort_document
|
from app.services.ai_file_sort_service import ai_sort_document
|
||||||
from app.services.llm_service import get_document_summary_llm
|
from app.services.llm_service import get_agent_llm
|
||||||
|
|
||||||
async with get_celery_session_maker()() as session:
|
async with get_celery_session_maker()() as session:
|
||||||
document = await session.get(Document, document_id)
|
document = await session.get(Document, document_id)
|
||||||
|
|
@ -1612,9 +1597,7 @@ async def _ai_sort_document_async(search_space_id: int, user_id: str, document_i
|
||||||
logger.warning("Document %d not found, skipping AI sort", document_id)
|
logger.warning("Document %d not found, skipping AI sort", document_id)
|
||||||
return
|
return
|
||||||
|
|
||||||
llm = await get_document_summary_llm(
|
llm = await get_agent_llm(session, search_space_id, disable_streaming=True)
|
||||||
session, search_space_id, disable_streaming=True
|
|
||||||
)
|
|
||||||
if llm is None:
|
if llm is None:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"No LLM for search_space=%d, skipping AI sort of doc=%d",
|
"No LLM for search_space=%d, skipping AI sort of doc=%d",
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,7 @@ async def build_new_chat_input_state(
|
||||||
user_image_data_urls: list[str] | None,
|
user_image_data_urls: list[str] | None,
|
||||||
mentioned_document_ids: list[int] | None,
|
mentioned_document_ids: list[int] | None,
|
||||||
mentioned_folder_ids: list[int] | None,
|
mentioned_folder_ids: list[int] | None,
|
||||||
|
mentioned_connectors: list[dict[str, Any]] | None,
|
||||||
mentioned_documents: list[dict[str, Any]] | None,
|
mentioned_documents: list[dict[str, Any]] | None,
|
||||||
needs_history_bootstrap: bool,
|
needs_history_bootstrap: bool,
|
||||||
thread_visibility: ChatVisibility,
|
thread_visibility: ChatVisibility,
|
||||||
|
|
@ -110,6 +111,7 @@ async def build_new_chat_input_state(
|
||||||
|
|
||||||
final_query = _render_query_with_context(
|
final_query = _render_query_with_context(
|
||||||
agent_user_query=agent_user_query,
|
agent_user_query=agent_user_query,
|
||||||
|
mentioned_connectors=mentioned_connectors,
|
||||||
recent_reports=recent_reports,
|
recent_reports=recent_reports,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -196,11 +198,16 @@ async def _resolve_mentions_for_query(
|
||||||
def _render_query_with_context(
|
def _render_query_with_context(
|
||||||
*,
|
*,
|
||||||
agent_user_query: str,
|
agent_user_query: str,
|
||||||
|
mentioned_connectors: list[dict[str, Any]] | None,
|
||||||
recent_reports: list[Report],
|
recent_reports: list[Report],
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Prepend recent-reports XML block to the user query."""
|
"""Prepend connector/report XML context blocks to the user query."""
|
||||||
context_parts: list[str] = []
|
context_parts: list[str] = []
|
||||||
|
|
||||||
|
connector_context = _render_mentioned_connectors(mentioned_connectors)
|
||||||
|
if connector_context:
|
||||||
|
context_parts.append(connector_context)
|
||||||
|
|
||||||
if recent_reports:
|
if recent_reports:
|
||||||
report_lines: list[str] = []
|
report_lines: list[str] = []
|
||||||
for r in recent_reports:
|
for r in recent_reports:
|
||||||
|
|
@ -225,3 +232,40 @@ def _render_query_with_context(
|
||||||
return f"{context}\n\n<user_query>{agent_user_query}</user_query>"
|
return f"{context}\n\n<user_query>{agent_user_query}</user_query>"
|
||||||
|
|
||||||
return agent_user_query
|
return agent_user_query
|
||||||
|
|
||||||
|
|
||||||
|
def _render_mentioned_connectors(
|
||||||
|
mentioned_connectors: list[dict[str, Any]] | None,
|
||||||
|
) -> str | None:
|
||||||
|
"""Render selected connector account metadata for connector-backed tools."""
|
||||||
|
if not mentioned_connectors:
|
||||||
|
return None
|
||||||
|
|
||||||
|
connector_lines: list[str] = []
|
||||||
|
for connector in mentioned_connectors:
|
||||||
|
if not isinstance(connector, dict):
|
||||||
|
continue
|
||||||
|
connector_id = connector.get("id")
|
||||||
|
connector_type = connector.get("connector_type") or connector.get(
|
||||||
|
"document_type"
|
||||||
|
)
|
||||||
|
account_name = connector.get("account_name") or connector.get("title")
|
||||||
|
if connector_id is None or connector_type is None:
|
||||||
|
continue
|
||||||
|
connector_lines.append(
|
||||||
|
f' - connector_id={connector_id}, connector_type="{connector_type}", '
|
||||||
|
f'account_name="{account_name or ""}"'
|
||||||
|
)
|
||||||
|
|
||||||
|
if not connector_lines:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return (
|
||||||
|
"<mentioned_connectors>\n"
|
||||||
|
"The user selected these exact connector accounts with @. "
|
||||||
|
"These entries are selection metadata, not retrieved connector content. "
|
||||||
|
"When a connector-backed tool needs an account, use the matching "
|
||||||
|
"connector_id from this list if the tool supports connector_id:\n"
|
||||||
|
+ "\n".join(connector_lines)
|
||||||
|
+ "\n</mentioned_connectors>"
|
||||||
|
)
|
||||||
|
|
|
||||||
|
|
@ -124,6 +124,8 @@ async def stream_new_chat(
|
||||||
llm_config_id: int = -1,
|
llm_config_id: int = -1,
|
||||||
mentioned_document_ids: list[int] | None = None,
|
mentioned_document_ids: list[int] | None = None,
|
||||||
mentioned_folder_ids: list[int] | None = None,
|
mentioned_folder_ids: list[int] | None = None,
|
||||||
|
mentioned_connector_ids: list[int] | None = None,
|
||||||
|
mentioned_connectors: list[dict[str, Any]] | None = None,
|
||||||
mentioned_documents: list[dict[str, Any]] | None = None,
|
mentioned_documents: list[dict[str, Any]] | None = None,
|
||||||
checkpoint_id: str | None = None,
|
checkpoint_id: str | None = None,
|
||||||
needs_history_bootstrap: bool = False,
|
needs_history_bootstrap: bool = False,
|
||||||
|
|
@ -435,6 +437,7 @@ async def stream_new_chat(
|
||||||
user_image_data_urls=user_image_data_urls,
|
user_image_data_urls=user_image_data_urls,
|
||||||
mentioned_document_ids=mentioned_document_ids,
|
mentioned_document_ids=mentioned_document_ids,
|
||||||
mentioned_folder_ids=mentioned_folder_ids,
|
mentioned_folder_ids=mentioned_folder_ids,
|
||||||
|
mentioned_connectors=mentioned_connectors,
|
||||||
mentioned_documents=mentioned_documents,
|
mentioned_documents=mentioned_documents,
|
||||||
needs_history_bootstrap=needs_history_bootstrap,
|
needs_history_bootstrap=needs_history_bootstrap,
|
||||||
thread_visibility=visibility,
|
thread_visibility=visibility,
|
||||||
|
|
@ -588,6 +591,8 @@ async def stream_new_chat(
|
||||||
mentioned_document_ids=mentioned_document_ids,
|
mentioned_document_ids=mentioned_document_ids,
|
||||||
accepted_folder_ids=accepted_folder_ids,
|
accepted_folder_ids=accepted_folder_ids,
|
||||||
mentioned_folder_ids=mentioned_folder_ids,
|
mentioned_folder_ids=mentioned_folder_ids,
|
||||||
|
mentioned_connector_ids=mentioned_connector_ids,
|
||||||
|
mentioned_connectors=mentioned_connectors,
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
turn_id=stream_result.turn_id,
|
turn_id=stream_result.turn_id,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -8,6 +8,8 @@ mention lists / request ids / turn ids without rebuilding the graph.
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
from app.agents.new_chat.context import SurfSenseContextSchema
|
from app.agents.new_chat.context import SurfSenseContextSchema
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -17,6 +19,8 @@ def build_new_chat_runtime_context(
|
||||||
mentioned_document_ids: list[int] | None,
|
mentioned_document_ids: list[int] | None,
|
||||||
accepted_folder_ids: list[int],
|
accepted_folder_ids: list[int],
|
||||||
mentioned_folder_ids: list[int] | None,
|
mentioned_folder_ids: list[int] | None,
|
||||||
|
mentioned_connector_ids: list[int] | None,
|
||||||
|
mentioned_connectors: list[dict[str, Any]] | None,
|
||||||
request_id: str | None,
|
request_id: str | None,
|
||||||
turn_id: str,
|
turn_id: str,
|
||||||
) -> SurfSenseContextSchema:
|
) -> SurfSenseContextSchema:
|
||||||
|
|
@ -31,6 +35,8 @@ def build_new_chat_runtime_context(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
mentioned_document_ids=list(mentioned_document_ids or []),
|
mentioned_document_ids=list(mentioned_document_ids or []),
|
||||||
mentioned_folder_ids=list(accepted_folder_ids or mentioned_folder_ids or []),
|
mentioned_folder_ids=list(accepted_folder_ids or mentioned_folder_ids or []),
|
||||||
|
mentioned_connector_ids=list(mentioned_connector_ids or []),
|
||||||
|
mentioned_connectors=list(mentioned_connectors or []),
|
||||||
request_id=request_id,
|
request_id=request_id,
|
||||||
turn_id=turn_id,
|
turn_id=turn_id,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -14,13 +14,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.airtable_history import AirtableHistoryConnector
|
from app.connectors.airtable_history import AirtableHistoryConnector
|
||||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -394,29 +392,10 @@ async def index_airtable_records(
|
||||||
document.status = DocumentStatus.processing()
|
document.status = DocumentStatus.processing()
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
# Heavy processing (LLM, embeddings, chunks)
|
# Heavy processing (embeddings, chunks)
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
if user_llm and connector.enable_summary:
|
summary_content = f"Airtable Record: {item['record_id']}\n\n{item['markdown_content']}"
|
||||||
document_metadata_for_summary = {
|
summary_embedding = embed_text(summary_content)
|
||||||
"record_id": item["record_id"],
|
|
||||||
"created_time": item["record"].get("CREATED_TIME()", ""),
|
|
||||||
"document_type": "Airtable Record",
|
|
||||||
"connector_type": "Airtable",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
item["markdown_content"],
|
|
||||||
user_llm,
|
|
||||||
document_metadata_for_summary,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"Airtable Record: {item['record_id']}\n\n{item['markdown_content']}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(item["markdown_content"])
|
chunks = await create_document_chunks(item["markdown_content"])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,13 +15,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.bookstack_connector import BookStackConnector
|
from app.connectors.bookstack_connector import BookStackConnector
|
||||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -384,10 +382,7 @@ async def index_bookstack_pages(
|
||||||
document.status = DocumentStatus.processing()
|
document.status = DocumentStatus.processing()
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
# Heavy processing (LLM, embeddings, chunks)
|
# Heavy processing (embeddings, chunks)
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
# Build document metadata
|
# Build document metadata
|
||||||
doc_metadata = {
|
doc_metadata = {
|
||||||
|
|
@ -403,23 +398,8 @@ async def index_bookstack_pages(
|
||||||
"connector_id": connector_id,
|
"connector_id": connector_id,
|
||||||
}
|
}
|
||||||
|
|
||||||
if user_llm and connector.enable_summary:
|
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n{item['full_content']}"
|
||||||
summary_metadata = {
|
summary_embedding = embed_text(summary_content)
|
||||||
"page_name": item["page_name"],
|
|
||||||
"page_id": item["page_id"],
|
|
||||||
"book_id": item["book_id"],
|
|
||||||
"document_type": "BookStack Page",
|
|
||||||
"connector_type": "BookStack",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
item["full_content"], user_llm, summary_metadata
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"BookStack Page: {item['page_name']}\n\nBook ID: {item['book_id']}\n\n{item['full_content']}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
# Process chunks - using the full page content
|
# Process chunks - using the full page content
|
||||||
chunks = await create_document_chunks(item["full_content"])
|
chunks = await create_document_chunks(item["full_content"])
|
||||||
|
|
|
||||||
|
|
@ -16,13 +16,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.clickup_history import ClickUpHistoryConnector
|
from app.connectors.clickup_history import ClickUpHistoryConnector
|
||||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -393,32 +391,10 @@ async def index_clickup_tasks(
|
||||||
document.status = DocumentStatus.processing()
|
document.status = DocumentStatus.processing()
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
# Heavy processing (LLM, embeddings, chunks)
|
# Heavy processing (embeddings, chunks)
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
if user_llm and connector.enable_summary:
|
summary_content = item["task_content"]
|
||||||
document_metadata_for_summary = {
|
summary_embedding = embed_text(item["task_content"])
|
||||||
"task_id": item["task_id"],
|
|
||||||
"task_name": item["task_name"],
|
|
||||||
"task_status": item["task_status"],
|
|
||||||
"task_priority": item["task_priority"],
|
|
||||||
"task_list": item["task_list_name"],
|
|
||||||
"task_space": item["task_space_name"],
|
|
||||||
"assignees": len(item["task_assignees"]),
|
|
||||||
"document_type": "ClickUp Task",
|
|
||||||
"connector_type": "ClickUp",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
item["task_content"], user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = item["task_content"]
|
|
||||||
summary_embedding = embed_text(item["task_content"])
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(item["task_content"])
|
chunks = await create_document_chunks(item["task_content"])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
|
||||||
IndexingPipelineService,
|
IndexingPipelineService,
|
||||||
PlaceholderInfo,
|
PlaceholderInfo,
|
||||||
)
|
)
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
|
|
@ -36,7 +35,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
"""Map a raw Confluence page dict to a ConnectorDocument."""
|
"""Map a raw Confluence page dict to a ConnectorDocument."""
|
||||||
page_id = page.get("id", "")
|
page_id = page.get("id", "")
|
||||||
|
|
@ -54,10 +52,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "Confluence",
|
"connector_type": "Confluence",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = (
|
|
||||||
f"Confluence Page: {page_title}\n\nSpace ID: {space_id}\n\n{full_content}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=page_title,
|
title=page_title,
|
||||||
source_markdown=full_content,
|
source_markdown=full_content,
|
||||||
|
|
@ -66,8 +60,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -268,8 +260,7 @@ async def index_confluence_pages(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector.enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
with session.no_autoflush:
|
with session.no_autoflush:
|
||||||
duplicate_by_content = await check_duplicate_document_by_hash(
|
duplicate_by_content = await check_duplicate_document_by_hash(
|
||||||
|
|
@ -297,12 +288,8 @@ async def index_confluence_pages(
|
||||||
|
|
||||||
await pipeline.migrate_legacy_docs(connector_docs)
|
await pipeline.migrate_legacy_docs(connector_docs)
|
||||||
|
|
||||||
async def _get_llm(s: AsyncSession):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,6 @@ from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector
|
||||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.page_limit_service import PageLimitService
|
from app.services.page_limit_service import PageLimitService
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.connector_indexers.base import (
|
from app.tasks.connector_indexers.base import (
|
||||||
|
|
@ -126,7 +125,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
file_id = file.get("id", "")
|
file_id = file.get("id", "")
|
||||||
file_name = file.get("name", "Unknown")
|
file_name = file.get("name", "Unknown")
|
||||||
|
|
@ -138,8 +136,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "Dropbox",
|
"connector_type": "Dropbox",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = f"File: {file_name}\n\n{markdown[:4000]}"
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=file_name,
|
title=file_name,
|
||||||
source_markdown=markdown,
|
source_markdown=markdown,
|
||||||
|
|
@ -148,8 +144,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -161,7 +155,6 @@ async def _download_files_parallel(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
max_concurrency: int = 3,
|
max_concurrency: int = 3,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
|
|
@ -191,7 +184,6 @@ async def _download_files_parallel(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
)
|
)
|
||||||
async with hb_lock:
|
async with hb_lock:
|
||||||
completed_count += 1
|
completed_count += 1
|
||||||
|
|
@ -223,7 +215,6 @@ async def _download_and_index(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
|
|
@ -234,7 +225,6 @@ async def _download_and_index(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -243,13 +233,8 @@ async def _download_and_index(
|
||||||
batch_failed = 0
|
batch_failed = 0
|
||||||
if connector_docs:
|
if connector_docs:
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
|
|
||||||
async def _get_llm(s):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
|
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
)
|
)
|
||||||
|
|
@ -289,7 +274,6 @@ async def _index_with_delta_sync(
|
||||||
log_entry: object,
|
log_entry: object,
|
||||||
max_files: int,
|
max_files: int,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, str]:
|
) -> tuple[int, int, int, str]:
|
||||||
"""Delta sync using Dropbox cursor-based change tracking.
|
"""Delta sync using Dropbox cursor-based change tracking.
|
||||||
|
|
@ -361,7 +345,6 @@ async def _index_with_delta_sync(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -388,7 +371,6 @@ async def _index_full_scan(
|
||||||
include_subfolders: bool = True,
|
include_subfolders: bool = True,
|
||||||
incremental_sync: bool = True,
|
incremental_sync: bool = True,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Full scan indexing of a folder.
|
"""Full scan indexing of a folder.
|
||||||
|
|
@ -473,7 +455,6 @@ async def _index_full_scan(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -502,7 +483,6 @@ async def _index_selected_files(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
incremental_sync: bool = True,
|
incremental_sync: bool = True,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
|
|
@ -563,7 +543,6 @@ async def _index_selected_files(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -629,7 +608,6 @@ async def index_dropbox_files(
|
||||||
)
|
)
|
||||||
return 0, 0, error_msg, 0
|
return 0, 0, error_msg, 0
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
|
||||||
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
vision_llm = None
|
vision_llm = None
|
||||||
if connector_enable_vision_llm:
|
if connector_enable_vision_llm:
|
||||||
|
|
@ -664,7 +642,6 @@ async def index_dropbox_files(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
incremental_sync=incremental_sync,
|
incremental_sync=incremental_sync,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -700,7 +677,6 @@ async def index_dropbox_files(
|
||||||
task_logger,
|
task_logger,
|
||||||
log_entry,
|
log_entry,
|
||||||
max_files,
|
max_files,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
folder_cursors[folder_path] = new_cursor
|
folder_cursors[folder_path] = new_cursor
|
||||||
|
|
@ -720,7 +696,6 @@ async def index_dropbox_files(
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
incremental_sync=incremental_sync,
|
incremental_sync=incremental_sync,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_unsupported += unsup
|
total_unsupported += unsup
|
||||||
|
|
|
||||||
|
|
@ -18,13 +18,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.github_connector import GitHubConnector
|
from app.connectors.github_connector import GitHubConnector
|
||||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -351,42 +349,14 @@ async def index_github_repos(
|
||||||
document.status = DocumentStatus.processing()
|
document.status = DocumentStatus.processing()
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
# Heavy processing (LLM, embeddings, chunks)
|
# Heavy processing (embeddings, chunks)
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
summary_text = (
|
||||||
|
f"# GitHub Repository: {repo_full_name}\n\n"
|
||||||
|
f"## Summary\n{digest.summary}\n\n"
|
||||||
|
f"## File Structure\n{digest.tree}"
|
||||||
)
|
)
|
||||||
|
summary_embedding = embed_text(summary_text)
|
||||||
document_metadata_for_summary = {
|
|
||||||
"repository": repo_full_name,
|
|
||||||
"document_type": "GitHub Repository",
|
|
||||||
"connector_type": "GitHub",
|
|
||||||
"ingestion_method": "gitingest",
|
|
||||||
"file_tree": digest.tree[:2000]
|
|
||||||
if len(digest.tree) > 2000
|
|
||||||
else digest.tree,
|
|
||||||
"estimated_tokens": digest.estimated_tokens,
|
|
||||||
}
|
|
||||||
|
|
||||||
if user_llm and connector.enable_summary:
|
|
||||||
# Prepare content for summarization
|
|
||||||
summary_content = digest.full_digest
|
|
||||||
if len(summary_content) > MAX_DIGEST_CHARS:
|
|
||||||
summary_content = (
|
|
||||||
f"# Repository: {repo_full_name}\n\n"
|
|
||||||
f"## File Structure\n\n{digest.tree}\n\n"
|
|
||||||
f"## File Contents (truncated)\n\n{digest.content[: MAX_DIGEST_CHARS - len(digest.tree) - 200]}..."
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_text, summary_embedding = await generate_document_summary(
|
|
||||||
summary_content, user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_text = (
|
|
||||||
f"# GitHub Repository: {repo_full_name}\n\n"
|
|
||||||
f"## Summary\n{digest.summary}\n\n"
|
|
||||||
f"## File Structure\n{digest.tree}"
|
|
||||||
)
|
|
||||||
summary_embedding = embed_text(summary_text)
|
|
||||||
|
|
||||||
# Chunk the full digest content for granular search
|
# Chunk the full digest content for granular search
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
Google Calendar connector indexer.
|
Google Calendar connector indexer.
|
||||||
|
|
||||||
Uses the shared IndexingPipelineService for document deduplication,
|
Uses the shared IndexingPipelineService for document deduplication,
|
||||||
summarization, chunking, and embedding.
|
chunking, and embedding.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
|
|
@ -21,7 +21,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
|
||||||
PlaceholderInfo,
|
PlaceholderInfo,
|
||||||
)
|
)
|
||||||
from app.services.composio_service import ComposioService
|
from app.services.composio_service import ComposioService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
|
from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
|
||||||
|
|
||||||
|
|
@ -53,7 +52,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
"""Map a raw Google Calendar API event dict to a ConnectorDocument."""
|
"""Map a raw Google Calendar API event dict to a ConnectorDocument."""
|
||||||
event_id = event.get("id", "")
|
event_id = event.get("id", "")
|
||||||
|
|
@ -78,8 +76,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "Google Calendar",
|
"connector_type": "Google Calendar",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = f"Google Calendar Event: {event_summary}\n\n{event_markdown}"
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=event_summary,
|
title=event_summary,
|
||||||
source_markdown=event_markdown,
|
source_markdown=event_markdown,
|
||||||
|
|
@ -88,8 +84,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -420,8 +414,7 @@ async def index_google_calendar_events(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector.enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
with session.no_autoflush:
|
with session.no_autoflush:
|
||||||
duplicate = await check_duplicate_document_by_hash(
|
duplicate = await check_duplicate_document_by_hash(
|
||||||
|
|
@ -448,13 +441,8 @@ async def index_google_calendar_events(
|
||||||
|
|
||||||
# ── Pipeline: migrate legacy docs + parallel index ─────────────
|
# ── Pipeline: migrate legacy docs + parallel index ─────────────
|
||||||
await pipeline.migrate_legacy_docs(connector_docs)
|
await pipeline.migrate_legacy_docs(connector_docs)
|
||||||
|
|
||||||
async def _get_llm(s):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
|
||||||
PlaceholderInfo,
|
PlaceholderInfo,
|
||||||
)
|
)
|
||||||
from app.services.composio_service import ComposioService
|
from app.services.composio_service import ComposioService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.page_limit_service import PageLimitService
|
from app.services.page_limit_service import PageLimitService
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.connector_indexers.base import (
|
from app.tasks.connector_indexers.base import (
|
||||||
|
|
@ -381,7 +380,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
"""Build a ConnectorDocument from Drive file metadata + extracted markdown."""
|
"""Build a ConnectorDocument from Drive file metadata + extracted markdown."""
|
||||||
file_id = file.get("id", "")
|
file_id = file.get("id", "")
|
||||||
|
|
@ -394,8 +392,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "Google Drive",
|
"connector_type": "Google Drive",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = f"File: {file_name}\n\n{markdown[:4000]}"
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=file_name,
|
title=file_name,
|
||||||
source_markdown=markdown,
|
source_markdown=markdown,
|
||||||
|
|
@ -404,8 +400,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -461,7 +455,6 @@ async def _download_files_parallel(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
max_concurrency: int = 3,
|
max_concurrency: int = 3,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
|
|
@ -494,7 +487,6 @@ async def _download_files_parallel(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
)
|
)
|
||||||
async with hb_lock:
|
async with hb_lock:
|
||||||
completed_count += 1
|
completed_count += 1
|
||||||
|
|
@ -525,7 +517,6 @@ async def _process_single_file(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool = True,
|
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Download, extract, and index a single Drive file via the pipeline.
|
"""Download, extract, and index a single Drive file via the pipeline.
|
||||||
|
|
@ -561,8 +552,7 @@ async def _process_single_file(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
documents = await pipeline.prepare_for_indexing([doc])
|
documents = await pipeline.prepare_for_indexing([doc])
|
||||||
|
|
@ -578,10 +568,7 @@ async def _process_single_file(
|
||||||
connector_doc = doc_map.get(document.unique_identifier_hash)
|
connector_doc = doc_map.get(document.unique_identifier_hash)
|
||||||
if not connector_doc:
|
if not connector_doc:
|
||||||
continue
|
continue
|
||||||
user_llm = await get_user_long_context_llm(
|
await pipeline.index(document, connector_doc)
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
await pipeline.index(document, connector_doc, user_llm)
|
|
||||||
|
|
||||||
await page_limit_service.update_page_usage(
|
await page_limit_service.update_page_usage(
|
||||||
user_id, estimated_pages, allow_exceed=True
|
user_id, estimated_pages, allow_exceed=True
|
||||||
|
|
@ -636,7 +623,6 @@ async def _download_and_index(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
|
|
@ -650,7 +636,6 @@ async def _download_and_index(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -659,13 +644,8 @@ async def _download_and_index(
|
||||||
batch_failed = 0
|
batch_failed = 0
|
||||||
if connector_docs:
|
if connector_docs:
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
|
|
||||||
async def _get_llm(s):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
|
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
)
|
)
|
||||||
|
|
@ -681,7 +661,6 @@ async def _index_selected_files(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, list[str]]:
|
) -> tuple[int, int, int, list[str]]:
|
||||||
|
|
@ -746,7 +725,6 @@ async def _index_selected_files(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -781,7 +759,6 @@ async def _index_full_scan(
|
||||||
max_files: int,
|
max_files: int,
|
||||||
include_subfolders: bool = False,
|
include_subfolders: bool = False,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Full scan indexing of a folder.
|
"""Full scan indexing of a folder.
|
||||||
|
|
@ -911,7 +888,6 @@ async def _index_full_scan(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -946,7 +922,6 @@ async def _index_with_delta_sync(
|
||||||
max_files: int,
|
max_files: int,
|
||||||
include_subfolders: bool = False,
|
include_subfolders: bool = False,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Delta sync using change tracking.
|
"""Delta sync using change tracking.
|
||||||
|
|
@ -1054,7 +1029,6 @@ async def _index_with_delta_sync(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -1142,7 +1116,6 @@ async def index_google_drive_files(
|
||||||
)
|
)
|
||||||
return 0, 0, client_error, 0
|
return 0, 0, client_error, 0
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
|
||||||
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
vision_llm = None
|
vision_llm = None
|
||||||
if connector_enable_vision_llm:
|
if connector_enable_vision_llm:
|
||||||
|
|
@ -1189,7 +1162,6 @@ async def index_google_drive_files(
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
on_heartbeat_callback,
|
on_heartbeat_callback,
|
||||||
connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
documents_unsupported += du
|
documents_unsupported += du
|
||||||
|
|
@ -1208,7 +1180,6 @@ async def index_google_drive_files(
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
on_heartbeat_callback,
|
on_heartbeat_callback,
|
||||||
connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
documents_indexed += ri
|
documents_indexed += ri
|
||||||
|
|
@ -1234,7 +1205,6 @@ async def index_google_drive_files(
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
on_heartbeat_callback,
|
on_heartbeat_callback,
|
||||||
connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -1346,7 +1316,6 @@ async def index_google_drive_single_file(
|
||||||
)
|
)
|
||||||
return 0, client_error
|
return 0, client_error
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
|
||||||
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
vision_llm = None
|
vision_llm = None
|
||||||
if connector_enable_vision_llm:
|
if connector_enable_vision_llm:
|
||||||
|
|
@ -1370,7 +1339,6 @@ async def index_google_drive_single_file(
|
||||||
connector_id,
|
connector_id,
|
||||||
search_space_id,
|
search_space_id,
|
||||||
user_id,
|
user_id,
|
||||||
connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
@ -1467,7 +1435,6 @@ async def index_google_drive_selected_files(
|
||||||
)
|
)
|
||||||
return 0, 0, [error_msg]
|
return 0, 0, [error_msg]
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
|
||||||
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
vision_llm = None
|
vision_llm = None
|
||||||
if connector_enable_vision_llm:
|
if connector_enable_vision_llm:
|
||||||
|
|
@ -1481,7 +1448,6 @@ async def index_google_drive_selected_files(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
Google Gmail connector indexer.
|
Google Gmail connector indexer.
|
||||||
|
|
||||||
Uses the shared IndexingPipelineService for document deduplication,
|
Uses the shared IndexingPipelineService for document deduplication,
|
||||||
summarization, chunking, and embedding.
|
chunking, and embedding.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
|
|
@ -21,7 +21,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
|
||||||
PlaceholderInfo,
|
PlaceholderInfo,
|
||||||
)
|
)
|
||||||
from app.services.composio_service import ComposioService
|
from app.services.composio_service import ComposioService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
|
from app.utils.google_credentials import COMPOSIO_GOOGLE_CONNECTOR_TYPES
|
||||||
|
|
||||||
|
|
@ -105,7 +104,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
"""Map a raw Gmail API message dict to a ConnectorDocument."""
|
"""Map a raw Gmail API message dict to a ConnectorDocument."""
|
||||||
message_id = message.get("id", "")
|
message_id = message.get("id", "")
|
||||||
|
|
@ -138,12 +136,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "Google Gmail",
|
"connector_type": "Google Gmail",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = (
|
|
||||||
f"Google Gmail Message: {subject}\n\n"
|
|
||||||
f"From: {sender}\nDate: {date_str}\n\n"
|
|
||||||
f"{markdown_content}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=subject,
|
title=subject,
|
||||||
source_markdown=markdown_content,
|
source_markdown=markdown_content,
|
||||||
|
|
@ -152,8 +144,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -454,8 +444,7 @@ async def index_google_gmail_messages(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector.enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
with session.no_autoflush:
|
with session.no_autoflush:
|
||||||
duplicate = await check_duplicate_document_by_hash(
|
duplicate = await check_duplicate_document_by_hash(
|
||||||
|
|
@ -483,13 +472,8 @@ async def index_google_gmail_messages(
|
||||||
|
|
||||||
# ── Pipeline: migrate legacy docs + parallel index ─────────────
|
# ── Pipeline: migrate legacy docs + parallel index ─────────────
|
||||||
await pipeline.migrate_legacy_docs(connector_docs)
|
await pipeline.migrate_legacy_docs(connector_docs)
|
||||||
|
|
||||||
async def _get_llm(s):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
Linear connector indexer.
|
Linear connector indexer.
|
||||||
|
|
||||||
Uses the shared IndexingPipelineService for document deduplication,
|
Uses the shared IndexingPipelineService for document deduplication,
|
||||||
summarization, chunking, and embedding with bounded parallel indexing.
|
chunking, and embedding with bounded parallel indexing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
|
|
@ -18,7 +18,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
|
||||||
IndexingPipelineService,
|
IndexingPipelineService,
|
||||||
PlaceholderInfo,
|
PlaceholderInfo,
|
||||||
)
|
)
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
|
|
||||||
from .base import (
|
from .base import (
|
||||||
|
|
@ -41,7 +40,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
"""Map a raw Linear issue dict to a ConnectorDocument."""
|
"""Map a raw Linear issue dict to a ConnectorDocument."""
|
||||||
issue_id = issue.get("id", "")
|
issue_id = issue.get("id", "")
|
||||||
|
|
@ -63,11 +61,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "Linear",
|
"connector_type": "Linear",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = (
|
|
||||||
f"Linear Issue {issue_identifier}: {issue_title}\n\n"
|
|
||||||
f"Status: {state}\n\n{issue_content}"
|
|
||||||
)
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=f"{issue_identifier}: {issue_title}",
|
title=f"{issue_identifier}: {issue_title}",
|
||||||
source_markdown=issue_content,
|
source_markdown=issue_content,
|
||||||
|
|
@ -76,8 +69,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -277,8 +268,7 @@ async def index_linear_issues(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector.enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
with session.no_autoflush:
|
with session.no_autoflush:
|
||||||
duplicate = await check_duplicate_document_by_hash(
|
duplicate = await check_duplicate_document_by_hash(
|
||||||
|
|
@ -306,13 +296,8 @@ async def index_linear_issues(
|
||||||
|
|
||||||
# ── Pipeline: migrate legacy docs + parallel index ────────────
|
# ── Pipeline: migrate legacy docs + parallel index ────────────
|
||||||
await pipeline.migrate_legacy_docs(connector_docs)
|
await pipeline.migrate_legacy_docs(connector_docs)
|
||||||
|
|
||||||
async def _get_llm(s):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,6 @@ from app.db import (
|
||||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.page_limit_service import PageLimitExceededError, PageLimitService
|
from app.services.page_limit_service import PageLimitExceededError, PageLimitService
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.celery_tasks import get_celery_session_maker
|
from app.tasks.celery_tasks import get_celery_session_maker
|
||||||
|
|
@ -478,7 +477,6 @@ def _build_connector_doc(
|
||||||
*,
|
*,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
"""Build a ConnectorDocument from a local file's extracted content."""
|
"""Build a ConnectorDocument from a local file's extracted content."""
|
||||||
unique_id = f"{folder_name}:{relative_path}"
|
unique_id = f"{folder_name}:{relative_path}"
|
||||||
|
|
@ -488,7 +486,6 @@ def _build_connector_doc(
|
||||||
"document_type": "Local Folder File",
|
"document_type": "Local Folder File",
|
||||||
"connector_type": "Local Folder",
|
"connector_type": "Local Folder",
|
||||||
}
|
}
|
||||||
fallback_summary = f"File: {title}\n\n{content[:4000]}"
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=title,
|
title=title,
|
||||||
|
|
@ -498,8 +495,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=None,
|
connector_id=None,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -513,7 +508,6 @@ async def index_local_folder(
|
||||||
exclude_patterns: list[str] | None = None,
|
exclude_patterns: list[str] | None = None,
|
||||||
file_extensions: list[str] | None = None,
|
file_extensions: list[str] | None = None,
|
||||||
root_folder_id: int | None = None,
|
root_folder_id: int | None = None,
|
||||||
enable_summary: bool = False,
|
|
||||||
target_file_paths: list[str] | None = None,
|
target_file_paths: list[str] | None = None,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
) -> tuple[int, int, int | None, str | None]:
|
) -> tuple[int, int, int | None, str | None]:
|
||||||
|
|
@ -574,8 +568,7 @@ async def index_local_folder(
|
||||||
folder_path=folder_path,
|
folder_path=folder_path,
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
target_file_path=target_file_paths[0],
|
target_file_path=target_file_paths[0],
|
||||||
enable_summary=enable_summary,
|
root_folder_id=root_folder_id,
|
||||||
root_folder_id=root_folder_id,
|
|
||||||
task_logger=task_logger,
|
task_logger=task_logger,
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
)
|
)
|
||||||
|
|
@ -587,8 +580,7 @@ async def index_local_folder(
|
||||||
folder_path=folder_path,
|
folder_path=folder_path,
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
target_file_paths=target_file_paths,
|
target_file_paths=target_file_paths,
|
||||||
enable_summary=enable_summary,
|
root_folder_id=root_folder_id,
|
||||||
root_folder_id=root_folder_id,
|
|
||||||
on_progress_callback=on_heartbeat_callback,
|
on_progress_callback=on_heartbeat_callback,
|
||||||
)
|
)
|
||||||
if err:
|
if err:
|
||||||
|
|
@ -774,8 +766,7 @@ async def index_local_folder(
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
)
|
||||||
)
|
|
||||||
connector_docs.append(doc)
|
connector_docs.append(doc)
|
||||||
file_meta_map[unique_identifier] = {
|
file_meta_map[unique_identifier] = {
|
||||||
"relative_path": relative_path,
|
"relative_path": relative_path,
|
||||||
|
|
@ -845,15 +836,13 @@ async def index_local_folder(
|
||||||
doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
|
doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs}
|
||||||
documents = await pipeline.prepare_for_indexing(connector_docs)
|
documents = await pipeline.prepare_for_indexing(connector_docs)
|
||||||
|
|
||||||
llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
|
||||||
|
|
||||||
for document in documents:
|
for document in documents:
|
||||||
connector_doc = doc_map.get(document.unique_identifier_hash)
|
connector_doc = doc_map.get(document.unique_identifier_hash)
|
||||||
if connector_doc is None:
|
if connector_doc is None:
|
||||||
failed_count += 1
|
failed_count += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
result = await pipeline.index(document, connector_doc, llm)
|
result = await pipeline.index(document, connector_doc)
|
||||||
|
|
||||||
if DocumentStatus.is_state(result.status, DocumentStatus.READY):
|
if DocumentStatus.is_state(result.status, DocumentStatus.READY):
|
||||||
indexed_count += 1
|
indexed_count += 1
|
||||||
|
|
@ -960,7 +949,6 @@ async def _index_batch_files(
|
||||||
folder_path: str,
|
folder_path: str,
|
||||||
folder_name: str,
|
folder_name: str,
|
||||||
target_file_paths: list[str],
|
target_file_paths: list[str],
|
||||||
enable_summary: bool,
|
|
||||||
root_folder_id: int | None,
|
root_folder_id: int | None,
|
||||||
on_progress_callback: HeartbeatCallbackType | None = None,
|
on_progress_callback: HeartbeatCallbackType | None = None,
|
||||||
) -> tuple[int, int, str | None]:
|
) -> tuple[int, int, str | None]:
|
||||||
|
|
@ -995,8 +983,7 @@ async def _index_batch_files(
|
||||||
folder_path=folder_path,
|
folder_path=folder_path,
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
target_file_path=file_path,
|
target_file_path=file_path,
|
||||||
enable_summary=enable_summary,
|
root_folder_id=root_folder_id,
|
||||||
root_folder_id=root_folder_id,
|
|
||||||
task_logger=task_logger,
|
task_logger=task_logger,
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
)
|
)
|
||||||
|
|
@ -1036,7 +1023,6 @@ async def _index_single_file(
|
||||||
folder_path: str,
|
folder_path: str,
|
||||||
folder_name: str,
|
folder_name: str,
|
||||||
target_file_path: str,
|
target_file_path: str,
|
||||||
enable_summary: bool,
|
|
||||||
root_folder_id: int | None,
|
root_folder_id: int | None,
|
||||||
task_logger,
|
task_logger,
|
||||||
log_entry,
|
log_entry,
|
||||||
|
|
@ -1125,8 +1111,7 @@ async def _index_single_file(
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
if root_folder_id:
|
if root_folder_id:
|
||||||
connector_doc.folder_id = await _resolve_folder_for_file(
|
connector_doc.folder_id = await _resolve_folder_for_file(
|
||||||
|
|
@ -1134,7 +1119,6 @@ async def _index_single_file(
|
||||||
)
|
)
|
||||||
|
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
|
||||||
documents = await pipeline.prepare_for_indexing([connector_doc])
|
documents = await pipeline.prepare_for_indexing([connector_doc])
|
||||||
|
|
||||||
if not documents:
|
if not documents:
|
||||||
|
|
@ -1142,7 +1126,7 @@ async def _index_single_file(
|
||||||
|
|
||||||
db_doc = documents[0]
|
db_doc = documents[0]
|
||||||
|
|
||||||
await pipeline.index(db_doc, connector_doc, llm)
|
await pipeline.index(db_doc, connector_doc)
|
||||||
|
|
||||||
await session.refresh(db_doc)
|
await session.refresh(db_doc)
|
||||||
doc_meta = dict(db_doc.document_metadata or {})
|
doc_meta = dict(db_doc.document_metadata or {})
|
||||||
|
|
@ -1275,7 +1259,6 @@ async def index_uploaded_files(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
folder_name: str,
|
folder_name: str,
|
||||||
root_folder_id: int,
|
root_folder_id: int,
|
||||||
enable_summary: bool,
|
|
||||||
file_mappings: list[dict],
|
file_mappings: list[dict],
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
|
|
@ -1318,7 +1301,6 @@ async def index_uploaded_files(
|
||||||
|
|
||||||
page_limit_service = PageLimitService(session)
|
page_limit_service = PageLimitService(session)
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
|
||||||
|
|
||||||
vision_llm_instance = None
|
vision_llm_instance = None
|
||||||
if use_vision_llm:
|
if use_vision_llm:
|
||||||
|
|
@ -1414,8 +1396,7 @@ async def index_uploaded_files(
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
connector_doc.folder_id = await _resolve_folder_for_file(
|
connector_doc.folder_id = await _resolve_folder_for_file(
|
||||||
session,
|
session,
|
||||||
|
|
@ -1432,7 +1413,7 @@ async def index_uploaded_files(
|
||||||
|
|
||||||
db_doc = documents[0]
|
db_doc = documents[0]
|
||||||
|
|
||||||
await pipeline.index(db_doc, connector_doc, llm)
|
await pipeline.index(db_doc, connector_doc)
|
||||||
|
|
||||||
await session.refresh(db_doc)
|
await session.refresh(db_doc)
|
||||||
doc_meta = dict(db_doc.document_metadata or {})
|
doc_meta = dict(db_doc.document_metadata or {})
|
||||||
|
|
|
||||||
|
|
@ -16,13 +16,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.luma_connector import LumaConnector
|
from app.connectors.luma_connector import LumaConnector
|
||||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -437,38 +435,14 @@ async def index_luma_events(
|
||||||
document.status = DocumentStatus.processing()
|
document.status = DocumentStatus.processing()
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
# Heavy processing (LLM, embeddings, chunks)
|
# Heavy processing (embeddings, chunks)
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
if user_llm and connector.enable_summary:
|
summary_content = (
|
||||||
document_metadata_for_summary = {
|
f"Luma Event: {item['event_name']}\n\n{item['event_markdown']}"
|
||||||
"event_id": item["event_id"],
|
)
|
||||||
"event_name": item["event_name"],
|
summary_embedding = await asyncio.to_thread(
|
||||||
"event_url": item["event_url"],
|
embed_text, summary_content
|
||||||
"start_at": item["start_at"],
|
)
|
||||||
"end_at": item["end_at"],
|
|
||||||
"timezone": item["timezone"],
|
|
||||||
"location": item["location"] or "No location",
|
|
||||||
"city": item["city"],
|
|
||||||
"hosts": item["host_names"],
|
|
||||||
"document_type": "Luma Event",
|
|
||||||
"connector_type": "Luma",
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
item["event_markdown"], user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = (
|
|
||||||
f"Luma Event: {item['event_name']}\n\n{item['event_markdown']}"
|
|
||||||
)
|
|
||||||
summary_embedding = await asyncio.to_thread(
|
|
||||||
embed_text, summary_content
|
|
||||||
)
|
|
||||||
|
|
||||||
chunks = await create_document_chunks(item["event_markdown"])
|
chunks = await create_document_chunks(item["event_markdown"])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
Notion connector indexer.
|
Notion connector indexer.
|
||||||
|
|
||||||
Uses the shared IndexingPipelineService for document deduplication,
|
Uses the shared IndexingPipelineService for document deduplication,
|
||||||
summarization, chunking, and embedding with bounded parallel indexing.
|
chunking, and embedding with bounded parallel indexing.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
|
|
@ -19,7 +19,6 @@ from app.indexing_pipeline.indexing_pipeline_service import (
|
||||||
IndexingPipelineService,
|
IndexingPipelineService,
|
||||||
PlaceholderInfo,
|
PlaceholderInfo,
|
||||||
)
|
)
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.notion_utils import process_blocks
|
from app.utils.notion_utils import process_blocks
|
||||||
|
|
||||||
|
|
@ -43,7 +42,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
"""Map a raw Notion page dict to a ConnectorDocument."""
|
"""Map a raw Notion page dict to a ConnectorDocument."""
|
||||||
page_id = page.get("page_id", "")
|
page_id = page.get("page_id", "")
|
||||||
|
|
@ -57,8 +55,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "Notion",
|
"connector_type": "Notion",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = f"Notion Page: {page_title}\n\n{markdown_content}"
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=page_title,
|
title=page_title,
|
||||||
source_markdown=markdown_content,
|
source_markdown=markdown_content,
|
||||||
|
|
@ -67,8 +63,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -314,8 +308,7 @@ async def index_notion_pages(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector.enable_summary,
|
)
|
||||||
)
|
|
||||||
|
|
||||||
with session.no_autoflush:
|
with session.no_autoflush:
|
||||||
duplicate = await check_duplicate_document_by_hash(
|
duplicate = await check_duplicate_document_by_hash(
|
||||||
|
|
@ -343,13 +336,8 @@ async def index_notion_pages(
|
||||||
|
|
||||||
# ── Pipeline: migrate legacy docs + parallel index ────────────
|
# ── Pipeline: migrate legacy docs + parallel index ────────────
|
||||||
await pipeline.migrate_legacy_docs(connector_docs)
|
await pipeline.migrate_legacy_docs(connector_docs)
|
||||||
|
|
||||||
async def _get_llm(s):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
_, documents_indexed, documents_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
heartbeat_interval=HEARTBEAT_INTERVAL_SECONDS,
|
||||||
|
|
|
||||||
|
|
@ -27,7 +27,6 @@ from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnector
|
||||||
from app.indexing_pipeline.connector_document import ConnectorDocument
|
from app.indexing_pipeline.connector_document import ConnectorDocument
|
||||||
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
from app.indexing_pipeline.document_hashing import compute_identifier_hash
|
||||||
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.page_limit_service import PageLimitService
|
from app.services.page_limit_service import PageLimitService
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.connector_indexers.base import (
|
from app.tasks.connector_indexers.base import (
|
||||||
|
|
@ -133,7 +132,6 @@ def _build_connector_doc(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
) -> ConnectorDocument:
|
) -> ConnectorDocument:
|
||||||
file_id = file.get("id", "")
|
file_id = file.get("id", "")
|
||||||
file_name = file.get("name", "Unknown")
|
file_name = file.get("name", "Unknown")
|
||||||
|
|
@ -145,8 +143,6 @@ def _build_connector_doc(
|
||||||
"connector_type": "OneDrive",
|
"connector_type": "OneDrive",
|
||||||
}
|
}
|
||||||
|
|
||||||
fallback_summary = f"File: {file_name}\n\n{markdown[:4000]}"
|
|
||||||
|
|
||||||
return ConnectorDocument(
|
return ConnectorDocument(
|
||||||
title=file_name,
|
title=file_name,
|
||||||
source_markdown=markdown,
|
source_markdown=markdown,
|
||||||
|
|
@ -155,8 +151,6 @@ def _build_connector_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=enable_summary,
|
|
||||||
fallback_summary=fallback_summary,
|
|
||||||
metadata=metadata,
|
metadata=metadata,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -168,7 +162,6 @@ async def _download_files_parallel(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
max_concurrency: int = 3,
|
max_concurrency: int = 3,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
|
|
@ -198,7 +191,6 @@ async def _download_files_parallel(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
)
|
)
|
||||||
async with hb_lock:
|
async with hb_lock:
|
||||||
completed_count += 1
|
completed_count += 1
|
||||||
|
|
@ -230,7 +222,6 @@ async def _download_and_index(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
|
|
@ -241,7 +232,6 @@ async def _download_and_index(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -250,13 +240,8 @@ async def _download_and_index(
|
||||||
batch_failed = 0
|
batch_failed = 0
|
||||||
if connector_docs:
|
if connector_docs:
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
|
|
||||||
async def _get_llm(s):
|
|
||||||
return await get_user_long_context_llm(s, user_id, search_space_id)
|
|
||||||
|
|
||||||
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
|
_, batch_indexed, batch_failed = await pipeline.index_batch_parallel(
|
||||||
connector_docs,
|
connector_docs,
|
||||||
_get_llm,
|
|
||||||
max_concurrency=3,
|
max_concurrency=3,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
)
|
)
|
||||||
|
|
@ -294,7 +279,6 @@ async def _index_selected_files(
|
||||||
connector_id: int,
|
connector_id: int,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, list[str]]:
|
) -> tuple[int, int, int, list[str]]:
|
||||||
|
|
@ -345,7 +329,6 @@ async def _index_selected_files(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -379,7 +362,6 @@ async def _index_full_scan(
|
||||||
max_files: int,
|
max_files: int,
|
||||||
include_subfolders: bool = True,
|
include_subfolders: bool = True,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Full scan indexing of a folder.
|
"""Full scan indexing of a folder.
|
||||||
|
|
@ -454,7 +436,6 @@ async def _index_full_scan(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -487,7 +468,6 @@ async def _index_with_delta_sync(
|
||||||
log_entry: object,
|
log_entry: object,
|
||||||
max_files: int,
|
max_files: int,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
vision_llm=None,
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, str | None]:
|
) -> tuple[int, int, int, str | None]:
|
||||||
"""Delta sync using OneDrive change tracking.
|
"""Delta sync using OneDrive change tracking.
|
||||||
|
|
@ -579,7 +559,6 @@ async def _index_with_delta_sync(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
@ -651,7 +630,6 @@ async def index_onedrive_files(
|
||||||
)
|
)
|
||||||
return 0, 0, error_msg, 0
|
return 0, 0, error_msg, 0
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
|
||||||
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
vision_llm = None
|
vision_llm = None
|
||||||
if connector_enable_vision_llm:
|
if connector_enable_vision_llm:
|
||||||
|
|
@ -681,7 +659,6 @@ async def index_onedrive_files(
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += indexed
|
total_indexed += indexed
|
||||||
|
|
@ -711,7 +688,6 @@ async def index_onedrive_files(
|
||||||
task_logger,
|
task_logger,
|
||||||
log_entry,
|
log_entry,
|
||||||
max_files,
|
max_files,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += indexed
|
total_indexed += indexed
|
||||||
|
|
@ -738,7 +714,6 @@ async def index_onedrive_files(
|
||||||
log_entry,
|
log_entry,
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += ri
|
total_indexed += ri
|
||||||
|
|
@ -758,7 +733,6 @@ async def index_onedrive_files(
|
||||||
log_entry,
|
log_entry,
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
enable_summary=connector_enable_summary,
|
|
||||||
vision_llm=vision_llm,
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += indexed
|
total_indexed += indexed
|
||||||
|
|
|
||||||
|
|
@ -15,13 +15,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.connectors.webcrawler_connector import WebCrawlerConnector
|
from app.connectors.webcrawler_connector import WebCrawlerConnector
|
||||||
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
from app.utils.webcrawler_utils import parse_webcrawler_urls
|
||||||
|
|
@ -372,29 +370,10 @@ async def index_crawled_urls(
|
||||||
documents_skipped += 1
|
documents_skipped += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Generate summary with LLM
|
# Select deterministic document content
|
||||||
user_llm = await get_user_long_context_llm(
|
|
||||||
session, user_id, search_space_id
|
|
||||||
)
|
|
||||||
|
|
||||||
if user_llm and connector.enable_summary:
|
summary_content = f"Crawled URL: {title}\n\nURL: {url}\n\n{content}"
|
||||||
document_metadata_for_summary = {
|
summary_embedding = embed_text(summary_content)
|
||||||
"url": url,
|
|
||||||
"title": title,
|
|
||||||
"description": description,
|
|
||||||
"language": language,
|
|
||||||
"document_type": "Crawled URL",
|
|
||||||
"crawler_type": crawler_type,
|
|
||||||
}
|
|
||||||
(
|
|
||||||
summary_content,
|
|
||||||
summary_embedding,
|
|
||||||
) = await generate_document_summary(
|
|
||||||
structured_document, user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
summary_content = f"Crawled URL: {title}\n\nURL: {url}\n\n{content}"
|
|
||||||
summary_embedding = embed_text(summary_content)
|
|
||||||
|
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(content)
|
chunks = await create_document_chunks(content)
|
||||||
|
|
|
||||||
|
|
@ -1,20 +1,15 @@
|
||||||
"""
|
"""Unified document save/update logic for file processors."""
|
||||||
Unified document save/update logic for file processors.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from sqlalchemy.exc import SQLAlchemyError
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.db import Document, DocumentStatus, DocumentType
|
from app.db import Document, DocumentStatus, DocumentType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
embed_text,
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from ._helpers import (
|
from ._helpers import (
|
||||||
|
|
@ -24,59 +19,6 @@ from ._helpers import (
|
||||||
)
|
)
|
||||||
from .base import get_current_timestamp, safe_set_chunks
|
from .base import get_current_timestamp, safe_set_chunks
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
# Summary generation
|
|
||||||
# ---------------------------------------------------------------------------
|
|
||||||
|
|
||||||
|
|
||||||
async def _generate_summary(
|
|
||||||
markdown_content: str,
|
|
||||||
file_name: str,
|
|
||||||
etl_service: str,
|
|
||||||
user_llm,
|
|
||||||
enable_summary: bool,
|
|
||||||
) -> tuple[str, list[float]]:
|
|
||||||
"""
|
|
||||||
Generate a document summary and embedding.
|
|
||||||
|
|
||||||
Docling uses its own large-document summary strategy; other ETL services
|
|
||||||
use the standard ``generate_document_summary`` helper.
|
|
||||||
"""
|
|
||||||
if not enable_summary:
|
|
||||||
summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
|
|
||||||
return summary, await asyncio.to_thread(embed_text, summary)
|
|
||||||
|
|
||||||
if etl_service == "DOCLING":
|
|
||||||
from app.services.docling_service import create_docling_service
|
|
||||||
|
|
||||||
docling_service = create_docling_service()
|
|
||||||
summary_text = await docling_service.process_large_document_summary(
|
|
||||||
content=markdown_content, llm=user_llm, document_title=file_name
|
|
||||||
)
|
|
||||||
|
|
||||||
meta = {
|
|
||||||
"file_name": file_name,
|
|
||||||
"etl_service": etl_service,
|
|
||||||
"document_type": "File Document",
|
|
||||||
}
|
|
||||||
parts = ["# DOCUMENT METADATA"]
|
|
||||||
for key, value in meta.items():
|
|
||||||
if value:
|
|
||||||
formatted_key = key.replace("_", " ").title()
|
|
||||||
parts.append(f"**{formatted_key}:** {value}")
|
|
||||||
|
|
||||||
enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
|
|
||||||
return enhanced, await asyncio.to_thread(embed_text, enhanced)
|
|
||||||
|
|
||||||
# Standard summary (Unstructured / LlamaCloud / others)
|
|
||||||
meta = {
|
|
||||||
"file_name": file_name,
|
|
||||||
"etl_service": etl_service,
|
|
||||||
"document_type": "File Document",
|
|
||||||
}
|
|
||||||
return await generate_document_summary(markdown_content, user_llm, meta)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Unified save function
|
# Unified save function
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -90,7 +32,6 @@ async def save_file_document(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
etl_service: str,
|
etl_service: str,
|
||||||
connector: dict | None = None,
|
connector: dict | None = None,
|
||||||
enable_summary: bool = True,
|
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process and store a file document with deduplication and migration support.
|
Process and store a file document with deduplication and migration support.
|
||||||
|
|
@ -106,7 +47,6 @@ async def save_file_document(
|
||||||
user_id: ID of the user
|
user_id: ID of the user
|
||||||
etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
|
etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
|
||||||
connector: Optional connector info for Google Drive files
|
connector: Optional connector info for Google Drive files
|
||||||
enable_summary: Whether to generate an AI summary
|
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Document object if successful, None if duplicate detected
|
Document object if successful, None if duplicate detected
|
||||||
|
|
@ -133,24 +73,16 @@ async def save_file_document(
|
||||||
if should_skip:
|
if should_skip:
|
||||||
return doc
|
return doc
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
document_content = f"File: {file_name}\n\n{markdown_content[:4000]}"
|
||||||
if not user_llm:
|
document_embedding = embed_text(document_content)
|
||||||
raise RuntimeError(
|
|
||||||
f"No long context LLM configured for user {user_id} "
|
|
||||||
f"in search space {search_space_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_content, summary_embedding = await _generate_summary(
|
|
||||||
markdown_content, file_name, etl_service, user_llm, enable_summary
|
|
||||||
)
|
|
||||||
chunks = await create_document_chunks(markdown_content)
|
chunks = await create_document_chunks(markdown_content)
|
||||||
doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
|
doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
|
||||||
|
|
||||||
if existing_document:
|
if existing_document:
|
||||||
existing_document.title = file_name
|
existing_document.title = file_name
|
||||||
existing_document.content = summary_content
|
existing_document.content = document_content
|
||||||
existing_document.content_hash = content_hash
|
existing_document.content_hash = content_hash
|
||||||
existing_document.embedding = summary_embedding
|
existing_document.embedding = document_embedding
|
||||||
existing_document.document_metadata = doc_metadata
|
existing_document.document_metadata = doc_metadata
|
||||||
await safe_set_chunks(session, existing_document, chunks)
|
await safe_set_chunks(session, existing_document, chunks)
|
||||||
existing_document.source_markdown = markdown_content
|
existing_document.source_markdown = markdown_content
|
||||||
|
|
@ -171,8 +103,8 @@ async def save_file_document(
|
||||||
title=file_name,
|
title=file_name,
|
||||||
document_type=doc_type,
|
document_type=doc_type,
|
||||||
document_metadata=doc_metadata,
|
document_metadata=doc_metadata,
|
||||||
content=summary_content,
|
content=document_content,
|
||||||
embedding=summary_embedding,
|
embedding=document_embedding,
|
||||||
chunks=chunks,
|
chunks=chunks,
|
||||||
content_hash=content_hash,
|
content_hash=content_hash,
|
||||||
unique_identifier_hash=primary_hash,
|
unique_identifier_hash=primary_hash,
|
||||||
|
|
|
||||||
|
|
@ -25,11 +25,10 @@ from app.db import (
|
||||||
SearchSourceConnectorType,
|
SearchSourceConnectorType,
|
||||||
SearchSpace,
|
SearchSpace,
|
||||||
)
|
)
|
||||||
from app.services.llm_service import get_document_summary_llm
|
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -176,34 +175,8 @@ async def add_circleback_meeting_document(
|
||||||
# PHASE 3: Process the document content
|
# PHASE 3: Process the document content
|
||||||
# =======================================================================
|
# =======================================================================
|
||||||
|
|
||||||
# Get LLM for generating summary
|
summary_content = markdown_content
|
||||||
llm = await get_document_summary_llm(session, search_space_id)
|
summary_embedding = embed_text(summary_content)
|
||||||
if not llm:
|
|
||||||
logger.warning(
|
|
||||||
f"No LLM configured for search space {search_space_id}. Using content as summary."
|
|
||||||
)
|
|
||||||
# Use first 1000 chars as summary if no LLM available
|
|
||||||
summary_content = (
|
|
||||||
markdown_content[:1000] + "..."
|
|
||||||
if len(markdown_content) > 1000
|
|
||||||
else markdown_content
|
|
||||||
)
|
|
||||||
summary_embedding = None
|
|
||||||
else:
|
|
||||||
# Generate summary with metadata
|
|
||||||
summary_metadata = {
|
|
||||||
"meeting_name": meeting_name,
|
|
||||||
"meeting_id": meeting_id,
|
|
||||||
"document_type": "Circleback Meeting",
|
|
||||||
**{
|
|
||||||
k: v
|
|
||||||
for k, v in metadata.items()
|
|
||||||
if isinstance(v, str | int | float | bool)
|
|
||||||
},
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
markdown_content, llm, summary_metadata
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(markdown_content)
|
chunks = await create_document_chunks(markdown_content)
|
||||||
|
|
@ -224,8 +197,7 @@ async def add_circleback_meeting_document(
|
||||||
document.title = meeting_name
|
document.title = meeting_name
|
||||||
document.content = summary_content
|
document.content = summary_content
|
||||||
document.content_hash = content_hash
|
document.content_hash = content_hash
|
||||||
if summary_embedding is not None:
|
document.embedding = summary_embedding
|
||||||
document.embedding = summary_embedding
|
|
||||||
document.document_metadata = document_metadata
|
document.document_metadata = document_metadata
|
||||||
await safe_set_chunks(session, document, chunks)
|
await safe_set_chunks(session, document, chunks)
|
||||||
document.source_markdown = markdown_content
|
document.source_markdown = markdown_content
|
||||||
|
|
|
||||||
|
|
@ -9,12 +9,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.db import Document, DocumentType
|
from app.db import Document, DocumentType
|
||||||
from app.schemas import ExtensionDocumentContent
|
from app.schemas import ExtensionDocumentContent
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -123,26 +122,8 @@ async def add_extension_received_document(
|
||||||
f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
|
f"Content changed for URL {content.metadata.VisitedWebPageURL}. Updating document."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
summary_content = combined_document_string
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
summary_embedding = embed_text(summary_content)
|
||||||
if not user_llm:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate summary with metadata
|
|
||||||
document_metadata = {
|
|
||||||
"session_id": content.metadata.BrowsingSessionId,
|
|
||||||
"url": content.metadata.VisitedWebPageURL,
|
|
||||||
"title": content.metadata.VisitedWebPageTitle,
|
|
||||||
"referrer": content.metadata.VisitedWebPageReffererURL,
|
|
||||||
"timestamp": content.metadata.VisitedWebPageDateWithTimeInISOString,
|
|
||||||
"duration_ms": content.metadata.VisitedWebPageVisitDurationInMilliseconds,
|
|
||||||
"document_type": "Browser Extension Capture",
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
combined_document_string, user_llm, document_metadata
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(content.pageContent)
|
chunks = await create_document_chunks(content.pageContent)
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ from __future__ import annotations
|
||||||
import contextlib
|
import contextlib
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
|
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
@ -48,12 +48,6 @@ class _ProcessingContext:
|
||||||
notification: Notification | None = None
|
notification: Notification | None = None
|
||||||
use_vision_llm: bool = False
|
use_vision_llm: bool = False
|
||||||
processing_mode: str = "basic"
|
processing_mode: str = "basic"
|
||||||
enable_summary: bool = field(init=False)
|
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
|
||||||
self.enable_summary = (
|
|
||||||
self.connector.get("enable_summary", True) if self.connector else True
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -261,7 +255,6 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
|
||||||
ctx.user_id,
|
ctx.user_id,
|
||||||
etl_result.etl_service,
|
etl_result.etl_service,
|
||||||
ctx.connector,
|
ctx.connector,
|
||||||
enable_summary=ctx.enable_summary,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if result:
|
if result:
|
||||||
|
|
@ -466,7 +459,6 @@ async def process_file_in_background_with_document(
|
||||||
log_entry: Log,
|
log_entry: Log,
|
||||||
connector: dict | None = None,
|
connector: dict | None = None,
|
||||||
notification: Notification | None = None,
|
notification: Notification | None = None,
|
||||||
should_summarize: bool = False,
|
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
|
|
@ -482,7 +474,6 @@ async def process_file_in_background_with_document(
|
||||||
from app.indexing_pipeline.adapters.file_upload_adapter import (
|
from app.indexing_pipeline.adapters.file_upload_adapter import (
|
||||||
UploadDocumentAdapter,
|
UploadDocumentAdapter,
|
||||||
)
|
)
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.utils.document_converters import generate_content_hash
|
from app.utils.document_converters import generate_content_hash
|
||||||
|
|
||||||
from .base import check_duplicate_document
|
from .base import check_duplicate_document
|
||||||
|
|
@ -522,8 +513,6 @@ async def process_file_in_background_with_document(
|
||||||
stage="chunking",
|
stage="chunking",
|
||||||
)
|
)
|
||||||
|
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
|
||||||
|
|
||||||
adapter = UploadDocumentAdapter(session)
|
adapter = UploadDocumentAdapter(session)
|
||||||
await adapter.index(
|
await adapter.index(
|
||||||
markdown_content=markdown_content,
|
markdown_content=markdown_content,
|
||||||
|
|
@ -531,8 +520,6 @@ async def process_file_in_background_with_document(
|
||||||
etl_service=etl_service,
|
etl_service=etl_service,
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
llm=user_llm,
|
|
||||||
should_summarize=should_summarize,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
if billable_pages > 0:
|
if billable_pages > 0:
|
||||||
|
|
|
||||||
|
|
@ -8,12 +8,11 @@ from sqlalchemy.exc import SQLAlchemyError
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
from app.db import Document, DocumentStatus, DocumentType
|
from app.db import Document, DocumentStatus, DocumentType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
from ._helpers import (
|
from ._helpers import (
|
||||||
|
|
@ -183,21 +182,8 @@ async def add_received_markdown_file_document(
|
||||||
return doc
|
return doc
|
||||||
# Content changed - continue to update
|
# Content changed - continue to update
|
||||||
|
|
||||||
# Get user's long context LLM (needed for both create and update)
|
summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
summary_embedding = embed_text(summary_content)
|
||||||
if not user_llm:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate summary with metadata
|
|
||||||
document_metadata = {
|
|
||||||
"file_name": file_name,
|
|
||||||
"document_type": "Markdown File Document",
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
file_in_markdown, user_llm, document_metadata
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process chunks
|
# Process chunks
|
||||||
chunks = await create_document_chunks(file_in_markdown)
|
chunks = await create_document_chunks(file_in_markdown)
|
||||||
|
|
|
||||||
|
|
@ -17,12 +17,11 @@ from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from youtube_transcript_api import YouTubeTranscriptApi
|
from youtube_transcript_api import YouTubeTranscriptApi
|
||||||
|
|
||||||
from app.db import Document, DocumentStatus, DocumentType
|
from app.db import Document, DocumentStatus, DocumentType
|
||||||
from app.services.llm_service import get_user_long_context_llm
|
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.utils.document_converters import (
|
from app.utils.document_converters import (
|
||||||
create_document_chunks,
|
create_document_chunks,
|
||||||
|
embed_text,
|
||||||
generate_content_hash,
|
generate_content_hash,
|
||||||
generate_document_summary,
|
|
||||||
generate_unique_identifier_hash,
|
generate_unique_identifier_hash,
|
||||||
)
|
)
|
||||||
from app.utils.proxy_config import get_requests_proxies
|
from app.utils.proxy_config import get_requests_proxies
|
||||||
|
|
@ -355,40 +354,8 @@ async def add_youtube_video_document(
|
||||||
await session.commit()
|
await session.commit()
|
||||||
return document
|
return document
|
||||||
|
|
||||||
# Get LLM for summary generation
|
summary_content = combined_document_string
|
||||||
await task_logger.log_task_progress(
|
summary_embedding = embed_text(summary_content)
|
||||||
log_entry,
|
|
||||||
f"Preparing for summary generation: {video_data.get('title', 'YouTube Video')}",
|
|
||||||
{"stage": "llm_setup"},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get user's long context LLM
|
|
||||||
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
|
||||||
if not user_llm:
|
|
||||||
raise RuntimeError(
|
|
||||||
f"No long context LLM configured for user {user_id} in search space {search_space_id}"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate summary
|
|
||||||
await task_logger.log_task_progress(
|
|
||||||
log_entry,
|
|
||||||
f"Generating summary for video: {video_data.get('title', 'YouTube Video')}",
|
|
||||||
{"stage": "summary_generation"},
|
|
||||||
)
|
|
||||||
|
|
||||||
# Generate summary with metadata
|
|
||||||
document_metadata_for_summary = {
|
|
||||||
"url": url,
|
|
||||||
"video_id": video_id,
|
|
||||||
"title": video_data.get("title", "YouTube Video"),
|
|
||||||
"author": video_data.get("author_name", "Unknown"),
|
|
||||||
"thumbnail": video_data.get("thumbnail_url", ""),
|
|
||||||
"document_type": "YouTube Video Document",
|
|
||||||
"has_transcript": "No captions available" not in transcript_text,
|
|
||||||
}
|
|
||||||
summary_content, summary_embedding = await generate_document_summary(
|
|
||||||
combined_document_string, user_llm, document_metadata_for_summary
|
|
||||||
)
|
|
||||||
|
|
||||||
# Process chunks
|
# Process chunks
|
||||||
await task_logger.log_task_progress(
|
await task_logger.log_task_progress(
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,6 @@ from litellm import get_model_info, token_counter
|
||||||
|
|
||||||
from app.config import config
|
from app.config import config
|
||||||
from app.db import Chunk, DocumentType
|
from app.db import Chunk, DocumentType
|
||||||
from app.prompts import SUMMARY_PROMPT_TEMPLATE
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -176,57 +175,6 @@ def optimize_content_for_context_window(
|
||||||
return optimized_content
|
return optimized_content
|
||||||
|
|
||||||
|
|
||||||
async def generate_document_summary(
|
|
||||||
content: str,
|
|
||||||
user_llm,
|
|
||||||
document_metadata: dict | None = None,
|
|
||||||
) -> tuple[str, list[float]]:
|
|
||||||
"""
|
|
||||||
Generate summary and embedding for document content with metadata.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
content: Document content
|
|
||||||
user_llm: User's LLM instance
|
|
||||||
document_metadata: Optional metadata dictionary to include in summary
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (enhanced_summary_content, summary_embedding)
|
|
||||||
"""
|
|
||||||
# Get model name from user_llm for token counting
|
|
||||||
model_name = getattr(user_llm, "model", "gpt-3.5-turbo") # Fallback to default
|
|
||||||
|
|
||||||
# Optimize content to fit within context window
|
|
||||||
optimized_content = optimize_content_for_context_window(
|
|
||||||
content, document_metadata, model_name
|
|
||||||
)
|
|
||||||
|
|
||||||
summary_chain = SUMMARY_PROMPT_TEMPLATE | user_llm
|
|
||||||
content_with_metadata = f"<DOCUMENT><DOCUMENT_METADATA>\n\n{document_metadata}\n\n</DOCUMENT_METADATA>\n\n<DOCUMENT_CONTENT>\n\n{optimized_content}\n\n</DOCUMENT_CONTENT></DOCUMENT>"
|
|
||||||
summary_result = await summary_chain.ainvoke({"document": content_with_metadata})
|
|
||||||
summary_content = summary_result.content
|
|
||||||
|
|
||||||
# Combine summary with metadata if provided
|
|
||||||
if document_metadata:
|
|
||||||
metadata_parts = []
|
|
||||||
metadata_parts.append("# DOCUMENT METADATA")
|
|
||||||
|
|
||||||
for key, value in document_metadata.items():
|
|
||||||
if value: # Only include non-empty values
|
|
||||||
formatted_key = key.replace("_", " ").title()
|
|
||||||
metadata_parts.append(f"**{formatted_key}:** {value}")
|
|
||||||
|
|
||||||
metadata_section = "\n".join(metadata_parts)
|
|
||||||
enhanced_summary_content = (
|
|
||||||
f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
enhanced_summary_content = summary_content
|
|
||||||
|
|
||||||
summary_embedding = await asyncio.to_thread(embed_text, enhanced_summary_content)
|
|
||||||
|
|
||||||
return enhanced_summary_content, summary_embedding
|
|
||||||
|
|
||||||
|
|
||||||
async def create_document_chunks(content: str) -> list[Chunk]:
|
async def create_document_chunks(content: str) -> list[Chunk]:
|
||||||
"""
|
"""
|
||||||
Create chunks from document content.
|
Create chunks from document content.
|
||||||
|
|
|
||||||
|
|
@ -7,13 +7,13 @@ The production indexing pipeline summarizes documents with:
|
||||||
summary_content = summary_result.content
|
summary_content = summary_result.content
|
||||||
|
|
||||||
The `llm` parameter is supplied per-document by
|
The `llm` parameter is supplied per-document by
|
||||||
`app.services.llm_service.get_user_long_context_llm`. We patch THAT
|
`app.services.llm_service.get_agent_llm`. We patch THAT
|
||||||
function to return a langchain-native FakeListChatModel so the rest of
|
function to return a langchain-native FakeListChatModel so the rest of
|
||||||
the chain works unchanged. No real LLM provider package is touched.
|
the chain works unchanged. No real LLM provider package is touched.
|
||||||
|
|
||||||
Run-backend / run-celery use unittest.mock.patch.start() to install
|
Run-backend / run-celery use unittest.mock.patch.start() to install
|
||||||
this at every binding site (the source module + every consumer that
|
this at every binding site (the source module + every consumer that
|
||||||
did `from app.services.llm_service import get_user_long_context_llm`
|
did `from app.services.llm_service import get_agent_llm`
|
||||||
at module load time).
|
at module load time).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -42,7 +42,7 @@ def _make_fake_llm() -> FakeListChatModel:
|
||||||
return fake
|
return fake
|
||||||
|
|
||||||
|
|
||||||
async def fake_get_user_long_context_llm(*args: Any, **kwargs: Any) -> Any:
|
async def fake_get_agent_llm(*args: Any, **kwargs: Any) -> Any:
|
||||||
"""Drop-in replacement for app.services.llm_service.get_user_long_context_llm."""
|
"""Drop-in replacement for app.services.llm_service.get_agent_llm."""
|
||||||
logger.info("[fake-llm] returning FakeListChatModel for E2E indexing")
|
logger.info("[fake-llm] returning FakeListChatModel for E2E indexing")
|
||||||
return _make_fake_llm()
|
return _make_fake_llm()
|
||||||
|
|
|
||||||
|
|
@ -206,23 +206,23 @@ def _patch_llm_bindings() -> None:
|
||||||
fake_create_chat_litellm_from_agent_config,
|
fake_create_chat_litellm_from_agent_config,
|
||||||
fake_create_chat_litellm_from_config,
|
fake_create_chat_litellm_from_config,
|
||||||
)
|
)
|
||||||
from tests.e2e.fakes.llm import fake_get_user_long_context_llm
|
from tests.e2e.fakes.llm import fake_get_agent_llm
|
||||||
|
|
||||||
targets = [
|
targets = [
|
||||||
"app.services.llm_service.get_user_long_context_llm",
|
"app.services.llm_service.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.confluence_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.google_drive_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.google_gmail_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.notion_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.onedrive_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.dropbox_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.local_folder_indexer.get_agent_llm",
|
||||||
"app.tasks.document_processors._save.get_user_long_context_llm",
|
"app.tasks.document_processors._save.get_agent_llm",
|
||||||
"app.tasks.document_processors.markdown_processor.get_user_long_context_llm",
|
"app.tasks.document_processors.markdown_processor.get_agent_llm",
|
||||||
]
|
]
|
||||||
for target in targets:
|
for target in targets:
|
||||||
try:
|
try:
|
||||||
p = patch(target, fake_get_user_long_context_llm)
|
p = patch(target, fake_get_agent_llm)
|
||||||
p.start()
|
p.start()
|
||||||
_active_patches.append(p)
|
_active_patches.append(p)
|
||||||
logger.info("[fake-llm] patched %s", target)
|
logger.info("[fake-llm] patched %s", target)
|
||||||
|
|
|
||||||
|
|
@ -183,23 +183,23 @@ def _patch_llm_bindings() -> None:
|
||||||
fake_create_chat_litellm_from_agent_config,
|
fake_create_chat_litellm_from_agent_config,
|
||||||
fake_create_chat_litellm_from_config,
|
fake_create_chat_litellm_from_config,
|
||||||
)
|
)
|
||||||
from tests.e2e.fakes.llm import fake_get_user_long_context_llm
|
from tests.e2e.fakes.llm import fake_get_agent_llm
|
||||||
|
|
||||||
targets = [
|
targets = [
|
||||||
"app.services.llm_service.get_user_long_context_llm",
|
"app.services.llm_service.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.confluence_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.confluence_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.google_drive_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.google_drive_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.google_gmail_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.google_gmail_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.notion_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.notion_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.onedrive_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.onedrive_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.dropbox_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.dropbox_indexer.get_agent_llm",
|
||||||
"app.tasks.connector_indexers.local_folder_indexer.get_user_long_context_llm",
|
"app.tasks.connector_indexers.local_folder_indexer.get_agent_llm",
|
||||||
"app.tasks.document_processors._save.get_user_long_context_llm",
|
"app.tasks.document_processors._save.get_agent_llm",
|
||||||
"app.tasks.document_processors.markdown_processor.get_user_long_context_llm",
|
"app.tasks.document_processors.markdown_processor.get_agent_llm",
|
||||||
]
|
]
|
||||||
for target in targets:
|
for target in targets:
|
||||||
try:
|
try:
|
||||||
p = patch(target, fake_get_user_long_context_llm)
|
p = patch(target, fake_get_agent_llm)
|
||||||
p.start()
|
p.start()
|
||||||
_active_patches.append(p)
|
_active_patches.append(p)
|
||||||
logger.info("[fake-llm] patched %s in celery worker", target)
|
logger.info("[fake-llm] patched %s in celery worker", target)
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,279 @@
|
||||||
|
"""Integration tests for new-chat thread visibility invariants.
|
||||||
|
|
||||||
|
These tests exercise the route handlers directly with real DB-backed
|
||||||
|
users, memberships, and permissions. The important contract is that a
|
||||||
|
thread shared with a search space stays shared across normal metadata
|
||||||
|
updates until the creator explicitly makes it private again.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_asyncio
|
||||||
|
from fastapi import HTTPException
|
||||||
|
from sqlalchemy import select
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from app.db import (
|
||||||
|
ChatVisibility,
|
||||||
|
SearchSpace,
|
||||||
|
SearchSpaceMembership,
|
||||||
|
SearchSpaceRole,
|
||||||
|
User,
|
||||||
|
)
|
||||||
|
from app.routes import new_chat_routes
|
||||||
|
from app.schemas.new_chat import (
|
||||||
|
NewChatThreadCreate,
|
||||||
|
NewChatThreadUpdate,
|
||||||
|
NewChatThreadVisibilityUpdate,
|
||||||
|
)
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
|
|
||||||
|
@pytest_asyncio.fixture
|
||||||
|
async def db_member(db_session: AsyncSession, db_search_space: SearchSpace) -> User:
|
||||||
|
member = User(
|
||||||
|
id=uuid.uuid4(),
|
||||||
|
email="member@surfsense.net",
|
||||||
|
hashed_password="hashed",
|
||||||
|
is_active=True,
|
||||||
|
is_superuser=False,
|
||||||
|
is_verified=True,
|
||||||
|
)
|
||||||
|
db_session.add(member)
|
||||||
|
await db_session.flush()
|
||||||
|
|
||||||
|
role = (
|
||||||
|
(
|
||||||
|
await db_session.execute(
|
||||||
|
select(SearchSpaceRole).where(
|
||||||
|
SearchSpaceRole.search_space_id == db_search_space.id,
|
||||||
|
SearchSpaceRole.name == "Editor",
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
.scalars()
|
||||||
|
.one()
|
||||||
|
)
|
||||||
|
db_session.add(
|
||||||
|
SearchSpaceMembership(
|
||||||
|
user_id=member.id,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
role_id=role.id,
|
||||||
|
is_owner=False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
await db_session.flush()
|
||||||
|
return member
|
||||||
|
|
||||||
|
|
||||||
|
async def _create_thread(
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
*,
|
||||||
|
title: str = "Visibility Invariant Chat",
|
||||||
|
):
|
||||||
|
return await new_chat_routes.create_thread(
|
||||||
|
NewChatThreadCreate(
|
||||||
|
title=title,
|
||||||
|
archived=False,
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
visibility=ChatVisibility.PRIVATE,
|
||||||
|
),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _active_thread_ids(response) -> set[int]:
|
||||||
|
return {thread.id for thread in response.threads}
|
||||||
|
|
||||||
|
|
||||||
|
def _search_thread_ids(response) -> set[int]:
|
||||||
|
return {thread.id for thread in response}
|
||||||
|
|
||||||
|
|
||||||
|
async def test_private_thread_is_hidden_from_other_search_space_member(
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_member: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
):
|
||||||
|
thread = await _create_thread(db_session, db_user, db_search_space)
|
||||||
|
|
||||||
|
member_threads = await new_chat_routes.list_threads(
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
member_search = await new_chat_routes.search_threads(
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Visibility",
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert thread.id not in _active_thread_ids(member_threads)
|
||||||
|
assert thread.id not in _search_thread_ids(member_search)
|
||||||
|
with pytest.raises(HTTPException) as exc_info:
|
||||||
|
await new_chat_routes.get_thread_full(
|
||||||
|
thread_id=thread.id,
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
assert exc_info.value.status_code == 403
|
||||||
|
|
||||||
|
|
||||||
|
async def test_creator_can_share_thread_and_member_can_list_search_read_it(
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_member: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
):
|
||||||
|
thread = await _create_thread(db_session, db_user, db_search_space)
|
||||||
|
|
||||||
|
updated = await new_chat_routes.update_thread_visibility(
|
||||||
|
thread_id=thread.id,
|
||||||
|
visibility_update=NewChatThreadVisibilityUpdate(
|
||||||
|
visibility=ChatVisibility.SEARCH_SPACE,
|
||||||
|
),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
|
||||||
|
member_threads = await new_chat_routes.list_threads(
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
member_search = await new_chat_routes.search_threads(
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Visibility",
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
full_thread = await new_chat_routes.get_thread_full(
|
||||||
|
thread_id=thread.id,
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert updated.visibility == ChatVisibility.SEARCH_SPACE
|
||||||
|
assert thread.id in _active_thread_ids(member_threads)
|
||||||
|
assert thread.id in _search_thread_ids(member_search)
|
||||||
|
assert full_thread["id"] == thread.id
|
||||||
|
assert full_thread["visibility"] == ChatVisibility.SEARCH_SPACE
|
||||||
|
|
||||||
|
|
||||||
|
async def test_rename_and_archive_do_not_reset_shared_visibility(
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
):
|
||||||
|
thread = await _create_thread(db_session, db_user, db_search_space)
|
||||||
|
await new_chat_routes.update_thread_visibility(
|
||||||
|
thread_id=thread.id,
|
||||||
|
visibility_update=NewChatThreadVisibilityUpdate(
|
||||||
|
visibility=ChatVisibility.SEARCH_SPACE,
|
||||||
|
),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
|
||||||
|
renamed = await new_chat_routes.update_thread(
|
||||||
|
thread_id=thread.id,
|
||||||
|
thread_update=NewChatThreadUpdate(title="Renamed Shared Chat"),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
archived = await new_chat_routes.update_thread(
|
||||||
|
thread_id=thread.id,
|
||||||
|
thread_update=NewChatThreadUpdate(archived=True),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert renamed.visibility == ChatVisibility.SEARCH_SPACE
|
||||||
|
assert archived.visibility == ChatVisibility.SEARCH_SPACE
|
||||||
|
assert archived.archived is True
|
||||||
|
|
||||||
|
|
||||||
|
async def test_non_creator_cannot_change_shared_thread_back_to_private(
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_member: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
):
|
||||||
|
thread = await _create_thread(db_session, db_user, db_search_space)
|
||||||
|
await new_chat_routes.update_thread_visibility(
|
||||||
|
thread_id=thread.id,
|
||||||
|
visibility_update=NewChatThreadVisibilityUpdate(
|
||||||
|
visibility=ChatVisibility.SEARCH_SPACE,
|
||||||
|
),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
|
||||||
|
with pytest.raises(HTTPException) as exc_info:
|
||||||
|
await new_chat_routes.update_thread_visibility(
|
||||||
|
thread_id=thread.id,
|
||||||
|
visibility_update=NewChatThreadVisibilityUpdate(
|
||||||
|
visibility=ChatVisibility.PRIVATE,
|
||||||
|
),
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert exc_info.value.status_code == 403
|
||||||
|
|
||||||
|
|
||||||
|
async def test_creator_can_make_shared_thread_private_again(
|
||||||
|
db_session: AsyncSession,
|
||||||
|
db_user: User,
|
||||||
|
db_member: User,
|
||||||
|
db_search_space: SearchSpace,
|
||||||
|
):
|
||||||
|
thread = await _create_thread(db_session, db_user, db_search_space)
|
||||||
|
await new_chat_routes.update_thread_visibility(
|
||||||
|
thread_id=thread.id,
|
||||||
|
visibility_update=NewChatThreadVisibilityUpdate(
|
||||||
|
visibility=ChatVisibility.SEARCH_SPACE,
|
||||||
|
),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
|
||||||
|
private_again = await new_chat_routes.update_thread_visibility(
|
||||||
|
thread_id=thread.id,
|
||||||
|
visibility_update=NewChatThreadVisibilityUpdate(
|
||||||
|
visibility=ChatVisibility.PRIVATE,
|
||||||
|
),
|
||||||
|
session=db_session,
|
||||||
|
user=db_user,
|
||||||
|
)
|
||||||
|
member_threads = await new_chat_routes.list_threads(
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
member_search = await new_chat_routes.search_threads(
|
||||||
|
search_space_id=db_search_space.id,
|
||||||
|
title="Visibility",
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert private_again.visibility == ChatVisibility.PRIVATE
|
||||||
|
assert thread.id not in _active_thread_ids(member_threads)
|
||||||
|
assert thread.id not in _search_thread_ids(member_search)
|
||||||
|
with pytest.raises(HTTPException) as exc_info:
|
||||||
|
await new_chat_routes.get_thread_full(
|
||||||
|
thread_id=thread.id,
|
||||||
|
session=db_session,
|
||||||
|
user=db_member,
|
||||||
|
)
|
||||||
|
assert exc_info.value.status_code == 403
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
import importlib
|
import importlib
|
||||||
import sys
|
import sys
|
||||||
import uuid
|
import uuid
|
||||||
from unittest.mock import AsyncMock, MagicMock
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import pytest_asyncio
|
import pytest_asyncio
|
||||||
|
|
@ -123,26 +123,6 @@ async def db_search_space(db_session: AsyncSession, db_user: User) -> SearchSpac
|
||||||
return space
|
return space
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def patched_summarize(monkeypatch) -> AsyncMock:
|
|
||||||
mock = AsyncMock(return_value="Mocked summary.")
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
|
||||||
mock,
|
|
||||||
)
|
|
||||||
return mock
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
|
||||||
def patched_summarize_raises(monkeypatch) -> AsyncMock:
|
|
||||||
mock = AsyncMock(side_effect=RuntimeError("LLM unavailable"))
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
|
||||||
mock,
|
|
||||||
)
|
|
||||||
return mock
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def patched_embed_texts(monkeypatch) -> MagicMock:
|
def patched_embed_texts(monkeypatch) -> MagicMock:
|
||||||
mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
|
mock = MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts])
|
||||||
|
|
@ -153,6 +133,16 @@ def patched_embed_texts(monkeypatch) -> MagicMock:
|
||||||
return mock
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def patched_embed_texts_raises(monkeypatch) -> MagicMock:
|
||||||
|
mock = MagicMock(side_effect=RuntimeError("Embedding unavailable"))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||||
|
mock,
|
||||||
|
)
|
||||||
|
return mock
|
||||||
|
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def patched_chunk_text(monkeypatch) -> MagicMock:
|
def patched_chunk_text(monkeypatch) -> MagicMock:
|
||||||
mock = MagicMock(return_value=["Test chunk content."])
|
mock = MagicMock(return_value=["Test chunk content."])
|
||||||
|
|
|
||||||
|
|
@ -68,7 +68,6 @@ class InlineTaskDispatcher:
|
||||||
filename: str,
|
filename: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
) -> None:
|
) -> None:
|
||||||
|
|
@ -83,7 +82,6 @@ class InlineTaskDispatcher:
|
||||||
filename,
|
filename,
|
||||||
search_space_id,
|
search_space_id,
|
||||||
user_id,
|
user_id,
|
||||||
should_summarize=should_summarize,
|
|
||||||
use_vision_llm=use_vision_llm,
|
use_vision_llm=use_vision_llm,
|
||||||
processing_mode=processing_mode,
|
processing_mode=processing_mode,
|
||||||
)
|
)
|
||||||
|
|
@ -266,10 +264,6 @@ async def page_limits():
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def _mock_external_apis(monkeypatch):
|
def _mock_external_apis(monkeypatch):
|
||||||
"""Mock LLM, embedding, and chunking — these are external API boundaries."""
|
"""Mock LLM, embedding, and chunking — these are external API boundaries."""
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
|
||||||
AsyncMock(return_value="Mocked summary."),
|
|
||||||
)
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
"app.indexing_pipeline.indexing_pipeline_service.embed_texts",
|
||||||
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
MagicMock(side_effect=lambda texts: [[0.1] * _EMBEDDING_DIM for _ in texts]),
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
||||||
"""Document status is READY after successful indexing."""
|
"""Document status is READY after successful indexing."""
|
||||||
|
|
@ -19,7 +19,6 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -31,10 +30,10 @@ async def test_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
|
async def test_content_is_source_markdown(db_session, db_search_space, db_user, mocker):
|
||||||
"""Document content is set to the LLM-generated summary."""
|
"""Document content is set to the extracted source markdown."""
|
||||||
adapter = UploadDocumentAdapter(db_session)
|
adapter = UploadDocumentAdapter(db_session)
|
||||||
await adapter.index(
|
await adapter.index(
|
||||||
markdown_content="## Hello\n\nSome content.",
|
markdown_content="## Hello\n\nSome content.",
|
||||||
|
|
@ -42,8 +41,6 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
should_summarize=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -51,11 +48,11 @@ async def test_content_is_summary(db_session, db_search_space, db_user, mocker):
|
||||||
)
|
)
|
||||||
document = result.scalars().first()
|
document = result.scalars().first()
|
||||||
|
|
||||||
assert document.content == "Mocked summary."
|
assert document.content == "## Hello\n\nSome content."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
|
async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker):
|
||||||
"""Chunks derived from the source markdown are persisted in the DB."""
|
"""Chunks derived from the source markdown are persisted in the DB."""
|
||||||
|
|
@ -66,7 +63,6 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -83,9 +79,7 @@ async def test_chunks_written_to_db(db_session, db_search_space, db_user, mocker
|
||||||
assert chunks[0].content == "Test chunk content."
|
assert chunks[0].content == "Test chunk content."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
|
||||||
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
|
async def test_raises_on_indexing_failure(db_session, db_search_space, db_user, mocker):
|
||||||
"""RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
|
"""RuntimeError is raised when the indexing step fails so the caller can fire a failure notification."""
|
||||||
adapter = UploadDocumentAdapter(db_session)
|
adapter = UploadDocumentAdapter(db_session)
|
||||||
|
|
@ -96,8 +90,6 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user,
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
should_summarize=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -107,10 +99,10 @@ async def test_raises_on_indexing_failure(db_session, db_search_space, db_user,
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_reindex_updates_content(db_session, db_search_space, db_user, mocker):
|
async def test_reindex_updates_content(db_session, db_search_space, db_user, mocker):
|
||||||
"""Document content is updated to the new summary after reindexing."""
|
"""Document content is updated to the new source markdown after reindexing."""
|
||||||
adapter = UploadDocumentAdapter(db_session)
|
adapter = UploadDocumentAdapter(db_session)
|
||||||
await adapter.index(
|
await adapter.index(
|
||||||
markdown_content="## Original\n\nOriginal content.",
|
markdown_content="## Original\n\nOriginal content.",
|
||||||
|
|
@ -118,7 +110,6 @@ async def test_reindex_updates_content(db_session, db_search_space, db_user, moc
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -129,14 +120,14 @@ async def test_reindex_updates_content(db_session, db_search_space, db_user, moc
|
||||||
document.source_markdown = "## Edited\n\nNew content after user edit."
|
document.source_markdown = "## Edited\n\nNew content after user edit."
|
||||||
await db_session.flush()
|
await db_session.flush()
|
||||||
|
|
||||||
await adapter.reindex(document=document, llm=mocker.Mock())
|
await adapter.reindex(document=document)
|
||||||
|
|
||||||
await db_session.refresh(document)
|
await db_session.refresh(document)
|
||||||
assert document.content == "Mocked summary."
|
assert document.content == "## Edited\n\nNew content after user edit."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_reindex_updates_content_hash(
|
async def test_reindex_updates_content_hash(
|
||||||
db_session, db_search_space, db_user, mocker
|
db_session, db_search_space, db_user, mocker
|
||||||
|
|
@ -149,7 +140,6 @@ async def test_reindex_updates_content_hash(
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -161,14 +151,14 @@ async def test_reindex_updates_content_hash(
|
||||||
document.source_markdown = "## Edited\n\nNew content after user edit."
|
document.source_markdown = "## Edited\n\nNew content after user edit."
|
||||||
await db_session.flush()
|
await db_session.flush()
|
||||||
|
|
||||||
await adapter.reindex(document=document, llm=mocker.Mock())
|
await adapter.reindex(document=document)
|
||||||
|
|
||||||
await db_session.refresh(document)
|
await db_session.refresh(document)
|
||||||
assert document.content_hash != original_hash
|
assert document.content_hash != original_hash
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, mocker):
|
||||||
"""Document status is READY after successful reindexing."""
|
"""Document status is READY after successful reindexing."""
|
||||||
|
|
@ -179,7 +169,6 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -190,13 +179,13 @@ async def test_reindex_sets_status_ready(db_session, db_search_space, db_user, m
|
||||||
document.source_markdown = "## Edited\n\nNew content after user edit."
|
document.source_markdown = "## Edited\n\nNew content after user edit."
|
||||||
await db_session.flush()
|
await db_session.flush()
|
||||||
|
|
||||||
await adapter.reindex(document=document, llm=mocker.Mock())
|
await adapter.reindex(document=document)
|
||||||
|
|
||||||
await db_session.refresh(document)
|
await db_session.refresh(document)
|
||||||
assert DocumentStatus.is_state(document.status, DocumentStatus.READY)
|
assert DocumentStatus.is_state(document.status, DocumentStatus.READY)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_summarize", "patched_embed_texts")
|
@pytest.mark.usefixtures("patched_embed_texts")
|
||||||
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
|
async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, mocker):
|
||||||
"""Reindexing replaces old chunks with new content rather than appending."""
|
"""Reindexing replaces old chunks with new content rather than appending."""
|
||||||
mocker.patch(
|
mocker.patch(
|
||||||
|
|
@ -211,7 +200,6 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -223,7 +211,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc
|
||||||
document.source_markdown = "## Edited\n\nNew content after user edit."
|
document.source_markdown = "## Edited\n\nNew content after user edit."
|
||||||
await db_session.flush()
|
await db_session.flush()
|
||||||
|
|
||||||
await adapter.reindex(document=document, llm=mocker.Mock())
|
await adapter.reindex(document=document)
|
||||||
|
|
||||||
chunks_result = await db_session.execute(
|
chunks_result = await db_session.execute(
|
||||||
select(Chunk).filter(Chunk.document_id == document_id)
|
select(Chunk).filter(Chunk.document_id == document_id)
|
||||||
|
|
@ -235,7 +223,7 @@ async def test_reindex_replaces_chunks(db_session, db_search_space, db_user, moc
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_reindex_clears_reindexing_flag(
|
async def test_reindex_clears_reindexing_flag(
|
||||||
db_session, db_search_space, db_user, mocker
|
db_session, db_search_space, db_user, mocker
|
||||||
|
|
@ -248,7 +236,6 @@ async def test_reindex_clears_reindexing_flag(
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -260,19 +247,17 @@ async def test_reindex_clears_reindexing_flag(
|
||||||
document.content_needs_reindexing = True
|
document.content_needs_reindexing = True
|
||||||
await db_session.flush()
|
await db_session.flush()
|
||||||
|
|
||||||
await adapter.reindex(document=document, llm=mocker.Mock())
|
await adapter.reindex(document=document)
|
||||||
|
|
||||||
await db_session.refresh(document)
|
await db_session.refresh(document)
|
||||||
assert document.content_needs_reindexing is False
|
assert document.content_needs_reindexing is False
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, mocker):
|
async def test_reindex_raises_on_failure(
|
||||||
|
db_session, db_search_space, db_user, patched_embed_texts, mocker
|
||||||
|
):
|
||||||
"""RuntimeError is raised when reindexing fails so the caller can handle it."""
|
"""RuntimeError is raised when reindexing fails so the caller can handle it."""
|
||||||
mocker.patch(
|
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
|
||||||
return_value="Mocked summary.",
|
|
||||||
)
|
|
||||||
|
|
||||||
adapter = UploadDocumentAdapter(db_session)
|
adapter = UploadDocumentAdapter(db_session)
|
||||||
await adapter.index(
|
await adapter.index(
|
||||||
|
|
@ -281,7 +266,6 @@ async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, m
|
||||||
etl_service="UNSTRUCTURED",
|
etl_service="UNSTRUCTURED",
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
user_id=str(db_user.id),
|
user_id=str(db_user.id),
|
||||||
llm=mocker.Mock(),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
|
|
@ -292,13 +276,10 @@ async def test_reindex_raises_on_failure(db_session, db_search_space, db_user, m
|
||||||
document.source_markdown = "## Edited\n\nNew content after user edit."
|
document.source_markdown = "## Edited\n\nNew content after user edit."
|
||||||
await db_session.flush()
|
await db_session.flush()
|
||||||
|
|
||||||
mocker.patch(
|
patched_embed_texts.side_effect = RuntimeError("Embedding unavailable")
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
|
||||||
side_effect=RuntimeError("LLM unavailable"),
|
|
||||||
)
|
|
||||||
|
|
||||||
with pytest.raises(RuntimeError, match=r"Embedding failed|Reindexing failed"):
|
with pytest.raises(RuntimeError, match=r"Embedding failed|Reindexing failed"):
|
||||||
await adapter.reindex(document=document, llm=mocker.Mock())
|
await adapter.reindex(document=document)
|
||||||
|
|
||||||
|
|
||||||
async def test_reindex_raises_on_empty_source_markdown(
|
async def test_reindex_raises_on_empty_source_markdown(
|
||||||
|
|
@ -323,4 +304,4 @@ async def test_reindex_raises_on_empty_source_markdown(
|
||||||
adapter = UploadDocumentAdapter(db_session)
|
adapter = UploadDocumentAdapter(db_session)
|
||||||
|
|
||||||
with pytest.raises(RuntimeError, match="no source_markdown"):
|
with pytest.raises(RuntimeError, match="no source_markdown"):
|
||||||
await adapter.reindex(document=document, llm=mocker.Mock())
|
await adapter.reindex(document=document)
|
||||||
|
|
|
||||||
|
|
@ -25,8 +25,6 @@ def _cal_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=True,
|
|
||||||
fallback_summary=f"Calendar: Event {unique_id}",
|
|
||||||
metadata={
|
metadata={
|
||||||
"event_id": unique_id,
|
"event_id": unique_id,
|
||||||
"start_time": "2025-01-15T10:00:00",
|
"start_time": "2025-01-15T10:00:00",
|
||||||
|
|
@ -37,7 +35,7 @@ def _cal_doc(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_calendar_pipeline_creates_ready_document(
|
async def test_calendar_pipeline_creates_ready_document(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
@ -55,7 +53,7 @@ async def test_calendar_pipeline_creates_ready_document(
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
|
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.search_space_id == space_id)
|
select(Document).filter(Document.search_space_id == space_id)
|
||||||
|
|
@ -68,7 +66,7 @@ async def test_calendar_pipeline_creates_ready_document(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_calendar_legacy_doc_migrated(
|
async def test_calendar_legacy_doc_migrated(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
|
||||||
|
|
@ -25,8 +25,6 @@ def _drive_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=True,
|
|
||||||
fallback_summary=f"File: {unique_id}.pdf",
|
|
||||||
metadata={
|
metadata={
|
||||||
"google_drive_file_id": unique_id,
|
"google_drive_file_id": unique_id,
|
||||||
"google_drive_file_name": f"{unique_id}.pdf",
|
"google_drive_file_name": f"{unique_id}.pdf",
|
||||||
|
|
@ -36,7 +34,7 @@ def _drive_doc(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_drive_pipeline_creates_ready_document(
|
async def test_drive_pipeline_creates_ready_document(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
@ -54,7 +52,7 @@ async def test_drive_pipeline_creates_ready_document(
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
|
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.search_space_id == space_id)
|
select(Document).filter(Document.search_space_id == space_id)
|
||||||
|
|
@ -67,7 +65,7 @@ async def test_drive_pipeline_creates_ready_document(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_drive_legacy_doc_migrated(
|
async def test_drive_legacy_doc_migrated(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
|
||||||
|
|
@ -24,8 +24,6 @@ def _dropbox_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=True,
|
|
||||||
fallback_summary=f"File: {unique_id}.docx",
|
|
||||||
metadata={
|
metadata={
|
||||||
"dropbox_file_id": unique_id,
|
"dropbox_file_id": unique_id,
|
||||||
"dropbox_file_name": f"{unique_id}.docx",
|
"dropbox_file_name": f"{unique_id}.docx",
|
||||||
|
|
@ -35,7 +33,7 @@ def _dropbox_doc(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_dropbox_pipeline_creates_ready_document(
|
async def test_dropbox_pipeline_creates_ready_document(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
@ -53,7 +51,7 @@ async def test_dropbox_pipeline_creates_ready_document(
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
|
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.search_space_id == space_id)
|
select(Document).filter(Document.search_space_id == space_id)
|
||||||
|
|
@ -66,7 +64,7 @@ async def test_dropbox_pipeline_creates_ready_document(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_dropbox_duplicate_content_skipped(
|
async def test_dropbox_duplicate_content_skipped(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
@ -86,7 +84,7 @@ async def test_dropbox_duplicate_content_skipped(
|
||||||
|
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.search_space_id == space_id)
|
select(Document).filter(Document.search_space_id == space_id)
|
||||||
|
|
|
||||||
|
|
@ -28,8 +28,6 @@ def _gmail_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=True,
|
|
||||||
fallback_summary=f"Gmail: Subject for {unique_id}",
|
|
||||||
metadata={
|
metadata={
|
||||||
"message_id": unique_id,
|
"message_id": unique_id,
|
||||||
"from": "sender@example.com",
|
"from": "sender@example.com",
|
||||||
|
|
@ -39,7 +37,7 @@ def _gmail_doc(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_gmail_pipeline_creates_ready_document(
|
async def test_gmail_pipeline_creates_ready_document(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
@ -57,7 +55,7 @@ async def test_gmail_pipeline_creates_ready_document(
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
|
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.search_space_id == space_id)
|
select(Document).filter(Document.search_space_id == space_id)
|
||||||
|
|
@ -71,7 +69,7 @@ async def test_gmail_pipeline_creates_ready_document(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_gmail_legacy_doc_migrated_then_reused(
|
async def test_gmail_legacy_doc_migrated_then_reused(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_index_batch_creates_ready_documents(
|
async def test_index_batch_creates_ready_documents(
|
||||||
db_session, db_search_space, make_connector_document, mocker
|
db_session, db_search_space, make_connector_document, mocker
|
||||||
|
|
@ -33,7 +33,7 @@ async def test_index_batch_creates_ready_documents(
|
||||||
]
|
]
|
||||||
|
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
results = await service.index_batch(docs, llm=mocker.Mock())
|
results = await service.index_batch(docs)
|
||||||
|
|
||||||
assert len(results) == 2
|
assert len(results) == 2
|
||||||
|
|
||||||
|
|
@ -50,10 +50,10 @@ async def test_index_batch_creates_ready_documents(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_index_batch_empty_returns_empty(db_session, mocker):
|
async def test_index_batch_empty_returns_empty(db_session, mocker):
|
||||||
"""index_batch with empty input returns an empty list."""
|
"""index_batch with empty input returns an empty list."""
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
results = await service.index_batch([], llm=mocker.Mock())
|
results = await service.index_batch([])
|
||||||
assert results == []
|
assert results == []
|
||||||
|
|
|
||||||
|
|
@ -10,9 +10,7 @@ _EMBEDDING_DIM = app_config.embedding_model_instance.dimension
|
||||||
pytestmark = pytest.mark.integration
|
pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_sets_status_ready(
|
async def test_sets_status_ready(
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
|
|
@ -27,7 +25,7 @@ async def test_sets_status_ready(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
@ -37,16 +35,14 @@ async def test_sets_status_ready(
|
||||||
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
|
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
async def test_content_is_source_markdown_by_default(
|
||||||
)
|
|
||||||
async def test_content_is_summary_when_should_summarize_true(
|
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
make_connector_document,
|
make_connector_document,
|
||||||
mocker,
|
mocker,
|
||||||
):
|
):
|
||||||
"""Document content is set to the LLM-generated summary when should_summarize=True."""
|
"""Document content is set to source_markdown by default."""
|
||||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
|
|
||||||
|
|
@ -54,28 +50,25 @@ async def test_content_is_summary_when_should_summarize_true(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
)
|
)
|
||||||
reloaded = result.scalars().first()
|
reloaded = result.scalars().first()
|
||||||
|
|
||||||
assert reloaded.content == "Mocked summary."
|
assert reloaded.content == connector_doc.source_markdown
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
async def test_content_is_source_markdown_when_custom_content(
|
||||||
)
|
|
||||||
async def test_content_is_source_markdown_when_should_summarize_false(
|
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
make_connector_document,
|
make_connector_document,
|
||||||
):
|
):
|
||||||
"""Document content is set to source_markdown verbatim when should_summarize=False."""
|
"""Document content is set to source_markdown verbatim."""
|
||||||
connector_doc = make_connector_document(
|
connector_doc = make_connector_document(
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
should_summarize=False,
|
|
||||||
source_markdown="## Raw content",
|
source_markdown="## Raw content",
|
||||||
)
|
)
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
|
|
@ -84,7 +77,7 @@ async def test_content_is_source_markdown_when_should_summarize_false(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=None)
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
@ -94,9 +87,7 @@ async def test_content_is_source_markdown_when_should_summarize_false(
|
||||||
assert reloaded.content == "## Raw content"
|
assert reloaded.content == "## Raw content"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_chunks_written_to_db(
|
async def test_chunks_written_to_db(
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
|
|
@ -111,7 +102,7 @@ async def test_chunks_written_to_db(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Chunk).filter(Chunk.document_id == document_id)
|
select(Chunk).filter(Chunk.document_id == document_id)
|
||||||
|
|
@ -122,9 +113,7 @@ async def test_chunks_written_to_db(
|
||||||
assert chunks[0].content == "Test chunk content."
|
assert chunks[0].content == "Test chunk content."
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_embedding_written_to_db(
|
async def test_embedding_written_to_db(
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
|
|
@ -139,7 +128,7 @@ async def test_embedding_written_to_db(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
@ -150,9 +139,7 @@ async def test_embedding_written_to_db(
|
||||||
assert len(reloaded.embedding) == _EMBEDDING_DIM
|
assert len(reloaded.embedding) == _EMBEDDING_DIM
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_updated_at_advances_after_indexing(
|
async def test_updated_at_advances_after_indexing(
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
|
|
@ -172,7 +159,7 @@ async def test_updated_at_advances_after_indexing(
|
||||||
)
|
)
|
||||||
updated_at_pending = result.scalars().first().updated_at
|
updated_at_pending = result.scalars().first().updated_at
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
@ -182,18 +169,15 @@ async def test_updated_at_advances_after_indexing(
|
||||||
assert updated_at_ready > updated_at_pending
|
assert updated_at_ready > updated_at_pending
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_no_llm_falls_back_to_source_markdown(
|
async def test_no_llm_falls_back_to_source_markdown(
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
make_connector_document,
|
make_connector_document,
|
||||||
):
|
):
|
||||||
"""When llm=None and no fallback_summary, content falls back to source_markdown."""
|
"""Content stays deterministic source markdown without an LLM."""
|
||||||
connector_doc = make_connector_document(
|
connector_doc = make_connector_document(
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
should_summarize=True,
|
|
||||||
source_markdown="## Fallback content",
|
source_markdown="## Fallback content",
|
||||||
)
|
)
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
|
|
@ -202,7 +186,7 @@ async def test_no_llm_falls_back_to_source_markdown(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=None)
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
@ -213,27 +197,23 @@ async def test_no_llm_falls_back_to_source_markdown(
|
||||||
assert reloaded.content == "## Fallback content"
|
assert reloaded.content == "## Fallback content"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
async def test_source_markdown_used_without_preview(
|
||||||
)
|
|
||||||
async def test_fallback_summary_used_when_llm_unavailable(
|
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
make_connector_document,
|
make_connector_document,
|
||||||
):
|
):
|
||||||
"""fallback_summary is used as content when llm=None and should_summarize=True."""
|
"""Source markdown is used without fallback preview fields."""
|
||||||
connector_doc = make_connector_document(
|
connector_doc = make_connector_document(
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
should_summarize=True,
|
|
||||||
source_markdown="## Full raw content",
|
source_markdown="## Full raw content",
|
||||||
fallback_summary="Short pre-built summary.",
|
|
||||||
)
|
)
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
|
|
||||||
prepared = await service.prepare_for_indexing([connector_doc])
|
prepared = await service.prepare_for_indexing([connector_doc])
|
||||||
document_id = prepared[0].id
|
document_id = prepared[0].id
|
||||||
|
|
||||||
await service.index(prepared[0], connector_doc, llm=None)
|
await service.index(prepared[0], connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
@ -241,12 +221,10 @@ async def test_fallback_summary_used_when_llm_unavailable(
|
||||||
reloaded = result.scalars().first()
|
reloaded = result.scalars().first()
|
||||||
|
|
||||||
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
|
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.READY)
|
||||||
assert reloaded.content == "Short pre-built summary."
|
assert reloaded.content == "## Full raw content"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts", "patched_chunk_text")
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_reindex_replaces_old_chunks(
|
async def test_reindex_replaces_old_chunks(
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
|
|
@ -264,14 +242,14 @@ async def test_reindex_replaces_old_chunks(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
updated_doc = make_connector_document(
|
updated_doc = make_connector_document(
|
||||||
search_space_id=db_search_space.id,
|
search_space_id=db_search_space.id,
|
||||||
source_markdown="## v2",
|
source_markdown="## v2",
|
||||||
)
|
)
|
||||||
re_prepared = await service.prepare_for_indexing([updated_doc])
|
re_prepared = await service.prepare_for_indexing([updated_doc])
|
||||||
await service.index(re_prepared[0], updated_doc, llm=mocker.Mock())
|
await service.index(re_prepared[0], updated_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Chunk).filter(Chunk.document_id == document_id)
|
select(Chunk).filter(Chunk.document_id == document_id)
|
||||||
|
|
@ -281,16 +259,14 @@ async def test_reindex_replaces_old_chunks(
|
||||||
assert len(chunks) == 1
|
assert len(chunks) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
|
||||||
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
|
async def test_embedding_error_sets_status_failed(
|
||||||
)
|
|
||||||
async def test_llm_error_sets_status_failed(
|
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
make_connector_document,
|
make_connector_document,
|
||||||
mocker,
|
mocker,
|
||||||
):
|
):
|
||||||
"""Document status is FAILED when the LLM raises during indexing."""
|
"""Document status is FAILED when embedding raises during indexing."""
|
||||||
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
connector_doc = make_connector_document(search_space_id=db_search_space.id)
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
|
|
||||||
|
|
@ -298,7 +274,7 @@ async def test_llm_error_sets_status_failed(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
@ -308,10 +284,8 @@ async def test_llm_error_sets_status_failed(
|
||||||
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.FAILED)
|
assert DocumentStatus.is_state(reloaded.status, DocumentStatus.FAILED)
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
|
||||||
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
|
async def test_embedding_error_leaves_no_partial_data(
|
||||||
)
|
|
||||||
async def test_llm_error_leaves_no_partial_data(
|
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
make_connector_document,
|
make_connector_document,
|
||||||
|
|
@ -325,7 +299,7 @@ async def test_llm_error_leaves_no_partial_data(
|
||||||
document = prepared[0]
|
document = prepared[0]
|
||||||
document_id = document.id
|
document_id = document.id
|
||||||
|
|
||||||
await service.index(document, connector_doc, llm=mocker.Mock())
|
await service.index(document, connector_doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
|
||||||
|
|
@ -21,7 +21,6 @@ from app.db import (
|
||||||
pytestmark = pytest.mark.integration
|
pytestmark = pytest.mark.integration
|
||||||
|
|
||||||
UNIFIED_FIXTURES = (
|
UNIFIED_FIXTURES = (
|
||||||
"patched_summarize",
|
|
||||||
"patched_embed_texts",
|
"patched_embed_texts",
|
||||||
"patched_chunk_text",
|
"patched_chunk_text",
|
||||||
)
|
)
|
||||||
|
|
@ -787,7 +786,7 @@ class TestPipelineIntegration:
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
|
|
||||||
db_doc = prepared[0]
|
db_doc = prepared[0]
|
||||||
result = await service.index(db_doc, doc, llm=mocker.Mock())
|
result = await service.index(db_doc, doc)
|
||||||
assert result is not None
|
assert result is not None
|
||||||
|
|
||||||
docs = (
|
docs = (
|
||||||
|
|
@ -1272,7 +1271,7 @@ class TestIndexingProgressFlag:
|
||||||
original_index = IndexingPipelineService.index
|
original_index = IndexingPipelineService.index
|
||||||
flag_observed = []
|
flag_observed = []
|
||||||
|
|
||||||
async def patched_index(self_pipe, document, connector_doc, llm):
|
async def patched_index(self_pipe, document, connector_doc):
|
||||||
folder = (
|
folder = (
|
||||||
await db_session.execute(
|
await db_session.execute(
|
||||||
select(Folder).where(
|
select(Folder).where(
|
||||||
|
|
@ -1284,7 +1283,7 @@ class TestIndexingProgressFlag:
|
||||||
if folder:
|
if folder:
|
||||||
meta = folder.folder_metadata or {}
|
meta = folder.folder_metadata or {}
|
||||||
flag_observed.append(meta.get("indexing_in_progress", False))
|
flag_observed.append(meta.get("indexing_in_progress", False))
|
||||||
return await original_index(self_pipe, document, connector_doc, llm)
|
return await original_index(self_pipe, document, connector_doc)
|
||||||
|
|
||||||
IndexingPipelineService.index = patched_index
|
IndexingPipelineService.index = patched_index
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -24,8 +24,6 @@ def _onedrive_doc(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
connector_id=connector_id,
|
connector_id=connector_id,
|
||||||
created_by_id=user_id,
|
created_by_id=user_id,
|
||||||
should_summarize=True,
|
|
||||||
fallback_summary=f"File: {unique_id}.docx",
|
|
||||||
metadata={
|
metadata={
|
||||||
"onedrive_file_id": unique_id,
|
"onedrive_file_id": unique_id,
|
||||||
"onedrive_file_name": f"{unique_id}.docx",
|
"onedrive_file_name": f"{unique_id}.docx",
|
||||||
|
|
@ -35,7 +33,7 @@ def _onedrive_doc(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_onedrive_pipeline_creates_ready_document(
|
async def test_onedrive_pipeline_creates_ready_document(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
@ -53,7 +51,7 @@ async def test_onedrive_pipeline_creates_ready_document(
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
|
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.search_space_id == space_id)
|
select(Document).filter(Document.search_space_id == space_id)
|
||||||
|
|
@ -66,7 +64,7 @@ async def test_onedrive_pipeline_creates_ready_document(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_onedrive_duplicate_content_skipped(
|
async def test_onedrive_duplicate_content_skipped(
|
||||||
db_session, db_search_space, db_connector, db_user, mocker
|
db_session, db_search_space, db_connector, db_user, mocker
|
||||||
|
|
@ -86,7 +84,7 @@ async def test_onedrive_duplicate_content_skipped(
|
||||||
|
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
assert len(prepared) == 1
|
assert len(prepared) == 1
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.search_space_id == space_id)
|
select(Document).filter(Document.search_space_id == space_id)
|
||||||
|
|
|
||||||
|
|
@ -33,7 +33,7 @@ async def test_new_document_is_persisted_with_pending_status(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_unchanged_ready_document_is_skipped(
|
async def test_unchanged_ready_document_is_skipped(
|
||||||
db_session,
|
db_session,
|
||||||
|
|
@ -47,7 +47,7 @@ async def test_unchanged_ready_document_is_skipped(
|
||||||
|
|
||||||
# Index fully so the document reaches ready state
|
# Index fully so the document reaches ready state
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
# Same content on the next run — a ready document must be skipped
|
# Same content on the next run — a ready document must be skipped
|
||||||
results = await service.prepare_for_indexing([doc])
|
results = await service.prepare_for_indexing([doc])
|
||||||
|
|
@ -56,7 +56,7 @@ async def test_unchanged_ready_document_is_skipped(
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures(
|
||||||
"patched_summarize", "patched_embed_texts", "patched_chunk_text"
|
"patched_embed_texts", "patched_chunk_text"
|
||||||
)
|
)
|
||||||
async def test_title_only_change_updates_title_in_db(
|
async def test_title_only_change_updates_title_in_db(
|
||||||
db_session,
|
db_session,
|
||||||
|
|
@ -72,7 +72,7 @@ async def test_title_only_change_updates_title_in_db(
|
||||||
|
|
||||||
prepared = await service.prepare_for_indexing([original])
|
prepared = await service.prepare_for_indexing([original])
|
||||||
document_id = prepared[0].id
|
document_id = prepared[0].id
|
||||||
await service.index(prepared[0], original, llm=mocker.Mock())
|
await service.index(prepared[0], original)
|
||||||
|
|
||||||
renamed = make_connector_document(
|
renamed = make_connector_document(
|
||||||
search_space_id=db_search_space.id, title="Updated Title"
|
search_space_id=db_search_space.id, title="Updated Title"
|
||||||
|
|
@ -338,9 +338,7 @@ async def test_same_content_from_different_source_is_skipped(
|
||||||
assert len(result.scalars().all()) == 1
|
assert len(result.scalars().all()) == 1
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures(
|
@pytest.mark.usefixtures("patched_embed_texts_raises", "patched_chunk_text")
|
||||||
"patched_summarize_raises", "patched_embed_texts", "patched_chunk_text"
|
|
||||||
)
|
|
||||||
async def test_failed_document_with_unchanged_content_is_requeued(
|
async def test_failed_document_with_unchanged_content_is_requeued(
|
||||||
db_session,
|
db_session,
|
||||||
db_search_space,
|
db_search_space,
|
||||||
|
|
@ -351,10 +349,10 @@ async def test_failed_document_with_unchanged_content_is_requeued(
|
||||||
doc = make_connector_document(search_space_id=db_search_space.id)
|
doc = make_connector_document(search_space_id=db_search_space.id)
|
||||||
service = IndexingPipelineService(session=db_session)
|
service = IndexingPipelineService(session=db_session)
|
||||||
|
|
||||||
# First run: document is created and indexing crashes → status = failed
|
# First run: document is created and indexing crashes, so status becomes failed.
|
||||||
prepared = await service.prepare_for_indexing([doc])
|
prepared = await service.prepare_for_indexing([doc])
|
||||||
document_id = prepared[0].id
|
document_id = prepared[0].id
|
||||||
await service.index(prepared[0], doc, llm=mocker.Mock())
|
await service.index(prepared[0], doc)
|
||||||
|
|
||||||
result = await db_session.execute(
|
result = await db_session.execute(
|
||||||
select(Document).filter(Document.id == document_id)
|
select(Document).filter(Document.id == document_id)
|
||||||
|
|
|
||||||
|
|
@ -101,7 +101,7 @@ async def test_generate_resume_defaults_to_one_page_target(monkeypatch) -> None:
|
||||||
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=_llm_invoke))
|
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=_llm_invoke))
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
resume_tool,
|
resume_tool,
|
||||||
"get_document_summary_llm",
|
"get_agent_llm",
|
||||||
AsyncMock(return_value=llm),
|
AsyncMock(return_value=llm),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
||||||
|
|
@ -130,7 +130,7 @@ async def test_generate_resume_compresses_when_over_limit(monkeypatch) -> None:
|
||||||
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
|
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
resume_tool,
|
resume_tool,
|
||||||
"get_document_summary_llm",
|
"get_agent_llm",
|
||||||
AsyncMock(return_value=llm),
|
AsyncMock(return_value=llm),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
||||||
|
|
@ -165,7 +165,7 @@ async def test_generate_resume_returns_ready_when_target_not_met(monkeypatch) ->
|
||||||
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
|
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
resume_tool,
|
resume_tool,
|
||||||
"get_document_summary_llm",
|
"get_agent_llm",
|
||||||
AsyncMock(return_value=llm),
|
AsyncMock(return_value=llm),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
||||||
|
|
@ -198,7 +198,7 @@ async def test_generate_resume_fails_when_hard_limit_exceeded(monkeypatch) -> No
|
||||||
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
|
llm = SimpleNamespace(ainvoke=AsyncMock(side_effect=responses))
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
resume_tool,
|
resume_tool,
|
||||||
"get_document_summary_llm",
|
"get_agent_llm",
|
||||||
AsyncMock(return_value=llm),
|
AsyncMock(return_value=llm),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
monkeypatch.setattr(resume_tool, "_compile_typst", lambda _source: b"pdf")
|
||||||
|
|
|
||||||
|
|
@ -14,8 +14,8 @@ from typing import Any
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
import app.automations.actions.agent_task.dependencies as deps_mod
|
import app.automations.actions.builtin.agent_task.dependencies as deps_mod
|
||||||
from app.automations.actions.agent_task.dependencies import (
|
from app.automations.actions.builtin.agent_task.dependencies import (
|
||||||
DependencyError,
|
DependencyError,
|
||||||
build_dependencies,
|
build_dependencies,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,6 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert doc.title == "Engineering Handbook"
|
assert doc.title == "Engineering Handbook"
|
||||||
|
|
@ -81,7 +80,6 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
assert doc.search_space_id == _SEARCH_SPACE_ID
|
assert doc.search_space_id == _SEARCH_SPACE_ID
|
||||||
assert doc.connector_id == _CONNECTOR_ID
|
assert doc.connector_id == _CONNECTOR_ID
|
||||||
assert doc.created_by_id == _USER_ID
|
assert doc.created_by_id == _USER_ID
|
||||||
assert doc.should_summarize is True
|
|
||||||
assert doc.metadata["page_id"] == "abc-123"
|
assert doc.metadata["page_id"] == "abc-123"
|
||||||
assert doc.metadata["page_title"] == "Engineering Handbook"
|
assert doc.metadata["page_title"] == "Engineering Handbook"
|
||||||
assert doc.metadata["space_id"] == "ENG"
|
assert doc.metadata["space_id"] == "ENG"
|
||||||
|
|
@ -89,21 +87,6 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
assert doc.metadata["connector_id"] == _CONNECTOR_ID
|
assert doc.metadata["connector_id"] == _CONNECTOR_ID
|
||||||
assert doc.metadata["document_type"] == "Confluence Page"
|
assert doc.metadata["document_type"] == "Confluence Page"
|
||||||
assert doc.metadata["connector_type"] == "Confluence"
|
assert doc.metadata["connector_type"] == "Confluence"
|
||||||
assert doc.fallback_summary is not None
|
|
||||||
assert "Engineering Handbook" in doc.fallback_summary
|
|
||||||
assert markdown in doc.fallback_summary
|
|
||||||
|
|
||||||
|
|
||||||
async def test_build_connector_doc_summary_disabled():
|
|
||||||
doc = _build_connector_doc(
|
|
||||||
_make_page(),
|
|
||||||
_to_markdown(_make_page()),
|
|
||||||
connector_id=_CONNECTOR_ID,
|
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
|
||||||
user_id=_USER_ID,
|
|
||||||
enable_summary=False,
|
|
||||||
)
|
|
||||||
assert doc.should_summarize is False
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -111,10 +94,9 @@ async def test_build_connector_doc_summary_disabled():
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _mock_connector(enable_summary: bool = True):
|
def _mock_connector():
|
||||||
c = MagicMock()
|
c = MagicMock()
|
||||||
c.config = {"access_token": "tok"}
|
c.config = {"access_token": "tok"}
|
||||||
c.enable_summary = enable_summary
|
|
||||||
c.last_indexed_at = None
|
c.last_indexed_at = None
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -71,7 +71,6 @@ async def test_single_file_returns_one_connector_document(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
@ -97,7 +96,6 @@ async def test_multiple_files_all_produce_documents(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 3
|
assert len(docs) == 3
|
||||||
|
|
@ -125,7 +123,6 @@ async def test_one_download_exception_does_not_block_others(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 2
|
assert len(docs) == 2
|
||||||
|
|
@ -152,7 +149,6 @@ async def test_etl_error_counts_as_download_failure(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
@ -191,7 +187,6 @@ async def test_concurrency_bounded_by_semaphore(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
max_concurrency=2,
|
max_concurrency=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -231,7 +226,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
on_heartbeat=_on_heartbeat,
|
on_heartbeat=_on_heartbeat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -324,7 +318,6 @@ async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500):
|
||||||
mocks["task_logger"],
|
mocks["task_logger"],
|
||||||
mocks["log_entry"],
|
mocks["log_entry"],
|
||||||
max_files,
|
max_files,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -434,7 +427,6 @@ async def _run_selected(mocks, file_tuples):
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -569,7 +561,6 @@ async def test_delta_sync_deletions_call_remove_document(monkeypatch):
|
||||||
mock_task_logger,
|
mock_task_logger,
|
||||||
MagicMock(),
|
MagicMock(),
|
||||||
max_files=500,
|
max_files=500,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert sorted(remove_calls) == ["id:del1", "id:del2"]
|
assert sorted(remove_calls) == ["id:del1", "id:del2"]
|
||||||
|
|
@ -608,7 +599,6 @@ async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
|
||||||
mock_task_logger,
|
mock_task_logger,
|
||||||
MagicMock(),
|
MagicMock(),
|
||||||
max_files=500,
|
max_files=500,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert indexed == 2
|
assert indexed == 2
|
||||||
|
|
@ -670,7 +660,6 @@ async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
|
||||||
mock_task_logger,
|
mock_task_logger,
|
||||||
MagicMock(),
|
MagicMock(),
|
||||||
max_files=500,
|
max_files=500,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert sorted(remove_calls) == ["id:del1", "id:del2"]
|
assert sorted(remove_calls) == ["id:del1", "id:del2"]
|
||||||
|
|
@ -704,7 +693,6 @@ async def test_delta_sync_returns_new_cursor(monkeypatch):
|
||||||
mock_task_logger,
|
mock_task_logger,
|
||||||
MagicMock(),
|
MagicMock(),
|
||||||
max_files=500,
|
max_files=500,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert cursor == "brand-new-cursor-xyz"
|
assert cursor == "brand-new-cursor-xyz"
|
||||||
|
|
@ -725,7 +713,7 @@ def orchestrator_mocks(monkeypatch):
|
||||||
mock_connector = MagicMock()
|
mock_connector = MagicMock()
|
||||||
mock_connector.config = {"_token_encrypted": False}
|
mock_connector.config = {"_token_encrypted": False}
|
||||||
mock_connector.last_indexed_at = None
|
mock_connector.last_indexed_at = None
|
||||||
mock_connector.enable_summary = True
|
mock_connector.enable_vision_llm = True
|
||||||
|
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
_mod,
|
_mod,
|
||||||
|
|
|
||||||
|
|
@ -66,7 +66,6 @@ async def test_single_file_returns_one_connector_document(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 3
|
assert len(docs) == 3
|
||||||
|
|
@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 2
|
assert len(docs) == 2
|
||||||
|
|
@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
@ -186,7 +182,6 @@ async def test_concurrency_bounded_by_semaphore(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
max_concurrency=2,
|
max_concurrency=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -226,7 +221,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
on_heartbeat=_on_heartbeat,
|
on_heartbeat=_on_heartbeat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -300,12 +294,6 @@ def full_scan_mocks(mock_drive_client, monkeypatch):
|
||||||
MagicMock(return_value=pipeline_mock),
|
MagicMock(return_value=pipeline_mock),
|
||||||
)
|
)
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
_mod,
|
|
||||||
"get_user_long_context_llm",
|
|
||||||
AsyncMock(return_value=MagicMock()),
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"drive_client": mock_drive_client,
|
"drive_client": mock_drive_client,
|
||||||
"session": mock_session,
|
"session": mock_session,
|
||||||
|
|
@ -333,7 +321,6 @@ async def _run_full_scan(mocks, *, max_files=500, include_subfolders=False):
|
||||||
mocks["log_entry"],
|
mocks["log_entry"],
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders=include_subfolders,
|
include_subfolders=include_subfolders,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -487,12 +474,6 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
|
||||||
"IndexingPipelineService",
|
"IndexingPipelineService",
|
||||||
MagicMock(return_value=pipeline_mock),
|
MagicMock(return_value=pipeline_mock),
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(
|
|
||||||
_mod,
|
|
||||||
"get_user_long_context_llm",
|
|
||||||
AsyncMock(return_value=MagicMock()),
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_session, _ = _make_page_limit_session()
|
mock_session, _ = _make_page_limit_session()
|
||||||
mock_task_logger = MagicMock()
|
mock_task_logger = MagicMock()
|
||||||
mock_task_logger.log_task_progress = AsyncMock()
|
mock_task_logger.log_task_progress = AsyncMock()
|
||||||
|
|
@ -509,7 +490,6 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
|
||||||
mock_task_logger,
|
mock_task_logger,
|
||||||
MagicMock(),
|
MagicMock(),
|
||||||
max_files=500,
|
max_files=500,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert sorted(remove_calls) == ["del1", "del2", "trash1"]
|
assert sorted(remove_calls) == ["del1", "del2", "trash1"]
|
||||||
|
|
@ -577,7 +557,6 @@ async def _run_selected(mocks, file_ids):
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -70,7 +70,6 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert doc.title == "ENG-42: Fix login bug"
|
assert doc.title == "ENG-42: Fix login bug"
|
||||||
|
|
@ -80,7 +79,6 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
assert doc.search_space_id == _SEARCH_SPACE_ID
|
assert doc.search_space_id == _SEARCH_SPACE_ID
|
||||||
assert doc.connector_id == _CONNECTOR_ID
|
assert doc.connector_id == _CONNECTOR_ID
|
||||||
assert doc.created_by_id == _USER_ID
|
assert doc.created_by_id == _USER_ID
|
||||||
assert doc.should_summarize is True
|
|
||||||
assert doc.metadata["issue_id"] == "abc-123"
|
assert doc.metadata["issue_id"] == "abc-123"
|
||||||
assert doc.metadata["issue_identifier"] == "ENG-42"
|
assert doc.metadata["issue_identifier"] == "ENG-42"
|
||||||
assert doc.metadata["issue_title"] == "Fix login bug"
|
assert doc.metadata["issue_title"] == "Fix login bug"
|
||||||
|
|
@ -90,24 +88,6 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
assert doc.metadata["connector_id"] == _CONNECTOR_ID
|
assert doc.metadata["connector_id"] == _CONNECTOR_ID
|
||||||
assert doc.metadata["document_type"] == "Linear Issue"
|
assert doc.metadata["document_type"] == "Linear Issue"
|
||||||
assert doc.metadata["connector_type"] == "Linear"
|
assert doc.metadata["connector_type"] == "Linear"
|
||||||
assert doc.fallback_summary is not None
|
|
||||||
assert "ENG-42" in doc.fallback_summary
|
|
||||||
assert markdown in doc.fallback_summary
|
|
||||||
|
|
||||||
|
|
||||||
async def test_build_connector_doc_summary_disabled():
|
|
||||||
"""When enable_summary is False, should_summarize is False."""
|
|
||||||
doc = _build_connector_doc(
|
|
||||||
_make_issue(),
|
|
||||||
_make_formatted_issue(),
|
|
||||||
"# content",
|
|
||||||
connector_id=_CONNECTOR_ID,
|
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
|
||||||
user_id=_USER_ID,
|
|
||||||
enable_summary=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert doc.should_summarize is False
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -115,10 +95,9 @@ async def test_build_connector_doc_summary_disabled():
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _mock_connector(enable_summary: bool = True):
|
def _mock_connector():
|
||||||
c = MagicMock()
|
c = MagicMock()
|
||||||
c.config = {"access_token": "tok"}
|
c.config = {"access_token": "tok"}
|
||||||
c.enable_summary = enable_summary
|
|
||||||
c.last_indexed_at = None
|
c.last_indexed_at = None
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,7 +41,6 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert doc.title == "My Notion Page"
|
assert doc.title == "My Notion Page"
|
||||||
|
|
@ -51,29 +50,11 @@ async def test_build_connector_doc_produces_correct_fields():
|
||||||
assert doc.search_space_id == _SEARCH_SPACE_ID
|
assert doc.search_space_id == _SEARCH_SPACE_ID
|
||||||
assert doc.connector_id == _CONNECTOR_ID
|
assert doc.connector_id == _CONNECTOR_ID
|
||||||
assert doc.created_by_id == _USER_ID
|
assert doc.created_by_id == _USER_ID
|
||||||
assert doc.should_summarize is True
|
|
||||||
assert doc.metadata["page_title"] == "My Notion Page"
|
assert doc.metadata["page_title"] == "My Notion Page"
|
||||||
assert doc.metadata["page_id"] == "abc-123"
|
assert doc.metadata["page_id"] == "abc-123"
|
||||||
assert doc.metadata["connector_id"] == _CONNECTOR_ID
|
assert doc.metadata["connector_id"] == _CONNECTOR_ID
|
||||||
assert doc.metadata["document_type"] == "Notion Page"
|
assert doc.metadata["document_type"] == "Notion Page"
|
||||||
assert doc.metadata["connector_type"] == "Notion"
|
assert doc.metadata["connector_type"] == "Notion"
|
||||||
assert doc.fallback_summary is not None
|
|
||||||
assert "My Notion Page" in doc.fallback_summary
|
|
||||||
assert markdown in doc.fallback_summary
|
|
||||||
|
|
||||||
|
|
||||||
async def test_build_connector_doc_summary_disabled():
|
|
||||||
"""When enable_summary is False, should_summarize is False."""
|
|
||||||
doc = _build_connector_doc(
|
|
||||||
_make_page(),
|
|
||||||
"# content",
|
|
||||||
connector_id=_CONNECTOR_ID,
|
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
|
||||||
user_id=_USER_ID,
|
|
||||||
enable_summary=False,
|
|
||||||
)
|
|
||||||
|
|
||||||
assert doc.should_summarize is False
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -81,10 +62,9 @@ async def test_build_connector_doc_summary_disabled():
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _mock_connector(enable_summary: bool = True):
|
def _mock_connector():
|
||||||
c = MagicMock()
|
c = MagicMock()
|
||||||
c.config = {"access_token": "tok"}
|
c.config = {"access_token": "tok"}
|
||||||
c.enable_summary = enable_summary
|
|
||||||
c.last_indexed_at = None
|
c.last_indexed_at = None
|
||||||
return c
|
return c
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -65,7 +65,6 @@ async def test_single_file_returns_one_connector_document(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
@ -91,7 +90,6 @@ async def test_multiple_files_all_produce_documents(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 3
|
assert len(docs) == 3
|
||||||
|
|
@ -119,7 +117,6 @@ async def test_one_download_exception_does_not_block_others(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 2
|
assert len(docs) == 2
|
||||||
|
|
@ -146,7 +143,6 @@ async def test_etl_error_counts_as_download_failure(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
assert len(docs) == 1
|
assert len(docs) == 1
|
||||||
|
|
@ -185,7 +181,6 @@ async def test_concurrency_bounded_by_semaphore(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
max_concurrency=2,
|
max_concurrency=2,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -225,7 +220,6 @@ async def test_heartbeat_fires_during_parallel_downloads(
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
on_heartbeat=_on_heartbeat,
|
on_heartbeat=_on_heartbeat,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -180,7 +180,6 @@ async def _run_gdrive_selected(mocks, file_ids):
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -336,10 +335,6 @@ def gdrive_full_scan_mocks(monkeypatch):
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
|
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(
|
|
||||||
_mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock())
|
|
||||||
)
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"mod": _mod,
|
"mod": _mod,
|
||||||
"session": session,
|
"session": session,
|
||||||
|
|
@ -366,7 +361,6 @@ async def _run_gdrive_full_scan(mocks, max_files=500):
|
||||||
MagicMock(),
|
MagicMock(),
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders=False,
|
include_subfolders=False,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -454,10 +448,6 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
|
_mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock)
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(
|
|
||||||
_mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock())
|
|
||||||
)
|
|
||||||
|
|
||||||
mock_task_logger = MagicMock()
|
mock_task_logger = MagicMock()
|
||||||
mock_task_logger.log_task_progress = AsyncMock()
|
mock_task_logger.log_task_progress = AsyncMock()
|
||||||
|
|
||||||
|
|
@ -473,7 +463,6 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
|
||||||
mock_task_logger,
|
mock_task_logger,
|
||||||
MagicMock(),
|
MagicMock(),
|
||||||
max_files=500,
|
max_files=500,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
call_files = download_mock.call_args[0][1]
|
call_files = download_mock.call_args[0][1]
|
||||||
|
|
@ -539,7 +528,6 @@ async def _run_onedrive_selected(mocks, file_ids):
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -641,7 +629,6 @@ async def _run_dropbox_selected(mocks, file_paths):
|
||||||
connector_id=_CONNECTOR_ID,
|
connector_id=_CONNECTOR_ID,
|
||||||
search_space_id=_SEARCH_SPACE_ID,
|
search_space_id=_SEARCH_SPACE_ID,
|
||||||
user_id=_USER_ID,
|
user_id=_USER_ID,
|
||||||
enable_summary=True,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,13 @@ def _signed_slack_request(payload: dict, *, secret: str = "signing-secret") -> R
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _enable_slack_gateway(monkeypatch):
|
||||||
|
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_ENABLED", True)
|
||||||
|
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_CLIENT_ID", "client-id")
|
||||||
|
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_CLIENT_SECRET", "client-secret")
|
||||||
|
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
|
||||||
|
|
||||||
|
|
||||||
async def _call_webhook(*, request: RequestStub, account_id: int, session):
|
async def _call_webhook(*, request: RequestStub, account_id: int, session):
|
||||||
return await routes.telegram_webhook(
|
return await routes.telegram_webhook(
|
||||||
request=request,
|
request=request,
|
||||||
|
|
@ -207,7 +214,7 @@ def test_verify_slack_signature_accepts_valid_signature():
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_slack_webhook_url_verification(monkeypatch, mocker):
|
async def test_slack_webhook_url_verification(monkeypatch, mocker):
|
||||||
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
|
_enable_slack_gateway(monkeypatch)
|
||||||
request = _signed_slack_request({"type": "url_verification", "challenge": "abc123"})
|
request = _signed_slack_request({"type": "url_verification", "challenge": "abc123"})
|
||||||
|
|
||||||
response = await routes.slack_webhook(request=request, session=mocker.AsyncMock())
|
response = await routes.slack_webhook(request=request, session=mocker.AsyncMock())
|
||||||
|
|
@ -218,7 +225,7 @@ async def test_slack_webhook_url_verification(monkeypatch, mocker):
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_slack_webhook_persists_event(monkeypatch, mocker):
|
async def test_slack_webhook_persists_event(monkeypatch, mocker):
|
||||||
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
|
_enable_slack_gateway(monkeypatch)
|
||||||
session = mocker.AsyncMock()
|
session = mocker.AsyncMock()
|
||||||
monkeypatch.setattr(routes, "get_slack_account_by_team", mocker.AsyncMock(return_value=_slack_account()))
|
monkeypatch.setattr(routes, "get_slack_account_by_team", mocker.AsyncMock(return_value=_slack_account()))
|
||||||
persist = mocker.AsyncMock(return_value=100)
|
persist = mocker.AsyncMock(return_value=100)
|
||||||
|
|
@ -248,7 +255,7 @@ async def test_slack_webhook_persists_event(monkeypatch, mocker):
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_slack_webhook_ignores_self_event(monkeypatch, mocker):
|
async def test_slack_webhook_ignores_self_event(monkeypatch, mocker):
|
||||||
monkeypatch.setattr(routes.config, "GATEWAY_SLACK_SIGNING_SECRET", "signing-secret")
|
_enable_slack_gateway(monkeypatch)
|
||||||
session = mocker.AsyncMock()
|
session = mocker.AsyncMock()
|
||||||
monkeypatch.setattr(routes, "get_slack_account_by_team", mocker.AsyncMock(return_value=_slack_account()))
|
monkeypatch.setattr(routes, "get_slack_account_by_team", mocker.AsyncMock(return_value=_slack_account()))
|
||||||
persist = mocker.AsyncMock(return_value=100)
|
persist = mocker.AsyncMock(return_value=100)
|
||||||
|
|
@ -275,7 +282,7 @@ async def test_slack_webhook_ignores_self_event(monkeypatch, mocker):
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_discord_gateway_install_returns_oauth_url(monkeypatch):
|
async def test_discord_gateway_install_returns_oauth_url(monkeypatch, mocker):
|
||||||
monkeypatch.setattr(routes.config, "DISCORD_CLIENT_ID", "discord-client")
|
monkeypatch.setattr(routes.config, "DISCORD_CLIENT_ID", "discord-client")
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
routes.config,
|
routes.config,
|
||||||
|
|
@ -283,10 +290,12 @@ async def test_discord_gateway_install_returns_oauth_url(monkeypatch):
|
||||||
"http://localhost:8000/api/v1/gateway/discord/callback",
|
"http://localhost:8000/api/v1/gateway/discord/callback",
|
||||||
)
|
)
|
||||||
monkeypatch.setattr(routes.config, "SECRET_KEY", "test-secret")
|
monkeypatch.setattr(routes.config, "SECRET_KEY", "test-secret")
|
||||||
|
monkeypatch.setattr(routes, "check_search_space_access", mocker.AsyncMock())
|
||||||
|
|
||||||
response = await routes.install_discord_gateway(
|
response = await routes.install_discord_gateway(
|
||||||
search_space_id=123,
|
search_space_id=123,
|
||||||
user=SimpleNamespace(id="00000000-0000-0000-0000-000000000001"),
|
user=SimpleNamespace(id="00000000-0000-0000-0000-000000000001"),
|
||||||
|
session=mocker.AsyncMock(),
|
||||||
)
|
)
|
||||||
|
|
||||||
assert response["auth_url"].startswith("https://discord.com/api/oauth2/authorize?")
|
assert response["auth_url"].startswith("https://discord.com/api/oauth2/authorize?")
|
||||||
|
|
|
||||||
|
|
@ -18,7 +18,6 @@ def test_valid_document_created_with_required_fields():
|
||||||
connector_id=42,
|
connector_id=42,
|
||||||
created_by_id="00000000-0000-0000-0000-000000000001",
|
created_by_id="00000000-0000-0000-0000-000000000001",
|
||||||
)
|
)
|
||||||
assert doc.should_summarize is True
|
|
||||||
assert doc.should_use_code_chunker is False
|
assert doc.should_use_code_chunker is False
|
||||||
assert doc.metadata == {}
|
assert doc.metadata == {}
|
||||||
assert doc.connector_id == 42
|
assert doc.connector_id == 42
|
||||||
|
|
|
||||||
|
|
@ -1,41 +0,0 @@
|
||||||
from unittest.mock import MagicMock
|
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from app.indexing_pipeline.document_summarizer import summarize_document
|
|
||||||
|
|
||||||
pytestmark = pytest.mark.unit
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_summarizer_chain")
|
|
||||||
async def test_without_metadata_returns_raw_summary():
|
|
||||||
"""Summarizer returns the LLM output directly when no metadata is provided."""
|
|
||||||
result = await summarize_document("# Content", llm=MagicMock(model="gpt-4"))
|
|
||||||
|
|
||||||
assert result == "The summary."
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_summarizer_chain")
|
|
||||||
async def test_with_metadata_includes_metadata_values_in_output():
|
|
||||||
"""Non-empty metadata values are prepended to the summary output."""
|
|
||||||
result = await summarize_document(
|
|
||||||
"# Content",
|
|
||||||
llm=MagicMock(model="gpt-4"),
|
|
||||||
metadata={"author": "Alice", "source": "Notion"},
|
|
||||||
)
|
|
||||||
|
|
||||||
assert "Alice" in result
|
|
||||||
assert "Notion" in result
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.usefixtures("patched_summarizer_chain")
|
|
||||||
async def test_with_metadata_omits_empty_fields_from_output():
|
|
||||||
"""Empty metadata fields are omitted from the summary output."""
|
|
||||||
result = await summarize_document(
|
|
||||||
"# Content",
|
|
||||||
llm=MagicMock(model="gpt-4"),
|
|
||||||
metadata={"author": "Alice", "description": ""},
|
|
||||||
)
|
|
||||||
|
|
||||||
assert "Alice" in result
|
|
||||||
assert "description" not in result.lower()
|
|
||||||
|
|
@ -37,12 +37,10 @@ async def test_calls_prepare_then_index_per_document(pipeline, make_connector_do
|
||||||
orm2 = MagicMock(spec=Document)
|
orm2 = MagicMock(spec=Document)
|
||||||
orm2.unique_identifier_hash = compute_unique_identifier_hash(doc2)
|
orm2.unique_identifier_hash = compute_unique_identifier_hash(doc2)
|
||||||
|
|
||||||
mock_llm = MagicMock()
|
|
||||||
|
|
||||||
pipeline.prepare_for_indexing = AsyncMock(return_value=[orm1, orm2])
|
pipeline.prepare_for_indexing = AsyncMock(return_value=[orm1, orm2])
|
||||||
pipeline.index = AsyncMock(side_effect=lambda doc, cdoc, llm: doc)
|
pipeline.index = AsyncMock(side_effect=lambda doc, cdoc: doc)
|
||||||
|
|
||||||
results = await pipeline.index_batch([doc1, doc2], mock_llm)
|
results = await pipeline.index_batch([doc1, doc2])
|
||||||
|
|
||||||
pipeline.prepare_for_indexing.assert_awaited_once_with([doc1, doc2])
|
pipeline.prepare_for_indexing.assert_awaited_once_with([doc1, doc2])
|
||||||
assert pipeline.index.await_count == 2
|
assert pipeline.index.await_count == 2
|
||||||
|
|
@ -53,7 +51,7 @@ async def test_empty_input_returns_empty(pipeline):
|
||||||
"""Empty connector_docs list returns empty result."""
|
"""Empty connector_docs list returns empty result."""
|
||||||
pipeline.prepare_for_indexing = AsyncMock(return_value=[])
|
pipeline.prepare_for_indexing = AsyncMock(return_value=[])
|
||||||
|
|
||||||
results = await pipeline.index_batch([], MagicMock())
|
results = await pipeline.index_batch([])
|
||||||
|
|
||||||
assert results == []
|
assert results == []
|
||||||
|
|
||||||
|
|
@ -74,7 +72,7 @@ async def test_skips_document_without_matching_connector_doc(
|
||||||
pipeline.prepare_for_indexing = AsyncMock(return_value=[orphan_orm])
|
pipeline.prepare_for_indexing = AsyncMock(return_value=[orphan_orm])
|
||||||
pipeline.index = AsyncMock()
|
pipeline.index = AsyncMock()
|
||||||
|
|
||||||
results = await pipeline.index_batch([doc1], MagicMock())
|
results = await pipeline.index_batch([doc1])
|
||||||
|
|
||||||
pipeline.index.assert_not_awaited()
|
pipeline.index.assert_not_awaited()
|
||||||
assert results == []
|
assert results == []
|
||||||
|
|
|
||||||
|
|
@ -51,11 +51,6 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
return await original_to_thread(func, *args, **kwargs)
|
return await original_to_thread(func, *args, **kwargs)
|
||||||
|
|
||||||
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
|
monkeypatch.setattr(asyncio, "to_thread", tracking_to_thread)
|
||||||
|
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
|
||||||
AsyncMock(return_value="Summary."),
|
|
||||||
)
|
|
||||||
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||||
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
|
|
@ -85,7 +80,7 @@ async def test_index_calls_embed_and_chunk_via_to_thread(
|
||||||
document.id = 1
|
document.id = 1
|
||||||
document.status = DocumentStatus.pending()
|
document.status = DocumentStatus.pending()
|
||||||
|
|
||||||
await pipeline.index(document, connector_doc, llm=MagicMock())
|
await pipeline.index(document, connector_doc)
|
||||||
|
|
||||||
# Either chunker entry point satisfies the "chunking runs off the event
|
# Either chunker entry point satisfies the "chunking runs off the event
|
||||||
# loop" contract this test guards. Routing between the two is verified
|
# loop" contract this test guards. Routing between the two is verified
|
||||||
|
|
@ -104,10 +99,6 @@ async def test_non_code_documents_use_hybrid_chunker(
|
||||||
mid-row. Only documents flagged with ``should_use_code_chunker=True``
|
mid-row. Only documents flagged with ``should_use_code_chunker=True``
|
||||||
should take the ``chunk_text`` path.
|
should take the ``chunk_text`` path.
|
||||||
"""
|
"""
|
||||||
monkeypatch.setattr(
|
|
||||||
"app.indexing_pipeline.indexing_pipeline_service.summarize_document",
|
|
||||||
AsyncMock(return_value="Summary."),
|
|
||||||
)
|
|
||||||
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
mock_chunk_hybrid = MagicMock(return_value=["chunk1"])
|
||||||
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
mock_chunk_hybrid.__name__ = "chunk_text_hybrid"
|
||||||
monkeypatch.setattr(
|
monkeypatch.setattr(
|
||||||
|
|
@ -139,7 +130,7 @@ async def test_non_code_documents_use_hybrid_chunker(
|
||||||
document.id = 1
|
document.id = 1
|
||||||
document.status = DocumentStatus.pending()
|
document.status = DocumentStatus.pending()
|
||||||
|
|
||||||
await pipeline.index(document, connector_doc, llm=MagicMock())
|
await pipeline.index(document, connector_doc)
|
||||||
|
|
||||||
mock_chunk_hybrid.assert_called_once()
|
mock_chunk_hybrid.assert_called_once()
|
||||||
mock_chunk_code.assert_not_called()
|
mock_chunk_code.assert_not_called()
|
||||||
|
|
@ -192,19 +183,14 @@ async def test_batch_parallel_indexes_all_documents(
|
||||||
|
|
||||||
index_calls = []
|
index_calls = []
|
||||||
|
|
||||||
async def fake_index(self, document, connector_doc, llm):
|
async def fake_index(self, document, connector_doc):
|
||||||
index_calls.append(document.id)
|
index_calls.append(document.id)
|
||||||
document.status = DocumentStatus.ready()
|
document.status = DocumentStatus.ready()
|
||||||
return document
|
return document
|
||||||
|
|
||||||
monkeypatch.setattr(IndexingPipelineService, "index", fake_index)
|
monkeypatch.setattr(IndexingPipelineService, "index", fake_index)
|
||||||
|
|
||||||
async def mock_get_llm(session):
|
_, indexed, failed = await pipeline.index_batch_parallel(docs, max_concurrency=2)
|
||||||
return MagicMock()
|
|
||||||
|
|
||||||
_, indexed, failed = await pipeline.index_batch_parallel(
|
|
||||||
docs, mock_get_llm, max_concurrency=2
|
|
||||||
)
|
|
||||||
|
|
||||||
assert indexed == 3
|
assert indexed == 3
|
||||||
assert failed == 0
|
assert failed == 0
|
||||||
|
|
@ -233,20 +219,15 @@ async def test_batch_parallel_one_failure_does_not_affect_others(
|
||||||
_mock_session_factory(orm_by_id),
|
_mock_session_factory(orm_by_id),
|
||||||
)
|
)
|
||||||
|
|
||||||
async def failing_index(self, document, connector_doc, llm):
|
async def failing_index(self, document, connector_doc):
|
||||||
if document.id == 2:
|
if document.id == 2:
|
||||||
raise RuntimeError("LLM exploded")
|
raise RuntimeError("Indexing exploded")
|
||||||
document.status = DocumentStatus.ready()
|
document.status = DocumentStatus.ready()
|
||||||
return document
|
return document
|
||||||
|
|
||||||
monkeypatch.setattr(IndexingPipelineService, "index", failing_index)
|
monkeypatch.setattr(IndexingPipelineService, "index", failing_index)
|
||||||
|
|
||||||
async def mock_get_llm(session):
|
_, indexed, failed = await pipeline.index_batch_parallel(docs, max_concurrency=4)
|
||||||
return MagicMock()
|
|
||||||
|
|
||||||
_, indexed, failed = await pipeline.index_batch_parallel(
|
|
||||||
docs, mock_get_llm, max_concurrency=4
|
|
||||||
)
|
|
||||||
|
|
||||||
assert indexed == 2
|
assert indexed == 2
|
||||||
assert failed == 1
|
assert failed == 1
|
||||||
|
|
|
||||||
|
|
@ -246,6 +246,8 @@ def test_new_chat_runtime_context_prefers_accepted_folder_ids() -> None:
|
||||||
mentioned_document_ids=[1, 2],
|
mentioned_document_ids=[1, 2],
|
||||||
accepted_folder_ids=[10],
|
accepted_folder_ids=[10],
|
||||||
mentioned_folder_ids=[20, 30],
|
mentioned_folder_ids=[20, 30],
|
||||||
|
mentioned_connector_ids=None,
|
||||||
|
mentioned_connectors=None,
|
||||||
request_id="req",
|
request_id="req",
|
||||||
turn_id="t1",
|
turn_id="t1",
|
||||||
)
|
)
|
||||||
|
|
@ -263,6 +265,8 @@ def test_new_chat_runtime_context_falls_back_to_mentioned_folder_ids() -> None:
|
||||||
mentioned_document_ids=None,
|
mentioned_document_ids=None,
|
||||||
accepted_folder_ids=[],
|
accepted_folder_ids=[],
|
||||||
mentioned_folder_ids=[20, 30],
|
mentioned_folder_ids=[20, 30],
|
||||||
|
mentioned_connector_ids=None,
|
||||||
|
mentioned_connectors=None,
|
||||||
request_id=None,
|
request_id=None,
|
||||||
turn_id="t2",
|
turn_id="t2",
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -137,15 +137,14 @@ Notes:
|
||||||
- `--skip-unanswerable` (run) — drop unanswerable questions
|
- `--skip-unanswerable` (run) — drop unanswerable questions
|
||||||
- `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs
|
- `--docs <a.pdf>,<b.pdf>` (run) — scope to specific docs
|
||||||
|
|
||||||
## Ingestion knobs (vision LLM, processing mode, summarize)
|
## Ingestion knobs (vision LLM, processing mode)
|
||||||
|
|
||||||
The harness exposes `POST /api/v1/documents/fileupload`'s three knobs on every `ingest` subcommand:
|
The harness exposes `POST /api/v1/documents/fileupload`'s ingest knobs on every `ingest` subcommand:
|
||||||
|
|
||||||
| Flag pair | Effect |
|
| Flag pair | Effect |
|
||||||
|--------------------------------------------|-----------------------------------------------------------------------------------------|
|
|--------------------------------------------|-----------------------------------------------------------------------------------------|
|
||||||
| `--use-vision-llm` / `--no-vision-llm` | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
|
| `--use-vision-llm` / `--no-vision-llm` | Walk every embedded image in the PDF and inline image-derived text at the image's position (see below). |
|
||||||
| `--processing-mode {basic,premium}` | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
|
| `--processing-mode {basic,premium}` | `premium` carries a 10× page multiplier and routes to a stronger ETL (e.g. LlamaCloud). |
|
||||||
| `--should-summarize` / `--no-summarize` | Generate a per-document summary at ingest. |
|
|
||||||
|
|
||||||
The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.
|
The "Default ingest" column in the benchmarks table is what runs if you don't pass any flag. Whatever was actually used is recorded as a `__settings__` header in the doc map (`data/<suite>/maps/<benchmark>_*_map.jsonl`) and as `extra.ingest_settings` in `run_artifact.json`, then surfaced in the report — no need to hunt through CLI history.
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -110,7 +110,6 @@ class DocumentsClient:
|
||||||
files: Iterable[Path],
|
files: Iterable[Path],
|
||||||
*,
|
*,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
should_summarize: bool = False,
|
|
||||||
use_vision_llm: bool = False,
|
use_vision_llm: bool = False,
|
||||||
processing_mode: str = "basic",
|
processing_mode: str = "basic",
|
||||||
) -> FileUploadResult:
|
) -> FileUploadResult:
|
||||||
|
|
@ -149,7 +148,6 @@ class DocumentsClient:
|
||||||
f"{self._base}/api/v1/documents/fileupload",
|
f"{self._base}/api/v1/documents/fileupload",
|
||||||
data={
|
data={
|
||||||
"search_space_id": str(search_space_id),
|
"search_space_id": str(search_space_id),
|
||||||
"should_summarize": "true" if should_summarize else "false",
|
|
||||||
"use_vision_llm": "true" if use_vision_llm else "false",
|
"use_vision_llm": "true" if use_vision_llm else "false",
|
||||||
"processing_mode": processing_mode,
|
"processing_mode": processing_mode,
|
||||||
},
|
},
|
||||||
|
|
|
||||||
|
|
@ -83,7 +83,6 @@ class LlmPreferences:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
agent_llm_id: int | None
|
agent_llm_id: int | None
|
||||||
document_summary_llm_id: int | None
|
|
||||||
image_generation_config_id: int | None
|
image_generation_config_id: int | None
|
||||||
vision_llm_config_id: int | None
|
vision_llm_config_id: int | None
|
||||||
agent_llm: dict[str, Any] | None
|
agent_llm: dict[str, Any] | None
|
||||||
|
|
@ -93,7 +92,6 @@ class LlmPreferences:
|
||||||
def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences:
|
def from_payload(cls, payload: dict[str, Any]) -> LlmPreferences:
|
||||||
return cls(
|
return cls(
|
||||||
agent_llm_id=payload.get("agent_llm_id"),
|
agent_llm_id=payload.get("agent_llm_id"),
|
||||||
document_summary_llm_id=payload.get("document_summary_llm_id"),
|
|
||||||
image_generation_config_id=payload.get("image_generation_config_id"),
|
image_generation_config_id=payload.get("image_generation_config_id"),
|
||||||
vision_llm_config_id=payload.get("vision_llm_config_id"),
|
vision_llm_config_id=payload.get("vision_llm_config_id"),
|
||||||
agent_llm=payload.get("agent_llm"),
|
agent_llm=payload.get("agent_llm"),
|
||||||
|
|
@ -154,7 +152,6 @@ class SearchSpaceClient:
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
*,
|
*,
|
||||||
agent_llm_id: int | None = None,
|
agent_llm_id: int | None = None,
|
||||||
document_summary_llm_id: int | None = None,
|
|
||||||
image_generation_config_id: int | None = None,
|
image_generation_config_id: int | None = None,
|
||||||
vision_llm_config_id: int | None = None,
|
vision_llm_config_id: int | None = None,
|
||||||
) -> LlmPreferences:
|
) -> LlmPreferences:
|
||||||
|
|
@ -167,8 +164,6 @@ class SearchSpaceClient:
|
||||||
body: dict[str, Any] = {}
|
body: dict[str, Any] = {}
|
||||||
if agent_llm_id is not None:
|
if agent_llm_id is not None:
|
||||||
body["agent_llm_id"] = agent_llm_id
|
body["agent_llm_id"] = agent_llm_id
|
||||||
if document_summary_llm_id is not None:
|
|
||||||
body["document_summary_llm_id"] = document_summary_llm_id
|
|
||||||
if image_generation_config_id is not None:
|
if image_generation_config_id is not None:
|
||||||
body["image_generation_config_id"] = image_generation_config_id
|
body["image_generation_config_id"] = image_generation_config_id
|
||||||
if vision_llm_config_id is not None:
|
if vision_llm_config_id is not None:
|
||||||
|
|
|
||||||
|
|
@ -8,15 +8,13 @@ exactly three knobs (verified at
|
||||||
* ``processing_mode`` — ``"basic"`` (default) | ``"premium"``
|
* ``processing_mode`` — ``"basic"`` (default) | ``"premium"``
|
||||||
* ``use_vision_llm`` — ``bool`` (run vision LLM during ingest to
|
* ``use_vision_llm`` — ``bool`` (run vision LLM during ingest to
|
||||||
extract image content / captions / tables)
|
extract image content / captions / tables)
|
||||||
* ``should_summarize`` — ``bool`` (generate document summary)
|
|
||||||
|
|
||||||
This module gives every benchmark a uniform way to:
|
This module gives every benchmark a uniform way to:
|
||||||
|
|
||||||
1. Receive sensible per-benchmark defaults (text-only benchmarks
|
1. Receive sensible per-benchmark defaults (text-only benchmarks
|
||||||
default vision off; image-bearing benchmarks default vision on).
|
default vision off; image-bearing benchmarks default vision on).
|
||||||
2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``,
|
2. Accept CLI overrides (``--use-vision-llm`` / ``--no-vision-llm``,
|
||||||
``--processing-mode {basic,premium}``,
|
``--processing-mode {basic,premium}``).
|
||||||
``--should-summarize`` / ``--no-summarize``).
|
|
||||||
3. Persist the *actual* settings used into the doc-map manifest and
|
3. Persist the *actual* settings used into the doc-map manifest and
|
||||||
the run artifact so reports can show "vision=ON, mode=premium →
|
the run artifact so reports can show "vision=ON, mode=premium →
|
||||||
65% accuracy" head-to-head with "vision=OFF, mode=basic → 52%".
|
65% accuracy" head-to-head with "vision=OFF, mode=basic → 52%".
|
||||||
|
|
@ -71,13 +69,11 @@ class IngestSettings:
|
||||||
|
|
||||||
use_vision_llm: bool = False
|
use_vision_llm: bool = False
|
||||||
processing_mode: str = "basic"
|
processing_mode: str = "basic"
|
||||||
should_summarize: bool = False
|
|
||||||
|
|
||||||
def to_dict(self) -> dict[str, Any]:
|
def to_dict(self) -> dict[str, Any]:
|
||||||
return {
|
return {
|
||||||
"use_vision_llm": self.use_vision_llm,
|
"use_vision_llm": self.use_vision_llm,
|
||||||
"processing_mode": self.processing_mode,
|
"processing_mode": self.processing_mode,
|
||||||
"should_summarize": self.should_summarize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
|
|
@ -87,14 +83,13 @@ class IngestSettings:
|
||||||
``opts`` is the kwargs dict built by ``core.cli`` from the
|
``opts`` is the kwargs dict built by ``core.cli`` from the
|
||||||
argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys
|
argparse namespace (see ``_cmd_ingest`` / ``_cmd_run``). Keys
|
||||||
we look for: ``use_vision_llm`` (bool or None), ``processing_mode``
|
we look for: ``use_vision_llm`` (bool or None), ``processing_mode``
|
||||||
(str or None), ``should_summarize`` (bool or None). Anything
|
(str or None). Anything
|
||||||
else is ignored so benchmarks can pass through their own opts.
|
else is ignored so benchmarks can pass through their own opts.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
return cls(
|
return cls(
|
||||||
use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm),
|
use_vision_llm=_coerce_bool(opts.get("use_vision_llm"), defaults.use_vision_llm),
|
||||||
processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode),
|
processing_mode=_coerce_mode(opts.get("processing_mode"), defaults.processing_mode),
|
||||||
should_summarize=_coerce_bool(opts.get("should_summarize"), defaults.should_summarize),
|
|
||||||
)
|
)
|
||||||
|
|
||||||
def render_label(self) -> str:
|
def render_label(self) -> str:
|
||||||
|
|
@ -102,8 +97,7 @@ class IngestSettings:
|
||||||
|
|
||||||
return (
|
return (
|
||||||
f"vision={'on' if self.use_vision_llm else 'off'}, "
|
f"vision={'on' if self.use_vision_llm else 'off'}, "
|
||||||
f"mode={self.processing_mode}, "
|
f"mode={self.processing_mode}"
|
||||||
f"summarize={'on' if self.should_summarize else 'off'}"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -179,14 +173,14 @@ def add_ingest_settings_args(
|
||||||
*,
|
*,
|
||||||
defaults: IngestSettings,
|
defaults: IngestSettings,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Attach the three ingest-settings flag pairs to ``parser``.
|
"""Attach ingest-settings flags to ``parser``.
|
||||||
|
|
||||||
Each bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
|
The vision bool exposes a mutually exclusive ``--foo`` / ``--no-foo``
|
||||||
pair so an operator can flip either direction without restating
|
pair so an operator can flip either direction without restating every
|
||||||
every flag. Default is ``None`` so that "operator didn't pass the
|
flag. Default is ``None`` so that "operator didn't pass the flag" is
|
||||||
flag" is distinguishable from "operator explicitly passed false"
|
distinguishable from "operator explicitly passed false" —
|
||||||
— ``IngestSettings.merge`` then folds in the benchmark default
|
``IngestSettings.merge`` then folds in the benchmark default only when
|
||||||
only when the operator was silent.
|
the operator was silent.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
settings_group = parser.add_argument_group(
|
settings_group = parser.add_argument_group(
|
||||||
|
|
@ -217,18 +211,6 @@ def add_ingest_settings_args(
|
||||||
f"Default for this benchmark: {defaults.processing_mode!r}."
|
f"Default for this benchmark: {defaults.processing_mode!r}."
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
_add_bool_pair(
|
|
||||||
settings_group,
|
|
||||||
dest="should_summarize",
|
|
||||||
on_flag="--should-summarize",
|
|
||||||
off_flag="--no-summarize",
|
|
||||||
on_help=(
|
|
||||||
"Have SurfSense generate a document summary at ingest "
|
|
||||||
f"(default for this benchmark: "
|
|
||||||
f"{'on' if defaults.should_summarize else 'off'})."
|
|
||||||
),
|
|
||||||
off_help="Skip per-document summary generation.",
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -292,10 +274,9 @@ def format_ingest_settings_md(settings: Any) -> str:
|
||||||
return "- SurfSense ingest settings: (not recorded — re-ingest to capture)"
|
return "- SurfSense ingest settings: (not recorded — re-ingest to capture)"
|
||||||
vision = "on" if settings.get("use_vision_llm") else "off"
|
vision = "on" if settings.get("use_vision_llm") else "off"
|
||||||
mode = settings.get("processing_mode") or "basic"
|
mode = settings.get("processing_mode") or "basic"
|
||||||
summarize = "on" if settings.get("should_summarize") else "off"
|
|
||||||
return (
|
return (
|
||||||
f"- SurfSense ingest settings: vision_llm=`{vision}`, "
|
f"- SurfSense ingest settings: vision_llm=`{vision}`, "
|
||||||
f"processing_mode=`{mode}`, summarize=`{summarize}`"
|
f"processing_mode=`{mode}`"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -160,8 +160,7 @@ async def run_ingest(
|
||||||
upload_result = await docs_client.upload(
|
upload_result = await docs_client.upload(
|
||||||
files=[b.path for b in batches],
|
files=[b.path for b in batches],
|
||||||
search_space_id=ctx.search_space_id,
|
search_space_id=ctx.search_space_id,
|
||||||
should_summarize=settings.should_summarize,
|
use_vision_llm=settings.use_vision_llm,
|
||||||
use_vision_llm=settings.use_vision_llm,
|
|
||||||
processing_mode=settings.processing_mode,
|
processing_mode=settings.processing_mode,
|
||||||
)
|
)
|
||||||
new_doc_ids = list(upload_result.document_ids)
|
new_doc_ids = list(upload_result.document_ids)
|
||||||
|
|
|
||||||
|
|
@ -63,7 +63,6 @@ _DESCRIPTION = "CUREv1 retrieval (single-arm SurfSense): Recall@k / MRR / nDCG@1
|
||||||
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
||||||
use_vision_llm=False,
|
use_vision_llm=False,
|
||||||
processing_mode="basic",
|
processing_mode="basic",
|
||||||
should_summarize=False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -208,7 +208,6 @@ async def _upload_pdfs(
|
||||||
result = await docs_client.upload(
|
result = await docs_client.upload(
|
||||||
files=batch,
|
files=batch,
|
||||||
search_space_id=ctx.search_space_id,
|
search_space_id=ctx.search_space_id,
|
||||||
should_summarize=settings.should_summarize,
|
|
||||||
use_vision_llm=settings.use_vision_llm,
|
use_vision_llm=settings.use_vision_llm,
|
||||||
processing_mode=settings.processing_mode,
|
processing_mode=settings.processing_mode,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -169,7 +169,6 @@ _DESCRIPTION = (
|
||||||
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
||||||
use_vision_llm=True,
|
use_vision_llm=True,
|
||||||
processing_mode="basic",
|
processing_mode="basic",
|
||||||
should_summarize=False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -480,7 +480,6 @@ async def run_ingest(
|
||||||
upload_result = await docs_client.upload(
|
upload_result = await docs_client.upload(
|
||||||
files=[b.path for b in batches],
|
files=[b.path for b in batches],
|
||||||
search_space_id=ctx.search_space_id,
|
search_space_id=ctx.search_space_id,
|
||||||
should_summarize=settings.should_summarize,
|
|
||||||
use_vision_llm=settings.use_vision_llm,
|
use_vision_llm=settings.use_vision_llm,
|
||||||
processing_mode=settings.processing_mode,
|
processing_mode=settings.processing_mode,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,6 @@ _DESCRIPTION = "MIRAGE (7,663 medical MCQs) — single-arm SurfSense per-task ac
|
||||||
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
||||||
use_vision_llm=False,
|
use_vision_llm=False,
|
||||||
processing_mode="basic",
|
processing_mode="basic",
|
||||||
should_summarize=False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -225,7 +225,6 @@ async def _upload_pdfs(
|
||||||
result = await docs_client.upload(
|
result = await docs_client.upload(
|
||||||
files=batch,
|
files=batch,
|
||||||
search_space_id=ctx.search_space_id,
|
search_space_id=ctx.search_space_id,
|
||||||
should_summarize=settings.should_summarize,
|
|
||||||
use_vision_llm=settings.use_vision_llm,
|
use_vision_llm=settings.use_vision_llm,
|
||||||
processing_mode=settings.processing_mode,
|
processing_mode=settings.processing_mode,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -178,7 +178,6 @@ _TEXT_ONLY_HINTS = ("gpt-5.4-mini", "gpt-3.5", "text-only", "instruct-")
|
||||||
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
_DEFAULT_INGEST_SETTINGS = IngestSettings(
|
||||||
use_vision_llm=True,
|
use_vision_llm=True,
|
||||||
processing_mode="basic",
|
processing_mode="basic",
|
||||||
should_summarize=False,
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Some files were not shown because too many files have changed in this diff Show more
Loading…
Add table
Add a link
Reference in a new issue