feat: implement time-based filtering for document retrieval using 'updated_at' timestamp

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-12-12 02:42:20 -08:00
parent d97136792c
commit 919c323ef3
6 changed files with 324 additions and 16 deletions

View file

@ -0,0 +1,40 @@
"""47_copy_created_at_to_updated_at
Revision ID: 47
Revises: 46
Create Date: 2025-12-12
Copies created_at values to updated_at for all documents where updated_at is NULL.
This ensures time-based filtering in retrievers works correctly for all documents.
"""
from collections.abc import Sequence
from alembic import op
# revision identifiers, used by Alembic.
revision: str = "47"
down_revision: str | None = "46"
branch_labels: str | Sequence[str] | None = None
depends_on: str | Sequence[str] | None = None
def upgrade() -> None:
"""Upgrade schema - Copy created_at to updated_at where updated_at is NULL."""
# Set updated_at to created_at for all documents that don't have an updated_at value
op.execute(
"""
UPDATE documents
SET updated_at = created_at
WHERE updated_at IS NULL
AND created_at IS NOT NULL
"""
)
def downgrade() -> None:
"""Downgrade schema - Set updated_at back to NULL where it was copied from created_at."""
# Note: This is a lossy downgrade - we cannot distinguish between documents
# that had updated_at set by this migration vs. other sources.
# For safety, we don't automatically revert these changes.
pass

View file

@ -1,6 +1,7 @@
import json
import logging
import traceback
from datetime import UTC, datetime, timedelta
from typing import Any
from langchain_core.messages import HumanMessage, SystemMessage
@ -21,6 +22,9 @@ from .qna_agent.graph import graph as qna_agent_graph
from .state import State
from .utils import get_connector_emoji, get_connector_friendly_name
# Time filter constants - hardcoded 2 year time range for now
DEFAULT_TIME_FILTER_YEARS = 2
def extract_sources_from_documents(
all_documents: list[dict[str, Any]],
@ -524,6 +528,8 @@ async def fetch_relevant_documents(
connector_service: ConnectorService = None,
search_mode: SearchMode = SearchMode.CHUNKS,
user_selected_sources: list[dict[str, Any]] | None = None,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> list[dict[str, Any]]:
"""
Fetch relevant documents for research questions using the provided connectors.
@ -542,6 +548,10 @@ async def fetch_relevant_documents(
state: The current state containing the streaming service
top_k: Number of top results to retrieve per connector per question
connector_service: An initialized connector service to use for searching
search_mode: Search mode (CHUNKS or DOCUMENTS)
user_selected_sources: Optional list of user-selected source objects
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
List of relevant documents
@ -620,6 +630,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -646,6 +658,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -672,6 +686,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -695,6 +711,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -718,6 +736,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -744,6 +764,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -770,6 +792,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -796,6 +820,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -922,6 +948,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
if source_object:
@ -943,6 +971,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -968,6 +998,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -993,6 +1025,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -1018,6 +1052,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -1043,6 +1079,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -1068,6 +1106,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -1094,6 +1134,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -1120,6 +1162,8 @@ async def fetch_relevant_documents(
search_space_id=search_space_id,
top_k=top_k,
search_mode=search_mode,
start_date=start_date,
end_date=end_date,
)
# Add to sources and raw documents
@ -1387,6 +1431,10 @@ async def handle_qna_workflow(
# Use the reformulated query as a single research question
research_questions = [reformulated_query, user_query]
# Calculate time filter: last 2 years from now (hardcoded for now)
end_date = datetime.now(UTC)
start_date = end_date - timedelta(days=DEFAULT_TIME_FILTER_YEARS * 365)
relevant_documents = await fetch_relevant_documents(
research_questions=research_questions,
search_space_id=configuration.search_space_id,
@ -1398,6 +1446,8 @@ async def handle_qna_workflow(
connector_service=connector_service,
search_mode=configuration.search_mode,
user_selected_sources=user_selected_sources,
start_date=start_date,
end_date=end_date,
)
except Exception as e:
error_message = f"Error fetching relevant documents for QNA: {e!s}"

View file

@ -25,8 +25,6 @@ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_asyn
from sqlalchemy.orm import DeclarativeBase, Mapped, declared_attr, relationship
from app.config import config
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.retriever.documents_hybrid_search import DocumentHybridSearchRetriever
if config.AUTH_TYPE == "GOOGLE":
from fastapi_users.db import SQLAlchemyBaseOAuthAccountTableUUID
@ -799,18 +797,6 @@ else:
yield SQLAlchemyUserDatabase(session, User)
async def get_chucks_hybrid_search_retriever(
session: AsyncSession = Depends(get_async_session),
):
return ChucksHybridSearchRetriever(session)
async def get_documents_hybrid_search_retriever(
session: AsyncSession = Depends(get_async_session),
):
return DocumentHybridSearchRetriever(session)
def has_permission(user_permissions: list[str], required_permission: str) -> bool:
"""
Check if the user has the required permission.

View file

@ -1,3 +1,6 @@
from datetime import datetime
class ChucksHybridSearchRetriever:
def __init__(self, db_session):
"""
@ -13,6 +16,8 @@ class ChucksHybridSearchRetriever:
query_text: str,
top_k: int,
search_space_id: int,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> list:
"""
Perform vector similarity search on chunks.
@ -21,6 +26,8 @@ class ChucksHybridSearchRetriever:
query_text: The search query text
top_k: Number of results to return
search_space_id: The search space ID to search within
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
List of chunks sorted by vector similarity
@ -43,6 +50,12 @@ class ChucksHybridSearchRetriever:
.where(Document.search_space_id == search_space_id)
)
# Add time-based filtering if provided
if start_date is not None:
query = query.where(Document.updated_at >= start_date)
if end_date is not None:
query = query.where(Document.updated_at <= end_date)
# Add vector similarity ordering
query = query.order_by(Chunk.embedding.op("<=>")(query_embedding)).limit(top_k)
@ -57,6 +70,8 @@ class ChucksHybridSearchRetriever:
query_text: str,
top_k: int,
search_space_id: int,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> list:
"""
Perform full-text keyword search on chunks.
@ -65,6 +80,8 @@ class ChucksHybridSearchRetriever:
query_text: The search query text
top_k: Number of results to return
search_space_id: The search space ID to search within
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
List of chunks sorted by text relevance
@ -89,6 +106,12 @@ class ChucksHybridSearchRetriever:
) # Only include results that match the query
)
# Add time-based filtering if provided
if start_date is not None:
query = query.where(Document.updated_at >= start_date)
if end_date is not None:
query = query.where(Document.updated_at <= end_date)
# Add text search ranking
query = query.order_by(func.ts_rank_cd(tsvector, tsquery).desc()).limit(top_k)
@ -104,6 +127,8 @@ class ChucksHybridSearchRetriever:
top_k: int,
search_space_id: int,
document_type: str | None = None,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> list:
"""
Combine vector similarity and full-text search results using Reciprocal Rank Fusion.
@ -113,6 +138,8 @@ class ChucksHybridSearchRetriever:
top_k: Number of results to return
search_space_id: The search space ID to search within
document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
List of dictionaries containing chunk data and relevance scores
@ -151,6 +178,12 @@ class ChucksHybridSearchRetriever:
else:
base_conditions.append(Document.document_type == document_type)
# Add time-based filtering if provided
if start_date is not None:
base_conditions.append(Document.updated_at >= start_date)
if end_date is not None:
base_conditions.append(Document.updated_at <= end_date)
# CTE for semantic search filtered by search space
semantic_search_cte = (
select(

View file

@ -1,3 +1,6 @@
from datetime import datetime
class DocumentHybridSearchRetriever:
def __init__(self, db_session):
"""
@ -13,6 +16,8 @@ class DocumentHybridSearchRetriever:
query_text: str,
top_k: int,
search_space_id: int,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> list:
"""
Perform vector similarity search on documents.
@ -21,6 +26,8 @@ class DocumentHybridSearchRetriever:
query_text: The search query text
top_k: Number of results to return
search_space_id: The search space ID to search within
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
List of documents sorted by vector similarity
@ -42,6 +49,12 @@ class DocumentHybridSearchRetriever:
.where(Document.search_space_id == search_space_id)
)
# Add time-based filtering if provided
if start_date is not None:
query = query.where(Document.updated_at >= start_date)
if end_date is not None:
query = query.where(Document.updated_at <= end_date)
# Add vector similarity ordering
query = query.order_by(Document.embedding.op("<=>")(query_embedding)).limit(
top_k
@ -58,6 +71,8 @@ class DocumentHybridSearchRetriever:
query_text: str,
top_k: int,
search_space_id: int,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> list:
"""
Perform full-text keyword search on documents.
@ -66,6 +81,8 @@ class DocumentHybridSearchRetriever:
query_text: The search query text
top_k: Number of results to return
search_space_id: The search space ID to search within
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
List of documents sorted by text relevance
@ -89,6 +106,12 @@ class DocumentHybridSearchRetriever:
) # Only include results that match the query
)
# Add time-based filtering if provided
if start_date is not None:
query = query.where(Document.updated_at >= start_date)
if end_date is not None:
query = query.where(Document.updated_at <= end_date)
# Add text search ranking
query = query.order_by(func.ts_rank_cd(tsvector, tsquery).desc()).limit(top_k)
@ -104,6 +127,8 @@ class DocumentHybridSearchRetriever:
top_k: int,
search_space_id: int,
document_type: str | None = None,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> list:
"""
Combine vector similarity and full-text search results using Reciprocal Rank Fusion.
@ -113,6 +138,8 @@ class DocumentHybridSearchRetriever:
top_k: Number of results to return
search_space_id: The search space ID to search within
document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL")
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
"""
from sqlalchemy import func, select, text
@ -149,6 +176,12 @@ class DocumentHybridSearchRetriever:
else:
base_conditions.append(Document.document_type == document_type)
# Add time-based filtering if provided
if start_date is not None:
base_conditions.append(Document.updated_at >= start_date)
if end_date is not None:
base_conditions.append(Document.updated_at <= end_date)
# CTE for semantic search filtered by search space
semantic_search_cte = select(
Document.id,

View file

@ -1,4 +1,5 @@
import asyncio
from datetime import datetime
from typing import Any
from urllib.parse import urljoin
@ -63,6 +64,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for crawled URLs and return both the source information and langchain documents
@ -72,6 +75,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -82,6 +87,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="CRAWLED_URL",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
crawled_urls_chunks = await self.document_retriever.hybrid_search(
@ -89,6 +96,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="CRAWLED_URL",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks)
@ -168,10 +177,20 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for files and return both the source information and langchain documents
Args:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
"""
@ -181,6 +200,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="FILE",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
files_chunks = await self.document_retriever.hybrid_search(
@ -188,6 +209,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="FILE",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
files_chunks = self._transform_document_results(files_chunks)
@ -807,10 +830,20 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for slack and return both the source information and langchain documents
Args:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
"""
@ -820,6 +853,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="SLACK_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
slack_chunks = await self.document_retriever.hybrid_search(
@ -827,6 +862,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="SLACK_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
slack_chunks = self._transform_document_results(slack_chunks)
@ -892,6 +929,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Notion pages and return both the source information and langchain documents
@ -900,6 +939,9 @@ class ConnectorService:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -910,6 +952,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="NOTION_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
notion_chunks = await self.document_retriever.hybrid_search(
@ -917,6 +961,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="NOTION_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
notion_chunks = self._transform_document_results(notion_chunks)
@ -985,6 +1031,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for extension data and return both the source information and langchain documents
@ -993,6 +1041,9 @@ class ConnectorService:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1003,6 +1054,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="EXTENSION",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
extension_chunks = await self.document_retriever.hybrid_search(
@ -1010,6 +1063,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="EXTENSION",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
extension_chunks = self._transform_document_results(extension_chunks)
@ -1102,6 +1157,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for YouTube videos and return both the source information and langchain documents
@ -1110,6 +1167,9 @@ class ConnectorService:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1120,6 +1180,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="YOUTUBE_VIDEO",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
youtube_chunks = await self.document_retriever.hybrid_search(
@ -1127,6 +1189,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="YOUTUBE_VIDEO",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
youtube_chunks = self._transform_document_results(youtube_chunks)
@ -1195,10 +1259,20 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for GitHub documents and return both the source information and langchain documents
Args:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
"""
@ -1208,6 +1282,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="GITHUB_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
github_chunks = await self.document_retriever.hybrid_search(
@ -1215,6 +1291,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="GITHUB_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
github_chunks = self._transform_document_results(github_chunks)
@ -1267,6 +1345,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Linear issues and comments and return both the source information and langchain documents
@ -1275,6 +1355,9 @@ class ConnectorService:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1285,6 +1368,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="LINEAR_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
linear_chunks = await self.document_retriever.hybrid_search(
@ -1292,6 +1377,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="LINEAR_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
linear_chunks = self._transform_document_results(linear_chunks)
@ -1372,6 +1459,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Jira issues and comments and return both the source information and langchain documents
@ -1381,6 +1470,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1391,6 +1482,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="JIRA_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
jira_chunks = await self.document_retriever.hybrid_search(
@ -1398,6 +1491,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="JIRA_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
jira_chunks = self._transform_document_results(jira_chunks)
@ -1489,6 +1584,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Google Calendar events and return both the source information and langchain documents
@ -1498,6 +1595,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1508,6 +1607,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="GOOGLE_CALENDAR_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
calendar_chunks = await self.document_retriever.hybrid_search(
@ -1515,6 +1616,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="GOOGLE_CALENDAR_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
calendar_chunks = self._transform_document_results(calendar_chunks)
@ -1618,6 +1721,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Airtable records and return both the source information and langchain documents
@ -1627,6 +1732,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1637,6 +1744,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="AIRTABLE_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
airtable_chunks = await self.document_retriever.hybrid_search(
@ -1644,6 +1753,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="AIRTABLE_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
airtable_chunks = self._transform_document_results(airtable_chunks)
@ -1702,6 +1813,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Gmail messages and return both the source information and langchain documents
@ -1711,6 +1824,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1721,6 +1836,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="GOOGLE_GMAIL_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
gmail_chunks = await self.document_retriever.hybrid_search(
@ -1728,6 +1845,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="GOOGLE_GMAIL_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
gmail_chunks = self._transform_document_results(gmail_chunks)
@ -1822,6 +1941,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Confluence pages and return both the source information and langchain documents
@ -1831,6 +1952,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1841,6 +1964,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="CONFLUENCE_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
confluence_chunks = await self.document_retriever.hybrid_search(
@ -1848,6 +1973,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="CONFLUENCE_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
confluence_chunks = self._transform_document_results(confluence_chunks)
@ -1913,6 +2040,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for ClickUp tasks and return both the source information and langchain documents
@ -1922,6 +2051,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -1932,6 +2063,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="CLICKUP_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
clickup_chunks = await self.document_retriever.hybrid_search(
@ -1939,6 +2072,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="CLICKUP_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
clickup_chunks = self._transform_document_results(clickup_chunks)
@ -2146,6 +2281,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Discord messages and return both the source information and langchain documents
@ -2154,6 +2291,9 @@ class ConnectorService:
user_query: The user's query
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -2164,6 +2304,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="DISCORD_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
discord_chunks = await self.document_retriever.hybrid_search(
@ -2171,6 +2313,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="DISCORD_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
discord_chunks = self._transform_document_results(discord_chunks)
@ -2239,6 +2383,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Luma events and return both the source information and langchain documents
@ -2248,6 +2394,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -2258,6 +2406,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="LUMA_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
luma_chunks = await self.document_retriever.hybrid_search(
@ -2265,6 +2415,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="LUMA_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
luma_chunks = self._transform_document_results(luma_chunks)
@ -2393,6 +2545,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for Elasticsearch documents and return both the source information and langchain documents
@ -2402,6 +2556,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -2412,6 +2568,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="ELASTICSEARCH_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
elasticsearch_chunks = await self.document_retriever.hybrid_search(
@ -2419,6 +2577,8 @@ class ConnectorService:
top_k=top_k,
search_space_id=search_space_id,
document_type="ELASTICSEARCH_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
elasticsearch_chunks = self._transform_document_results(
@ -2504,6 +2664,8 @@ class ConnectorService:
search_space_id: int,
top_k: int = 20,
search_mode: SearchMode = SearchMode.CHUNKS,
start_date: datetime | None = None,
end_date: datetime | None = None,
) -> tuple:
"""
Search for BookStack pages and return both the source information and langchain documents
@ -2514,6 +2676,8 @@ class ConnectorService:
search_space_id: The search space ID to search in
top_k: Maximum number of results to return
search_mode: Search mode (CHUNKS or DOCUMENTS)
start_date: Optional start date for filtering documents by updated_at
end_date: Optional end date for filtering documents by updated_at
Returns:
tuple: (sources_info, langchain_documents)
@ -2522,17 +2686,19 @@ class ConnectorService:
bookstack_chunks = await self.chunk_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="BOOKSTACK_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
elif search_mode == SearchMode.DOCUMENTS:
bookstack_chunks = await self.document_retriever.hybrid_search(
query_text=user_query,
top_k=top_k,
user_id=user_id,
search_space_id=search_space_id,
document_type="BOOKSTACK_CONNECTOR",
start_date=start_date,
end_date=end_date,
)
# Transform document retriever results to match expected format
bookstack_chunks = self._transform_document_results(bookstack_chunks)