From 919c323ef342fb0c02649c2a1295bf3dd90882b4 Mon Sep 17 00:00:00 2001 From: "DESKTOP-RTLN3BA\\$punk" Date: Fri, 12 Dec 2025 02:42:20 -0800 Subject: [PATCH] feat: implement time-based filtering for document retrieval using 'updated_at' timestamp --- .../47_copy_created_at_to_updated_at.py | 40 +++++ .../app/agents/researcher/nodes.py | 50 ++++++ surfsense_backend/app/db.py | 14 -- .../app/retriever/chunks_hybrid_search.py | 33 ++++ .../app/retriever/documents_hybrid_search.py | 33 ++++ .../app/services/connector_service.py | 170 +++++++++++++++++- 6 files changed, 324 insertions(+), 16 deletions(-) create mode 100644 surfsense_backend/alembic/versions/47_copy_created_at_to_updated_at.py diff --git a/surfsense_backend/alembic/versions/47_copy_created_at_to_updated_at.py b/surfsense_backend/alembic/versions/47_copy_created_at_to_updated_at.py new file mode 100644 index 000000000..b90e706e7 --- /dev/null +++ b/surfsense_backend/alembic/versions/47_copy_created_at_to_updated_at.py @@ -0,0 +1,40 @@ +"""47_copy_created_at_to_updated_at + +Revision ID: 47 +Revises: 46 +Create Date: 2025-12-12 + +Copies created_at values to updated_at for all documents where updated_at is NULL. +This ensures time-based filtering in retrievers works correctly for all documents. +""" + +from collections.abc import Sequence + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "47" +down_revision: str | None = "46" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + """Upgrade schema - Copy created_at to updated_at where updated_at is NULL.""" + # Set updated_at to created_at for all documents that don't have an updated_at value + op.execute( + """ + UPDATE documents + SET updated_at = created_at + WHERE updated_at IS NULL + AND created_at IS NOT NULL + """ + ) + + +def downgrade() -> None: + """Downgrade schema - Set updated_at back to NULL where it was copied from created_at.""" + # Note: This is a lossy downgrade - we cannot distinguish between documents + # that had updated_at set by this migration vs. other sources. + # For safety, we don't automatically revert these changes. + pass diff --git a/surfsense_backend/app/agents/researcher/nodes.py b/surfsense_backend/app/agents/researcher/nodes.py index 4b2f4b0e6..b8ad33c4f 100644 --- a/surfsense_backend/app/agents/researcher/nodes.py +++ b/surfsense_backend/app/agents/researcher/nodes.py @@ -1,6 +1,7 @@ import json import logging import traceback +from datetime import UTC, datetime, timedelta from typing import Any from langchain_core.messages import HumanMessage, SystemMessage @@ -21,6 +22,9 @@ from .qna_agent.graph import graph as qna_agent_graph from .state import State from .utils import get_connector_emoji, get_connector_friendly_name +# Time filter constants - hardcoded 2 year time range for now +DEFAULT_TIME_FILTER_YEARS = 2 + def extract_sources_from_documents( all_documents: list[dict[str, Any]], @@ -524,6 +528,8 @@ async def fetch_relevant_documents( connector_service: ConnectorService = None, search_mode: SearchMode = SearchMode.CHUNKS, user_selected_sources: list[dict[str, Any]] | None = None, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> list[dict[str, Any]]: """ Fetch relevant documents for research questions using the provided connectors. @@ -542,6 +548,10 @@ async def fetch_relevant_documents( state: The current state containing the streaming service top_k: Number of top results to retrieve per connector per question connector_service: An initialized connector service to use for searching + search_mode: Search mode (CHUNKS or DOCUMENTS) + user_selected_sources: Optional list of user-selected source objects + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: List of relevant documents @@ -620,6 +630,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -646,6 +658,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -672,6 +686,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -695,6 +711,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -718,6 +736,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -744,6 +764,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -770,6 +792,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -796,6 +820,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -922,6 +948,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents if source_object: @@ -943,6 +971,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -968,6 +998,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -993,6 +1025,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -1018,6 +1052,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -1043,6 +1079,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -1068,6 +1106,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -1094,6 +1134,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -1120,6 +1162,8 @@ async def fetch_relevant_documents( search_space_id=search_space_id, top_k=top_k, search_mode=search_mode, + start_date=start_date, + end_date=end_date, ) # Add to sources and raw documents @@ -1387,6 +1431,10 @@ async def handle_qna_workflow( # Use the reformulated query as a single research question research_questions = [reformulated_query, user_query] + # Calculate time filter: last 2 years from now (hardcoded for now) + end_date = datetime.now(UTC) + start_date = end_date - timedelta(days=DEFAULT_TIME_FILTER_YEARS * 365) + relevant_documents = await fetch_relevant_documents( research_questions=research_questions, search_space_id=configuration.search_space_id, @@ -1398,6 +1446,8 @@ async def handle_qna_workflow( connector_service=connector_service, search_mode=configuration.search_mode, user_selected_sources=user_selected_sources, + start_date=start_date, + end_date=end_date, ) except Exception as e: error_message = f"Error fetching relevant documents for QNA: {e!s}" diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index b6581ce6a..cf267bd3d 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -25,8 +25,6 @@ from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_asyn from sqlalchemy.orm import DeclarativeBase, Mapped, declared_attr, relationship from app.config import config -from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever -from app.retriever.documents_hybrid_search import DocumentHybridSearchRetriever if config.AUTH_TYPE == "GOOGLE": from fastapi_users.db import SQLAlchemyBaseOAuthAccountTableUUID @@ -799,18 +797,6 @@ else: yield SQLAlchemyUserDatabase(session, User) -async def get_chucks_hybrid_search_retriever( - session: AsyncSession = Depends(get_async_session), -): - return ChucksHybridSearchRetriever(session) - - -async def get_documents_hybrid_search_retriever( - session: AsyncSession = Depends(get_async_session), -): - return DocumentHybridSearchRetriever(session) - - def has_permission(user_permissions: list[str], required_permission: str) -> bool: """ Check if the user has the required permission. diff --git a/surfsense_backend/app/retriever/chunks_hybrid_search.py b/surfsense_backend/app/retriever/chunks_hybrid_search.py index 25a121ad7..017f36088 100644 --- a/surfsense_backend/app/retriever/chunks_hybrid_search.py +++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py @@ -1,3 +1,6 @@ +from datetime import datetime + + class ChucksHybridSearchRetriever: def __init__(self, db_session): """ @@ -13,6 +16,8 @@ class ChucksHybridSearchRetriever: query_text: str, top_k: int, search_space_id: int, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> list: """ Perform vector similarity search on chunks. @@ -21,6 +26,8 @@ class ChucksHybridSearchRetriever: query_text: The search query text top_k: Number of results to return search_space_id: The search space ID to search within + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: List of chunks sorted by vector similarity @@ -43,6 +50,12 @@ class ChucksHybridSearchRetriever: .where(Document.search_space_id == search_space_id) ) + # Add time-based filtering if provided + if start_date is not None: + query = query.where(Document.updated_at >= start_date) + if end_date is not None: + query = query.where(Document.updated_at <= end_date) + # Add vector similarity ordering query = query.order_by(Chunk.embedding.op("<=>")(query_embedding)).limit(top_k) @@ -57,6 +70,8 @@ class ChucksHybridSearchRetriever: query_text: str, top_k: int, search_space_id: int, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> list: """ Perform full-text keyword search on chunks. @@ -65,6 +80,8 @@ class ChucksHybridSearchRetriever: query_text: The search query text top_k: Number of results to return search_space_id: The search space ID to search within + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: List of chunks sorted by text relevance @@ -89,6 +106,12 @@ class ChucksHybridSearchRetriever: ) # Only include results that match the query ) + # Add time-based filtering if provided + if start_date is not None: + query = query.where(Document.updated_at >= start_date) + if end_date is not None: + query = query.where(Document.updated_at <= end_date) + # Add text search ranking query = query.order_by(func.ts_rank_cd(tsvector, tsquery).desc()).limit(top_k) @@ -104,6 +127,8 @@ class ChucksHybridSearchRetriever: top_k: int, search_space_id: int, document_type: str | None = None, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> list: """ Combine vector similarity and full-text search results using Reciprocal Rank Fusion. @@ -113,6 +138,8 @@ class ChucksHybridSearchRetriever: top_k: Number of results to return search_space_id: The search space ID to search within document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL") + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: List of dictionaries containing chunk data and relevance scores @@ -151,6 +178,12 @@ class ChucksHybridSearchRetriever: else: base_conditions.append(Document.document_type == document_type) + # Add time-based filtering if provided + if start_date is not None: + base_conditions.append(Document.updated_at >= start_date) + if end_date is not None: + base_conditions.append(Document.updated_at <= end_date) + # CTE for semantic search filtered by search space semantic_search_cte = ( select( diff --git a/surfsense_backend/app/retriever/documents_hybrid_search.py b/surfsense_backend/app/retriever/documents_hybrid_search.py index 0c08ecc05..ba3243a96 100644 --- a/surfsense_backend/app/retriever/documents_hybrid_search.py +++ b/surfsense_backend/app/retriever/documents_hybrid_search.py @@ -1,3 +1,6 @@ +from datetime import datetime + + class DocumentHybridSearchRetriever: def __init__(self, db_session): """ @@ -13,6 +16,8 @@ class DocumentHybridSearchRetriever: query_text: str, top_k: int, search_space_id: int, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> list: """ Perform vector similarity search on documents. @@ -21,6 +26,8 @@ class DocumentHybridSearchRetriever: query_text: The search query text top_k: Number of results to return search_space_id: The search space ID to search within + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: List of documents sorted by vector similarity @@ -42,6 +49,12 @@ class DocumentHybridSearchRetriever: .where(Document.search_space_id == search_space_id) ) + # Add time-based filtering if provided + if start_date is not None: + query = query.where(Document.updated_at >= start_date) + if end_date is not None: + query = query.where(Document.updated_at <= end_date) + # Add vector similarity ordering query = query.order_by(Document.embedding.op("<=>")(query_embedding)).limit( top_k @@ -58,6 +71,8 @@ class DocumentHybridSearchRetriever: query_text: str, top_k: int, search_space_id: int, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> list: """ Perform full-text keyword search on documents. @@ -66,6 +81,8 @@ class DocumentHybridSearchRetriever: query_text: The search query text top_k: Number of results to return search_space_id: The search space ID to search within + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: List of documents sorted by text relevance @@ -89,6 +106,12 @@ class DocumentHybridSearchRetriever: ) # Only include results that match the query ) + # Add time-based filtering if provided + if start_date is not None: + query = query.where(Document.updated_at >= start_date) + if end_date is not None: + query = query.where(Document.updated_at <= end_date) + # Add text search ranking query = query.order_by(func.ts_rank_cd(tsvector, tsquery).desc()).limit(top_k) @@ -104,6 +127,8 @@ class DocumentHybridSearchRetriever: top_k: int, search_space_id: int, document_type: str | None = None, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> list: """ Combine vector similarity and full-text search results using Reciprocal Rank Fusion. @@ -113,6 +138,8 @@ class DocumentHybridSearchRetriever: top_k: Number of results to return search_space_id: The search space ID to search within document_type: Optional document type to filter results (e.g., "FILE", "CRAWLED_URL") + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at """ from sqlalchemy import func, select, text @@ -149,6 +176,12 @@ class DocumentHybridSearchRetriever: else: base_conditions.append(Document.document_type == document_type) + # Add time-based filtering if provided + if start_date is not None: + base_conditions.append(Document.updated_at >= start_date) + if end_date is not None: + base_conditions.append(Document.updated_at <= end_date) + # CTE for semantic search filtered by search space semantic_search_cte = select( Document.id, diff --git a/surfsense_backend/app/services/connector_service.py b/surfsense_backend/app/services/connector_service.py index 2b122eeb7..2dc8e14dc 100644 --- a/surfsense_backend/app/services/connector_service.py +++ b/surfsense_backend/app/services/connector_service.py @@ -1,4 +1,5 @@ import asyncio +from datetime import datetime from typing import Any from urllib.parse import urljoin @@ -63,6 +64,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for crawled URLs and return both the source information and langchain documents @@ -72,6 +75,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -82,6 +87,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="CRAWLED_URL", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: crawled_urls_chunks = await self.document_retriever.hybrid_search( @@ -89,6 +96,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="CRAWLED_URL", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format crawled_urls_chunks = self._transform_document_results(crawled_urls_chunks) @@ -168,10 +177,20 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for files and return both the source information and langchain documents + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + Returns: tuple: (sources_info, langchain_documents) """ @@ -181,6 +200,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="FILE", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: files_chunks = await self.document_retriever.hybrid_search( @@ -188,6 +209,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="FILE", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format files_chunks = self._transform_document_results(files_chunks) @@ -807,10 +830,20 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for slack and return both the source information and langchain documents + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + Returns: tuple: (sources_info, langchain_documents) """ @@ -820,6 +853,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="SLACK_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: slack_chunks = await self.document_retriever.hybrid_search( @@ -827,6 +862,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="SLACK_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format slack_chunks = self._transform_document_results(slack_chunks) @@ -892,6 +929,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Notion pages and return both the source information and langchain documents @@ -900,6 +939,9 @@ class ConnectorService: user_query: The user's query search_space_id: The search space ID to search in top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -910,6 +952,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="NOTION_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: notion_chunks = await self.document_retriever.hybrid_search( @@ -917,6 +961,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="NOTION_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format notion_chunks = self._transform_document_results(notion_chunks) @@ -985,6 +1031,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for extension data and return both the source information and langchain documents @@ -993,6 +1041,9 @@ class ConnectorService: user_query: The user's query search_space_id: The search space ID to search in top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1003,6 +1054,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="EXTENSION", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: extension_chunks = await self.document_retriever.hybrid_search( @@ -1010,6 +1063,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="EXTENSION", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format extension_chunks = self._transform_document_results(extension_chunks) @@ -1102,6 +1157,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for YouTube videos and return both the source information and langchain documents @@ -1110,6 +1167,9 @@ class ConnectorService: user_query: The user's query search_space_id: The search space ID to search in top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1120,6 +1180,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="YOUTUBE_VIDEO", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: youtube_chunks = await self.document_retriever.hybrid_search( @@ -1127,6 +1189,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="YOUTUBE_VIDEO", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format youtube_chunks = self._transform_document_results(youtube_chunks) @@ -1195,10 +1259,20 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for GitHub documents and return both the source information and langchain documents + Args: + user_query: The user's query + search_space_id: The search space ID to search in + top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at + Returns: tuple: (sources_info, langchain_documents) """ @@ -1208,6 +1282,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="GITHUB_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: github_chunks = await self.document_retriever.hybrid_search( @@ -1215,6 +1291,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="GITHUB_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format github_chunks = self._transform_document_results(github_chunks) @@ -1267,6 +1345,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Linear issues and comments and return both the source information and langchain documents @@ -1275,6 +1355,9 @@ class ConnectorService: user_query: The user's query search_space_id: The search space ID to search in top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1285,6 +1368,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="LINEAR_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: linear_chunks = await self.document_retriever.hybrid_search( @@ -1292,6 +1377,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="LINEAR_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format linear_chunks = self._transform_document_results(linear_chunks) @@ -1372,6 +1459,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Jira issues and comments and return both the source information and langchain documents @@ -1381,6 +1470,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1391,6 +1482,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="JIRA_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: jira_chunks = await self.document_retriever.hybrid_search( @@ -1398,6 +1491,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="JIRA_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format jira_chunks = self._transform_document_results(jira_chunks) @@ -1489,6 +1584,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Google Calendar events and return both the source information and langchain documents @@ -1498,6 +1595,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1508,6 +1607,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="GOOGLE_CALENDAR_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: calendar_chunks = await self.document_retriever.hybrid_search( @@ -1515,6 +1616,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="GOOGLE_CALENDAR_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format calendar_chunks = self._transform_document_results(calendar_chunks) @@ -1618,6 +1721,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Airtable records and return both the source information and langchain documents @@ -1627,6 +1732,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1637,6 +1744,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="AIRTABLE_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: airtable_chunks = await self.document_retriever.hybrid_search( @@ -1644,6 +1753,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="AIRTABLE_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format airtable_chunks = self._transform_document_results(airtable_chunks) @@ -1702,6 +1813,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Gmail messages and return both the source information and langchain documents @@ -1711,6 +1824,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1721,6 +1836,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="GOOGLE_GMAIL_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: gmail_chunks = await self.document_retriever.hybrid_search( @@ -1728,6 +1845,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="GOOGLE_GMAIL_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format gmail_chunks = self._transform_document_results(gmail_chunks) @@ -1822,6 +1941,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Confluence pages and return both the source information and langchain documents @@ -1831,6 +1952,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1841,6 +1964,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="CONFLUENCE_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: confluence_chunks = await self.document_retriever.hybrid_search( @@ -1848,6 +1973,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="CONFLUENCE_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format confluence_chunks = self._transform_document_results(confluence_chunks) @@ -1913,6 +2040,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for ClickUp tasks and return both the source information and langchain documents @@ -1922,6 +2051,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -1932,6 +2063,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="CLICKUP_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: clickup_chunks = await self.document_retriever.hybrid_search( @@ -1939,6 +2072,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="CLICKUP_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format clickup_chunks = self._transform_document_results(clickup_chunks) @@ -2146,6 +2281,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Discord messages and return both the source information and langchain documents @@ -2154,6 +2291,9 @@ class ConnectorService: user_query: The user's query search_space_id: The search space ID to search in top_k: Maximum number of results to return + search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -2164,6 +2304,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="DISCORD_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: discord_chunks = await self.document_retriever.hybrid_search( @@ -2171,6 +2313,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="DISCORD_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format discord_chunks = self._transform_document_results(discord_chunks) @@ -2239,6 +2383,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Luma events and return both the source information and langchain documents @@ -2248,6 +2394,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -2258,6 +2406,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="LUMA_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: luma_chunks = await self.document_retriever.hybrid_search( @@ -2265,6 +2415,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="LUMA_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format luma_chunks = self._transform_document_results(luma_chunks) @@ -2393,6 +2545,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for Elasticsearch documents and return both the source information and langchain documents @@ -2402,6 +2556,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -2412,6 +2568,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="ELASTICSEARCH_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: elasticsearch_chunks = await self.document_retriever.hybrid_search( @@ -2419,6 +2577,8 @@ class ConnectorService: top_k=top_k, search_space_id=search_space_id, document_type="ELASTICSEARCH_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format elasticsearch_chunks = self._transform_document_results( @@ -2504,6 +2664,8 @@ class ConnectorService: search_space_id: int, top_k: int = 20, search_mode: SearchMode = SearchMode.CHUNKS, + start_date: datetime | None = None, + end_date: datetime | None = None, ) -> tuple: """ Search for BookStack pages and return both the source information and langchain documents @@ -2514,6 +2676,8 @@ class ConnectorService: search_space_id: The search space ID to search in top_k: Maximum number of results to return search_mode: Search mode (CHUNKS or DOCUMENTS) + start_date: Optional start date for filtering documents by updated_at + end_date: Optional end date for filtering documents by updated_at Returns: tuple: (sources_info, langchain_documents) @@ -2522,17 +2686,19 @@ class ConnectorService: bookstack_chunks = await self.chunk_retriever.hybrid_search( query_text=user_query, top_k=top_k, - user_id=user_id, search_space_id=search_space_id, document_type="BOOKSTACK_CONNECTOR", + start_date=start_date, + end_date=end_date, ) elif search_mode == SearchMode.DOCUMENTS: bookstack_chunks = await self.document_retriever.hybrid_search( query_text=user_query, top_k=top_k, - user_id=user_id, search_space_id=search_space_id, document_type="BOOKSTACK_CONNECTOR", + start_date=start_date, + end_date=end_date, ) # Transform document retriever results to match expected format bookstack_chunks = self._transform_document_results(bookstack_chunks)