refactor: optimized document handling and added token management in Q&A and sub-section writing agents

2026-04-26 17:26:23 +02:00 · 2025-06-05 20:33:09 -07:00 · 2025-06-05 20:33:09 -07:00 · a22228f36e
commit a22228f36e
parent 051580d145
3 changed files with 323 additions and 89 deletions
--- a/surfsense_backend/app/agents/researcher/utils.py
+++ b/surfsense_backend/app/agents/researcher/utils.py
@ -0,0 +1,185 @@
+from typing import List, Dict, Any, Tuple, NamedTuple
+from langchain_core.messages import BaseMessage
+from litellm import token_counter, get_model_info
+from app.config import config as app_config
+
+
+class DocumentTokenInfo(NamedTuple):
+    """Information about a document and its token cost."""
+    index: int
+    document: Dict[str, Any]
+    formatted_content: str
+    token_count: int
+
+
+def convert_langchain_messages_to_dict(messages: List[BaseMessage]) -> List[Dict[str, str]]:
+    """Convert LangChain messages to format expected by token_counter."""
+    role_mapping = {
+        'system': 'system',
+        'human': 'user',
+        'ai': 'assistant'
+    }
+
+    converted_messages = []
+    for msg in messages:
+        role = role_mapping.get(getattr(msg, 'type', None), 'user')
+        converted_messages.append({
+            "role": role,
+            "content": str(msg.content)
+        })
+
+    return converted_messages
+
+
+def format_document_for_citation(document: Dict[str, Any]) -> str:
+    """Format a single document for citation in the standard XML format."""
+    content = document.get("content", "")
+    doc_info = document.get("document", {})
+    document_id = doc_info.get("id", "")
+    document_type = doc_info.get("document_type", "CRAWLED_URL")
+
+    return f"""<document>
+    <metadata>
+        <source_id>{document_id}</source_id>
+        <source_type>{document_type}</source_type>
+    </metadata>
+    <content>
+        {content}
+    </content>
+    </document>"""
+
+
+def format_documents_section(documents: List[Dict[str, Any]], section_title: str = "Source material") -> str:
+    """Format multiple documents into a complete documents section."""
+    if not documents:
+        return ""
+
+    formatted_docs = [format_document_for_citation(doc) for doc in documents]
+
+    return f"""{section_title}:
+    <documents>
+    {chr(10).join(formatted_docs)}
+    </documents>"""
+
+
+def calculate_document_token_costs(documents: List[Dict[str, Any]], model: str) -> List[DocumentTokenInfo]:
+    """Pre-calculate token costs for each document."""
+    document_token_info = []
+
+    for i, doc in enumerate(documents):
+        formatted_doc = format_document_for_citation(doc)
+
+        # Calculate token count for this document
+        token_count = token_counter(
+            messages=[{"role": "user", "content": formatted_doc}],
+            model=model
+        )
+
+        document_token_info.append(DocumentTokenInfo(
+            index=i,
+            document=doc,
+            formatted_content=formatted_doc,
+            token_count=token_count
+        ))
+
+    return document_token_info
+
+
+def find_optimal_documents_with_binary_search(
+    document_tokens: List[DocumentTokenInfo],
+    available_tokens: int
+) -> List[DocumentTokenInfo]:
+    """Use binary search to find the maximum number of documents that fit within token limit."""
+    if not document_tokens or available_tokens <= 0:
+        return []
+
+    left, right = 0, len(document_tokens)
+    optimal_docs = []
+
+    while left <= right:
+        mid = (left + right) // 2
+        current_docs = document_tokens[:mid]
+        current_token_sum = sum(
+            doc_info.token_count for doc_info in current_docs)
+
+        if current_token_sum <= available_tokens:
+            optimal_docs = current_docs
+            left = mid + 1
+        else:
+            right = mid - 1
+
+    return optimal_docs
+
+
+def get_model_context_window(model_name: str) -> int:
+    """Get the total context window size for a model (input + output tokens)."""
+    try:
+        model_info = get_model_info(model_name)
+        context_window = model_info.get(
+            'max_input_tokens', 4096)  # Default fallback
+        return context_window
+    except Exception as e:
+        print(
+            f"Warning: Could not get model info for {model_name}, using default 4096 tokens. Error: {e}")
+        return 4096  # Conservative fallback
+
+
+def optimize_documents_for_token_limit(
+    documents: List[Dict[str, Any]],
+    base_messages: List[BaseMessage],
+    model_name: str = None
+) -> Tuple[List[Dict[str, Any]], bool]:
+    """
+    Optimize documents to fit within token limits using binary search.
+
+    Args:
+        documents: List of documents with content and metadata
+        base_messages: Base messages without documents (chat history + system + human message template)
+        model_name: Model name for token counting (defaults to app_config.FAST_LLM)
+        output_token_buffer: Number of tokens to reserve for model output
+
+    Returns:
+        Tuple of (optimized_documents, has_documents_remaining)
+    """
+    if not documents:
+        return [], False
+
+    model = model_name or app_config.FAST_LLM
+    context_window = get_model_context_window(model)
+
+    # Calculate base token cost
+    base_messages_dict = convert_langchain_messages_to_dict(base_messages)
+    base_tokens = token_counter(messages=base_messages_dict, model=model)
+    available_tokens_for_docs = context_window - base_tokens
+
+    print(
+        f"Token optimization: Context window={context_window}, Base={base_tokens}, Available for docs={available_tokens_for_docs}")
+
+    if available_tokens_for_docs <= 0:
+        print("No tokens available for documents after base content and output buffer")
+        return [], False
+
+    # Calculate token costs for all documents
+    document_token_info = calculate_document_token_costs(documents, model)
+
+    # Find optimal number of documents using binary search
+    optimal_doc_info = find_optimal_documents_with_binary_search(
+        document_token_info,
+        available_tokens_for_docs
+    )
+
+    # Extract the original document objects
+    optimized_documents = [doc_info.document for doc_info in optimal_doc_info]
+    has_documents_remaining = len(optimized_documents) > 0
+
+    print(
+        f"Token optimization result: Using {len(optimized_documents)}/{len(documents)} documents")
+
+    return optimized_documents, has_documents_remaining
+
+
+def calculate_token_count(messages: List[BaseMessage], model_name: str = None) -> int:
+    """Calculate token count for a list of LangChain messages."""
+    model = model_name or app_config.FAST_LLM
+    messages_dict = convert_langchain_messages_to_dict(messages)
+    return token_counter(messages=messages_dict, model=model)