feat: Introduce the RAPTOR Search.

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-05-11 23:04:48 -07:00
parent d3540d8cc5
commit fbbb3294f4
11 changed files with 318 additions and 127 deletions

View file

@ -3,10 +3,16 @@
from __future__ import annotations
from dataclasses import dataclass, fields
from enum import Enum
from typing import Optional, List, Any
from langchain_core.runnables import RunnableConfig
class SearchMode(Enum):
"""Enum defining the type of search mode."""
CHUNKS = "CHUNKS"
DOCUMENTS = "DOCUMENTS"
@dataclass(kw_only=True)
class Configuration:
@ -18,6 +24,7 @@ class Configuration:
connectors_to_search: List[str]
user_id: str
search_space_id: int
search_mode: SearchMode
@classmethod

View file

@ -10,7 +10,7 @@ from langchain_core.runnables import RunnableConfig
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
from .configuration import Configuration
from .configuration import Configuration, SearchMode
from .prompts import get_answer_outline_system_prompt
from .state import State
from .sub_section_writer.graph import graph as sub_section_writer_graph
@ -149,7 +149,8 @@ async def fetch_relevant_documents(
writer: StreamWriter = None,
state: State = None,
top_k: int = 10,
connector_service: ConnectorService = None
connector_service: ConnectorService = None,
search_mode: SearchMode = SearchMode.CHUNKS
) -> List[Dict[str, Any]]:
"""
Fetch relevant documents for research questions using the provided connectors.
@ -213,7 +214,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -231,7 +233,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -249,7 +252,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -267,7 +271,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -286,7 +291,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -304,7 +310,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -322,7 +329,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -340,7 +348,8 @@ async def fetch_relevant_documents(
user_query=reformulated_query,
user_id=user_id,
search_space_id=search_space_id,
top_k=top_k
top_k=top_k,
search_mode=search_mode
)
# Add to sources and raw documents
@ -558,7 +567,8 @@ async def process_sections(state: State, config: RunnableConfig, writer: StreamW
writer=writer,
state=state,
top_k=TOP_K,
connector_service=connector_service
connector_service=connector_service,
search_mode=configuration.search_mode
)
except Exception as e:
error_message = f"Error fetching relevant documents: {str(e)}"

View file

@ -141,6 +141,11 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
# Construct a clear, structured query for the LLM
human_message_content = f"""
Source material:
<documents>
{documents_text}
</documents>
Now user's query is:
<user_query>
{user_query}
@ -158,11 +163,6 @@ async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, A
<guiding_questions>
{questions_text}
</guiding_questions>
Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id.
<documents>
{documents_text}
</documents>
"""
# Create messages for the LLM

View file

@ -25,6 +25,8 @@ You are a research assistant tasked with analyzing documents and providing compr
16. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting.
17. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata.
18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
19. CRITICAL: Focus only on answering the user's query. Any guiding questions provided are for your thinking process only and should not be mentioned in your response.
20. CRITICAL: Ensure your response aligns with the provided sub-section title and section position.
</instructions>
<format>
@ -37,6 +39,8 @@ You are a research assistant tasked with analyzing documents and providing compr
- NEVER create your own citation numbering system - use the exact source_id values from the documents.
- NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only.
- NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
- NEVER include or mention the guiding questions in your response. They are only to help guide your thinking.
- ALWAYS focus on answering the user's query directly from the information in the documents.
</format>
<input_example>
@ -84,4 +88,21 @@ ONLY use plain square brackets [1] or multiple citations [1], [2], [3]
</incorrect_citation_formats>
Note that the citation numbers match exactly with the source_id values (1, 13, and 21) and are not renumbered sequentially. Citations follow IEEE style with square brackets and appear at the end of sentences.
<user_query_instructions>
When you see a user query like:
<user_query>
Give all linear issues.
</user_query>
Focus exclusively on answering this query using information from the provided documents.
If guiding questions are provided in a <guiding_questions> section, use them only to guide your thinking process. Do not mention or list these questions in your response.
Make sure your response:
1. Directly answers the user's query
2. Fits the provided sub-section title and section position
3. Uses proper citations for all information from documents
4. Is well-structured and professional in tone
</user_query_instructions>
"""