feat: add language support across configurations and prompts

This commit is contained in:
Tarun 2025-10-12 13:13:42 +05:30
parent 402039f02f
commit 045537aa79
16 changed files with 242 additions and 18 deletions

View file

@ -37,6 +37,8 @@ class Configuration:
search_mode: SearchMode
research_mode: ResearchMode
document_ids_to_add_in_context: list[int]
language: str | None = None
@classmethod
def from_runnable_config(

View file

@ -578,6 +578,7 @@ async def write_answer_outline(
num_sections = configuration.num_sections
user_id = configuration.user_id
search_space_id = configuration.search_space_id
language = configuration.language # Get language from configuration
writer(
{
@ -628,7 +629,7 @@ async def write_answer_outline(
# Create messages for the LLM
messages = [
SystemMessage(content=get_answer_outline_system_prompt()),
SystemMessage(content=get_answer_outline_system_prompt(language=language)),
HumanMessage(content=human_message_content),
]
@ -2000,6 +2001,7 @@ async def handle_qna_workflow(
"relevant_documents": all_documents, # Use combined documents
"user_id": configuration.user_id,
"search_space_id": configuration.search_space_id,
"language": configuration.language,
}
}

View file

@ -1,9 +1,14 @@
import datetime
def get_answer_outline_system_prompt():
def get_answer_outline_system_prompt(language: str | None = None) -> str:
language_instruction = ""
if language:
language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
{language_instruction}
<answer_outline_system>
You are an expert research assistant specializing in structuring information. Your task is to create a detailed and logical research outline based on the user's query. This outline will serve as the blueprint for generating a comprehensive research report.

View file

@ -20,6 +20,7 @@ class Configuration:
] # Documents provided directly to the agent for answering
user_id: str # User identifier
search_space_id: int # Search space identifier
language: str | None = None # Language for responses
@classmethod
def from_runnable_config(

View file

@ -102,7 +102,7 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
user_query = configuration.user_query
user_id = configuration.user_id
search_space_id = configuration.search_space_id
language = configuration.language
# Get user's fast LLM
llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
if not llm:
@ -127,7 +127,7 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
"""
# Use initial system prompt for token calculation
initial_system_prompt = get_qna_citation_system_prompt(chat_history_str)
initial_system_prompt = get_qna_citation_system_prompt(chat_history_str, language)
base_messages = [
SystemMessage(content=initial_system_prompt),
HumanMessage(content=base_human_message_template),
@ -146,9 +146,9 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
# Choose system prompt based on final document availability
system_prompt = (
get_qna_citation_system_prompt(chat_history_str)
get_qna_citation_system_prompt(chat_history_str, language)
if has_documents
else get_qna_no_documents_system_prompt(chat_history_str)
else get_qna_no_documents_system_prompt(chat_history_str, language)
)
# Generate documents section

View file

@ -1,7 +1,7 @@
import datetime
def get_qna_citation_system_prompt(chat_history: str | None = None):
def get_qna_citation_system_prompt(chat_history: str | None = None, language: str | None = None):
chat_history_section = (
f"""
<chat_history>
@ -15,10 +15,15 @@ NO CHAT HISTORY PROVIDED
</chat_history>
"""
)
# Add language instruction if specified
language_instruction = ""
if language:
language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.
You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.{language_instruction}
{chat_history_section}
<knowledge_sources>
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
@ -149,7 +154,7 @@ Make sure your response:
"""
def get_qna_no_documents_system_prompt(chat_history: str | None = None):
def get_qna_no_documents_system_prompt(chat_history: str | None = None, language: str | None = None):
chat_history_section = (
f"""
<chat_history>
@ -163,10 +168,15 @@ NO CHAT HISTORY PROVIDED
</chat_history>
"""
)
# Add language instruction if specified
language_instruction = ""
if language:
language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.
You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.{language_instruction}
{chat_history_section}
<context>
The user has asked a question but there are no specific documents from their personal knowledge base available to answer it. You should provide a helpful response based on:

View file

@ -1,7 +1,7 @@
import datetime
def get_citation_system_prompt(chat_history: str | None = None):
def get_citation_system_prompt(chat_history: str | None = None, language: str | None = None):
chat_history_section = (
f"""
<chat_history>
@ -15,10 +15,15 @@ NO CHAT HISTORY PROVIDED
</chat_history>
"""
)
# Add language instruction if specified
language_instruction = ""
if language:
language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.
You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.{language_instruction}
{chat_history_section}
<knowledge_sources>
- EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
@ -156,7 +161,7 @@ Make sure your response:
"""
def get_no_documents_system_prompt(chat_history: str | None = None):
def get_no_documents_system_prompt(chat_history: str | None = None, language: str | None = None):
chat_history_section = (
f"""
<chat_history>
@ -170,10 +175,15 @@ NO CHAT HISTORY PROVIDED
</chat_history>
"""
)
# Add language instruction if specified
language_instruction = ""
if language:
language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
return f"""
Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
You are SurfSense, an advanced AI research assistant that helps users create well-structured content for their documents and research.
You are SurfSense, an advanced AI research assistant that helps users create well-structured content for their documents and research.{language_instruction}
{chat_history_section}
<context>
You are writing content for a specific sub-section of a document. No specific documents from the user's personal knowledge base are available, so you should create content based on:

View file

@ -296,6 +296,8 @@ class LLMConfig(BaseModel, TimestampMixin):
api_key = Column(String, nullable=False)
api_base = Column(String(500), nullable=True)
language = Column(String(50), nullable=True, default="English")
# For any other parameters that litellm supports
litellm_params = Column(JSON, nullable=True, default={})

View file

@ -4,8 +4,10 @@ from langchain.schema import AIMessage, HumanMessage
from sqlalchemy.exc import IntegrityError, OperationalError
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
from app.db import Chat, SearchSpace, User, get_async_session
from app.db import Chat, SearchSpace, User, UserSearchSpacePreference, get_async_session
from app.schemas import (
AISDKChatRequest,
ChatCreate,
@ -53,21 +55,60 @@ async def handle_chat_data(
request_data.get("document_ids_to_add_in_context")
)
search_mode_str = validate_search_mode(request_data.get("search_mode"))
# print("RESQUEST DATA:", request_data)
# print("SELECTED CONNECTORS:", selected_connectors)
# Check if the search space belongs to the current user
try:
await check_ownership(session, SearchSpace, search_space_id, user)
language_result = await session.execute(
select(UserSearchSpacePreference)
.options(
selectinload(UserSearchSpacePreference.search_space).selectinload(SearchSpace.llm_configs),
selectinload(UserSearchSpacePreference.long_context_llm),
selectinload(UserSearchSpacePreference.fast_llm),
selectinload(UserSearchSpacePreference.strategic_llm)
)
.filter(
UserSearchSpacePreference.search_space_id == search_space_id,
UserSearchSpacePreference.user_id == user.id
)
)
user_preference = language_result.scalars().first()
print("UserSearchSpacePreference:", user_preference)
language = None
if user_preference and user_preference.search_space and user_preference.search_space.llm_configs:
llm_configs = user_preference.search_space.llm_configs
# print(f"Found {len(llm_configs)} LLM Configs")
# for i, config in enumerate(llm_configs):
# print(f" Config {i+1}: name={config.name}, provider={config.provider}, language={getattr(config, 'language', None)}")
for preferred_llm in [user_preference.fast_llm, user_preference.long_context_llm, user_preference.strategic_llm]:
if preferred_llm and getattr(preferred_llm, 'language', None):
language = preferred_llm.language
# print(f"Using language from preferred LLM: {preferred_llm.name} -> {language}")
break
# no preferred llM has language use first available LLM config
if not language:
first_llm_config = llm_configs[0]
language = getattr(first_llm_config, 'language', None)
# print(f"Using language from first LLM config: {first_llm_config.name} -> {language}")
except HTTPException:
raise HTTPException(
status_code=403, detail="You don't have access to this search space"
) from None
# print("Language selected:", language)
langchain_chat_history = []
for message in messages[:-1]:
if message["role"] == "user":
langchain_chat_history.append(HumanMessage(content=message["content"]))
elif message["role"] == "assistant":
langchain_chat_history.append(AIMessage(content=message["content"]))
response = StreamingResponse(
stream_connector_search_results(
@ -80,6 +121,7 @@ async def handle_chat_data(
langchain_chat_history,
search_mode_str,
document_ids_to_add_in_context,
language,
)
)

View file

@ -299,7 +299,10 @@ async def update_user_llm_preferences(
# Validate that all provided LLM config IDs belong to the search space
update_data = preferences.model_dump(exclude_unset=True)
# Store language from configs to validate consistency
languages = set()
for _key, llm_config_id in update_data.items():
if llm_config_id is not None:
# Verify the LLM config belongs to the search space
@ -315,6 +318,16 @@ async def update_user_llm_preferences(
status_code=404,
detail=f"LLM configuration {llm_config_id} not found in this search space",
)
# Collect language for consistency check
languages.add(llm_config.language)
# Check if all selected LLM configs have the same language
if len(languages) > 1:
raise HTTPException(
status_code=400,
detail="All selected LLM configurations must have the same language setting",
)
# Update user preferences
for key, value in update_data.items():

View file

@ -26,6 +26,9 @@ class LLMConfigBase(BaseModel):
litellm_params: dict[str, Any] | None = Field(
default=None, description="Additional LiteLLM parameters"
)
language: str | None = Field(
default="English", max_length=50, description="Language for the LLM"
)
class LLMConfigCreate(LLMConfigBase):
@ -49,6 +52,9 @@ class LLMConfigUpdate(BaseModel):
api_base: str | None = Field(
None, max_length=500, description="Optional API base URL"
)
language: str | None = Field(
None, max_length=50, description="Language for the LLM"
)
litellm_params: dict[str, Any] | None = Field(
None, description="Additional LiteLLM parameters"
)

View file

@ -20,6 +20,7 @@ async def stream_connector_search_results(
langchain_chat_history: list[Any],
search_mode_str: str,
document_ids_to_add_in_context: list[int],
language: str | None = None,
) -> AsyncGenerator[str, None]:
"""
Stream connector search results to the client
@ -66,8 +67,10 @@ async def stream_connector_search_results(
"search_mode": search_mode,
"research_mode": research_mode,
"document_ids_to_add_in_context": document_ids_to_add_in_context,
"language": language, # Add language to the configuration
}
}
# print(f"Researcher configuration: {config['configurable']}") # Debug print
# Initialize state with database session and streaming service
initial_state = State(
db_session=session,