feat: add language support across configurations and prompts

2026-06-24 21:38:09 +02:00 · 2025-10-12 13:13:42 +05:30 · 2025-10-12 13:13:42 +05:30 · 045537aa79
commit 045537aa79
parent 402039f02f
16 changed files with 242 additions and 18 deletions
--- a/surfsense_backend/app/agents/researcher/configuration.py
+++ b/surfsense_backend/app/agents/researcher/configuration.py
@ -37,6 +37,8 @@ class Configuration:
    search_mode: SearchMode
    research_mode: ResearchMode
    document_ids_to_add_in_context: list[int]
+    language: str | None = None  
+    

    @classmethod
    def from_runnable_config(
--- a/surfsense_backend/app/agents/researcher/nodes.py
+++ b/surfsense_backend/app/agents/researcher/nodes.py
@ -578,6 +578,7 @@ async def write_answer_outline(
    num_sections = configuration.num_sections
    user_id = configuration.user_id
    search_space_id = configuration.search_space_id
+    language = configuration.language  # Get language from configuration

    writer(
        {
@ -628,7 +629,7 @@ async def write_answer_outline(

    # Create messages for the LLM
    messages = [
-        SystemMessage(content=get_answer_outline_system_prompt()),
+        SystemMessage(content=get_answer_outline_system_prompt(language=language)),
        HumanMessage(content=human_message_content),
    ]

@ -2000,6 +2001,7 @@ async def handle_qna_workflow(
            "relevant_documents": all_documents,  # Use combined documents
            "user_id": configuration.user_id,
            "search_space_id": configuration.search_space_id,
+            "language": configuration.language,
        }
    }

--- a/surfsense_backend/app/agents/researcher/prompts.py
+++ b/surfsense_backend/app/agents/researcher/prompts.py
@ -1,9 +1,14 @@
 import datetime


-def get_answer_outline_system_prompt():
+def get_answer_outline_system_prompt(language: str | None = None) -> str:
+    language_instruction = ""
+    if language:
+        language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."
+
    return f"""
 Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
+{language_instruction}
 <answer_outline_system>
 You are an expert research assistant specializing in structuring information. Your task is to create a detailed and logical research outline based on the user's query. This outline will serve as the blueprint for generating a comprehensive research report.

--- a/surfsense_backend/app/agents/researcher/qna_agent/configuration.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/configuration.py
@ -20,6 +20,7 @@ class Configuration:
    ]  # Documents provided directly to the agent for answering
    user_id: str  # User identifier
    search_space_id: int  # Search space identifier
+    language: str | None = None  # Language for responses

    @classmethod
    def from_runnable_config(
--- a/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/nodes.py
@ -102,7 +102,7 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
    user_query = configuration.user_query
    user_id = configuration.user_id
    search_space_id = configuration.search_space_id
-
+    language = configuration.language 
    # Get user's fast LLM
    llm = await get_user_fast_llm(state.db_session, user_id, search_space_id)
    if not llm:
@ -127,7 +127,7 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any
        """

        # Use initial system prompt for token calculation
-        initial_system_prompt = get_qna_citation_system_prompt(chat_history_str)
+        initial_system_prompt = get_qna_citation_system_prompt(chat_history_str, language)
        base_messages = [
            SystemMessage(content=initial_system_prompt),
            HumanMessage(content=base_human_message_template),
@ -146,9 +146,9 @@ async def answer_question(state: State, config: RunnableConfig) -> dict[str, Any

    # Choose system prompt based on final document availability
    system_prompt = (
-        get_qna_citation_system_prompt(chat_history_str)
+        get_qna_citation_system_prompt(chat_history_str, language)
        if has_documents
-        else get_qna_no_documents_system_prompt(chat_history_str)
+        else get_qna_no_documents_system_prompt(chat_history_str, language)
    )

    # Generate documents section
--- a/surfsense_backend/app/agents/researcher/qna_agent/prompts.py
+++ b/surfsense_backend/app/agents/researcher/qna_agent/prompts.py
@ -1,7 +1,7 @@
 import datetime


-def get_qna_citation_system_prompt(chat_history: str | None = None):
+def get_qna_citation_system_prompt(chat_history: str | None = None, language: str | None = None):
    chat_history_section = (
        f"""
 <chat_history>
@ -15,10 +15,15 @@ NO CHAT HISTORY PROVIDED
 </chat_history>
 """
    )
+    
+    # Add language instruction if specified
+    language_instruction = ""
+    if language:
+        language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."

    return f"""
 Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.
+You are SurfSense, an advanced AI research assistant that provides detailed, well-researched answers to user questions by synthesizing information from multiple personal knowledge sources.{language_instruction}
 {chat_history_section}
 <knowledge_sources>
 - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
@ -149,7 +154,7 @@ Make sure your response:
 """


-def get_qna_no_documents_system_prompt(chat_history: str | None = None):
+def get_qna_no_documents_system_prompt(chat_history: str | None = None, language: str | None = None):
    chat_history_section = (
        f"""
 <chat_history>
@ -163,10 +168,15 @@ NO CHAT HISTORY PROVIDED
 </chat_history>
 """
    )
+    
+    # Add language instruction if specified
+    language_instruction = ""
+    if language:
+        language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."

    return f"""
 Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.
+You are SurfSense, an advanced AI research assistant that provides helpful, detailed answers to user questions in a conversational manner.{language_instruction}
 {chat_history_section}
 <context>
 The user has asked a question but there are no specific documents from their personal knowledge base available to answer it. You should provide a helpful response based on:
--- a/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
+++ b/surfsense_backend/app/agents/researcher/sub_section_writer/prompts.py
@ -1,7 +1,7 @@
 import datetime


-def get_citation_system_prompt(chat_history: str | None = None):
+def get_citation_system_prompt(chat_history: str | None = None, language: str | None = None):
    chat_history_section = (
        f"""
 <chat_history>
@ -15,10 +15,15 @@ NO CHAT HISTORY PROVIDED
 </chat_history>
 """
    )
+    
+    # Add language instruction if specified
+    language_instruction = ""
+    if language:
+        language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."

    return f"""
 Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.
+You are SurfSense, an advanced AI research assistant that synthesizes information from multiple knowledge sources to provide comprehensive, well-cited answers to user queries.{language_instruction}
 {chat_history_section}
 <knowledge_sources>
 - EXTENSION: "Web content saved via SurfSense browser extension" (personal browsing history)
@ -156,7 +161,7 @@ Make sure your response:
 """


-def get_no_documents_system_prompt(chat_history: str | None = None):
+def get_no_documents_system_prompt(chat_history: str | None = None, language: str | None = None):
    chat_history_section = (
        f"""
 <chat_history>
@ -170,10 +175,15 @@ NO CHAT HISTORY PROVIDED
 </chat_history>
 """
    )
+    
+    # Add language instruction if specified
+    language_instruction = ""
+    if language:
+        language_instruction = f"\n\nIMPORTANT: Please respond in {language} language. All your responses, explanations, and analysis should be written in {language}."

    return f"""
 Today's date: {datetime.datetime.now().strftime("%Y-%m-%d")}
-You are SurfSense, an advanced AI research assistant that helps users create well-structured content for their documents and research.
+You are SurfSense, an advanced AI research assistant that helps users create well-structured content for their documents and research.{language_instruction}
 {chat_history_section}
 <context>
 You are writing content for a specific sub-section of a document. No specific documents from the user's personal knowledge base are available, so you should create content based on:
--- a/surfsense_backend/app/db.py
+++ b/surfsense_backend/app/db.py
@ -296,6 +296,8 @@ class LLMConfig(BaseModel, TimestampMixin):
    api_key = Column(String, nullable=False)
    api_base = Column(String(500), nullable=True)

+    language = Column(String(50), nullable=True, default="English")
+
    # For any other parameters that litellm supports
    litellm_params = Column(JSON, nullable=True, default={})

--- a/surfsense_backend/app/routes/chats_routes.py
+++ b/surfsense_backend/app/routes/chats_routes.py
@ -4,8 +4,10 @@ from langchain.schema import AIMessage, HumanMessage
 from sqlalchemy.exc import IntegrityError, OperationalError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.future import select
+from sqlalchemy.orm import selectinload

-from app.db import Chat, SearchSpace, User, get_async_session
+
+from app.db import Chat, SearchSpace, User, UserSearchSpacePreference, get_async_session
 from app.schemas import (
    AISDKChatRequest,
    ChatCreate,
@ -53,21 +55,60 @@ async def handle_chat_data(
        request_data.get("document_ids_to_add_in_context")
    )
    search_mode_str = validate_search_mode(request_data.get("search_mode"))
+    # print("RESQUEST DATA:", request_data)
+    # print("SELECTED CONNECTORS:", selected_connectors)

    # Check if the search space belongs to the current user
    try:
        await check_ownership(session, SearchSpace, search_space_id, user)
+        language_result = await session.execute(
+            select(UserSearchSpacePreference)
+            .options(
+                selectinload(UserSearchSpacePreference.search_space).selectinload(SearchSpace.llm_configs),
+                selectinload(UserSearchSpacePreference.long_context_llm),
+                selectinload(UserSearchSpacePreference.fast_llm),
+                selectinload(UserSearchSpacePreference.strategic_llm)
+            )
+            .filter(
+                UserSearchSpacePreference.search_space_id == search_space_id, 
+                UserSearchSpacePreference.user_id == user.id
+            )
+        )
+        user_preference = language_result.scalars().first()
+        print("UserSearchSpacePreference:", user_preference)
+        
+        language = None
+        if user_preference and user_preference.search_space and user_preference.search_space.llm_configs:
+            llm_configs = user_preference.search_space.llm_configs
+            # print(f"Found {len(llm_configs)} LLM Configs")
+            # for i, config in enumerate(llm_configs):
+                # print(f"  Config {i+1}: name={config.name}, provider={config.provider}, language={getattr(config, 'language', None)}")
+            
+            
+            for preferred_llm in [user_preference.fast_llm, user_preference.long_context_llm, user_preference.strategic_llm]:
+                if preferred_llm and getattr(preferred_llm, 'language', None):
+                    language = preferred_llm.language
+                    # print(f"Using language from preferred LLM: {preferred_llm.name} -> {language}")
+                    break
+            
+            # no preferred llM has language use first available LLM config
+            if not language:
+                first_llm_config = llm_configs[0]
+                language = getattr(first_llm_config, 'language', None)
+                # print(f"Using language from first LLM config: {first_llm_config.name} -> {language}")
+            
    except HTTPException:
        raise HTTPException(
            status_code=403, detail="You don't have access to this search space"
        ) from None
-
+    # print("Language selected:", language)
    langchain_chat_history = []
    for message in messages[:-1]:
        if message["role"] == "user":
            langchain_chat_history.append(HumanMessage(content=message["content"]))
        elif message["role"] == "assistant":
            langchain_chat_history.append(AIMessage(content=message["content"]))
+    

    response = StreamingResponse(
        stream_connector_search_results(
@ -80,6 +121,7 @@ async def handle_chat_data(
            langchain_chat_history,
            search_mode_str,
            document_ids_to_add_in_context,
+            language, 
        )
    )

--- a/surfsense_backend/app/routes/llm_config_routes.py
+++ b/surfsense_backend/app/routes/llm_config_routes.py
@ -299,7 +299,10 @@ async def update_user_llm_preferences(

        # Validate that all provided LLM config IDs belong to the search space
        update_data = preferences.model_dump(exclude_unset=True)
-
+        
+        # Store language from configs to validate consistency
+        languages = set()
+        
        for _key, llm_config_id in update_data.items():
            if llm_config_id is not None:
                # Verify the LLM config belongs to the search space
@ -315,6 +318,16 @@ async def update_user_llm_preferences(
                        status_code=404,
                        detail=f"LLM configuration {llm_config_id} not found in this search space",
                    )
+                
+                # Collect language for consistency check
+                languages.add(llm_config.language)
+        
+        # Check if all selected LLM configs have the same language
+        if len(languages) > 1:
+            raise HTTPException(
+                status_code=400,
+                detail="All selected LLM configurations must have the same language setting",
+            )

        # Update user preferences
        for key, value in update_data.items():
--- a/surfsense_backend/app/schemas/llm_config.py
+++ b/surfsense_backend/app/schemas/llm_config.py
@ -26,6 +26,9 @@ class LLMConfigBase(BaseModel):
    litellm_params: dict[str, Any] | None = Field(
        default=None, description="Additional LiteLLM parameters"
    )
+    language: str | None = Field(
+        default="English", max_length=50, description="Language for the LLM"
+    )


 class LLMConfigCreate(LLMConfigBase):
@ -49,6 +52,9 @@ class LLMConfigUpdate(BaseModel):
    api_base: str | None = Field(
        None, max_length=500, description="Optional API base URL"
    )
+    language: str | None = Field(
+        None, max_length=50, description="Language for the LLM"
+    )
    litellm_params: dict[str, Any] | None = Field(
        None, description="Additional LiteLLM parameters"
    )
--- a/surfsense_backend/app/tasks/stream_connector_search_results.py
+++ b/surfsense_backend/app/tasks/stream_connector_search_results.py
@ -20,6 +20,7 @@ async def stream_connector_search_results(
    langchain_chat_history: list[Any],
    search_mode_str: str,
    document_ids_to_add_in_context: list[int],
+    language: str | None = None,
 ) -> AsyncGenerator[str, None]:
    """
    Stream connector search results to the client
@ -66,8 +67,10 @@ async def stream_connector_search_results(
            "search_mode": search_mode,
            "research_mode": research_mode,
            "document_ids_to_add_in_context": document_ids_to_add_in_context,
+            "language": language,  # Add language to the configuration
        }
    }
+    # print(f"Researcher configuration: {config['configurable']}")  # Debug print
    # Initialize state with database session and streaming service
    initial_state = State(
        db_session=session,