SurfSense/surfsense_backend/app/services/llm_service.py

import asyncio
import logging

import litellm
from langchain_core.messages import HumanMessage
from langchain_litellm import ChatLiteLLM
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload

from app.config import config
from app.db import Model, SearchSpace
from app.services.auto_model_pin_service import (
    auto_model_candidates,
    choose_auto_model_candidate,
)
from app.services.llm_router_service import (
    AUTO_MODE_ID,
    ChatLiteLLMRouter,
    is_auto_mode,
)
from app.services.model_capabilities import has_capability
from app.services.model_resolver import native_connection_from_config, to_litellm
from app.services.token_tracking_service import token_tracker

# Configure litellm to automatically drop unsupported parameters
litellm.drop_params = True

# Memory controls: prevent unbounded internal accumulation
litellm.telemetry = False
litellm.cache = None
litellm.failure_callback = []
litellm.input_callback = []

litellm.callbacks = [token_tracker]

logger = logging.getLogger(__name__)


# Providers that require an interactive OAuth / device-flow login before
# issuing any completion. LiteLLM implements these with blocking sync polling
# (requests + time.sleep), which would freeze the FastAPI event loop if
# invoked from validation. They are never usable from a headless backend,
# so we reject them at the edge.
_INTERACTIVE_AUTH_PROVIDERS: frozenset[str] = frozenset(
    {
        "github_copilot",
        "github-copilot",
        "githubcopilot",
        "copilot",
    }
)

# Hard upper bound for a single validation call. Must exceed the ChatLiteLLM
# request timeout (30s) by a small margin so a well-behaved provider never
# trips the watchdog, while any pathological/blocking provider is killed.
_VALIDATION_TIMEOUT_SECONDS: float = 35.0


def _is_interactive_auth_provider(
    provider: str | None, custom_provider: str | None
) -> bool:
    """Return True if the given provider triggers interactive OAuth in LiteLLM."""
    for raw in (custom_provider, provider):
        if not raw:
            continue
        normalized = raw.strip().lower().replace(" ", "_")
        if normalized in _INTERACTIVE_AUTH_PROVIDERS:
            return True
    return False


def _legacy_config_connection(
    *,
    provider: str,
    model_name: str,
    api_key: str | None,
    api_base: str | None,
    custom_provider: str | None = None,
    litellm_params: dict | None = None,
    api_version: str | None = None,
) -> tuple[str, dict]:
    cfg = {
        "provider": provider.lower(),
        "model_name": model_name,
        "api_key": api_key,
        "api_base": api_base,
        "custom_provider": custom_provider,
        "api_version": api_version,
        "litellm_params": litellm_params or {},
    }
    conn = native_connection_from_config(cfg)
    return to_litellm(conn, model_name)


class LLMRole:
    AGENT = "agent"  # For agent/chat operations


def get_global_llm_config(llm_config_id: int) -> dict | None:
    """
    Get a global LLM configuration by ID.
    Global configs have negative IDs. Auto mode (ID 0) is resolved through the
    model-candidate pipeline, not this legacy config lookup.

    Args:
        llm_config_id: The ID of the global config (must be negative)

    Returns:
        dict: Global config dictionary or None if not found
    """
    if llm_config_id >= 0:
        return None

    for cfg in config.GLOBAL_LLM_CONFIGS:
        if cfg.get("id") == llm_config_id:
            return cfg

    return None


def get_global_model(model_id: int) -> dict | None:
    return next((m for m in config.GLOBAL_MODELS if m.get("id") == model_id), None)


def get_global_connection(connection_id: int) -> dict | None:
    return next(
        (c for c in config.GLOBAL_CONNECTIONS if c.get("id") == connection_id),
        None,
    )


def _has_capability(model: dict | Model, capability: str) -> bool:
    return has_capability(model, capability)


def _chat_litellm_from_resolved(
    *,
    conn: dict | object,
    model_id: str,
    disable_streaming: bool = False,
) -> tuple[str, dict]:
    model_string, resolved_kwargs = to_litellm(conn, model_id)
    litellm_kwargs = {"model": model_string, **resolved_kwargs}
    if disable_streaming:
        litellm_kwargs["disable_streaming"] = True
    return model_string, litellm_kwargs


async def _get_db_model(
    session: AsyncSession,
    model_id: int,
    search_space: SearchSpace,
) -> Model | None:
    result = await session.execute(
        select(Model)
        .options(selectinload(Model.connection))
        .where(Model.id == model_id, Model.enabled.is_(True))
    )
    model = result.scalars().first()
    if not model or not model.connection or not model.connection.enabled:
        return None
    conn = model.connection
    if conn.search_space_id and conn.search_space_id != search_space.id:
        return None
    if conn.user_id and conn.user_id != search_space.user_id:
        return None
    return model


async def validate_llm_config(
    provider: str,
    model_name: str,
    api_key: str,
    api_base: str | None = None,
    custom_provider: str | None = None,
    litellm_params: dict | None = None,
) -> tuple[bool, str]:
    """
    Validate an LLM configuration by attempting to make a test API call.

    Args:
        provider: LLM provider (e.g., 'OPENAI', 'ANTHROPIC')
        model_name: Model identifier
        api_key: API key for the provider
        api_base: Optional custom API base URL
        custom_provider: Optional custom provider string
        litellm_params: Optional additional litellm parameters

    Returns:
        Tuple of (is_valid, error_message)
        - is_valid: True if config works, False otherwise
        - error_message: Empty string if valid, error description if invalid
    """
    # Reject providers that require interactive OAuth/device-flow auth.
    # LiteLLM's github_copilot provider (and similar) uses a blocking sync
    # Authenticator that polls GitHub for up to several minutes and prints a
    # device code to stdout. Running it on the FastAPI event loop will freeze
    # the entire backend, so we refuse them up front.
    if _is_interactive_auth_provider(provider, custom_provider):
        msg = (
            "Provider requires interactive OAuth/device-flow authentication "
            "(e.g. github_copilot) and cannot be used in a hosted backend. "
            "Please choose a provider that authenticates via API key."
        )
        logger.warning(
            "Rejected LLM config validation for interactive-auth provider "
            "(provider=%r, custom_provider=%r)",
            provider,
            custom_provider,
        )
        return False, msg

    try:
        model_string, resolved_kwargs = _legacy_config_connection(
            provider=provider,
            model_name=model_name,
            api_key=api_key,
            api_base=api_base,
            custom_provider=custom_provider,
            litellm_params=litellm_params,
        )
        litellm_kwargs = {"model": model_string, **resolved_kwargs, "timeout": 30}

        from app.agents.chat.runtime.llm_config import (
            SanitizedChatLiteLLM,
        )

        llm = SanitizedChatLiteLLM(**litellm_kwargs)

        # Run the test call in a worker thread with a hard timeout. Some
        # LiteLLM providers have synchronous blocking code paths (e.g. OAuth
        # authenticators that call time.sleep and requests.post) that would
        # otherwise freeze the asyncio event loop. Offloading to a thread and
        # bounding the wait keeps the server responsive even if a provider
        # misbehaves.
        test_message = HumanMessage(content="Hello")
        try:
            response = await asyncio.wait_for(
                asyncio.to_thread(llm.invoke, [test_message]),
                timeout=_VALIDATION_TIMEOUT_SECONDS,
            )
        except TimeoutError:
            logger.warning(
                "LLM config validation timed out after %ss for model: %s",
                _VALIDATION_TIMEOUT_SECONDS,
                model_string,
            )
            return (
                False,
                f"Validation timed out after {int(_VALIDATION_TIMEOUT_SECONDS)}s. "
                "The provider is unreachable or requires interactive "
                "authentication that is not supported by the backend.",
            )

        # If we got here without exception, the config is valid
        if response and response.content:
            logger.info(f"Successfully validated LLM config for model: {model_string}")
            return True, ""
        else:
            logger.warning(
                f"LLM config validation returned empty response for model: {model_string}"
            )
            return False, "LLM returned an empty response"

    except Exception as e:
        error_msg = f"Failed to validate LLM configuration: {e!s}"
        logger.error(error_msg)
        return False, error_msg


async def get_search_space_llm_instance(
    session: AsyncSession,
    search_space_id: int,
    role: str,
    disable_streaming: bool = False,
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
    """
    Get a ChatLiteLLM instance for a specific search space and role.

    LLM preferences are stored at the search space level and shared by all members.

    If Auto mode (ID 0) is configured, returns a ChatLiteLLMRouter that uses
    LiteLLM Router for automatic load balancing across available providers.

    Args:
        session: Database session
        search_space_id: Search Space ID
        role: LLM role ('agent')

    Returns:
        ChatLiteLLM or ChatLiteLLMRouter instance, or None if not found
    """
    try:
        # Get the search space with its LLM preferences
        result = await session.execute(
            select(SearchSpace).where(SearchSpace.id == search_space_id)
        )
        search_space = result.scalars().first()

        if not search_space:
            logger.error(f"Search space {search_space_id} not found")
            return None

        # Get the appropriate model binding ID based on role
        if role == LLMRole.AGENT:
            llm_config_id = search_space.chat_model_id
        else:
            logger.error(f"Invalid LLM role: {role}")
            return None

        if llm_config_id is None:
            logger.error(f"No {role} LLM configured for search space {search_space_id}")
            return None

        # Auto mode resolves to one concrete global or BYOK model from the
        # unified model-connections catalog.
        if is_auto_mode(llm_config_id):
            candidates = await auto_model_candidates(
                session,
                search_space_id=search_space_id,
                user_id=search_space.user_id,
                capability="chat",
            )
            if not candidates:
                logger.error("No chat-capable models available for Auto mode")
                return None
            llm_config_id = int(
                choose_auto_model_candidate(candidates, search_space_id)["id"]
            )

        # Check if this is a global virtual model (negative ID)
        if llm_config_id < 0:
            global_model = get_global_model(llm_config_id)
            if not global_model or not _has_capability(global_model, "chat"):
                logger.error(f"Global chat model {llm_config_id} not found")
                return None
            global_connection = get_global_connection(global_model["connection_id"])
            if not global_connection:
                logger.error(
                    "Global connection %s not found for model %s",
                    global_model["connection_id"],
                    llm_config_id,
                )
                return None

            _, litellm_kwargs = _chat_litellm_from_resolved(
                conn=global_connection,
                model_id=global_model["model_id"],
                disable_streaming=disable_streaming,
            )

            from app.agents.chat.runtime.llm_config import (
                SanitizedChatLiteLLM,
            )

            return SanitizedChatLiteLLM(**litellm_kwargs)

        model = await _get_db_model(session, llm_config_id, search_space)
        if not model or not _has_capability(model, "chat"):
            logger.error(
                f"Chat model {llm_config_id} not found in search space {search_space_id}"
            )
            return None

        _, litellm_kwargs = _chat_litellm_from_resolved(
            conn=model.connection,
            model_id=model.model_id,
            disable_streaming=disable_streaming,
        )

        from app.agents.chat.runtime.llm_config import (
            SanitizedChatLiteLLM,
        )

        return SanitizedChatLiteLLM(**litellm_kwargs)

    except Exception as e:
        logger.error(
            f"Error getting LLM instance for search space {search_space_id}, role {role}: {e!s}"
        )
        return None


async def get_agent_llm(
    session: AsyncSession, search_space_id: int, disable_streaming: bool = False
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
    """Get the search space's chat model instance."""
    return await get_search_space_llm_instance(
        session,
        search_space_id,
        LLMRole.AGENT,
        disable_streaming=disable_streaming,
    )


async def get_vision_llm(
    session: AsyncSession, search_space_id: int
) -> ChatLiteLLM | ChatLiteLLMRouter | None:
    """Get the search space's vision LLM instance for screenshot analysis.

    Resolves from the new connection/model role bindings:
    - Auto mode (ID 0): unified global/BYOK model candidate selection
    - Global (negative ID): virtual GLOBAL models from YAML
    - DB (positive ID): Model + Connection tables

    Premium global configs are wrapped in :class:`QuotaCheckedVisionLLM`
    so each ``ainvoke`` debits the search-space owner's premium credit
    pool. User-owned BYOK configs and free global configs are returned
    unwrapped — they don't consume premium credit (issue M).
    """
    from app.services.quota_checked_vision_llm import QuotaCheckedVisionLLM

    try:
        result = await session.execute(
            select(SearchSpace).where(SearchSpace.id == search_space_id)
        )
        search_space = result.scalars().first()
        if not search_space:
            logger.error(f"Search space {search_space_id} not found")
            return None

        owner_user_id = search_space.user_id

        # Prefer the selected chat model when it is vision-capable.
        chat_model_id = search_space.chat_model_id
        if chat_model_id and chat_model_id != AUTO_MODE_ID:
            if chat_model_id < 0:
                chat_model = get_global_model(chat_model_id)
                if chat_model and _has_capability(chat_model, "vision"):
                    global_connection = get_global_connection(
                        chat_model["connection_id"]
                    )
                    if global_connection:
                        model_string, litellm_kwargs = _chat_litellm_from_resolved(
                            conn=global_connection,
                            model_id=chat_model["model_id"],
                        )
                        from app.agents.chat.runtime.llm_config import (
                            SanitizedChatLiteLLM,
                        )

                        return SanitizedChatLiteLLM(**litellm_kwargs)
            else:
                chat_model = await _get_db_model(session, chat_model_id, search_space)
                if chat_model and _has_capability(chat_model, "vision"):
                    _, litellm_kwargs = _chat_litellm_from_resolved(
                        conn=chat_model.connection,
                        model_id=chat_model.model_id,
                    )
                    from app.agents.chat.runtime.llm_config import (
                        SanitizedChatLiteLLM,
                    )

                    return SanitizedChatLiteLLM(**litellm_kwargs)

        config_id = search_space.vision_model_id
        if config_id is None:
            logger.error(f"No vision LLM configured for search space {search_space_id}")
            return None

        if config_id == AUTO_MODE_ID:
            candidates = await auto_model_candidates(
                session,
                search_space_id=search_space_id,
                user_id=owner_user_id,
                capability="vision",
            )
            if not candidates:
                logger.error("No vision-capable models available for Auto mode")
                return None
            config_id = int(
                choose_auto_model_candidate(candidates, search_space_id)["id"]
            )

        if config_id < 0:
            global_model = get_global_model(config_id)
            if not global_model or not _has_capability(global_model, "vision"):
                logger.error(f"Global vision model {config_id} not found")
                return None

            global_connection = get_global_connection(global_model["connection_id"])
            if not global_connection:
                logger.error(
                    "Global connection %s not found for model %s",
                    global_model["connection_id"],
                    config_id,
                )
                return None

            model_string, litellm_kwargs = _chat_litellm_from_resolved(
                conn=global_connection,
                model_id=global_model["model_id"],
            )

            from app.agents.chat.runtime.llm_config import (
                SanitizedChatLiteLLM,
            )

            inner_llm = SanitizedChatLiteLLM(**litellm_kwargs)

            billing_tier = str(global_model.get("billing_tier", "free")).lower()
            if billing_tier == "premium":
                return QuotaCheckedVisionLLM(
                    inner_llm,
                    user_id=owner_user_id,
                    search_space_id=search_space_id,
                    billing_tier=billing_tier,
                    base_model=model_string,
                    quota_reserve_tokens=global_model.get("catalog", {}).get(
                        "quota_reserve_tokens"
                    ),
                )
            return inner_llm

        model = await _get_db_model(session, config_id, search_space)
        if not model or not _has_capability(model, "vision"):
            logger.error(
                f"Vision model {config_id} not found in search space {search_space_id}"
            )
            return None

        _, litellm_kwargs = _chat_litellm_from_resolved(
            conn=model.connection,
            model_id=model.model_id,
        )

        from app.agents.chat.runtime.llm_config import (
            SanitizedChatLiteLLM,
        )

        return SanitizedChatLiteLLM(**litellm_kwargs)

    except Exception as e:
        logger.error(
            f"Error getting vision LLM for search space {search_space_id}: {e!s}"
        )
        return None


def get_planner_llm() -> ChatLiteLLM | None:
    """Return a planner LLM instance from the first global config marked
    ``is_planner: true``, or ``None`` if no planner config is defined.

    The planner role handles short, structured internal tasks (KB search
    planning: query rewriting, date extraction, recency classification).
    These tasks are well-served by small/fast models (e.g. gpt-4o-mini,
    Claude Haiku, Azure gpt-5.x-nano) — using the user's chat LLM for them
    is unnecessarily expensive and slow.

    This helper reads from ``config.GLOBAL_LLM_CONFIGS`` (loaded at import
    time from ``global_llm_config.yaml``) so it has no DB cost and can be
    called synchronously from middleware/factory code. It returns the same
    instance shape as the global path of ``get_search_space_llm_instance``.

    Callers MUST fall back to their chat LLM when this returns ``None`` so
    deployments without a planner config keep working unchanged.
    """
    from app.agents.chat.runtime.llm_config import (
        create_chat_litellm_from_config,
    )

    planner_cfg = next(
        (cfg for cfg in config.GLOBAL_LLM_CONFIGS if cfg.get("is_planner") is True),
        None,
    )
    if not planner_cfg:
        return None
    return create_chat_litellm_from_config(planner_cfg)