feat(story-3.5): add cloud-mode LLM model selection with token quota enforcement

Implement system-managed model catalog, subscription tier enforcement, atomic token quota tracking, and frontend cloud/self-hosted conditional rendering. Apply all 20 BMAD code review patches including security fixes (cross-user API key hijack), race condition mitigation (atomic SQL UPDATE), and SSE mid-stream quota error handling. Co-Authored-By: Claude Sonnet 4 <noreply@anthropic.com>
2026-05-11 16:52:38 +02:00 · 2026-04-14 17:01:21 +07:00 · 2026-04-14 17:01:21 +07:00 · c1776b3ec8
commit c1776b3ec8
parent e7382b26de
19 changed files with 1003 additions and 34 deletions
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -41,6 +41,7 @@ from app.agents.new_chat.memory_extraction import (
    extract_and_save_memory,
    extract_and_save_team_memory,
 )
+from app.config import config as app_config
 from app.db import (
    ChatVisibility,
    NewChatMessage,
@ -144,6 +145,7 @@ class StreamResult:
    interrupt_value: dict[str, Any] | None = None
    sandbox_files: list[str] = field(default_factory=list)  # unused, kept for compat
    agent_called_update_memory: bool = False
+    total_tokens_used: int = 0  # Accumulated across all LLM calls in the stream


 async def _stream_agent_events(
@ -1105,6 +1107,27 @@ async def _stream_agent_events(
                    },
                )

+        elif event_type == "on_chat_model_end":
+            # Accumulate token counts for quota tracking (cloud mode)
+            output = event.get("data", {}).get("output")
+            if output is not None:
+                usage = None
+                if hasattr(output, "usage_metadata") and output.usage_metadata is not None:
+                    usage = output.usage_metadata
+                elif hasattr(output, "response_metadata") and output.response_metadata is not None:
+                    rm = output.response_metadata or {}
+                    usage = rm.get("usage") or rm.get("token_usage") or rm.get("usage_metadata")
+
+                if isinstance(usage, dict):
+                    total = (
+                        usage.get("total_tokens")
+                        or (usage.get("input_tokens", 0) + usage.get("output_tokens", 0))
+                        or (usage.get("prompt_tokens", 0) + usage.get("completion_tokens", 0))
+                    )
+                    result.total_tokens_used += total or 0
+                elif usage is not None and hasattr(usage, "total_tokens"):
+                    result.total_tokens_used += getattr(usage, "total_tokens", 0) or 0
+
        elif event_type in ("on_chain_end", "on_agent_end"):
            if current_text_id is not None:
                yield streaming_service.format_text_end(current_text_id)
@ -1569,6 +1592,22 @@ async def stream_new_chat(
                    )
                )

+        # Cloud mode: deduct consumed tokens from the user's monthly quota
+        if app_config.is_cloud() and user_id and stream_result.total_tokens_used > 0:
+            try:
+                async with shielded_async_session() as quota_session:
+                    from app.services.token_quota_service import TokenQuotaService
+
+                    quota_service = TokenQuotaService(quota_session)
+                    await quota_service.update_token_usage(
+                        user_id, stream_result.total_tokens_used, allow_exceed=True
+                    )
+            except Exception as quota_err:
+                # Non-fatal — log and continue; usage was already streamed
+                logging.getLogger(__name__).warning(
+                    "[stream_new_chat] Failed to record token usage: %s", quota_err
+                )
+
        # Finish the step and message
        yield streaming_service.format_finish_step()
        yield streaming_service.format_finish()
@ -1778,6 +1817,22 @@ async def stream_resume_chat(
        yield streaming_service.format_finish()
        yield streaming_service.format_done()

+        # Cloud mode: deduct consumed tokens from the user's monthly quota
+        if app_config.is_cloud() and user_id and stream_result.total_tokens_used > 0:
+            try:
+                async with shielded_async_session() as quota_session:
+                    from app.services.token_quota_service import TokenQuotaService
+
+                    quota_service = TokenQuotaService(quota_session)
+                    await quota_service.update_token_usage(
+                        user_id, stream_result.total_tokens_used, allow_exceed=True
+                    )
+            except Exception as quota_err:
+                # Non-fatal — log and continue; usage was already streamed
+                logging.getLogger(__name__).warning(
+                    "[stream_resume_chat] Failed to record token usage: %s", quota_err
+                )
+
    except Exception as e:
        import traceback