feat: implement token usage tracking for LLM calls with new accumulator and callback

2026-07-18 23:11:12 +02:00 · 2026-04-14 13:40:32 +05:30 · 2026-04-14 13:40:32 +05:30 · 3cfe53fb7f
commit 3cfe53fb7f
parent 917f35eb33
6 changed files with 223 additions and 4 deletions
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@ -1170,6 +1170,10 @@ async def stream_new_chat(
    _t_total = time.perf_counter()
    log_system_snapshot("stream_new_chat_START")

+    from app.services.token_tracking_service import start_turn
+
+    accumulator = start_turn()
+
    session = async_session_maker()
    try:
        # Mark AI as responding to this user for live collaboration
@ -1527,6 +1531,17 @@ async def stream_new_chat(
        if stream_result.is_interrupted:
            if title_task is not None and not title_task.done():
                title_task.cancel()
+
+            usage_summary = accumulator.per_message_summary()
+            if usage_summary:
+                yield streaming_service.format_data("token-usage", {
+                    "usage": usage_summary,
+                    "prompt_tokens": accumulator.total_prompt_tokens,
+                    "completion_tokens": accumulator.total_completion_tokens,
+                    "total_tokens": accumulator.grand_total,
+                    "call_details": accumulator.serialized_calls(),
+                })
+
            yield streaming_service.format_finish_step()
            yield streaming_service.format_finish()
            yield streaming_service.format_done()
@ -1548,6 +1563,16 @@ async def stream_new_chat(
                    chat_id, generated_title
                )

+        usage_summary = accumulator.per_message_summary()
+        if usage_summary:
+            yield streaming_service.format_data("token-usage", {
+                "usage": usage_summary,
+                "prompt_tokens": accumulator.total_prompt_tokens,
+                "completion_tokens": accumulator.total_completion_tokens,
+                "total_tokens": accumulator.grand_total,
+                "call_details": accumulator.serialized_calls(),
+            })
+
        # Fire background memory extraction if the agent didn't handle it.
        # Shared threads write to team memory; private threads write to user memory.
        if not stream_result.agent_called_update_memory:
@ -1646,6 +1671,10 @@ async def stream_resume_chat(
    stream_result = StreamResult()
    _t_total = time.perf_counter()

+    from app.services.token_tracking_service import start_turn
+
+    accumulator = start_turn()
+
    session = async_session_maker()
    try:
        if user_id:
@ -1769,11 +1798,31 @@ async def stream_resume_chat(
            chat_id,
        )
        if stream_result.is_interrupted:
+            usage_summary = accumulator.per_message_summary()
+            if usage_summary:
+                yield streaming_service.format_data("token-usage", {
+                    "usage": usage_summary,
+                    "prompt_tokens": accumulator.total_prompt_tokens,
+                    "completion_tokens": accumulator.total_completion_tokens,
+                    "total_tokens": accumulator.grand_total,
+                    "call_details": accumulator.serialized_calls(),
+                })
+
            yield streaming_service.format_finish_step()
            yield streaming_service.format_finish()
            yield streaming_service.format_done()
            return

+        usage_summary = accumulator.per_message_summary()
+        if usage_summary:
+            yield streaming_service.format_data("token-usage", {
+                "usage": usage_summary,
+                "prompt_tokens": accumulator.total_prompt_tokens,
+                "completion_tokens": accumulator.total_completion_tokens,
+                "total_tokens": accumulator.grand_total,
+                "call_details": accumulator.serialized_calls(),
+            })
+
        yield streaming_service.format_finish_step()
        yield streaming_service.format_finish()
        yield streaming_service.format_done()