feat: implement token usage tracking for LLM calls with new accumulator and callback

This commit is contained in:
Anish Sarkar 2026-04-14 13:40:32 +05:30
parent 917f35eb33
commit 3cfe53fb7f
6 changed files with 223 additions and 4 deletions

View file

@ -1170,6 +1170,10 @@ async def stream_new_chat(
_t_total = time.perf_counter()
log_system_snapshot("stream_new_chat_START")
from app.services.token_tracking_service import start_turn
accumulator = start_turn()
session = async_session_maker()
try:
# Mark AI as responding to this user for live collaboration
@ -1527,6 +1531,17 @@ async def stream_new_chat(
if stream_result.is_interrupted:
if title_task is not None and not title_task.done():
title_task.cancel()
usage_summary = accumulator.per_message_summary()
if usage_summary:
yield streaming_service.format_data("token-usage", {
"usage": usage_summary,
"prompt_tokens": accumulator.total_prompt_tokens,
"completion_tokens": accumulator.total_completion_tokens,
"total_tokens": accumulator.grand_total,
"call_details": accumulator.serialized_calls(),
})
yield streaming_service.format_finish_step()
yield streaming_service.format_finish()
yield streaming_service.format_done()
@ -1548,6 +1563,16 @@ async def stream_new_chat(
chat_id, generated_title
)
usage_summary = accumulator.per_message_summary()
if usage_summary:
yield streaming_service.format_data("token-usage", {
"usage": usage_summary,
"prompt_tokens": accumulator.total_prompt_tokens,
"completion_tokens": accumulator.total_completion_tokens,
"total_tokens": accumulator.grand_total,
"call_details": accumulator.serialized_calls(),
})
# Fire background memory extraction if the agent didn't handle it.
# Shared threads write to team memory; private threads write to user memory.
if not stream_result.agent_called_update_memory:
@ -1646,6 +1671,10 @@ async def stream_resume_chat(
stream_result = StreamResult()
_t_total = time.perf_counter()
from app.services.token_tracking_service import start_turn
accumulator = start_turn()
session = async_session_maker()
try:
if user_id:
@ -1769,11 +1798,31 @@ async def stream_resume_chat(
chat_id,
)
if stream_result.is_interrupted:
usage_summary = accumulator.per_message_summary()
if usage_summary:
yield streaming_service.format_data("token-usage", {
"usage": usage_summary,
"prompt_tokens": accumulator.total_prompt_tokens,
"completion_tokens": accumulator.total_completion_tokens,
"total_tokens": accumulator.grand_total,
"call_details": accumulator.serialized_calls(),
})
yield streaming_service.format_finish_step()
yield streaming_service.format_finish()
yield streaming_service.format_done()
return
usage_summary = accumulator.per_message_summary()
if usage_summary:
yield streaming_service.format_data("token-usage", {
"usage": usage_summary,
"prompt_tokens": accumulator.total_prompt_tokens,
"completion_tokens": accumulator.total_completion_tokens,
"total_tokens": accumulator.grand_total,
"call_details": accumulator.serialized_calls(),
})
yield streaming_service.format_finish_step()
yield streaming_service.format_finish()
yield streaming_service.format_done()