mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
obs(tokens): log prompt-cache read/write counts and hit ratio per LLM call
This commit is contained in:
parent
0cdda14922
commit
6090980c5e
1 changed files with 31 additions and 1 deletions
|
|
@ -325,6 +325,22 @@ class TokenTrackingCallback(CustomLogger):
|
||||||
total_tokens = getattr(usage, "total_tokens", 0) or 0
|
total_tokens = getattr(usage, "total_tokens", 0) or 0
|
||||||
call_kind = "chat"
|
call_kind = "chat"
|
||||||
|
|
||||||
|
# Prompt-cache accounting. Field shapes differ by provider:
|
||||||
|
# - OpenAI / Azure: ``usage.prompt_tokens_details.cached_tokens``
|
||||||
|
# - Anthropic: ``usage.cache_read_input_tokens`` + ``usage.cache_creation_input_tokens``
|
||||||
|
# LiteLLM normalizes both; we read both shapes and prefer whichever is set.
|
||||||
|
cached_tokens = 0
|
||||||
|
cache_creation_tokens = 0
|
||||||
|
if not is_image:
|
||||||
|
prompt_details = getattr(usage, "prompt_tokens_details", None)
|
||||||
|
if prompt_details is not None:
|
||||||
|
cached_tokens = getattr(prompt_details, "cached_tokens", 0) or 0
|
||||||
|
if cached_tokens == 0:
|
||||||
|
cached_tokens = getattr(usage, "cache_read_input_tokens", 0) or 0
|
||||||
|
cache_creation_tokens = (
|
||||||
|
getattr(usage, "cache_creation_input_tokens", 0) or 0
|
||||||
|
)
|
||||||
|
|
||||||
model = kwargs.get("model", "unknown")
|
model = kwargs.get("model", "unknown")
|
||||||
|
|
||||||
cost_usd = _extract_cost_usd(
|
cost_usd = _extract_cost_usd(
|
||||||
|
|
@ -367,9 +383,13 @@ class TokenTrackingCallback(CustomLogger):
|
||||||
except Exception:
|
except Exception:
|
||||||
call_latency_s = None
|
call_latency_s = None
|
||||||
|
|
||||||
|
cache_hit_ratio: float | None = None
|
||||||
|
if prompt_tokens > 0 and (cached_tokens > 0 or cache_creation_tokens > 0):
|
||||||
|
cache_hit_ratio = cached_tokens / prompt_tokens
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
"[TokenTracking] Captured: model=%s kind=%s prompt=%d completion=%d total=%d "
|
"[TokenTracking] Captured: model=%s kind=%s prompt=%d completion=%d total=%d "
|
||||||
"cost=$%.6f (%d micros) (accumulator now has %d calls)%s",
|
"cost=$%.6f (%d micros) (accumulator now has %d calls)%s%s",
|
||||||
model,
|
model,
|
||||||
call_kind,
|
call_kind,
|
||||||
prompt_tokens,
|
prompt_tokens,
|
||||||
|
|
@ -379,6 +399,16 @@ class TokenTrackingCallback(CustomLogger):
|
||||||
cost_micros,
|
cost_micros,
|
||||||
len(acc.calls),
|
len(acc.calls),
|
||||||
f" latency={call_latency_s:.3f}s" if call_latency_s is not None else "",
|
f" latency={call_latency_s:.3f}s" if call_latency_s is not None else "",
|
||||||
|
(
|
||||||
|
f" cache_read={cached_tokens} cache_write={cache_creation_tokens}"
|
||||||
|
f" hit_ratio={cache_hit_ratio:.1%}"
|
||||||
|
if cache_hit_ratio is not None
|
||||||
|
else (
|
||||||
|
f" cache_read={cached_tokens} cache_write={cache_creation_tokens}"
|
||||||
|
if (cached_tokens or cache_creation_tokens)
|
||||||
|
else ""
|
||||||
|
)
|
||||||
|
),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue