feat(agents): track subagent invocation telemetry

This commit is contained in:
Anish Sarkar 2026-05-22 13:48:57 +05:30
parent 5a6b92c2b6
commit 8bca29fe0d

View file

@ -20,6 +20,7 @@ from langchain_core.tools import StructuredTool
from langgraph.errors import GraphInterrupt from langgraph.errors import GraphInterrupt
from langgraph.types import Command, Interrupt from langgraph.types import Command, Interrupt
from app.observability import metrics as ot_metrics, otel as ot
from app.utils.perf import get_perf_logger from app.utils.perf import get_perf_logger
from .config import ( from .config import (
@ -173,6 +174,9 @@ def build_task_tool_with_parent_config(
exc_info=True, exc_info=True,
) )
invoke_path = "resume" if pending_value is not None else "fresh"
invoke_start = time.perf_counter()
invoke_outcome = "ok"
if pending_value is not None: if pending_value is not None:
resume_value = consume_surfsense_resume(runtime) resume_value = consume_surfsense_resume(runtime)
if resume_value is None: if resume_value is None:
@ -188,18 +192,94 @@ def build_task_tool_with_parent_config(
# Prevent the parent's resume payload from leaking into subagent # Prevent the parent's resume payload from leaking into subagent
# interrupts via langgraph's parent_scratchpad fallback. # interrupts via langgraph's parent_scratchpad fallback.
drain_parent_null_resume(runtime) drain_parent_null_resume(runtime)
try: with ot.subagent_invoke_span(
result = subagent.invoke( subagent_type=subagent_type, path=invoke_path
build_resume_command(resume_value, pending_id), ) as sp:
config=sub_config, try:
) result = subagent.invoke(
except GraphInterrupt as gi: build_resume_command(resume_value, pending_id),
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) config=sub_config,
)
sp.set_attribute("subagent.outcome", invoke_outcome)
except GraphInterrupt as gi:
invoke_outcome = "interrupted"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
invoke_outcome = "error"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
raise
else: else:
try: with ot.subagent_invoke_span(
result = subagent.invoke(subagent_state, config=sub_config) subagent_type=subagent_type, path=invoke_path
except GraphInterrupt as gi: ) as sp:
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) try:
result = subagent.invoke(subagent_state, config=sub_config)
sp.set_attribute("subagent.outcome", invoke_outcome)
except GraphInterrupt as gi:
invoke_outcome = "interrupted"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
invoke_outcome = "error"
sp.set_attribute("subagent.outcome", invoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - invoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
raise
invoke_elapsed_ms = (time.perf_counter() - invoke_start) * 1000
ot_metrics.record_subagent_invoke_duration(
invoke_elapsed_ms,
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=invoke_outcome,
)
return _return_command_with_state_update(result, runtime.tool_call_id) return _return_command_with_state_update(result, runtime.tool_call_id)
async def atask( async def atask(
@ -274,40 +354,104 @@ def build_task_tool_with_parent_config(
# Prevent the parent's resume payload from leaking into subagent # Prevent the parent's resume payload from leaking into subagent
# interrupts via langgraph's parent_scratchpad fallback. # interrupts via langgraph's parent_scratchpad fallback.
drain_parent_null_resume(runtime) drain_parent_null_resume(runtime)
try: with ot.subagent_invoke_span(
result = await subagent.ainvoke( subagent_type=subagent_type, path=invoke_path
build_resume_command(resume_value, pending_id), ) as sp:
config=sub_config, try:
) result = await subagent.ainvoke(
except GraphInterrupt as gi: build_resume_command(resume_value, pending_id),
ainvoke_outcome = "interrupted" config=sub_config,
_perf_log.info( )
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " sp.set_attribute("subagent.outcome", ainvoke_outcome)
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs", except GraphInterrupt as gi:
subagent_type, ainvoke_outcome = "interrupted"
invoke_path, sp.set_attribute("subagent.outcome", ainvoke_outcome)
ainvoke_outcome, ot_metrics.record_subagent_invoke_duration(
aget_state_elapsed, (time.perf_counter() - ainvoke_start) * 1000,
time.perf_counter() - ainvoke_start, subagent_type=subagent_type,
time.perf_counter() - atask_start, path=invoke_path,
) outcome=ainvoke_outcome,
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) )
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
_perf_log.info(
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
subagent_type,
invoke_path,
ainvoke_outcome,
aget_state_elapsed,
time.perf_counter() - ainvoke_start,
time.perf_counter() - atask_start,
)
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
ainvoke_outcome = "error"
sp.set_attribute("subagent.outcome", ainvoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - ainvoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
raise
else: else:
try: with ot.subagent_invoke_span(
result = await subagent.ainvoke(subagent_state, config=sub_config) subagent_type=subagent_type, path=invoke_path
except GraphInterrupt as gi: ) as sp:
ainvoke_outcome = "interrupted" try:
_perf_log.info( result = await subagent.ainvoke(
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s " subagent_state, config=sub_config
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs", )
subagent_type, sp.set_attribute("subagent.outcome", ainvoke_outcome)
invoke_path, except GraphInterrupt as gi:
ainvoke_outcome, ainvoke_outcome = "interrupted"
aget_state_elapsed, sp.set_attribute("subagent.outcome", ainvoke_outcome)
time.perf_counter() - ainvoke_start, ot_metrics.record_subagent_invoke_duration(
time.perf_counter() - atask_start, (time.perf_counter() - ainvoke_start) * 1000,
) subagent_type=subagent_type,
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id) path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
_perf_log.info(
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
subagent_type,
invoke_path,
ainvoke_outcome,
aget_state_elapsed,
time.perf_counter() - ainvoke_start,
time.perf_counter() - atask_start,
)
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
except Exception:
ainvoke_outcome = "error"
sp.set_attribute("subagent.outcome", ainvoke_outcome)
ot_metrics.record_subagent_invoke_duration(
(time.perf_counter() - ainvoke_start) * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
raise
ainvoke_elapsed = time.perf_counter() - ainvoke_start ainvoke_elapsed = time.perf_counter() - ainvoke_start
except GraphInterrupt: except GraphInterrupt:
raise raise
@ -326,6 +470,17 @@ def build_task_tool_with_parent_config(
merge_elapsed, merge_elapsed,
time.perf_counter() - atask_start, time.perf_counter() - atask_start,
) )
ot_metrics.record_subagent_invoke_duration(
ainvoke_elapsed * 1000,
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
ot_metrics.record_subagent_invoke_outcome(
subagent_type=subagent_type,
path=invoke_path,
outcome=ainvoke_outcome,
)
return cmd return cmd
return StructuredTool.from_function( return StructuredTool.from_function(