mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
feat(agents): track subagent invocation telemetry
This commit is contained in:
parent
5a6b92c2b6
commit
8bca29fe0d
1 changed files with 199 additions and 44 deletions
|
|
@ -20,6 +20,7 @@ from langchain_core.tools import StructuredTool
|
||||||
from langgraph.errors import GraphInterrupt
|
from langgraph.errors import GraphInterrupt
|
||||||
from langgraph.types import Command, Interrupt
|
from langgraph.types import Command, Interrupt
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
|
|
||||||
from .config import (
|
from .config import (
|
||||||
|
|
@ -173,6 +174,9 @@ def build_task_tool_with_parent_config(
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
invoke_path = "resume" if pending_value is not None else "fresh"
|
||||||
|
invoke_start = time.perf_counter()
|
||||||
|
invoke_outcome = "ok"
|
||||||
if pending_value is not None:
|
if pending_value is not None:
|
||||||
resume_value = consume_surfsense_resume(runtime)
|
resume_value = consume_surfsense_resume(runtime)
|
||||||
if resume_value is None:
|
if resume_value is None:
|
||||||
|
|
@ -188,18 +192,94 @@ def build_task_tool_with_parent_config(
|
||||||
# Prevent the parent's resume payload from leaking into subagent
|
# Prevent the parent's resume payload from leaking into subagent
|
||||||
# interrupts via langgraph's parent_scratchpad fallback.
|
# interrupts via langgraph's parent_scratchpad fallback.
|
||||||
drain_parent_null_resume(runtime)
|
drain_parent_null_resume(runtime)
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = subagent.invoke(
|
result = subagent.invoke(
|
||||||
build_resume_command(resume_value, pending_id),
|
build_resume_command(resume_value, pending_id),
|
||||||
config=sub_config,
|
config=sub_config,
|
||||||
)
|
)
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
|
invoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
invoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = subagent.invoke(subagent_state, config=sub_config)
|
result = subagent.invoke(subagent_state, config=sub_config)
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
|
invoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
invoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
invoke_elapsed_ms = (time.perf_counter() - invoke_start) * 1000
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
invoke_elapsed_ms,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
return _return_command_with_state_update(result, runtime.tool_call_id)
|
return _return_command_with_state_update(result, runtime.tool_call_id)
|
||||||
|
|
||||||
async def atask(
|
async def atask(
|
||||||
|
|
@ -274,13 +354,29 @@ def build_task_tool_with_parent_config(
|
||||||
# Prevent the parent's resume payload from leaking into subagent
|
# Prevent the parent's resume payload from leaking into subagent
|
||||||
# interrupts via langgraph's parent_scratchpad fallback.
|
# interrupts via langgraph's parent_scratchpad fallback.
|
||||||
drain_parent_null_resume(runtime)
|
drain_parent_null_resume(runtime)
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = await subagent.ainvoke(
|
result = await subagent.ainvoke(
|
||||||
build_resume_command(resume_value, pending_id),
|
build_resume_command(resume_value, pending_id),
|
||||||
config=sub_config,
|
config=sub_config,
|
||||||
)
|
)
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
ainvoke_outcome = "interrupted"
|
ainvoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
||||||
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
||||||
|
|
@ -292,11 +388,44 @@ def build_task_tool_with_parent_config(
|
||||||
time.perf_counter() - atask_start,
|
time.perf_counter() - atask_start,
|
||||||
)
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
ainvoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = await subagent.ainvoke(subagent_state, config=sub_config)
|
result = await subagent.ainvoke(
|
||||||
|
subagent_state, config=sub_config
|
||||||
|
)
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
ainvoke_outcome = "interrupted"
|
ainvoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
||||||
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
||||||
|
|
@ -308,6 +437,21 @@ def build_task_tool_with_parent_config(
|
||||||
time.perf_counter() - atask_start,
|
time.perf_counter() - atask_start,
|
||||||
)
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
ainvoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
ainvoke_elapsed = time.perf_counter() - ainvoke_start
|
ainvoke_elapsed = time.perf_counter() - ainvoke_start
|
||||||
except GraphInterrupt:
|
except GraphInterrupt:
|
||||||
raise
|
raise
|
||||||
|
|
@ -326,6 +470,17 @@ def build_task_tool_with_parent_config(
|
||||||
merge_elapsed,
|
merge_elapsed,
|
||||||
time.perf_counter() - atask_start,
|
time.perf_counter() - atask_start,
|
||||||
)
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
ainvoke_elapsed * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
return cmd
|
return cmd
|
||||||
|
|
||||||
return StructuredTool.from_function(
|
return StructuredTool.from_function(
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue