mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-25 19:15:18 +02:00
Merge pull request #1427 from AnishSarkar22/feat/opentelemetry
feat: OpenTelemetry integration
This commit is contained in:
commit
da4ba09d88
45 changed files with 7554 additions and 4442 deletions
|
|
@ -7,6 +7,9 @@
|
||||||
# SurfSense version (use "latest" or a specific version like "0.0.14")
|
# SurfSense version (use "latest" or a specific version like "0.0.14")
|
||||||
SURFSENSE_VERSION=latest
|
SURFSENSE_VERSION=latest
|
||||||
|
|
||||||
|
# Deployment environment: dev or production
|
||||||
|
SURFSENSE_ENV=production
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
# Core Settings
|
# Core Settings
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
|
|
@ -304,6 +307,28 @@ STT_SERVICE=local/base
|
||||||
# LANGSMITH_API_KEY=
|
# LANGSMITH_API_KEY=
|
||||||
# LANGSMITH_PROJECT=surfsense
|
# LANGSMITH_PROJECT=surfsense
|
||||||
|
|
||||||
|
# OpenTelemetry traces and metrics.
|
||||||
|
# Enable the collector with: docker compose --profile observability up -d
|
||||||
|
# SURFSENSE_ENABLE_OTEL=true
|
||||||
|
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||||
|
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc
|
||||||
|
# OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
|
||||||
|
#
|
||||||
|
# Emergency kill switch.
|
||||||
|
# OTEL_SDK_DISABLED=true
|
||||||
|
#
|
||||||
|
# Grafana Cloud OTLP credentials. These are used only by the collector container.
|
||||||
|
# GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
|
||||||
|
# GRAFANA_CLOUD_INSTANCE_ID=
|
||||||
|
# GRAFANA_CLOUD_API_KEY=
|
||||||
|
#
|
||||||
|
# Optional host port overrides for the bundled OTel Collector. Only change
|
||||||
|
# these if the host already uses 4317/4318/13133; backend containers still use
|
||||||
|
# the internal Docker endpoint above.
|
||||||
|
# OTEL_GRPC_PORT=4317
|
||||||
|
# OTEL_HTTP_PORT=4318
|
||||||
|
# OTEL_HEALTH_PORT=13133
|
||||||
|
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
# Advanced (optional)
|
# Advanced (optional)
|
||||||
# ------------------------------------------------------------------------------
|
# ------------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -78,6 +78,15 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
otel-lgtm:
|
||||||
|
image: grafana/otel-lgtm:latest
|
||||||
|
ports:
|
||||||
|
- "${OTEL_GRPC_PORT:-4317}:4317"
|
||||||
|
- "${OTEL_HTTP_PORT:-4318}:4318"
|
||||||
|
- "${OTEL_GRAFANA_PORT:-3001}:3000"
|
||||||
|
- "${OTEL_TEMPO_PORT:-3200}:3200"
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
searxng:
|
searxng:
|
||||||
image: searxng/searxng:2026.3.13-3c1f68c59
|
image: searxng/searxng:2026.3.13-3c1f68c59
|
||||||
ports:
|
ports:
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,29 @@ services:
|
||||||
timeout: 5s
|
timeout: 5s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
otel-collector:
|
||||||
|
image: otel/opentelemetry-collector-contrib:0.152.1
|
||||||
|
profiles:
|
||||||
|
- observability
|
||||||
|
command: ["--config=/etc/otelcol/config.yaml"]
|
||||||
|
volumes:
|
||||||
|
- ./otel-collector/config.yaml:/etc/otelcol/config.yaml:ro
|
||||||
|
environment:
|
||||||
|
GRAFANA_CLOUD_OTLP_ENDPOINT: ${GRAFANA_CLOUD_OTLP_ENDPOINT:-}
|
||||||
|
GRAFANA_CLOUD_INSTANCE_ID: ${GRAFANA_CLOUD_INSTANCE_ID:-}
|
||||||
|
GRAFANA_CLOUD_API_KEY: ${GRAFANA_CLOUD_API_KEY:-}
|
||||||
|
ports:
|
||||||
|
- "${OTEL_GRPC_PORT:-4317}:4317"
|
||||||
|
- "${OTEL_HTTP_PORT:-4318}:4318"
|
||||||
|
- "${OTEL_HEALTH_PORT:-13133}:13133"
|
||||||
|
mem_limit: 2g
|
||||||
|
restart: unless-stopped
|
||||||
|
healthcheck:
|
||||||
|
test: ["CMD", "/otelcol-contrib", "--version"]
|
||||||
|
interval: 30s
|
||||||
|
timeout: 5s
|
||||||
|
retries: 3
|
||||||
|
|
||||||
searxng:
|
searxng:
|
||||||
image: searxng/searxng:2026.3.13-3c1f68c59
|
image: searxng/searxng:2026.3.13-3c1f68c59
|
||||||
volumes:
|
volumes:
|
||||||
|
|
|
||||||
81
docker/otel-collector/config.yaml
Normal file
81
docker/otel-collector/config.yaml
Normal file
|
|
@ -0,0 +1,81 @@
|
||||||
|
extensions:
|
||||||
|
health_check:
|
||||||
|
endpoint: 0.0.0.0:13133
|
||||||
|
basicauth/grafana_cloud:
|
||||||
|
client_auth:
|
||||||
|
username: ${env:GRAFANA_CLOUD_INSTANCE_ID}
|
||||||
|
password: ${env:GRAFANA_CLOUD_API_KEY}
|
||||||
|
|
||||||
|
receivers:
|
||||||
|
otlp:
|
||||||
|
protocols:
|
||||||
|
grpc:
|
||||||
|
endpoint: 0.0.0.0:4317
|
||||||
|
http:
|
||||||
|
endpoint: 0.0.0.0:4318
|
||||||
|
|
||||||
|
processors:
|
||||||
|
# Percentage limits are calculated against the collector container memory limit.
|
||||||
|
# Keep docker-compose.yml/Coolify memory limit set for predictability.
|
||||||
|
memory_limiter:
|
||||||
|
check_interval: 1s
|
||||||
|
limit_percentage: 80
|
||||||
|
spike_limit_percentage: 25
|
||||||
|
|
||||||
|
attributes/scrub:
|
||||||
|
actions:
|
||||||
|
- key: http.request.header.authorization
|
||||||
|
action: delete
|
||||||
|
- key: http.request.header.cookie
|
||||||
|
action: delete
|
||||||
|
- key: db.statement
|
||||||
|
action: delete
|
||||||
|
|
||||||
|
tail_sampling:
|
||||||
|
decision_wait: 10s
|
||||||
|
num_traces: 50000
|
||||||
|
expected_new_traces_per_sec: 100
|
||||||
|
policies:
|
||||||
|
- name: errors
|
||||||
|
type: status_code
|
||||||
|
status_code:
|
||||||
|
status_codes: [ERROR]
|
||||||
|
- name: slow-requests
|
||||||
|
type: latency
|
||||||
|
latency:
|
||||||
|
threshold_ms: 500
|
||||||
|
- name: baseline
|
||||||
|
type: probabilistic
|
||||||
|
probabilistic:
|
||||||
|
sampling_percentage: 100
|
||||||
|
|
||||||
|
batch:
|
||||||
|
timeout: 5s
|
||||||
|
send_batch_size: 1024
|
||||||
|
send_batch_max_size: 2048
|
||||||
|
|
||||||
|
exporters:
|
||||||
|
otlp_http/grafana_cloud:
|
||||||
|
endpoint: ${env:GRAFANA_CLOUD_OTLP_ENDPOINT}
|
||||||
|
auth:
|
||||||
|
authenticator: basicauth/grafana_cloud
|
||||||
|
sending_queue:
|
||||||
|
enabled: true
|
||||||
|
queue_size: 10000
|
||||||
|
retry_on_failure:
|
||||||
|
enabled: true
|
||||||
|
initial_interval: 5s
|
||||||
|
max_interval: 30s
|
||||||
|
max_elapsed_time: 300s
|
||||||
|
|
||||||
|
service:
|
||||||
|
extensions: [health_check, basicauth/grafana_cloud]
|
||||||
|
pipelines:
|
||||||
|
traces:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, attributes/scrub, tail_sampling, batch]
|
||||||
|
exporters: [otlp_http/grafana_cloud]
|
||||||
|
metrics:
|
||||||
|
receivers: [otlp]
|
||||||
|
processors: [memory_limiter, batch]
|
||||||
|
exporters: [otlp_http/grafana_cloud]
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
|
DATABASE_URL=postgresql+asyncpg://postgres:postgres@localhost:5432/surfsense
|
||||||
|
|
||||||
|
# Deployment environment: dev or production
|
||||||
|
SURFSENSE_ENV=dev
|
||||||
|
|
||||||
#Celery Config
|
#Celery Config
|
||||||
CELERY_BROKER_URL=redis://localhost:6379/0
|
CELERY_BROKER_URL=redis://localhost:6379/0
|
||||||
CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
CELERY_RESULT_BACKEND=redis://localhost:6379/0
|
||||||
|
|
@ -303,8 +306,16 @@ LANGSMITH_PROJECT=surfsense
|
||||||
# SURFSENSE_ENABLE_BUSY_MUTEX=false
|
# SURFSENSE_ENABLE_BUSY_MUTEX=false
|
||||||
# SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false # adds a per-turn LLM call
|
# SURFSENSE_ENABLE_LLM_TOOL_SELECTOR=false # adds a per-turn LLM call
|
||||||
|
|
||||||
# Observability — OTel (also requires OTEL_EXPORTER_OTLP_ENDPOINT)
|
# Observability - OTel
|
||||||
# SURFSENSE_ENABLE_OTEL=false
|
# SURFSENSE_ENABLE_OTEL=false
|
||||||
|
# OpenTelemetry - endpoint enables export; absent = no-op.
|
||||||
|
# Production should point at an OTel Collector. For local docker-compose.dev.yml,
|
||||||
|
# use http://otel-lgtm:4317 instead.
|
||||||
|
# OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||||
|
# OTEL_EXPORTER_OTLP_PROTOCOL=grpc # or http/protobuf
|
||||||
|
# OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
|
||||||
|
# OTEL_METRIC_EXPORT_INTERVAL=300000 # ms; 5 minutes
|
||||||
|
# OTEL_SDK_DISABLED=true # emergency kill-switch
|
||||||
|
|
||||||
# Skills + subagents
|
# Skills + subagents
|
||||||
# SURFSENSE_ENABLE_SKILLS=false
|
# SURFSENSE_ENABLE_SKILLS=false
|
||||||
|
|
|
||||||
|
|
@ -20,6 +20,7 @@ from langchain_core.tools import StructuredTool
|
||||||
from langgraph.errors import GraphInterrupt
|
from langgraph.errors import GraphInterrupt
|
||||||
from langgraph.types import Command, Interrupt
|
from langgraph.types import Command, Interrupt
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
|
|
||||||
from .config import (
|
from .config import (
|
||||||
|
|
@ -173,6 +174,9 @@ def build_task_tool_with_parent_config(
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
invoke_path = "resume" if pending_value is not None else "fresh"
|
||||||
|
invoke_start = time.perf_counter()
|
||||||
|
invoke_outcome = "ok"
|
||||||
if pending_value is not None:
|
if pending_value is not None:
|
||||||
resume_value = consume_surfsense_resume(runtime)
|
resume_value = consume_surfsense_resume(runtime)
|
||||||
if resume_value is None:
|
if resume_value is None:
|
||||||
|
|
@ -188,18 +192,94 @@ def build_task_tool_with_parent_config(
|
||||||
# Prevent the parent's resume payload from leaking into subagent
|
# Prevent the parent's resume payload from leaking into subagent
|
||||||
# interrupts via langgraph's parent_scratchpad fallback.
|
# interrupts via langgraph's parent_scratchpad fallback.
|
||||||
drain_parent_null_resume(runtime)
|
drain_parent_null_resume(runtime)
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = subagent.invoke(
|
result = subagent.invoke(
|
||||||
build_resume_command(resume_value, pending_id),
|
build_resume_command(resume_value, pending_id),
|
||||||
config=sub_config,
|
config=sub_config,
|
||||||
)
|
)
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
|
invoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
invoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = subagent.invoke(subagent_state, config=sub_config)
|
result = subagent.invoke(subagent_state, config=sub_config)
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
|
invoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
invoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", invoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - invoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
invoke_elapsed_ms = (time.perf_counter() - invoke_start) * 1000
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
invoke_elapsed_ms,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=invoke_outcome,
|
||||||
|
)
|
||||||
return _return_command_with_state_update(result, runtime.tool_call_id)
|
return _return_command_with_state_update(result, runtime.tool_call_id)
|
||||||
|
|
||||||
async def atask(
|
async def atask(
|
||||||
|
|
@ -274,13 +354,29 @@ def build_task_tool_with_parent_config(
|
||||||
# Prevent the parent's resume payload from leaking into subagent
|
# Prevent the parent's resume payload from leaking into subagent
|
||||||
# interrupts via langgraph's parent_scratchpad fallback.
|
# interrupts via langgraph's parent_scratchpad fallback.
|
||||||
drain_parent_null_resume(runtime)
|
drain_parent_null_resume(runtime)
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = await subagent.ainvoke(
|
result = await subagent.ainvoke(
|
||||||
build_resume_command(resume_value, pending_id),
|
build_resume_command(resume_value, pending_id),
|
||||||
config=sub_config,
|
config=sub_config,
|
||||||
)
|
)
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
ainvoke_outcome = "interrupted"
|
ainvoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
||||||
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
||||||
|
|
@ -292,11 +388,44 @@ def build_task_tool_with_parent_config(
|
||||||
time.perf_counter() - atask_start,
|
time.perf_counter() - atask_start,
|
||||||
)
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
ainvoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
|
with ot.subagent_invoke_span(
|
||||||
|
subagent_type=subagent_type, path=invoke_path
|
||||||
|
) as sp:
|
||||||
try:
|
try:
|
||||||
result = await subagent.ainvoke(subagent_state, config=sub_config)
|
result = await subagent.ainvoke(
|
||||||
|
subagent_state, config=sub_config
|
||||||
|
)
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
except GraphInterrupt as gi:
|
except GraphInterrupt as gi:
|
||||||
ainvoke_outcome = "interrupted"
|
ainvoke_outcome = "interrupted"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
"[hitl_route] atask EXIT subagent_type=%r path=%s outcome=%s "
|
||||||
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
"aget_state=%.3fs ainvoke=%.3fs total=%.3fs",
|
||||||
|
|
@ -308,6 +437,21 @@ def build_task_tool_with_parent_config(
|
||||||
time.perf_counter() - atask_start,
|
time.perf_counter() - atask_start,
|
||||||
)
|
)
|
||||||
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
_reraise_stamped_subagent_interrupt(gi, runtime.tool_call_id)
|
||||||
|
except Exception:
|
||||||
|
ainvoke_outcome = "error"
|
||||||
|
sp.set_attribute("subagent.outcome", ainvoke_outcome)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
(time.perf_counter() - ainvoke_start) * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
raise
|
||||||
ainvoke_elapsed = time.perf_counter() - ainvoke_start
|
ainvoke_elapsed = time.perf_counter() - ainvoke_start
|
||||||
except GraphInterrupt:
|
except GraphInterrupt:
|
||||||
raise
|
raise
|
||||||
|
|
@ -326,6 +470,17 @@ def build_task_tool_with_parent_config(
|
||||||
merge_elapsed,
|
merge_elapsed,
|
||||||
time.perf_counter() - atask_start,
|
time.perf_counter() - atask_start,
|
||||||
)
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_duration(
|
||||||
|
ainvoke_elapsed * 1000,
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
|
ot_metrics.record_subagent_invoke_outcome(
|
||||||
|
subagent_type=subagent_type,
|
||||||
|
path=invoke_path,
|
||||||
|
outcome=ainvoke_outcome,
|
||||||
|
)
|
||||||
return cmd
|
return cmd
|
||||||
|
|
||||||
return StructuredTool.from_function(
|
return StructuredTool.from_function(
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ from langchain_core.tools import BaseTool
|
||||||
from langgraph.types import interrupt
|
from langgraph.types import interrupt
|
||||||
|
|
||||||
from app.agents.new_chat.permissions import Rule
|
from app.agents.new_chat.permissions import Rule
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
from app.observability import otel as ot
|
from app.observability import otel as ot
|
||||||
|
|
||||||
from .decision import normalize_permission_decision
|
from .decision import normalize_permission_decision
|
||||||
|
|
@ -52,6 +53,8 @@ def request_permission_decision(
|
||||||
),
|
),
|
||||||
ot.interrupt_span(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE),
|
ot.interrupt_span(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE),
|
||||||
):
|
):
|
||||||
|
ot_metrics.record_permission_ask(permission=tool_name)
|
||||||
|
ot_metrics.record_interrupt(interrupt_type=PERMISSION_ASK_INTERRUPT_TYPE)
|
||||||
decision = interrupt(payload)
|
decision = interrupt(payload)
|
||||||
return normalize_permission_decision(decision)
|
return normalize_permission_decision(decision)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -34,6 +34,7 @@ from deepagents.middleware.summarization import (
|
||||||
)
|
)
|
||||||
from langchain_core.messages import SystemMessage
|
from langchain_core.messages import SystemMessage
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
from app.observability import otel as ot
|
from app.observability import otel as ot
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
|
@ -178,6 +179,7 @@ class SurfSenseCompactionMiddleware(SummarizationMiddleware):
|
||||||
messages_in=len(conversation_messages),
|
messages_in=len(conversation_messages),
|
||||||
extra={"compaction.cutoff_index": int(cutoff_index)},
|
extra={"compaction.cutoff_index": int(cutoff_index)},
|
||||||
):
|
):
|
||||||
|
ot_metrics.record_compaction_run(reason="auto")
|
||||||
messages_to_summarize, preserved_messages = super()._partition_messages(
|
messages_to_summarize, preserved_messages = super()._partition_messages(
|
||||||
conversation_messages, cutoff_index
|
conversation_messages, cutoff_index
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -47,6 +47,7 @@ from langgraph.config import get_config
|
||||||
from langgraph.runtime import Runtime
|
from langgraph.runtime import Runtime
|
||||||
from langgraph.types import interrupt
|
from langgraph.types import interrupt
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
from app.observability import otel as ot
|
from app.observability import otel as ot
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -195,6 +196,7 @@ class DoomLoopMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Respon
|
||||||
"interrupt.tool": (action or {}).get("tool", "<unknown>"),
|
"interrupt.tool": (action or {}).get("tool", "<unknown>"),
|
||||||
},
|
},
|
||||||
):
|
):
|
||||||
|
ot_metrics.record_interrupt(interrupt_type="permission_ask")
|
||||||
decision = interrupt(
|
decision = interrupt(
|
||||||
{
|
{
|
||||||
"type": "permission_ask",
|
"type": "permission_ask",
|
||||||
|
|
|
||||||
|
|
@ -16,13 +16,14 @@ dashboards expect.
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
from typing import TYPE_CHECKING, Any
|
from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from langchain.agents.middleware import AgentMiddleware
|
from langchain.agents.middleware import AgentMiddleware
|
||||||
from langchain_core.messages import AIMessage, ToolMessage
|
from langchain_core.messages import AIMessage, ToolMessage
|
||||||
|
|
||||||
from app.observability import otel as ot
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
|
|
||||||
if TYPE_CHECKING: # pragma: no cover — type-only
|
if TYPE_CHECKING: # pragma: no cover — type-only
|
||||||
from langchain.agents.middleware.types import (
|
from langchain.agents.middleware.types import (
|
||||||
|
|
@ -62,14 +63,37 @@ class OtelSpanMiddleware(AgentMiddleware):
|
||||||
return await handler(request)
|
return await handler(request)
|
||||||
|
|
||||||
model_id, provider = _resolve_model_attrs(request)
|
model_id, provider = _resolve_model_attrs(request)
|
||||||
|
t0 = time.perf_counter()
|
||||||
with ot.model_call_span(model_id=model_id, provider=provider) as sp:
|
with ot.model_call_span(model_id=model_id, provider=provider) as sp:
|
||||||
|
_annotate_model_request(sp, model_id=model_id, provider=provider)
|
||||||
try:
|
try:
|
||||||
result = await handler(request)
|
result = await handler(request)
|
||||||
except Exception:
|
except Exception:
|
||||||
|
ot_metrics.record_model_call_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
model=model_id,
|
||||||
|
provider=provider,
|
||||||
|
)
|
||||||
# span context manager records + re-raises
|
# span context manager records + re-raises
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
_annotate_model_response(sp, result)
|
input_tokens, output_tokens = _annotate_model_response(
|
||||||
|
sp,
|
||||||
|
result,
|
||||||
|
model_id=model_id,
|
||||||
|
provider=provider,
|
||||||
|
)
|
||||||
|
ot_metrics.record_model_call_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
model=model_id,
|
||||||
|
provider=provider,
|
||||||
|
)
|
||||||
|
ot_metrics.record_model_token_usage(
|
||||||
|
input_tokens=input_tokens,
|
||||||
|
output_tokens=output_tokens,
|
||||||
|
model=model_id,
|
||||||
|
provider=provider,
|
||||||
|
)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
|
|
@ -87,9 +111,24 @@ class OtelSpanMiddleware(AgentMiddleware):
|
||||||
tool_name = _resolve_tool_name(request)
|
tool_name = _resolve_tool_name(request)
|
||||||
input_size = _resolve_input_size(request)
|
input_size = _resolve_input_size(request)
|
||||||
|
|
||||||
|
t0 = time.perf_counter()
|
||||||
with ot.tool_call_span(tool_name, input_size=input_size) as sp:
|
with ot.tool_call_span(tool_name, input_size=input_size) as sp:
|
||||||
|
try:
|
||||||
result = await handler(request)
|
result = await handler(request)
|
||||||
_annotate_tool_result(sp, result)
|
except Exception:
|
||||||
|
ot_metrics.record_tool_call_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
tool_name=tool_name,
|
||||||
|
)
|
||||||
|
ot_metrics.record_tool_call_error(tool_name=tool_name)
|
||||||
|
raise
|
||||||
|
errored = _annotate_tool_result(sp, result)
|
||||||
|
ot_metrics.record_tool_call_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
tool_name=tool_name,
|
||||||
|
)
|
||||||
|
if errored:
|
||||||
|
ot_metrics.record_tool_call_error(tool_name=tool_name)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -154,8 +193,29 @@ def _resolve_input_size(request: Any) -> int | None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|
||||||
def _annotate_model_response(span: Any, result: Any) -> None:
|
def _annotate_model_request(
|
||||||
|
span: Any, *, model_id: str | None, provider: str | None
|
||||||
|
) -> None:
|
||||||
|
try:
|
||||||
|
span.set_attribute("gen_ai.operation.name", "chat")
|
||||||
|
if model_id:
|
||||||
|
span.set_attribute("gen_ai.request.model", model_id)
|
||||||
|
if provider:
|
||||||
|
span.set_attribute("gen_ai.provider.name", provider)
|
||||||
|
except Exception: # pragma: no cover — defensive
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def _annotate_model_response(
|
||||||
|
span: Any,
|
||||||
|
result: Any,
|
||||||
|
*,
|
||||||
|
model_id: str | None = None,
|
||||||
|
provider: str | None = None,
|
||||||
|
) -> tuple[int | None, int | None]:
|
||||||
"""Best-effort: attach prompt/completion token counts when available."""
|
"""Best-effort: attach prompt/completion token counts when available."""
|
||||||
|
input_tokens: int | None = None
|
||||||
|
output_tokens: int | None = None
|
||||||
try:
|
try:
|
||||||
# ModelResponse may be a dataclass with .result containing AIMessage
|
# ModelResponse may be a dataclass with .result containing AIMessage
|
||||||
msg: Any
|
msg: Any
|
||||||
|
|
@ -165,22 +225,42 @@ def _annotate_model_response(span: Any, result: Any) -> None:
|
||||||
inner = getattr(result, "result", None)
|
inner = getattr(result, "result", None)
|
||||||
msg = inner[-1] if isinstance(inner, list) and inner else inner
|
msg = inner[-1] if isinstance(inner, list) and inner else inner
|
||||||
if msg is None:
|
if msg is None:
|
||||||
return
|
return None, None
|
||||||
|
if provider:
|
||||||
|
span.set_attribute("gen_ai.provider.name", provider)
|
||||||
|
if model_id:
|
||||||
|
span.set_attribute("gen_ai.request.model", model_id)
|
||||||
|
response_model = getattr(msg, "response_metadata", {}) or {}
|
||||||
|
if isinstance(response_model, dict):
|
||||||
|
response_model = (
|
||||||
|
response_model.get("model_name")
|
||||||
|
or response_model.get("model")
|
||||||
|
or response_model.get("model_id")
|
||||||
|
)
|
||||||
|
if not response_model:
|
||||||
|
response_model = model_id
|
||||||
|
if response_model:
|
||||||
|
span.set_attribute("gen_ai.response.model", str(response_model))
|
||||||
|
span.set_attribute("gen_ai.operation.name", "chat")
|
||||||
usage = getattr(msg, "usage_metadata", None) or {}
|
usage = getattr(msg, "usage_metadata", None) or {}
|
||||||
if isinstance(usage, dict):
|
if isinstance(usage, dict):
|
||||||
if (n := usage.get("input_tokens")) is not None:
|
if (n := usage.get("input_tokens")) is not None:
|
||||||
span.set_attribute("tokens.prompt", int(n))
|
input_tokens = int(n)
|
||||||
|
span.set_attribute("gen_ai.usage.input_tokens", input_tokens)
|
||||||
if (n := usage.get("output_tokens")) is not None:
|
if (n := usage.get("output_tokens")) is not None:
|
||||||
span.set_attribute("tokens.completion", int(n))
|
output_tokens = int(n)
|
||||||
|
span.set_attribute("gen_ai.usage.output_tokens", output_tokens)
|
||||||
if (n := usage.get("total_tokens")) is not None:
|
if (n := usage.get("total_tokens")) is not None:
|
||||||
span.set_attribute("tokens.total", int(n))
|
span.set_attribute("gen_ai.usage.total_tokens", int(n))
|
||||||
tool_calls = getattr(msg, "tool_calls", None) or []
|
tool_calls = getattr(msg, "tool_calls", None) or []
|
||||||
span.set_attribute("model.tool_calls", len(tool_calls))
|
span.set_attribute("model.tool_calls", len(tool_calls))
|
||||||
except Exception: # pragma: no cover — defensive
|
except Exception: # pragma: no cover — defensive
|
||||||
pass
|
pass
|
||||||
|
return input_tokens, output_tokens
|
||||||
|
|
||||||
|
|
||||||
def _annotate_tool_result(span: Any, result: Any) -> None:
|
def _annotate_tool_result(span: Any, result: Any) -> bool:
|
||||||
|
errored = False
|
||||||
try:
|
try:
|
||||||
if isinstance(result, ToolMessage):
|
if isinstance(result, ToolMessage):
|
||||||
content = (
|
content = (
|
||||||
|
|
@ -192,11 +272,14 @@ def _annotate_tool_result(span: Any, result: Any) -> None:
|
||||||
status = getattr(result, "status", None)
|
status = getattr(result, "status", None)
|
||||||
if isinstance(status, str):
|
if isinstance(status, str):
|
||||||
span.set_attribute("tool.status", status)
|
span.set_attribute("tool.status", status)
|
||||||
|
errored = status.lower() == "error"
|
||||||
kwargs = getattr(result, "additional_kwargs", None) or {}
|
kwargs = getattr(result, "additional_kwargs", None) or {}
|
||||||
if isinstance(kwargs, dict) and kwargs.get("error"):
|
if isinstance(kwargs, dict) and kwargs.get("error"):
|
||||||
span.set_attribute("tool.error", True)
|
span.set_attribute("tool.error", True)
|
||||||
|
errored = True
|
||||||
except Exception: # pragma: no cover — defensive
|
except Exception: # pragma: no cover — defensive
|
||||||
pass
|
pass
|
||||||
|
return errored
|
||||||
|
|
||||||
|
|
||||||
__all__ = ["OtelSpanMiddleware"]
|
__all__ = ["OtelSpanMiddleware"]
|
||||||
|
|
|
||||||
|
|
@ -61,6 +61,7 @@ from app.agents.new_chat.permissions import (
|
||||||
aggregate_action,
|
aggregate_action,
|
||||||
evaluate_many,
|
evaluate_many,
|
||||||
)
|
)
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
from app.observability import otel as ot
|
from app.observability import otel as ot
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -284,6 +285,8 @@ class PermissionMiddleware(AgentMiddleware): # type: ignore[type-arg]
|
||||||
),
|
),
|
||||||
ot.interrupt_span(interrupt_type="permission_ask"),
|
ot.interrupt_span(interrupt_type="permission_ask"),
|
||||||
):
|
):
|
||||||
|
ot_metrics.record_permission_ask(permission=tool_name)
|
||||||
|
ot_metrics.record_interrupt(interrupt_type="permission_ask")
|
||||||
decision = interrupt(payload)
|
decision = interrupt(payload)
|
||||||
return _normalize_permission_decision(decision)
|
return _normalize_permission_decision(decision)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,6 +45,8 @@ from langchain.agents.middleware.types import (
|
||||||
from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event
|
from langchain_core.callbacks import adispatch_custom_event, dispatch_custom_event
|
||||||
from langchain_core.messages import AIMessage
|
from langchain_core.messages import AIMessage
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Names of exception classes for which a retry would not help — context
|
# Names of exception classes for which a retry would not help — context
|
||||||
|
|
@ -198,6 +200,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp
|
||||||
if not self._should_retry(exc) or attempt >= self.max_retries:
|
if not self._should_retry(exc) or attempt >= self.max_retries:
|
||||||
raise
|
raise
|
||||||
delay = self._delay_for_attempt(attempt, exc)
|
delay = self._delay_for_attempt(attempt, exc)
|
||||||
|
ot.add_event(
|
||||||
|
"model.retry.scheduled",
|
||||||
|
{
|
||||||
|
"retry.attempt": attempt + 1,
|
||||||
|
"retry.max": self.max_retries,
|
||||||
|
"retry.delay_ms": int(delay * 1000),
|
||||||
|
"retry.reason": ot_metrics.categorize_exception(exc),
|
||||||
|
},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
dispatch_custom_event(
|
dispatch_custom_event(
|
||||||
"surfsense.retrying",
|
"surfsense.retrying",
|
||||||
|
|
@ -231,6 +242,15 @@ class RetryAfterMiddleware(AgentMiddleware[AgentState[ResponseT], ContextT, Resp
|
||||||
if not self._should_retry(exc) or attempt >= self.max_retries:
|
if not self._should_retry(exc) or attempt >= self.max_retries:
|
||||||
raise
|
raise
|
||||||
delay = self._delay_for_attempt(attempt, exc)
|
delay = self._delay_for_attempt(attempt, exc)
|
||||||
|
ot.add_event(
|
||||||
|
"model.retry.scheduled",
|
||||||
|
{
|
||||||
|
"retry.attempt": attempt + 1,
|
||||||
|
"retry.max": self.max_retries,
|
||||||
|
"retry.delay_ms": int(delay * 1000),
|
||||||
|
"retry.reason": ot_metrics.categorize_exception(exc),
|
||||||
|
},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
await adispatch_custom_event(
|
await adispatch_custom_event(
|
||||||
"surfsense.retrying",
|
"surfsense.retrying",
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,8 @@ from typing import TYPE_CHECKING, Any
|
||||||
|
|
||||||
from langchain.agents.middleware import ModelFallbackMiddleware
|
from langchain.agents.middleware import ModelFallbackMiddleware
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
|
|
||||||
|
|
@ -55,7 +57,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware):
|
||||||
raise
|
raise
|
||||||
last_exception = e
|
last_exception = e
|
||||||
|
|
||||||
for fallback_model in self.models:
|
for attempt, fallback_model in enumerate(self.models, start=1):
|
||||||
|
ot.add_event(
|
||||||
|
"model.fallback",
|
||||||
|
{
|
||||||
|
"fallback.attempt": attempt,
|
||||||
|
"fallback.from": attempt - 1,
|
||||||
|
"fallback.to": attempt,
|
||||||
|
"fallback.reason": ot_metrics.categorize_exception(last_exception),
|
||||||
|
},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
return handler(request.override(model=fallback_model))
|
return handler(request.override(model=fallback_model))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -79,7 +90,16 @@ class ScopedModelFallbackMiddleware(ModelFallbackMiddleware):
|
||||||
raise
|
raise
|
||||||
last_exception = e
|
last_exception = e
|
||||||
|
|
||||||
for fallback_model in self.models:
|
for attempt, fallback_model in enumerate(self.models, start=1):
|
||||||
|
ot.add_event(
|
||||||
|
"model.fallback",
|
||||||
|
{
|
||||||
|
"fallback.attempt": attempt,
|
||||||
|
"fallback.from": attempt - 1,
|
||||||
|
"fallback.to": attempt,
|
||||||
|
"fallback.reason": ot_metrics.categorize_exception(last_exception),
|
||||||
|
},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
return await handler(request.override(model=fallback_model))
|
return await handler(request.override(model=fallback_model))
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,5 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
import contextlib
|
||||||
import gc
|
import gc
|
||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
|
@ -36,13 +37,15 @@ from app.config import (
|
||||||
)
|
)
|
||||||
from app.db import User, create_db_and_tables, get_async_session
|
from app.db import User, create_db_and_tables, get_async_session
|
||||||
from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
|
from app.exceptions import GENERIC_5XX_MESSAGE, ISSUES_URL, SurfSenseError
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
|
from app.observability.bootstrap import init_otel, shutdown_otel
|
||||||
from app.rate_limiter import get_real_client_ip, limiter
|
from app.rate_limiter import get_real_client_ip, limiter
|
||||||
from app.routes import router as crud_router
|
from app.routes import router as crud_router
|
||||||
from app.routes.auth_routes import router as auth_router
|
from app.routes.auth_routes import router as auth_router
|
||||||
from app.schemas import UserCreate, UserRead, UserUpdate
|
from app.schemas import UserCreate, UserRead, UserUpdate
|
||||||
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
|
from app.tasks.surfsense_docs_indexer import seed_surfsense_docs
|
||||||
from app.users import SECRET, auth_backend, current_active_user, fastapi_users
|
from app.users import SECRET, auth_backend, current_active_user, fastapi_users
|
||||||
from app.utils.perf import get_perf_logger, log_system_snapshot
|
from app.utils.perf import log_system_snapshot
|
||||||
|
|
||||||
_error_logger = logging.getLogger("surfsense.errors")
|
_error_logger = logging.getLogger("surfsense.errors")
|
||||||
|
|
||||||
|
|
@ -127,6 +130,8 @@ def _http_exception_handler(request: Request, exc: HTTPException) -> JSONRespons
|
||||||
logged server-side.
|
logged server-side.
|
||||||
"""
|
"""
|
||||||
rid = _get_request_id(request)
|
rid = _get_request_id(request)
|
||||||
|
if exc.status_code in {401, 403} and request.url.path.startswith("/auth"):
|
||||||
|
ot_metrics.record_auth_failure(reason=_status_to_code(exc.status_code))
|
||||||
should_sanitize = exc.status_code == 500
|
should_sanitize = exc.status_code == 500
|
||||||
|
|
||||||
# Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."})
|
# Structured dict details (e.g. {"code": "CAPTCHA_REQUIRED", "message": "..."})
|
||||||
|
|
@ -213,6 +218,7 @@ def _validation_error_handler(
|
||||||
def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
def _unhandled_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
||||||
"""Catch-all: log full traceback, return sanitized 500."""
|
"""Catch-all: log full traceback, return sanitized 500."""
|
||||||
rid = _get_request_id(request)
|
rid = _get_request_id(request)
|
||||||
|
ot_metrics.record_auth_failure(reason="unhandled_exception")
|
||||||
_error_logger.error(
|
_error_logger.error(
|
||||||
"[%s] Unhandled exception on %s %s",
|
"[%s] Unhandled exception on %s %s",
|
||||||
rid,
|
rid,
|
||||||
|
|
@ -246,6 +252,7 @@ def _status_to_code(status_code: int, detail: str = "") -> str:
|
||||||
def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
|
def _rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
|
||||||
"""Custom 429 handler that returns JSON matching our error envelope."""
|
"""Custom 429 handler that returns JSON matching our error envelope."""
|
||||||
rid = _get_request_id(request)
|
rid = _get_request_id(request)
|
||||||
|
ot_metrics.record_rate_limit_rejection(scope="slowapi")
|
||||||
retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60"
|
retry_after = exc.detail.split("per")[-1].strip() if exc.detail else "60"
|
||||||
return _build_error_response(
|
return _build_error_response(
|
||||||
429,
|
429,
|
||||||
|
|
@ -306,6 +313,7 @@ def _check_rate_limit_memory(
|
||||||
f"Rate limit exceeded (in-memory fallback) on {scope} for IP {client_ip} "
|
f"Rate limit exceeded (in-memory fallback) on {scope} for IP {client_ip} "
|
||||||
f"({len(timestamps)}/{max_requests} in {window_seconds}s)"
|
f"({len(timestamps)}/{max_requests} in {window_seconds}s)"
|
||||||
)
|
)
|
||||||
|
ot_metrics.record_rate_limit_rejection(scope=scope)
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=429,
|
status_code=429,
|
||||||
detail="RATE_LIMIT_EXCEEDED",
|
detail="RATE_LIMIT_EXCEEDED",
|
||||||
|
|
@ -349,6 +357,7 @@ def _check_rate_limit(
|
||||||
f"Rate limit exceeded on {scope} for IP {client_ip} "
|
f"Rate limit exceeded on {scope} for IP {client_ip} "
|
||||||
f"({current_count}/{max_requests} in {window_seconds}s)"
|
f"({current_count}/{max_requests} in {window_seconds}s)"
|
||||||
)
|
)
|
||||||
|
ot_metrics.record_rate_limit_rejection(scope=scope)
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=429,
|
status_code=429,
|
||||||
detail="RATE_LIMIT_EXCEEDED",
|
detail="RATE_LIMIT_EXCEEDED",
|
||||||
|
|
@ -558,6 +567,7 @@ async def lifespan(app: FastAPI):
|
||||||
gc.set_threshold(700, 10, 5)
|
gc.set_threshold(700, 10, 5)
|
||||||
|
|
||||||
_enable_slow_callback_logging(threshold_sec=0.5)
|
_enable_slow_callback_logging(threshold_sec=0.5)
|
||||||
|
init_otel(app)
|
||||||
await create_db_and_tables()
|
await create_db_and_tables()
|
||||||
await setup_checkpointer_tables()
|
await setup_checkpointer_tables()
|
||||||
initialize_openrouter_integration()
|
initialize_openrouter_integration()
|
||||||
|
|
@ -592,6 +602,7 @@ async def lifespan(app: FastAPI):
|
||||||
|
|
||||||
_stop_openrouter_background_refresh()
|
_stop_openrouter_background_refresh()
|
||||||
await close_checkpointer()
|
await close_checkpointer()
|
||||||
|
shutdown_otel()
|
||||||
|
|
||||||
|
|
||||||
def registration_allowed():
|
def registration_allowed():
|
||||||
|
|
@ -676,32 +687,20 @@ class RequestPerfMiddleware(BaseHTTPMiddleware):
|
||||||
async def dispatch(
|
async def dispatch(
|
||||||
self, request: StarletteRequest, call_next: RequestResponseEndpoint
|
self, request: StarletteRequest, call_next: RequestResponseEndpoint
|
||||||
) -> StarletteResponse:
|
) -> StarletteResponse:
|
||||||
perf = get_perf_logger()
|
|
||||||
t0 = time.perf_counter()
|
t0 = time.perf_counter()
|
||||||
response = await call_next(request)
|
response = await call_next(request)
|
||||||
elapsed_ms = (time.perf_counter() - t0) * 1000
|
elapsed_ms = (time.perf_counter() - t0) * 1000
|
||||||
|
|
||||||
path = request.url.path
|
path = request.url.path
|
||||||
method = request.method
|
|
||||||
status = response.status_code
|
|
||||||
|
|
||||||
perf.debug(
|
|
||||||
"[request] %s %s -> %d in %.1fms",
|
|
||||||
method,
|
|
||||||
path,
|
|
||||||
status,
|
|
||||||
elapsed_ms,
|
|
||||||
)
|
|
||||||
|
|
||||||
if elapsed_ms > _PERF_SLOW_REQUEST_THRESHOLD:
|
if elapsed_ms > _PERF_SLOW_REQUEST_THRESHOLD:
|
||||||
perf.warning(
|
with contextlib.suppress(Exception):
|
||||||
"[SLOW_REQUEST] %s %s -> %d in %.1fms (threshold=%.0fms)",
|
from opentelemetry import trace
|
||||||
method,
|
|
||||||
path,
|
span = trace.get_current_span()
|
||||||
status,
|
span.set_attribute("slow_request", True)
|
||||||
elapsed_ms,
|
span.set_attribute("surfsense.request.elapsed_ms", elapsed_ms)
|
||||||
_PERF_SLOW_REQUEST_THRESHOLD,
|
span.set_attribute("http.route", path)
|
||||||
)
|
|
||||||
log_system_snapshot("slow_request")
|
log_system_snapshot("slow_request")
|
||||||
|
|
||||||
return response
|
return response
|
||||||
|
|
|
||||||
|
|
@ -1,16 +1,103 @@
|
||||||
"""Celery application configuration and setup."""
|
"""Celery application configuration and setup."""
|
||||||
|
|
||||||
|
import contextlib
|
||||||
import os
|
import os
|
||||||
|
import time
|
||||||
|
|
||||||
from celery import Celery
|
from celery import Celery
|
||||||
from celery.schedules import crontab
|
from celery.schedules import crontab
|
||||||
from celery.signals import worker_process_init
|
from celery.signals import (
|
||||||
|
before_task_publish,
|
||||||
|
task_postrun,
|
||||||
|
task_prerun,
|
||||||
|
worker_process_init,
|
||||||
|
)
|
||||||
from dotenv import load_dotenv
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
try:
|
||||||
|
from opentelemetry import trace
|
||||||
|
except ImportError: # pragma: no cover - optional OTel dependency
|
||||||
|
trace = None # type: ignore[assignment]
|
||||||
|
|
||||||
# Load environment variables
|
# Load environment variables
|
||||||
load_dotenv()
|
load_dotenv()
|
||||||
|
|
||||||
|
|
||||||
|
@before_task_publish.connect
|
||||||
|
def _stamp_enqueue_time(headers=None, **_kwargs):
|
||||||
|
"""Stamp enqueue time so workers can measure queue wait."""
|
||||||
|
if headers is None:
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
headers["surfsense.enqueued_at_ns"] = str(time.monotonic_ns())
|
||||||
|
|
||||||
|
|
||||||
|
@task_prerun.connect
|
||||||
|
def _record_queue_latency(task=None, **_kwargs):
|
||||||
|
"""Record queue latency and stash task metadata for span enrichment."""
|
||||||
|
if task is None:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
|
|
||||||
|
task_name = getattr(task, "name", None) or "unknown"
|
||||||
|
operation = ot_metrics.parse_celery_task_label(task_name)
|
||||||
|
request = getattr(task, "request", None)
|
||||||
|
delivery_info = getattr(request, "delivery_info", None) or {}
|
||||||
|
queue = delivery_info.get("routing_key") or "unknown"
|
||||||
|
scheduled = bool(
|
||||||
|
getattr(request, "eta", None) or getattr(request, "expires", None)
|
||||||
|
)
|
||||||
|
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
request.surfsense_operation = operation
|
||||||
|
request.surfsense_queue = queue
|
||||||
|
request.surfsense_scheduled = scheduled
|
||||||
|
|
||||||
|
headers = getattr(request, "headers", None) or {}
|
||||||
|
enqueued_ns = headers.get("surfsense.enqueued_at_ns")
|
||||||
|
if enqueued_ns is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
elapsed_s = (time.monotonic_ns() - int(enqueued_ns)) / 1e9
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
request.surfsense_queue_latency_ms = elapsed_s * 1000
|
||||||
|
|
||||||
|
ot_metrics.record_celery_queue_latency(
|
||||||
|
elapsed_s,
|
||||||
|
task_name=task_name,
|
||||||
|
queue=queue,
|
||||||
|
scheduled=scheduled,
|
||||||
|
operation=operation,
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
@task_postrun.connect
|
||||||
|
def _set_celery_span_attributes(task=None, **_kwargs):
|
||||||
|
"""Attach derived queue metadata to the active Celery run span."""
|
||||||
|
if task is None or trace is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
request = getattr(task, "request", None)
|
||||||
|
if request is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
span = trace.get_current_span()
|
||||||
|
|
||||||
|
operation = getattr(request, "surfsense_operation", None)
|
||||||
|
if operation:
|
||||||
|
span.set_attribute("celery.task.operation", operation)
|
||||||
|
|
||||||
|
latency_ms = getattr(request, "surfsense_queue_latency_ms", None)
|
||||||
|
if latency_ms is not None:
|
||||||
|
span.set_attribute("celery.queue.latency_ms", latency_ms)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
@worker_process_init.connect
|
@worker_process_init.connect
|
||||||
def init_worker(**kwargs):
|
def init_worker(**kwargs):
|
||||||
"""Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
|
"""Initialize the LLM Router and Image Gen Router when a Celery worker process starts.
|
||||||
|
|
@ -18,6 +105,10 @@ def init_worker(**kwargs):
|
||||||
This ensures the Auto mode (LiteLLM Router) is available for background tasks
|
This ensures the Auto mode (LiteLLM Router) is available for background tasks
|
||||||
like document summarization and image generation.
|
like document summarization and image generation.
|
||||||
"""
|
"""
|
||||||
|
from app.observability.bootstrap import init_otel
|
||||||
|
|
||||||
|
init_otel(app=None, traces=True, metrics=True, logs=True)
|
||||||
|
|
||||||
from app.config import (
|
from app.config import (
|
||||||
initialize_image_gen_router,
|
initialize_image_gen_router,
|
||||||
initialize_llm_router,
|
initialize_llm_router,
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,7 @@
|
||||||
|
import contextlib
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
|
from pathlib import PurePosixPath
|
||||||
|
|
||||||
from app.config import config as app_config
|
from app.config import config as app_config
|
||||||
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
|
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
|
||||||
|
|
@ -10,6 +13,11 @@ from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
||||||
from app.etl_pipeline.parsers.audio import transcribe_audio
|
from app.etl_pipeline.parsers.audio import transcribe_audio
|
||||||
from app.etl_pipeline.parsers.direct_convert import convert_file_directly
|
from app.etl_pipeline.parsers.direct_convert import convert_file_directly
|
||||||
from app.etl_pipeline.parsers.plaintext import read_plaintext
|
from app.etl_pipeline.parsers.plaintext import read_plaintext
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
|
|
||||||
|
|
||||||
|
def _file_extension(filename: str) -> str:
|
||||||
|
return PurePosixPath(filename).suffix.lower() or "none"
|
||||||
|
|
||||||
|
|
||||||
class EtlPipelineService:
|
class EtlPipelineService:
|
||||||
|
|
@ -20,7 +28,16 @@ class EtlPipelineService:
|
||||||
|
|
||||||
async def extract(self, request: EtlRequest) -> EtlResult:
|
async def extract(self, request: EtlRequest) -> EtlResult:
|
||||||
category = classify_file(request.filename)
|
category = classify_file(request.filename)
|
||||||
|
start = time.perf_counter()
|
||||||
|
status = "success"
|
||||||
|
error_category: str | None = None
|
||||||
|
result: EtlResult | None = None
|
||||||
|
with ot.etl_extract_span(
|
||||||
|
content_type=category.value,
|
||||||
|
file_extension=_file_extension(request.filename),
|
||||||
|
processing_mode=request.processing_mode.value,
|
||||||
|
) as sp:
|
||||||
|
try:
|
||||||
if category == FileCategory.UNSUPPORTED:
|
if category == FileCategory.UNSUPPORTED:
|
||||||
raise EtlUnsupportedFileError(
|
raise EtlUnsupportedFileError(
|
||||||
f"File type not supported for parsing: {request.filename}"
|
f"File type not supported for parsing: {request.filename}"
|
||||||
|
|
@ -28,41 +45,74 @@ class EtlPipelineService:
|
||||||
|
|
||||||
if category == FileCategory.PLAINTEXT:
|
if category == FileCategory.PLAINTEXT:
|
||||||
content = read_plaintext(request.file_path)
|
content = read_plaintext(request.file_path)
|
||||||
return EtlResult(
|
result = EtlResult(
|
||||||
markdown_content=content,
|
markdown_content=content,
|
||||||
etl_service="PLAINTEXT",
|
etl_service="PLAINTEXT",
|
||||||
content_type="plaintext",
|
content_type="plaintext",
|
||||||
)
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
if category == FileCategory.DIRECT_CONVERT:
|
if category == FileCategory.DIRECT_CONVERT:
|
||||||
content = convert_file_directly(request.file_path, request.filename)
|
content = convert_file_directly(request.file_path, request.filename)
|
||||||
return EtlResult(
|
result = EtlResult(
|
||||||
markdown_content=content,
|
markdown_content=content,
|
||||||
etl_service="DIRECT_CONVERT",
|
etl_service="DIRECT_CONVERT",
|
||||||
content_type="direct_convert",
|
content_type="direct_convert",
|
||||||
)
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
if category == FileCategory.AUDIO:
|
if category == FileCategory.AUDIO:
|
||||||
content = await transcribe_audio(request.file_path, request.filename)
|
content = await transcribe_audio(request.file_path, request.filename)
|
||||||
return EtlResult(
|
result = EtlResult(
|
||||||
markdown_content=content,
|
markdown_content=content,
|
||||||
etl_service="AUDIO",
|
etl_service="AUDIO",
|
||||||
content_type="audio",
|
content_type="audio",
|
||||||
)
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
if category == FileCategory.IMAGE:
|
if category == FileCategory.IMAGE:
|
||||||
return await self._extract_image(request)
|
result = await self._extract_image(request)
|
||||||
|
return result
|
||||||
|
|
||||||
return await self._extract_document(request)
|
result = await self._extract_document(request)
|
||||||
|
return result
|
||||||
|
except Exception as exc:
|
||||||
|
status = "error"
|
||||||
|
error_category = ot_metrics.categorize_exception(exc)
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
if result is not None:
|
||||||
|
sp.set_attribute("etl.service", result.etl_service)
|
||||||
|
sp.set_attribute("content.type", result.content_type)
|
||||||
|
sp.set_attribute("etl.status", status)
|
||||||
|
ot_metrics.record_etl_extract_duration(
|
||||||
|
time.perf_counter() - start,
|
||||||
|
etl_service=result.etl_service if result else None,
|
||||||
|
content_type=result.content_type if result else category.value,
|
||||||
|
status=status,
|
||||||
|
)
|
||||||
|
ot_metrics.record_etl_extract_outcome(
|
||||||
|
etl_service=result.etl_service if result else None,
|
||||||
|
content_type=result.content_type if result else category.value,
|
||||||
|
status=status,
|
||||||
|
error_category=error_category,
|
||||||
|
)
|
||||||
|
|
||||||
async def _extract_image(self, request: EtlRequest) -> EtlResult:
|
async def _extract_image(self, request: EtlRequest) -> EtlResult:
|
||||||
if self._vision_llm:
|
if self._vision_llm:
|
||||||
try:
|
try:
|
||||||
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
|
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
|
||||||
|
|
||||||
|
with ot.etl_parse_span(
|
||||||
|
etl_service="VISION_LLM",
|
||||||
|
content_type="image",
|
||||||
|
file_extension=_file_extension(request.filename),
|
||||||
|
) as sp:
|
||||||
content = await parse_with_vision_llm(
|
content = await parse_with_vision_llm(
|
||||||
request.file_path, request.filename, self._vision_llm
|
request.file_path, request.filename, self._vision_llm
|
||||||
)
|
)
|
||||||
|
sp.set_attribute("etl.status", "success")
|
||||||
return EtlResult(
|
return EtlResult(
|
||||||
markdown_content=content,
|
markdown_content=content,
|
||||||
etl_service="VISION_LLM",
|
etl_service="VISION_LLM",
|
||||||
|
|
@ -87,13 +137,33 @@ class EtlPipelineService:
|
||||||
request.filename,
|
request.filename,
|
||||||
exc_info=True,
|
exc_info=True,
|
||||||
)
|
)
|
||||||
|
ot.add_event(
|
||||||
|
"etl.fallback",
|
||||||
|
{
|
||||||
|
"fallback.from": "vision_llm",
|
||||||
|
"fallback.to": "document_parser",
|
||||||
|
"fallback.reason": ot_metrics.categorize_exception(exc),
|
||||||
|
},
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
logging.info(
|
logging.info(
|
||||||
"No vision LLM provided, falling back to document parser for %s",
|
"No vision LLM provided, falling back to document parser for %s",
|
||||||
request.filename,
|
request.filename,
|
||||||
)
|
)
|
||||||
|
ot.add_event(
|
||||||
|
"etl.fallback",
|
||||||
|
{
|
||||||
|
"fallback.from": "vision_llm",
|
||||||
|
"fallback.to": "document_parser",
|
||||||
|
"fallback.reason": "not_configured",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
with ot.etl_ocr_span(
|
||||||
|
etl_service=app_config.ETL_SERVICE,
|
||||||
|
file_extension=_file_extension(request.filename),
|
||||||
|
):
|
||||||
return await self._extract_document(request)
|
return await self._extract_document(request)
|
||||||
except (EtlUnsupportedFileError, EtlServiceUnavailableError):
|
except (EtlUnsupportedFileError, EtlServiceUnavailableError):
|
||||||
raise EtlUnsupportedFileError(
|
raise EtlUnsupportedFileError(
|
||||||
|
|
@ -121,18 +191,27 @@ class EtlPipelineService:
|
||||||
f"File type {ext} is not supported by {etl_service}"
|
f"File type {ext} is not supported by {etl_service}"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
with ot.etl_parse_span(
|
||||||
|
etl_service=etl_service,
|
||||||
|
content_type="document",
|
||||||
|
file_extension=ext,
|
||||||
|
processing_mode=request.processing_mode.value,
|
||||||
|
) as sp:
|
||||||
if etl_service == "DOCLING":
|
if etl_service == "DOCLING":
|
||||||
from app.etl_pipeline.parsers.docling import parse_with_docling
|
from app.etl_pipeline.parsers.docling import parse_with_docling
|
||||||
|
|
||||||
content = await parse_with_docling(request.file_path, request.filename)
|
content = await parse_with_docling(request.file_path, request.filename)
|
||||||
elif etl_service == "UNSTRUCTURED":
|
elif etl_service == "UNSTRUCTURED":
|
||||||
from app.etl_pipeline.parsers.unstructured import parse_with_unstructured
|
from app.etl_pipeline.parsers.unstructured import (
|
||||||
|
parse_with_unstructured,
|
||||||
|
)
|
||||||
|
|
||||||
content = await parse_with_unstructured(request.file_path)
|
content = await parse_with_unstructured(request.file_path)
|
||||||
elif etl_service == "LLAMACLOUD":
|
elif etl_service == "LLAMACLOUD":
|
||||||
content = await self._extract_with_llamacloud(request)
|
content = await self._extract_with_llamacloud(request)
|
||||||
else:
|
else:
|
||||||
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
|
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
|
||||||
|
sp.set_attribute("etl.status", "success")
|
||||||
|
|
||||||
# When the operator opts into vision-LLM at ingest, walk the
|
# When the operator opts into vision-LLM at ingest, walk the
|
||||||
# original file's embedded images and append a structured
|
# original file's embedded images and append a structured
|
||||||
|
|
@ -171,9 +250,14 @@ class EtlPipelineService:
|
||||||
async def _ocr_image(image_path: str, image_name: str) -> str:
|
async def _ocr_image(image_path: str, image_name: str) -> str:
|
||||||
try:
|
try:
|
||||||
sub = EtlPipelineService(vision_llm=None)
|
sub = EtlPipelineService(vision_llm=None)
|
||||||
|
with ot.etl_picture_ocr_span(
|
||||||
|
file_extension=_file_extension(image_name)
|
||||||
|
) as sp:
|
||||||
ocr_result = await sub.extract(
|
ocr_result = await sub.extract(
|
||||||
EtlRequest(file_path=image_path, filename=image_name)
|
EtlRequest(file_path=image_path, filename=image_name)
|
||||||
)
|
)
|
||||||
|
sp.set_attribute("etl.service", ocr_result.etl_service)
|
||||||
|
sp.set_attribute("etl.status", "success")
|
||||||
except (
|
except (
|
||||||
EtlUnsupportedFileError,
|
EtlUnsupportedFileError,
|
||||||
EtlServiceUnavailableError,
|
EtlServiceUnavailableError,
|
||||||
|
|
@ -181,20 +265,42 @@ class EtlPipelineService:
|
||||||
# Common case: the configured ETL service can't OCR
|
# Common case: the configured ETL service can't OCR
|
||||||
# this image format (or no service is configured at
|
# this image format (or no service is configured at
|
||||||
# all). Don't spam warnings -- just no OCR for it.
|
# all). Don't spam warnings -- just no OCR for it.
|
||||||
|
ot.add_event(
|
||||||
|
"etl.ocr.skipped",
|
||||||
|
{
|
||||||
|
"skip.reason": "unsupported_format",
|
||||||
|
"error.category": ot_metrics.categorize_exception(exc),
|
||||||
|
},
|
||||||
|
)
|
||||||
logging.debug("Skipping per-image OCR for %s: %s", image_name, exc)
|
logging.debug("Skipping per-image OCR for %s: %s", image_name, exc)
|
||||||
return ""
|
return ""
|
||||||
return ocr_result.markdown_content
|
return ocr_result.markdown_content
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
with ot.etl_picture_describe_span() as sp:
|
||||||
result = await describe_pictures(
|
result = await describe_pictures(
|
||||||
request.file_path,
|
request.file_path,
|
||||||
request.filename,
|
request.filename,
|
||||||
self._vision_llm,
|
self._vision_llm,
|
||||||
ocr_runner=_ocr_image,
|
ocr_runner=_ocr_image,
|
||||||
)
|
)
|
||||||
except Exception:
|
sp.set_attribute("image.described.count", len(result.descriptions))
|
||||||
|
sp.set_attribute("image.failed.count", result.failed)
|
||||||
|
sp.set_attribute("image.skipped.too_small", result.skipped_too_small)
|
||||||
|
sp.set_attribute("image.skipped.too_large", result.skipped_too_large)
|
||||||
|
sp.set_attribute("image.skipped.duplicate", result.skipped_duplicate)
|
||||||
|
sp.set_attribute("etl.status", "success")
|
||||||
|
except Exception as exc:
|
||||||
# Picture description is additive; never let it fail an
|
# Picture description is additive; never let it fail an
|
||||||
# otherwise-successful document extraction.
|
# otherwise-successful document extraction.
|
||||||
|
ot.add_event(
|
||||||
|
"etl.degraded",
|
||||||
|
{
|
||||||
|
"degraded.reason": "picture_describe_failed",
|
||||||
|
"degraded.action": "return_parser_output",
|
||||||
|
"error.category": ot_metrics.categorize_exception(exc),
|
||||||
|
},
|
||||||
|
)
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"Picture description failed for %s, returning parser output unchanged",
|
"Picture description failed for %s, returning parser output unchanged",
|
||||||
request.filename,
|
request.filename,
|
||||||
|
|
@ -247,7 +353,15 @@ class EtlPipelineService:
|
||||||
return await parse_with_azure_doc_intelligence(
|
return await parse_with_azure_doc_intelligence(
|
||||||
request.file_path, processing_mode=mode_value
|
request.file_path, processing_mode=mode_value
|
||||||
)
|
)
|
||||||
except Exception:
|
except Exception as exc:
|
||||||
|
ot.add_event(
|
||||||
|
"etl.fallback",
|
||||||
|
{
|
||||||
|
"fallback.from": "azure_di",
|
||||||
|
"fallback.to": "llamacloud",
|
||||||
|
"fallback.reason": ot_metrics.categorize_exception(exc),
|
||||||
|
},
|
||||||
|
)
|
||||||
logging.warning(
|
logging.warning(
|
||||||
"Azure Document Intelligence failed for %s, "
|
"Azure Document Intelligence failed for %s, "
|
||||||
"falling back to LlamaCloud",
|
"falling back to LlamaCloud",
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,7 @@ import asyncio
|
||||||
import contextlib
|
import contextlib
|
||||||
import hashlib
|
import hashlib
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
from collections.abc import Awaitable, Callable
|
from collections.abc import Awaitable, Callable
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
@ -57,6 +58,7 @@ from app.indexing_pipeline.pipeline_logger import (
|
||||||
log_retryable_llm_error,
|
log_retryable_llm_error,
|
||||||
log_unexpected_error,
|
log_unexpected_error,
|
||||||
)
|
)
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -362,6 +364,16 @@ class IndexingPipelineService:
|
||||||
)
|
)
|
||||||
perf = get_perf_logger()
|
perf = get_perf_logger()
|
||||||
t_index = time.perf_counter()
|
t_index = time.perf_counter()
|
||||||
|
document_type = (
|
||||||
|
document.document_type.value
|
||||||
|
if getattr(document, "document_type", None)
|
||||||
|
else None
|
||||||
|
)
|
||||||
|
persist_span_cm = ot.kb_persist_span(
|
||||||
|
document_type=document_type,
|
||||||
|
)
|
||||||
|
persist_span = persist_span_cm.__enter__()
|
||||||
|
outcome_status = "failed"
|
||||||
try:
|
try:
|
||||||
log_index_started(ctx)
|
log_index_started(ctx)
|
||||||
document.status = DocumentStatus.processing()
|
document.status = DocumentStatus.processing()
|
||||||
|
|
@ -429,34 +441,41 @@ class IndexingPipelineService:
|
||||||
time.perf_counter() - t_index,
|
time.perf_counter() - t_index,
|
||||||
)
|
)
|
||||||
log_index_success(ctx, chunk_count=len(chunks))
|
log_index_success(ctx, chunk_count=len(chunks))
|
||||||
|
outcome_status = "success"
|
||||||
|
|
||||||
await self._enqueue_ai_sort_if_enabled(document)
|
await self._enqueue_ai_sort_if_enabled(document)
|
||||||
|
|
||||||
except RETRYABLE_LLM_ERRORS as e:
|
except RETRYABLE_LLM_ERRORS as e:
|
||||||
|
ot.record_error(persist_span, e)
|
||||||
log_retryable_llm_error(ctx, e)
|
log_retryable_llm_error(ctx, e)
|
||||||
|
outcome_status = "requeued"
|
||||||
await rollback_and_persist_failure(
|
await rollback_and_persist_failure(
|
||||||
self.session, document, llm_retryable_message(e)
|
self.session, document, llm_retryable_message(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
except PERMANENT_LLM_ERRORS as e:
|
except PERMANENT_LLM_ERRORS as e:
|
||||||
|
ot.record_error(persist_span, e)
|
||||||
log_permanent_llm_error(ctx, e)
|
log_permanent_llm_error(ctx, e)
|
||||||
await rollback_and_persist_failure(
|
await rollback_and_persist_failure(
|
||||||
self.session, document, llm_permanent_message(e)
|
self.session, document, llm_permanent_message(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
except RecursionError as e:
|
except RecursionError as e:
|
||||||
|
ot.record_error(persist_span, e)
|
||||||
log_chunking_overflow(ctx, e)
|
log_chunking_overflow(ctx, e)
|
||||||
await rollback_and_persist_failure(
|
await rollback_and_persist_failure(
|
||||||
self.session, document, PipelineMessages.CHUNKING_OVERFLOW
|
self.session, document, PipelineMessages.CHUNKING_OVERFLOW
|
||||||
)
|
)
|
||||||
|
|
||||||
except EMBEDDING_ERRORS as e:
|
except EMBEDDING_ERRORS as e:
|
||||||
|
ot.record_error(persist_span, e)
|
||||||
log_embedding_error(ctx, e)
|
log_embedding_error(ctx, e)
|
||||||
await rollback_and_persist_failure(
|
await rollback_and_persist_failure(
|
||||||
self.session, document, embedding_message(e)
|
self.session, document, embedding_message(e)
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
ot.record_error(persist_span, e)
|
||||||
log_unexpected_error(ctx, e)
|
log_unexpected_error(ctx, e)
|
||||||
await rollback_and_persist_failure(
|
await rollback_and_persist_failure(
|
||||||
self.session, document, safe_exception_message(e)
|
self.session, document, safe_exception_message(e)
|
||||||
|
|
@ -465,6 +484,17 @@ class IndexingPipelineService:
|
||||||
with contextlib.suppress(Exception):
|
with contextlib.suppress(Exception):
|
||||||
await self.session.refresh(document)
|
await self.session.refresh(document)
|
||||||
|
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
persist_span.set_attribute("indexing.status", outcome_status)
|
||||||
|
ot_metrics.record_indexing_document_duration(
|
||||||
|
time.perf_counter() - t_index,
|
||||||
|
document_type=document_type,
|
||||||
|
)
|
||||||
|
ot_metrics.record_indexing_document_outcome(
|
||||||
|
document_type=document_type,
|
||||||
|
status=outcome_status,
|
||||||
|
)
|
||||||
|
persist_span_cm.__exit__(*sys.exc_info())
|
||||||
return document
|
return document
|
||||||
|
|
||||||
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
|
async def _enqueue_ai_sort_if_enabled(self, document: Document) -> None:
|
||||||
|
|
|
||||||
|
|
@ -5,3 +5,5 @@ small wrapper around the optional ``opentelemetry`` instrumentation. The
|
||||||
wrapper is a no-op when OTEL is not configured, so importing it from
|
wrapper is a no-op when OTEL is not configured, so importing it from
|
||||||
performance-critical paths is safe.
|
performance-critical paths is safe.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
__all__ = ["bootstrap", "metrics", "otel"]
|
||||||
|
|
|
||||||
390
surfsense_backend/app/observability/bootstrap.py
Normal file
390
surfsense_backend/app/observability/bootstrap.py
Normal file
|
|
@ -0,0 +1,390 @@
|
||||||
|
"""Programmatic OpenTelemetry bootstrap for SurfSense backend processes."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
from importlib import metadata
|
||||||
|
from typing import Any
|
||||||
|
from urllib.parse import urlsplit, urlunsplit
|
||||||
|
|
||||||
|
from app.observability import otel
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_BOOL_TRUE = {"1", "true", "yes", "on"}
|
||||||
|
|
||||||
|
_TRACES_INITIALIZED = False
|
||||||
|
_METRICS_INITIALIZED = False
|
||||||
|
_LOGS_INITIALIZED = False
|
||||||
|
_FASTAPI_INSTRUMENTED = False
|
||||||
|
_SQLALCHEMY_INSTRUMENTED = False
|
||||||
|
_PSYCOPG_INSTRUMENTED = False
|
||||||
|
_REDIS_INSTRUMENTED = False
|
||||||
|
_HTTPX_INSTRUMENTED = False
|
||||||
|
_CELERY_INSTRUMENTED = False
|
||||||
|
|
||||||
|
_TRACER_PROVIDER: Any | None = None
|
||||||
|
_METER_PROVIDER: Any | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _env_truthy(name: str) -> bool:
|
||||||
|
return os.environ.get(name, "").strip().lower() in _BOOL_TRUE
|
||||||
|
|
||||||
|
|
||||||
|
def is_otel_disabled() -> bool:
|
||||||
|
"""Return true when either SurfSense or OTel's spec kill switch is set."""
|
||||||
|
return _env_truthy("SURFSENSE_DISABLE_OTEL") or _env_truthy("OTEL_SDK_DISABLED")
|
||||||
|
|
||||||
|
|
||||||
|
def is_otel_configured() -> bool:
|
||||||
|
"""Return true when this process should export OTel signals."""
|
||||||
|
return bool(
|
||||||
|
os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT")
|
||||||
|
or os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
|
||||||
|
or os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _package_version() -> str:
|
||||||
|
with contextlib.suppress(metadata.PackageNotFoundError):
|
||||||
|
return metadata.version("surf-new-backend")
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def _deployment_environment() -> str:
|
||||||
|
return os.environ.get("SURFSENSE_ENV", "dev")
|
||||||
|
|
||||||
|
|
||||||
|
def _build_resource():
|
||||||
|
from opentelemetry.sdk.resources import Resource
|
||||||
|
|
||||||
|
deployment_environment = _deployment_environment()
|
||||||
|
return Resource.create(
|
||||||
|
{
|
||||||
|
"service.name": os.environ.get("OTEL_SERVICE_NAME", "surfsense-backend"),
|
||||||
|
"service.version": _package_version(),
|
||||||
|
"service.instance.id": socket.gethostname(),
|
||||||
|
"deployment.environment.name": deployment_environment,
|
||||||
|
# Compatibility alias for Grafana onboarding checks that still use
|
||||||
|
# the older semantic-convention key.
|
||||||
|
"deployment.environment": deployment_environment,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _otlp_protocol() -> str:
|
||||||
|
return os.environ.get("OTEL_EXPORTER_OTLP_PROTOCOL", "grpc").strip().lower()
|
||||||
|
|
||||||
|
|
||||||
|
def _trace_exporter():
|
||||||
|
if _otlp_protocol() == "http/protobuf":
|
||||||
|
from opentelemetry.exporter.otlp.proto.http.trace_exporter import (
|
||||||
|
OTLPSpanExporter,
|
||||||
|
)
|
||||||
|
|
||||||
|
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
|
||||||
|
return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter()
|
||||||
|
|
||||||
|
from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter
|
||||||
|
|
||||||
|
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_TRACES_ENDPOINT")
|
||||||
|
return OTLPSpanExporter(endpoint=endpoint) if endpoint else OTLPSpanExporter()
|
||||||
|
|
||||||
|
|
||||||
|
def _metric_exporter():
|
||||||
|
if _otlp_protocol() == "http/protobuf":
|
||||||
|
from opentelemetry.exporter.otlp.proto.http.metric_exporter import (
|
||||||
|
OTLPMetricExporter,
|
||||||
|
)
|
||||||
|
|
||||||
|
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
|
||||||
|
return (
|
||||||
|
OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter()
|
||||||
|
)
|
||||||
|
|
||||||
|
from opentelemetry.exporter.otlp.proto.grpc.metric_exporter import (
|
||||||
|
OTLPMetricExporter,
|
||||||
|
)
|
||||||
|
|
||||||
|
endpoint = os.environ.get("OTEL_EXPORTER_OTLP_METRICS_ENDPOINT")
|
||||||
|
return OTLPMetricExporter(endpoint=endpoint) if endpoint else OTLPMetricExporter()
|
||||||
|
|
||||||
|
|
||||||
|
def _safe_instrument(name: str, instrument: Any) -> bool:
|
||||||
|
try:
|
||||||
|
instrument()
|
||||||
|
except Exception:
|
||||||
|
logger.warning("OpenTelemetry %s instrumentation failed", name, exc_info=True)
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def _url_without_query(raw_url: Any) -> str | None:
|
||||||
|
try:
|
||||||
|
parts = urlsplit(str(raw_url))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
if not parts.scheme or not parts.netloc:
|
||||||
|
return None
|
||||||
|
return urlunsplit((parts.scheme, parts.netloc, parts.path or "/", "", ""))
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_http_span_url(span: Any, request: Any) -> None:
|
||||||
|
sanitized = _url_without_query(getattr(request, "url", None))
|
||||||
|
if not sanitized:
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
# Keep both old and current semantic-convention names safe. The
|
||||||
|
# collector can drop one later without needing application changes.
|
||||||
|
span.set_attribute("http.url", sanitized)
|
||||||
|
span.set_attribute("url.full", sanitized)
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_fastapi(app: Any | None) -> None:
|
||||||
|
global _FASTAPI_INSTRUMENTED
|
||||||
|
if app is None or _FASTAPI_INSTRUMENTED:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor
|
||||||
|
|
||||||
|
FastAPIInstrumentor.instrument_app(
|
||||||
|
app,
|
||||||
|
excluded_urls="/health,/ready,/metrics",
|
||||||
|
)
|
||||||
|
|
||||||
|
if _safe_instrument("FastAPI", _run):
|
||||||
|
_FASTAPI_INSTRUMENTED = True
|
||||||
|
|
||||||
|
|
||||||
|
def instrument_sqlalchemy_engine(engine: Any) -> None:
|
||||||
|
"""Instrument a SQLAlchemy engine once per process."""
|
||||||
|
global _SQLALCHEMY_INSTRUMENTED
|
||||||
|
if _SQLALCHEMY_INSTRUMENTED:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from opentelemetry.instrumentation.sqlalchemy import SQLAlchemyInstrumentor
|
||||||
|
|
||||||
|
SQLAlchemyInstrumentor().instrument(
|
||||||
|
engine=getattr(engine, "sync_engine", engine),
|
||||||
|
enable_commenter=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if _safe_instrument("SQLAlchemy", _run):
|
||||||
|
_SQLALCHEMY_INSTRUMENTED = True
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_sqlalchemy() -> None:
|
||||||
|
if _SQLALCHEMY_INSTRUMENTED:
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
from app.db import engine
|
||||||
|
|
||||||
|
instrument_sqlalchemy_engine(engine)
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_psycopg() -> None:
|
||||||
|
global _PSYCOPG_INSTRUMENTED
|
||||||
|
if _PSYCOPG_INSTRUMENTED:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from opentelemetry.instrumentation.psycopg import PsycopgInstrumentor
|
||||||
|
|
||||||
|
PsycopgInstrumentor().instrument()
|
||||||
|
|
||||||
|
if _safe_instrument("psycopg", _run):
|
||||||
|
_PSYCOPG_INSTRUMENTED = True
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_redis() -> None:
|
||||||
|
global _REDIS_INSTRUMENTED
|
||||||
|
if _REDIS_INSTRUMENTED:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from opentelemetry.instrumentation.redis import RedisInstrumentor
|
||||||
|
|
||||||
|
RedisInstrumentor().instrument()
|
||||||
|
|
||||||
|
if _safe_instrument("Redis", _run):
|
||||||
|
_REDIS_INSTRUMENTED = True
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_httpx() -> None:
|
||||||
|
global _HTTPX_INSTRUMENTED
|
||||||
|
if _HTTPX_INSTRUMENTED:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from opentelemetry.instrumentation.httpx import HTTPXClientInstrumentor
|
||||||
|
|
||||||
|
HTTPXClientInstrumentor().instrument(
|
||||||
|
request_hook=lambda span, request: _sanitize_http_span_url(span, request),
|
||||||
|
response_hook=lambda span, request, _response: _sanitize_http_span_url(
|
||||||
|
span, request
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
if _safe_instrument("HTTPX", _run):
|
||||||
|
_HTTPX_INSTRUMENTED = True
|
||||||
|
|
||||||
|
|
||||||
|
def instrument_celery() -> None:
|
||||||
|
"""Instrument Celery producer/consumer hooks once per process."""
|
||||||
|
global _CELERY_INSTRUMENTED
|
||||||
|
if _CELERY_INSTRUMENTED:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from opentelemetry.instrumentation.celery import CeleryInstrumentor
|
||||||
|
|
||||||
|
CeleryInstrumentor().instrument()
|
||||||
|
|
||||||
|
if _safe_instrument("Celery", _run):
|
||||||
|
_CELERY_INSTRUMENTED = True
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_libraries(app: Any | None) -> None:
|
||||||
|
_instrument_fastapi(app)
|
||||||
|
_instrument_sqlalchemy()
|
||||||
|
_instrument_psycopg()
|
||||||
|
_instrument_redis()
|
||||||
|
_instrument_httpx()
|
||||||
|
instrument_celery()
|
||||||
|
|
||||||
|
|
||||||
|
def init_traces(app: Any | None = None) -> None:
|
||||||
|
"""Install the tracer provider, span processor, exporter, and instrumentors."""
|
||||||
|
global _TRACER_PROVIDER, _TRACES_INITIALIZED
|
||||||
|
if _TRACES_INITIALIZED:
|
||||||
|
_instrument_fastapi(app)
|
||||||
|
return
|
||||||
|
|
||||||
|
from opentelemetry import trace
|
||||||
|
from opentelemetry.sdk.trace import TracerProvider
|
||||||
|
from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
||||||
|
from opentelemetry.sdk.trace.sampling import ALWAYS_ON, ParentBased
|
||||||
|
|
||||||
|
provider = TracerProvider(
|
||||||
|
resource=_build_resource(),
|
||||||
|
sampler=ParentBased(ALWAYS_ON),
|
||||||
|
)
|
||||||
|
provider.add_span_processor(BatchSpanProcessor(_trace_exporter()))
|
||||||
|
|
||||||
|
try:
|
||||||
|
trace.set_tracer_provider(provider)
|
||||||
|
except Exception:
|
||||||
|
logger.warning(
|
||||||
|
"OpenTelemetry tracer provider was already set; reusing existing provider",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
_TRACER_PROVIDER = trace.get_tracer_provider()
|
||||||
|
else:
|
||||||
|
_TRACER_PROVIDER = provider
|
||||||
|
|
||||||
|
_TRACES_INITIALIZED = True
|
||||||
|
otel.reload_for_tests()
|
||||||
|
_instrument_libraries(app)
|
||||||
|
|
||||||
|
|
||||||
|
def init_metrics() -> None:
|
||||||
|
"""Install the meter provider, metric reader, exporter, and custom gauges."""
|
||||||
|
global _METER_PROVIDER, _METRICS_INITIALIZED
|
||||||
|
if _METRICS_INITIALIZED:
|
||||||
|
return
|
||||||
|
|
||||||
|
from opentelemetry import metrics
|
||||||
|
from opentelemetry.sdk.metrics import MeterProvider
|
||||||
|
from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader
|
||||||
|
|
||||||
|
interval_ms = int(os.environ.get("OTEL_METRIC_EXPORT_INTERVAL", "60000"))
|
||||||
|
reader = PeriodicExportingMetricReader(
|
||||||
|
_metric_exporter(),
|
||||||
|
export_interval_millis=interval_ms,
|
||||||
|
)
|
||||||
|
provider = MeterProvider(metric_readers=[reader], resource=_build_resource())
|
||||||
|
|
||||||
|
try:
|
||||||
|
metrics.set_meter_provider(provider)
|
||||||
|
except Exception:
|
||||||
|
logger.warning(
|
||||||
|
"OpenTelemetry meter provider was already set; reusing existing provider",
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
_METER_PROVIDER = metrics.get_meter_provider()
|
||||||
|
else:
|
||||||
|
_METER_PROVIDER = provider
|
||||||
|
|
||||||
|
_METRICS_INITIALIZED = True
|
||||||
|
from app.observability.metrics import register_runtime_observables
|
||||||
|
|
||||||
|
register_runtime_observables()
|
||||||
|
|
||||||
|
|
||||||
|
def init_logs() -> None:
|
||||||
|
"""Enable trace/span correlation fields on stdlib LogRecords."""
|
||||||
|
global _LOGS_INITIALIZED
|
||||||
|
if _LOGS_INITIALIZED:
|
||||||
|
return
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
from opentelemetry.instrumentation.logging import LoggingInstrumentor
|
||||||
|
|
||||||
|
# Required for stdlib LogRecords to receive otelTraceID/otelSpanID.
|
||||||
|
# logging.basicConfig is already installed by main.py, so this does not
|
||||||
|
# take over formatting in normal app startup.
|
||||||
|
LoggingInstrumentor().instrument(set_logging_format=True)
|
||||||
|
|
||||||
|
if _safe_instrument("logging", _run):
|
||||||
|
_LOGS_INITIALIZED = True
|
||||||
|
|
||||||
|
|
||||||
|
def init_otel(
|
||||||
|
app: Any | None = None,
|
||||||
|
*,
|
||||||
|
traces: bool = True,
|
||||||
|
metrics: bool = True,
|
||||||
|
logs: bool = True,
|
||||||
|
) -> None:
|
||||||
|
"""Initialize OpenTelemetry for a FastAPI or Celery process."""
|
||||||
|
if is_otel_disabled() or not is_otel_configured():
|
||||||
|
otel.reload_for_tests()
|
||||||
|
return
|
||||||
|
|
||||||
|
if traces:
|
||||||
|
init_traces(app)
|
||||||
|
if metrics:
|
||||||
|
init_metrics()
|
||||||
|
if logs:
|
||||||
|
init_logs()
|
||||||
|
|
||||||
|
|
||||||
|
def shutdown_otel(timeout_millis: int = 5000) -> None:
|
||||||
|
"""Best-effort flush and shutdown for installed providers."""
|
||||||
|
for provider in (_TRACER_PROVIDER, _METER_PROVIDER):
|
||||||
|
if provider is None:
|
||||||
|
continue
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
provider.force_flush(timeout_millis=timeout_millis)
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
provider.shutdown()
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"_BOOL_TRUE",
|
||||||
|
"_build_resource",
|
||||||
|
"init_logs",
|
||||||
|
"init_metrics",
|
||||||
|
"init_otel",
|
||||||
|
"init_traces",
|
||||||
|
"instrument_celery",
|
||||||
|
"instrument_sqlalchemy_engine",
|
||||||
|
"is_otel_configured",
|
||||||
|
"is_otel_disabled",
|
||||||
|
"shutdown_otel",
|
||||||
|
]
|
||||||
684
surfsense_backend/app/observability/metrics.py
Normal file
684
surfsense_backend/app/observability/metrics.py
Normal file
|
|
@ -0,0 +1,684 @@
|
||||||
|
"""Custom OpenTelemetry metrics for SurfSense.
|
||||||
|
|
||||||
|
This module owns all SurfSense-specific metric instruments. Callers use the
|
||||||
|
small helper functions below instead of constructing instruments directly so
|
||||||
|
attribute names and cardinality stay consistent across the backend.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import contextlib
|
||||||
|
import gc
|
||||||
|
import logging
|
||||||
|
from functools import lru_cache
|
||||||
|
from importlib import metadata
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
from app.observability import otel
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_INSTRUMENTATION_NAME = "surfsense.platform"
|
||||||
|
_OBSERVABLES_REGISTERED = False
|
||||||
|
_ERROR_CATEGORY_UNKNOWN = "unknown"
|
||||||
|
|
||||||
|
_ERROR_CATEGORY_HINTS: tuple[tuple[str, tuple[str, ...]], ...] = (
|
||||||
|
("rate_limited", ("ratelimit", "rate_limit", "toomanyrequests", "429")),
|
||||||
|
("auth_failed", ("authentication", "auth", "unauthorized", "forbidden")),
|
||||||
|
("quota_exhausted", ("quota", "insufficient", "credit", "billing")),
|
||||||
|
("timeout", ("timeout", "timedout", "deadline")),
|
||||||
|
("network_failed", ("connection", "connect", "network", "dns", "socket")),
|
||||||
|
("server_error", ("internalserver", "serviceunavailable", "badgateway", "gateway")),
|
||||||
|
("lock_contention", ("lock", "busy", "contention", "alreadyrunning")),
|
||||||
|
("unsupported_format", ("unsupported", "format", "filetype")),
|
||||||
|
("provider_error", ("provider", "apierror", "apistatus", "badrequest")),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _package_version() -> str:
|
||||||
|
with contextlib.suppress(metadata.PackageNotFoundError):
|
||||||
|
return metadata.version("surf-new-backend")
|
||||||
|
return "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def _is_enabled() -> bool:
|
||||||
|
return otel.is_enabled()
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
|
||||||
|
"""Drop empty values and coerce low-cardinality attrs to OTel-safe scalars."""
|
||||||
|
cleaned: dict[str, str | int | float | bool] = {}
|
||||||
|
for key, value in attrs.items():
|
||||||
|
if value is None:
|
||||||
|
continue
|
||||||
|
if isinstance(value, bool | int | float):
|
||||||
|
cleaned[key] = value
|
||||||
|
continue
|
||||||
|
text = str(value)
|
||||||
|
if text:
|
||||||
|
cleaned[key] = text
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def _attrs_with_optional_error_category(
|
||||||
|
attrs: dict[str, Any], error_category: str | None
|
||||||
|
) -> dict[str, Any]:
|
||||||
|
if error_category:
|
||||||
|
return {**attrs, "error.category": error_category}
|
||||||
|
return attrs
|
||||||
|
|
||||||
|
|
||||||
|
def categorize_exception(exc: BaseException | None) -> str:
|
||||||
|
"""Return a low-cardinality category for an exception."""
|
||||||
|
if exc is None:
|
||||||
|
return _ERROR_CATEGORY_UNKNOWN
|
||||||
|
haystack = " ".join(
|
||||||
|
cls.__name__.replace("-", "").replace("_", "").lower()
|
||||||
|
for cls in type(exc).__mro__
|
||||||
|
)
|
||||||
|
for category, hints in _ERROR_CATEGORY_HINTS:
|
||||||
|
if any(hint in haystack for hint in hints):
|
||||||
|
return category
|
||||||
|
return _ERROR_CATEGORY_UNKNOWN
|
||||||
|
|
||||||
|
|
||||||
|
def parse_celery_task_label(task_name: str | None) -> str:
|
||||||
|
"""Return the operation token from a Celery task name."""
|
||||||
|
if not task_name:
|
||||||
|
return "unknown"
|
||||||
|
operation = str(task_name).split("_", 1)[0].strip()
|
||||||
|
return operation or "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
def _record(callable_obj: Any, value: int | float, attrs: dict[str, Any]) -> None:
|
||||||
|
if not _is_enabled():
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
callable_obj.record(value, _clean_attrs(attrs))
|
||||||
|
|
||||||
|
|
||||||
|
def _add(callable_obj: Any, value: int, attrs: dict[str, Any]) -> None:
|
||||||
|
if not _is_enabled():
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
callable_obj.add(value, _clean_attrs(attrs))
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _get_meter():
|
||||||
|
from opentelemetry import metrics
|
||||||
|
|
||||||
|
return metrics.get_meter(_INSTRUMENTATION_NAME, _package_version())
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _model_call_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.model.call.duration",
|
||||||
|
unit="ms",
|
||||||
|
description="Duration of SurfSense LLM model calls.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _model_token_usage():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"gen_ai.client.token.usage",
|
||||||
|
unit="{token}",
|
||||||
|
description="Token usage reported by GenAI model responses.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _tool_call_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.tool.call.duration",
|
||||||
|
unit="ms",
|
||||||
|
description="Duration of SurfSense agent tool calls.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _tool_call_errors():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.tool.call.errors",
|
||||||
|
description="Count of SurfSense agent tool call errors.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _kb_search_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.kb.search.duration",
|
||||||
|
unit="ms",
|
||||||
|
description="Duration of SurfSense knowledge-base search calls.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _compaction_runs():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.compaction.runs",
|
||||||
|
description="Count of SurfSense conversation compaction runs.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _permission_asks():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.permission.asks",
|
||||||
|
description="Count of SurfSense permission asks.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _interrupts():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.interrupt.raised",
|
||||||
|
description="Count of SurfSense interrupts raised.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _indexing_document_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.indexing.document.duration",
|
||||||
|
unit="s",
|
||||||
|
description="Duration of SurfSense document indexing.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _indexing_document_outcome():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.indexing.document.outcome",
|
||||||
|
description="Count of SurfSense document indexing outcomes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _connector_sync_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.connector.sync.duration",
|
||||||
|
unit="s",
|
||||||
|
description="Duration of SurfSense connector sync tasks.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _connector_sync_outcome():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.connector.sync.outcome",
|
||||||
|
description="Count of SurfSense connector sync outcomes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _auth_failures():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.auth.failures",
|
||||||
|
description="Count of SurfSense authentication failures.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _rate_limit_rejections():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.rate_limit.rejections",
|
||||||
|
description="Count of SurfSense rate-limit rejections.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _perf_elapsed():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.perf.elapsed_ms",
|
||||||
|
unit="ms",
|
||||||
|
description="Elapsed time recorded by SurfSense perf timers.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _chat_request_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.chat.request.duration",
|
||||||
|
unit="ms",
|
||||||
|
description="Duration of SurfSense streamed chat requests.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _chat_request_outcome():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.chat.request.outcome",
|
||||||
|
description="Count of SurfSense chat request outcomes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _subagent_invoke_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.subagent.invoke.duration",
|
||||||
|
unit="ms",
|
||||||
|
description="Duration of SurfSense subagent invocations.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _subagent_invoke_outcome():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.subagent.invoke.outcome",
|
||||||
|
description="Count of SurfSense subagent invocation outcomes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _etl_extract_duration():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.etl.extract.duration",
|
||||||
|
unit="s",
|
||||||
|
description="Duration of SurfSense ETL extraction.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _etl_extract_outcome():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.etl.extract.outcome",
|
||||||
|
description="Count of SurfSense ETL extraction outcomes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _celery_heartbeat_refreshes():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.celery.heartbeat.refreshes",
|
||||||
|
description="Count of SurfSense Celery heartbeat refreshes.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _celery_heartbeat_failures():
|
||||||
|
return _get_meter().create_counter(
|
||||||
|
"surfsense.celery.heartbeat.failures",
|
||||||
|
description="Count of SurfSense Celery heartbeat failures.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def _celery_queue_latency():
|
||||||
|
return _get_meter().create_histogram(
|
||||||
|
"surfsense.celery.queue.latency",
|
||||||
|
unit="s",
|
||||||
|
description="Time SurfSense Celery tasks spend waiting in queue.",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_model_call_duration(
|
||||||
|
duration_ms: float, *, model: str | None, provider: str | None
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_model_call_duration(),
|
||||||
|
duration_ms,
|
||||||
|
{
|
||||||
|
"gen_ai.request.model": model,
|
||||||
|
"gen_ai.provider.name": provider,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_model_token_usage(
|
||||||
|
*,
|
||||||
|
input_tokens: int | None,
|
||||||
|
output_tokens: int | None,
|
||||||
|
model: str | None,
|
||||||
|
provider: str | None,
|
||||||
|
) -> None:
|
||||||
|
base = {
|
||||||
|
"gen_ai.request.model": model,
|
||||||
|
"gen_ai.provider.name": provider,
|
||||||
|
"gen_ai.operation.name": "chat",
|
||||||
|
}
|
||||||
|
if input_tokens is not None:
|
||||||
|
_record(
|
||||||
|
_model_token_usage(),
|
||||||
|
int(input_tokens),
|
||||||
|
{**base, "gen_ai.token.type": "input"},
|
||||||
|
)
|
||||||
|
if output_tokens is not None:
|
||||||
|
_record(
|
||||||
|
_model_token_usage(),
|
||||||
|
int(output_tokens),
|
||||||
|
{**base, "gen_ai.token.type": "output"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_tool_call_duration(duration_ms: float, *, tool_name: str) -> None:
|
||||||
|
_record(_tool_call_duration(), duration_ms, {"tool.name": tool_name})
|
||||||
|
|
||||||
|
|
||||||
|
def record_tool_call_error(*, tool_name: str) -> None:
|
||||||
|
_add(_tool_call_errors(), 1, {"tool.name": tool_name})
|
||||||
|
|
||||||
|
|
||||||
|
def record_kb_search_duration(
|
||||||
|
duration_ms: float, *, search_space_id: int | None, surface: str
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_kb_search_duration(),
|
||||||
|
duration_ms,
|
||||||
|
{"search_space.id": search_space_id, "search.surface": surface},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_compaction_run(*, reason: str | None) -> None:
|
||||||
|
_add(_compaction_runs(), 1, {"compaction.reason": reason or "unknown"})
|
||||||
|
|
||||||
|
|
||||||
|
def record_permission_ask(*, permission: str) -> None:
|
||||||
|
_add(_permission_asks(), 1, {"permission.permission": permission})
|
||||||
|
|
||||||
|
|
||||||
|
def record_interrupt(*, interrupt_type: str) -> None:
|
||||||
|
_add(_interrupts(), 1, {"interrupt.type": interrupt_type})
|
||||||
|
|
||||||
|
|
||||||
|
def record_indexing_document_duration(
|
||||||
|
duration_s: float, *, document_type: str | None
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_indexing_document_duration(),
|
||||||
|
duration_s,
|
||||||
|
{"document.type": document_type or "unknown"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_indexing_document_outcome(*, document_type: str | None, status: str) -> None:
|
||||||
|
_add(
|
||||||
|
_indexing_document_outcome(),
|
||||||
|
1,
|
||||||
|
{"document.type": document_type or "unknown", "status": status},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_connector_sync_duration(
|
||||||
|
duration_s: float, *, connector_type: str | None
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_connector_sync_duration(),
|
||||||
|
duration_s,
|
||||||
|
{"connector.type": connector_type or "unknown"},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_connector_sync_outcome(
|
||||||
|
*, connector_type: str | None, status: str, error_category: str | None = None
|
||||||
|
) -> None:
|
||||||
|
_add(
|
||||||
|
_connector_sync_outcome(),
|
||||||
|
1,
|
||||||
|
_attrs_with_optional_error_category(
|
||||||
|
{"connector.type": connector_type or "unknown", "status": status},
|
||||||
|
error_category,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_auth_failure(*, reason: str) -> None:
|
||||||
|
_add(_auth_failures(), 1, {"reason": reason})
|
||||||
|
|
||||||
|
|
||||||
|
def record_rate_limit_rejection(*, scope: str) -> None:
|
||||||
|
_add(_rate_limit_rejections(), 1, {"scope": scope})
|
||||||
|
|
||||||
|
|
||||||
|
def record_perf_elapsed(duration_ms: float, *, label: str) -> None:
|
||||||
|
_record(_perf_elapsed(), duration_ms, {"label": label})
|
||||||
|
|
||||||
|
|
||||||
|
def record_chat_request_duration(
|
||||||
|
duration_ms: float,
|
||||||
|
*,
|
||||||
|
flow: str,
|
||||||
|
outcome: str,
|
||||||
|
agent_mode: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_chat_request_duration(),
|
||||||
|
duration_ms,
|
||||||
|
{"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_chat_request_outcome(
|
||||||
|
*,
|
||||||
|
flow: str,
|
||||||
|
outcome: str,
|
||||||
|
agent_mode: str | None = None,
|
||||||
|
error_category: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
_add(
|
||||||
|
_chat_request_outcome(),
|
||||||
|
1,
|
||||||
|
_attrs_with_optional_error_category(
|
||||||
|
{"chat.flow": flow, "outcome": outcome, "agent.mode": agent_mode},
|
||||||
|
error_category,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_subagent_invoke_duration(
|
||||||
|
duration_ms: float,
|
||||||
|
*,
|
||||||
|
subagent_type: str,
|
||||||
|
path: str | None,
|
||||||
|
outcome: str,
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_subagent_invoke_duration(),
|
||||||
|
duration_ms,
|
||||||
|
{
|
||||||
|
"subagent.type": subagent_type,
|
||||||
|
"subagent.path": path or "unknown",
|
||||||
|
"outcome": outcome,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_subagent_invoke_outcome(
|
||||||
|
*,
|
||||||
|
subagent_type: str,
|
||||||
|
path: str | None,
|
||||||
|
outcome: str,
|
||||||
|
) -> None:
|
||||||
|
_add(
|
||||||
|
_subagent_invoke_outcome(),
|
||||||
|
1,
|
||||||
|
{
|
||||||
|
"subagent.type": subagent_type,
|
||||||
|
"subagent.path": path or "unknown",
|
||||||
|
"outcome": outcome,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_etl_extract_duration(
|
||||||
|
duration_s: float,
|
||||||
|
*,
|
||||||
|
etl_service: str | None,
|
||||||
|
content_type: str | None,
|
||||||
|
status: str,
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_etl_extract_duration(),
|
||||||
|
duration_s,
|
||||||
|
{
|
||||||
|
"etl.service": etl_service or "unknown",
|
||||||
|
"content.type": content_type or "unknown",
|
||||||
|
"status": status,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_etl_extract_outcome(
|
||||||
|
*,
|
||||||
|
etl_service: str | None,
|
||||||
|
content_type: str | None,
|
||||||
|
status: str,
|
||||||
|
error_category: str | None = None,
|
||||||
|
) -> None:
|
||||||
|
_add(
|
||||||
|
_etl_extract_outcome(),
|
||||||
|
1,
|
||||||
|
_attrs_with_optional_error_category(
|
||||||
|
{
|
||||||
|
"etl.service": etl_service or "unknown",
|
||||||
|
"content.type": content_type or "unknown",
|
||||||
|
"status": status,
|
||||||
|
},
|
||||||
|
error_category,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_celery_heartbeat_refresh(*, heartbeat_type: str) -> None:
|
||||||
|
_add(_celery_heartbeat_refreshes(), 1, {"heartbeat.type": heartbeat_type})
|
||||||
|
|
||||||
|
|
||||||
|
def record_celery_heartbeat_failure(*, heartbeat_type: str) -> None:
|
||||||
|
_add(_celery_heartbeat_failures(), 1, {"heartbeat.type": heartbeat_type})
|
||||||
|
|
||||||
|
|
||||||
|
def record_celery_queue_latency(
|
||||||
|
duration_s: float,
|
||||||
|
*,
|
||||||
|
task_name: str | None,
|
||||||
|
queue: str | None,
|
||||||
|
scheduled: bool,
|
||||||
|
operation: str | None,
|
||||||
|
) -> None:
|
||||||
|
_record(
|
||||||
|
_celery_queue_latency(),
|
||||||
|
duration_s,
|
||||||
|
{
|
||||||
|
"task.name": task_name or "unknown",
|
||||||
|
"task.queue": queue or "unknown",
|
||||||
|
"task.scheduled": bool(scheduled),
|
||||||
|
"operation": operation or "unknown",
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _runtime_snapshot_value(key: str, transform: Any = None) -> list[Any]:
|
||||||
|
from opentelemetry.metrics import Observation
|
||||||
|
|
||||||
|
from app.utils.perf import system_snapshot
|
||||||
|
|
||||||
|
snap = system_snapshot()
|
||||||
|
value = snap.get(key)
|
||||||
|
if not isinstance(value, int | float) or value < 0:
|
||||||
|
return []
|
||||||
|
if transform is not None:
|
||||||
|
value = transform(value)
|
||||||
|
return [Observation(value)]
|
||||||
|
|
||||||
|
|
||||||
|
def _observe_gc_collections(_options: Any) -> list[Any]:
|
||||||
|
from opentelemetry.metrics import Observation
|
||||||
|
|
||||||
|
return [
|
||||||
|
Observation(count, {"generation": str(generation)})
|
||||||
|
for generation, count in enumerate(gc.get_count())
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def register_runtime_observables() -> None:
|
||||||
|
"""Register process/runtime observable gauges once per process."""
|
||||||
|
global _OBSERVABLES_REGISTERED
|
||||||
|
if _OBSERVABLES_REGISTERED or not _is_enabled():
|
||||||
|
return
|
||||||
|
|
||||||
|
meter = _get_meter()
|
||||||
|
try:
|
||||||
|
# Each callback returns the value for a single gauge except GC, whose
|
||||||
|
# callback carries a generation attribute.
|
||||||
|
meter.create_observable_gauge(
|
||||||
|
"process.runtime.cpython.memory.rss",
|
||||||
|
callbacks=[
|
||||||
|
lambda _options: _runtime_snapshot_value(
|
||||||
|
"rss_mb", lambda v: float(v) * 1024 * 1024
|
||||||
|
)
|
||||||
|
],
|
||||||
|
unit="By",
|
||||||
|
description="Resident set size of the SurfSense backend process.",
|
||||||
|
)
|
||||||
|
meter.create_observable_gauge(
|
||||||
|
"process.runtime.cpython.cpu.utilization",
|
||||||
|
callbacks=[
|
||||||
|
lambda _options: _runtime_snapshot_value(
|
||||||
|
"cpu_percent", lambda v: float(v) / 100.0
|
||||||
|
)
|
||||||
|
],
|
||||||
|
unit="1",
|
||||||
|
description="CPU utilization of the SurfSense backend process.",
|
||||||
|
)
|
||||||
|
meter.create_observable_gauge(
|
||||||
|
"process.runtime.cpython.threads",
|
||||||
|
callbacks=[lambda _options: _runtime_snapshot_value("threads")],
|
||||||
|
unit="{thread}",
|
||||||
|
description="Thread count of the SurfSense backend process.",
|
||||||
|
)
|
||||||
|
meter.create_observable_gauge(
|
||||||
|
"process.runtime.cpython.open_fds",
|
||||||
|
callbacks=[lambda _options: _runtime_snapshot_value("open_fds")],
|
||||||
|
unit="{fd}",
|
||||||
|
description="Open file descriptor count of the SurfSense backend process.",
|
||||||
|
)
|
||||||
|
meter.create_observable_gauge(
|
||||||
|
"python.asyncio.tasks",
|
||||||
|
callbacks=[lambda _options: _runtime_snapshot_value("asyncio_tasks")],
|
||||||
|
unit="{task}",
|
||||||
|
description="Live asyncio task count in the current event loop.",
|
||||||
|
)
|
||||||
|
meter.create_observable_gauge(
|
||||||
|
"process.runtime.cpython.gc.collections",
|
||||||
|
callbacks=[_observe_gc_collections],
|
||||||
|
unit="{collection}",
|
||||||
|
description="CPython GC counters by generation.",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logger.warning("Failed to register OTel runtime observables", exc_info=True)
|
||||||
|
return
|
||||||
|
|
||||||
|
_OBSERVABLES_REGISTERED = True
|
||||||
|
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"categorize_exception",
|
||||||
|
"parse_celery_task_label",
|
||||||
|
"record_auth_failure",
|
||||||
|
"record_celery_heartbeat_failure",
|
||||||
|
"record_celery_heartbeat_refresh",
|
||||||
|
"record_celery_queue_latency",
|
||||||
|
"record_chat_request_duration",
|
||||||
|
"record_chat_request_outcome",
|
||||||
|
"record_compaction_run",
|
||||||
|
"record_connector_sync_duration",
|
||||||
|
"record_connector_sync_outcome",
|
||||||
|
"record_etl_extract_duration",
|
||||||
|
"record_etl_extract_outcome",
|
||||||
|
"record_indexing_document_duration",
|
||||||
|
"record_indexing_document_outcome",
|
||||||
|
"record_interrupt",
|
||||||
|
"record_kb_search_duration",
|
||||||
|
"record_model_call_duration",
|
||||||
|
"record_model_token_usage",
|
||||||
|
"record_perf_elapsed",
|
||||||
|
"record_permission_ask",
|
||||||
|
"record_rate_limit_rejection",
|
||||||
|
"record_subagent_invoke_duration",
|
||||||
|
"record_subagent_invoke_outcome",
|
||||||
|
"record_tool_call_duration",
|
||||||
|
"record_tool_call_error",
|
||||||
|
"register_runtime_observables",
|
||||||
|
]
|
||||||
|
|
@ -66,6 +66,8 @@ def _resolve_enabled() -> bool:
|
||||||
# Honor an explicit kill-switch first.
|
# Honor an explicit kill-switch first.
|
||||||
if os.environ.get("SURFSENSE_DISABLE_OTEL", "").lower() in {"1", "true", "yes"}:
|
if os.environ.get("SURFSENSE_DISABLE_OTEL", "").lower() in {"1", "true", "yes"}:
|
||||||
return False
|
return False
|
||||||
|
if os.environ.get("OTEL_SDK_DISABLED", "").lower() in {"1", "true", "yes", "on"}:
|
||||||
|
return False
|
||||||
# Treat a configured endpoint as the canonical "OTel is wired up" signal.
|
# Treat a configured endpoint as the canonical "OTel is wired up" signal.
|
||||||
if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
|
if os.environ.get("OTEL_EXPORTER_OTLP_ENDPOINT"):
|
||||||
return True
|
return True
|
||||||
|
|
@ -90,6 +92,48 @@ def is_enabled() -> bool:
|
||||||
return _ENABLED
|
return _ENABLED
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_event_attrs(attrs: dict[str, Any]) -> dict[str, str | int | float | bool]:
|
||||||
|
"""Coerce event attributes to OTel-safe scalar values."""
|
||||||
|
cleaned: dict[str, str | int | float | bool] = {}
|
||||||
|
for key, value in attrs.items():
|
||||||
|
if value is None:
|
||||||
|
continue
|
||||||
|
if isinstance(value, bool | int | float):
|
||||||
|
cleaned[key] = value
|
||||||
|
continue
|
||||||
|
text = str(value)
|
||||||
|
if text:
|
||||||
|
cleaned[key] = text
|
||||||
|
return cleaned
|
||||||
|
|
||||||
|
|
||||||
|
def add_event(name: str, attributes: dict[str, Any] | None = None) -> None:
|
||||||
|
"""Attach an event to the current active span.
|
||||||
|
|
||||||
|
This is intentionally no-op and exception-safe when OTel is disabled,
|
||||||
|
unavailable, or no span is currently recording.
|
||||||
|
"""
|
||||||
|
if not _ENABLED or _ot_trace is None:
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
sp = _ot_trace.get_current_span()
|
||||||
|
if sp is None or not sp.is_recording():
|
||||||
|
return
|
||||||
|
sp.add_event(
|
||||||
|
name,
|
||||||
|
attributes=_clean_event_attrs(attributes) if attributes else None,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def record_error(span_obj: Any, exc: BaseException) -> None:
|
||||||
|
"""Record an exception and mark a span as errored without re-raising."""
|
||||||
|
if not _ENABLED:
|
||||||
|
return
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
span_obj.record_exception(exc)
|
||||||
|
span_obj.set_status(_OtStatus(_OtStatusCode.ERROR, str(exc)))
|
||||||
|
|
||||||
|
|
||||||
def _get_tracer():
|
def _get_tracer():
|
||||||
if not _OTEL_AVAILABLE:
|
if not _OTEL_AVAILABLE:
|
||||||
return None
|
return None
|
||||||
|
|
@ -198,8 +242,11 @@ def model_call_span(
|
||||||
attrs: dict[str, Any] = {}
|
attrs: dict[str, Any] = {}
|
||||||
if model_id:
|
if model_id:
|
||||||
attrs["model.id"] = model_id
|
attrs["model.id"] = model_id
|
||||||
|
attrs["gen_ai.request.model"] = model_id
|
||||||
if provider:
|
if provider:
|
||||||
attrs["model.provider"] = provider
|
attrs["model.provider"] = provider
|
||||||
|
attrs["gen_ai.provider.name"] = provider
|
||||||
|
attrs["gen_ai.operation.name"] = "chat"
|
||||||
if extra:
|
if extra:
|
||||||
attrs.update(extra)
|
attrs.update(extra)
|
||||||
return span("model.call", attributes=attrs)
|
return span("model.call", attributes=attrs)
|
||||||
|
|
@ -239,6 +286,152 @@ def kb_persist_span(
|
||||||
return span("kb.persist", attributes=attrs)
|
return span("kb.persist", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def chat_request_span(
|
||||||
|
*,
|
||||||
|
chat_id: int | None = None,
|
||||||
|
search_space_id: int | None = None,
|
||||||
|
flow: str | None = None,
|
||||||
|
request_id: str | None = None,
|
||||||
|
turn_id: str | None = None,
|
||||||
|
filesystem_mode: str | None = None,
|
||||||
|
client_platform: str | None = None,
|
||||||
|
agent_mode: str | None = None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Parent span for a single streamed chat or resume turn."""
|
||||||
|
attrs: dict[str, Any] = {}
|
||||||
|
if chat_id is not None:
|
||||||
|
attrs["chat.id"] = int(chat_id)
|
||||||
|
if search_space_id is not None:
|
||||||
|
attrs["search_space.id"] = int(search_space_id)
|
||||||
|
if flow:
|
||||||
|
attrs["chat.flow"] = flow
|
||||||
|
if request_id:
|
||||||
|
attrs["request.id"] = request_id
|
||||||
|
if turn_id:
|
||||||
|
attrs["turn.id"] = turn_id
|
||||||
|
if filesystem_mode:
|
||||||
|
attrs["filesystem.mode"] = filesystem_mode
|
||||||
|
if client_platform:
|
||||||
|
attrs["client.platform"] = client_platform
|
||||||
|
if agent_mode:
|
||||||
|
attrs["agent.mode"] = agent_mode
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("chat.request", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def subagent_invoke_span(
|
||||||
|
*,
|
||||||
|
subagent_type: str,
|
||||||
|
path: str | None = None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Span around invoking a delegated subagent from the main agent."""
|
||||||
|
attrs: dict[str, Any] = {"subagent.type": subagent_type}
|
||||||
|
if path:
|
||||||
|
attrs["subagent.path"] = path
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("subagent.invoke", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def connector_sync_span(
|
||||||
|
*,
|
||||||
|
connector_type: str | None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Business-level span around connector indexing task execution."""
|
||||||
|
attrs: dict[str, Any] = {"connector.type": connector_type or "unknown"}
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("connector.sync", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def etl_extract_span(
|
||||||
|
*,
|
||||||
|
content_type: str | None = None,
|
||||||
|
file_extension: str | None = None,
|
||||||
|
processing_mode: str | None = None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Span around top-level ETL extraction for a file."""
|
||||||
|
attrs: dict[str, Any] = {}
|
||||||
|
if content_type:
|
||||||
|
attrs["content.type"] = content_type
|
||||||
|
if file_extension:
|
||||||
|
attrs["file.extension"] = file_extension
|
||||||
|
if processing_mode:
|
||||||
|
attrs["processing.mode"] = processing_mode
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("etl.extract", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def etl_parse_span(
|
||||||
|
*,
|
||||||
|
etl_service: str | None,
|
||||||
|
content_type: str | None = None,
|
||||||
|
file_extension: str | None = None,
|
||||||
|
processing_mode: str | None = None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Span around a concrete ETL parser/backend call."""
|
||||||
|
attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"}
|
||||||
|
if content_type:
|
||||||
|
attrs["content.type"] = content_type
|
||||||
|
if file_extension:
|
||||||
|
attrs["file.extension"] = file_extension
|
||||||
|
if processing_mode:
|
||||||
|
attrs["processing.mode"] = processing_mode
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("etl.parse", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def etl_ocr_span(
|
||||||
|
*,
|
||||||
|
etl_service: str | None,
|
||||||
|
file_extension: str | None = None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Span around OCR extraction from image content."""
|
||||||
|
attrs: dict[str, Any] = {"etl.service": etl_service or "unknown"}
|
||||||
|
if file_extension:
|
||||||
|
attrs["file.extension"] = file_extension
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("etl.ocr", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def etl_picture_describe_span(
|
||||||
|
*,
|
||||||
|
image_count: int | None = None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Span around describing embedded images in a document."""
|
||||||
|
attrs: dict[str, Any] = {}
|
||||||
|
if image_count is not None:
|
||||||
|
attrs["image.count"] = int(image_count)
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("etl.picture.describe", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
|
def etl_picture_ocr_span(
|
||||||
|
*,
|
||||||
|
file_extension: str | None = None,
|
||||||
|
extra: dict[str, Any] | None = None,
|
||||||
|
):
|
||||||
|
"""Span around per-image OCR during picture description."""
|
||||||
|
attrs: dict[str, Any] = {}
|
||||||
|
if file_extension:
|
||||||
|
attrs["file.extension"] = file_extension
|
||||||
|
if extra:
|
||||||
|
attrs.update(extra)
|
||||||
|
return span("etl.picture.ocr", attributes=attrs)
|
||||||
|
|
||||||
|
|
||||||
def compaction_span(
|
def compaction_span(
|
||||||
*,
|
*,
|
||||||
reason: str | None = None,
|
reason: str | None = None,
|
||||||
|
|
@ -301,14 +494,24 @@ def reload_for_tests() -> bool:
|
||||||
|
|
||||||
|
|
||||||
__all__ = [
|
__all__ = [
|
||||||
|
"add_event",
|
||||||
|
"chat_request_span",
|
||||||
"compaction_span",
|
"compaction_span",
|
||||||
|
"connector_sync_span",
|
||||||
|
"etl_extract_span",
|
||||||
|
"etl_ocr_span",
|
||||||
|
"etl_parse_span",
|
||||||
|
"etl_picture_describe_span",
|
||||||
|
"etl_picture_ocr_span",
|
||||||
"interrupt_span",
|
"interrupt_span",
|
||||||
"is_enabled",
|
"is_enabled",
|
||||||
"kb_persist_span",
|
"kb_persist_span",
|
||||||
"kb_search_span",
|
"kb_search_span",
|
||||||
"model_call_span",
|
"model_call_span",
|
||||||
"permission_asked_span",
|
"permission_asked_span",
|
||||||
|
"record_error",
|
||||||
"reload_for_tests",
|
"reload_for_tests",
|
||||||
"span",
|
"span",
|
||||||
|
"subagent_invoke_span",
|
||||||
"tool_call_span",
|
"tool_call_span",
|
||||||
]
|
]
|
||||||
|
|
|
||||||
|
|
@ -1,13 +1,51 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import functools
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
|
|
||||||
_MAX_FETCH_CHUNKS_PER_DOC = 20
|
_MAX_FETCH_CHUNKS_PER_DOC = 20
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_search(mode: str):
|
||||||
|
def _decorator(func):
|
||||||
|
@functools.wraps(func)
|
||||||
|
async def _wrapper(
|
||||||
|
self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs
|
||||||
|
):
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
with ot.kb_search_span(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
query_chars=len(query_text),
|
||||||
|
extra={"search.surface": "chunks", "search.mode": mode},
|
||||||
|
) as sp:
|
||||||
|
try:
|
||||||
|
result = await func(
|
||||||
|
self, query_text, top_k, search_space_id, *args, **kwargs
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
ot_metrics.record_kb_search_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
surface="chunks",
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
sp.set_attribute("result.count", len(result))
|
||||||
|
ot_metrics.record_kb_search_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
surface="chunks",
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
return _wrapper
|
||||||
|
|
||||||
|
return _decorator
|
||||||
|
|
||||||
|
|
||||||
class ChucksHybridSearchRetriever:
|
class ChucksHybridSearchRetriever:
|
||||||
def __init__(self, db_session):
|
def __init__(self, db_session):
|
||||||
"""
|
"""
|
||||||
|
|
@ -18,6 +56,7 @@ class ChucksHybridSearchRetriever:
|
||||||
"""
|
"""
|
||||||
self.db_session = db_session
|
self.db_session = db_session
|
||||||
|
|
||||||
|
@_instrument_search("vector")
|
||||||
async def vector_search(
|
async def vector_search(
|
||||||
self,
|
self,
|
||||||
query_text: str,
|
query_text: str,
|
||||||
|
|
@ -88,6 +127,7 @@ class ChucksHybridSearchRetriever:
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
@_instrument_search("full_text")
|
||||||
async def full_text_search(
|
async def full_text_search(
|
||||||
self,
|
self,
|
||||||
query_text: str,
|
query_text: str,
|
||||||
|
|
@ -153,6 +193,7 @@ class ChucksHybridSearchRetriever:
|
||||||
|
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
@_instrument_search("hybrid")
|
||||||
async def hybrid_search(
|
async def hybrid_search(
|
||||||
self,
|
self,
|
||||||
query_text: str,
|
query_text: str,
|
||||||
|
|
|
||||||
|
|
@ -1,12 +1,50 @@
|
||||||
import contextlib
|
import contextlib
|
||||||
|
import functools
|
||||||
import time
|
import time
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
from app.utils.perf import get_perf_logger
|
from app.utils.perf import get_perf_logger
|
||||||
|
|
||||||
_MAX_FETCH_CHUNKS_PER_DOC = 20
|
_MAX_FETCH_CHUNKS_PER_DOC = 20
|
||||||
|
|
||||||
|
|
||||||
|
def _instrument_search(mode: str):
|
||||||
|
def _decorator(func):
|
||||||
|
@functools.wraps(func)
|
||||||
|
async def _wrapper(
|
||||||
|
self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs
|
||||||
|
):
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
with ot.kb_search_span(
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
query_chars=len(query_text),
|
||||||
|
extra={"search.surface": "documents", "search.mode": mode},
|
||||||
|
) as sp:
|
||||||
|
try:
|
||||||
|
result = await func(
|
||||||
|
self, query_text, top_k, search_space_id, *args, **kwargs
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
ot_metrics.record_kb_search_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
surface="documents",
|
||||||
|
)
|
||||||
|
raise
|
||||||
|
sp.set_attribute("result.count", len(result))
|
||||||
|
ot_metrics.record_kb_search_duration(
|
||||||
|
(time.perf_counter() - t0) * 1000,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
surface="documents",
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
return _wrapper
|
||||||
|
|
||||||
|
return _decorator
|
||||||
|
|
||||||
|
|
||||||
class DocumentHybridSearchRetriever:
|
class DocumentHybridSearchRetriever:
|
||||||
def __init__(self, db_session):
|
def __init__(self, db_session):
|
||||||
"""
|
"""
|
||||||
|
|
@ -17,6 +55,7 @@ class DocumentHybridSearchRetriever:
|
||||||
"""
|
"""
|
||||||
self.db_session = db_session
|
self.db_session = db_session
|
||||||
|
|
||||||
|
@_instrument_search("vector")
|
||||||
async def vector_search(
|
async def vector_search(
|
||||||
self,
|
self,
|
||||||
query_text: str,
|
query_text: str,
|
||||||
|
|
@ -81,6 +120,7 @@ class DocumentHybridSearchRetriever:
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
@_instrument_search("full_text")
|
||||||
async def full_text_search(
|
async def full_text_search(
|
||||||
self,
|
self,
|
||||||
query_text: str,
|
query_text: str,
|
||||||
|
|
@ -145,6 +185,7 @@ class DocumentHybridSearchRetriever:
|
||||||
|
|
||||||
return documents
|
return documents
|
||||||
|
|
||||||
|
@_instrument_search("hybrid")
|
||||||
async def hybrid_search(
|
async def hybrid_search(
|
||||||
self,
|
self,
|
||||||
query_text: str,
|
query_text: str,
|
||||||
|
|
|
||||||
|
|
@ -43,6 +43,7 @@ from app.db import (
|
||||||
async_session_maker,
|
async_session_maker,
|
||||||
get_async_session,
|
get_async_session,
|
||||||
)
|
)
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
from app.schemas import (
|
from app.schemas import (
|
||||||
GoogleDriveIndexRequest,
|
GoogleDriveIndexRequest,
|
||||||
MCPConnectorCreate,
|
MCPConnectorCreate,
|
||||||
|
|
@ -104,7 +105,9 @@ async def _run_indexing_heartbeat_loop(notification_id: int) -> None:
|
||||||
await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
|
await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
|
||||||
try:
|
try:
|
||||||
get_heartbeat_redis_client().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
|
get_heartbeat_redis_client().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
|
||||||
|
ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="connector")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
ot_metrics.record_celery_heartbeat_failure(heartbeat_type="connector")
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Failed to refresh Redis heartbeat for notification "
|
f"Failed to refresh Redis heartbeat for notification "
|
||||||
f"{notification_id}: {e}"
|
f"{notification_id}: {e}"
|
||||||
|
|
@ -1243,6 +1246,12 @@ async def _persist_auth_expired(session: AsyncSession, connector_id: int) -> Non
|
||||||
"""Flag a connector as auth_expired so the frontend shows a re-auth prompt."""
|
"""Flag a connector as auth_expired so the frontend shows a re-auth prompt."""
|
||||||
from sqlalchemy.orm.attributes import flag_modified
|
from sqlalchemy.orm.attributes import flag_modified
|
||||||
|
|
||||||
|
ot.add_event(
|
||||||
|
"connector.auth.expired",
|
||||||
|
{
|
||||||
|
"error.category": "auth_failed",
|
||||||
|
},
|
||||||
|
)
|
||||||
try:
|
try:
|
||||||
result = await session.execute(
|
result = await session.execute(
|
||||||
select(SearchSourceConnector).where(
|
select(SearchSourceConnector).where(
|
||||||
|
|
@ -1302,6 +1311,13 @@ async def _run_indexing_with_notifications(
|
||||||
try:
|
try:
|
||||||
connector_lock_acquired = acquire_connector_indexing_lock(connector_id)
|
connector_lock_acquired = acquire_connector_indexing_lock(connector_id)
|
||||||
if not connector_lock_acquired:
|
if not connector_lock_acquired:
|
||||||
|
ot.add_event(
|
||||||
|
"connector.sync.skipped",
|
||||||
|
{
|
||||||
|
"skip.reason": "lock_contention",
|
||||||
|
"error.category": "lock_contention",
|
||||||
|
},
|
||||||
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
f"Skipping indexing for connector {connector_id} "
|
f"Skipping indexing for connector {connector_id} "
|
||||||
"(another worker already holds Redis connector lock)"
|
"(another worker already holds Redis connector lock)"
|
||||||
|
|
@ -1338,7 +1354,13 @@ async def _run_indexing_with_notifications(
|
||||||
get_heartbeat_redis_client().setex(
|
get_heartbeat_redis_client().setex(
|
||||||
heartbeat_key, HEARTBEAT_TTL_SECONDS, "0"
|
heartbeat_key, HEARTBEAT_TTL_SECONDS, "0"
|
||||||
)
|
)
|
||||||
|
ot_metrics.record_celery_heartbeat_refresh(
|
||||||
|
heartbeat_type="connector"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
ot_metrics.record_celery_heartbeat_failure(
|
||||||
|
heartbeat_type="connector"
|
||||||
|
)
|
||||||
logger.warning(f"Failed to set initial Redis heartbeat: {e}")
|
logger.warning(f"Failed to set initial Redis heartbeat: {e}")
|
||||||
|
|
||||||
# Start a background coroutine that refreshes the
|
# Start a background coroutine that refreshes the
|
||||||
|
|
@ -1366,6 +1388,15 @@ async def _run_indexing_with_notifications(
|
||||||
) -> None:
|
) -> None:
|
||||||
"""Callback to update notification during API retries (rate limits, etc.)"""
|
"""Callback to update notification during API retries (rate limits, etc.)"""
|
||||||
nonlocal notification
|
nonlocal notification
|
||||||
|
ot.add_event(
|
||||||
|
"connector.retry.scheduled",
|
||||||
|
{
|
||||||
|
"retry.reason": retry_reason,
|
||||||
|
"retry.attempt": attempt,
|
||||||
|
"retry.max": max_attempts,
|
||||||
|
"retry.delay_ms": int(wait_seconds * 1000),
|
||||||
|
},
|
||||||
|
)
|
||||||
if notification:
|
if notification:
|
||||||
try:
|
try:
|
||||||
await session.refresh(notification)
|
await session.refresh(notification)
|
||||||
|
|
@ -1397,8 +1428,14 @@ async def _run_indexing_with_notifications(
|
||||||
get_heartbeat_redis_client().setex(
|
get_heartbeat_redis_client().setex(
|
||||||
heartbeat_key, HEARTBEAT_TTL_SECONDS, str(indexed_count)
|
heartbeat_key, HEARTBEAT_TTL_SECONDS, str(indexed_count)
|
||||||
)
|
)
|
||||||
|
ot_metrics.record_celery_heartbeat_refresh(
|
||||||
|
heartbeat_type="connector"
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Don't let Redis errors break the indexing
|
# Don't let Redis errors break the indexing
|
||||||
|
ot_metrics.record_celery_heartbeat_failure(
|
||||||
|
heartbeat_type="connector"
|
||||||
|
)
|
||||||
logger.warning(f"Failed to set Redis heartbeat: {e}")
|
logger.warning(f"Failed to set Redis heartbeat: {e}")
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -37,6 +37,10 @@ def get_celery_session_maker() -> async_sessionmaker:
|
||||||
poolclass=NullPool,
|
poolclass=NullPool,
|
||||||
echo=False,
|
echo=False,
|
||||||
)
|
)
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
from app.observability.bootstrap import instrument_sqlalchemy_engine
|
||||||
|
|
||||||
|
instrument_sqlalchemy_engine(_celery_engine)
|
||||||
_celery_session_maker = async_sessionmaker(
|
_celery_session_maker = async_sessionmaker(
|
||||||
_celery_engine, expire_on_commit=False
|
_celery_engine, expire_on_commit=False
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,52 @@
|
||||||
"""Celery tasks for connector indexing."""
|
"""Celery tasks for connector indexing."""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
import time
|
||||||
import traceback
|
import traceback
|
||||||
|
from collections.abc import Awaitable, Callable
|
||||||
|
|
||||||
|
from celery import current_task
|
||||||
|
|
||||||
from app.celery_app import celery_app
|
from app.celery_app import celery_app
|
||||||
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
|
from app.tasks.celery_tasks import (
|
||||||
|
get_celery_session_maker,
|
||||||
|
run_async_celery_task as _run_async_celery_task,
|
||||||
|
)
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def run_async_celery_task[T](coro_factory: Callable[[], Awaitable[T]]) -> T:
|
||||||
|
"""Run connector sync work and record aggregate connector metrics."""
|
||||||
|
task_name = getattr(current_task, "name", None) or "unknown"
|
||||||
|
t0 = time.perf_counter()
|
||||||
|
status = "failed"
|
||||||
|
error_category: str | None = None
|
||||||
|
try:
|
||||||
|
with ot.connector_sync_span(connector_type=task_name) as sp:
|
||||||
|
try:
|
||||||
|
result = _run_async_celery_task(coro_factory)
|
||||||
|
sp.set_attribute("connector.status", "success")
|
||||||
|
except Exception as exc:
|
||||||
|
error_category = ot_metrics.categorize_exception(exc)
|
||||||
|
sp.set_attribute("connector.error.category", error_category)
|
||||||
|
raise
|
||||||
|
status = "success"
|
||||||
|
return result
|
||||||
|
finally:
|
||||||
|
elapsed_s = time.perf_counter() - t0
|
||||||
|
ot_metrics.record_connector_sync_duration(
|
||||||
|
elapsed_s,
|
||||||
|
connector_type=task_name,
|
||||||
|
)
|
||||||
|
ot_metrics.record_connector_sync_outcome(
|
||||||
|
connector_type=task_name,
|
||||||
|
status=status,
|
||||||
|
error_category=error_category,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
|
def _handle_greenlet_error(e: Exception, task_name: str, connector_id: int) -> None:
|
||||||
"""
|
"""
|
||||||
Handle greenlet_spawn errors with detailed logging for debugging.
|
Handle greenlet_spawn errors with detailed logging for debugging.
|
||||||
|
|
|
||||||
|
|
@ -9,6 +9,7 @@ from uuid import UUID
|
||||||
|
|
||||||
from app.celery_app import celery_app
|
from app.celery_app import celery_app
|
||||||
from app.config import config
|
from app.config import config
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
from app.services.notification_service import NotificationService
|
from app.services.notification_service import NotificationService
|
||||||
from app.services.task_logging_service import TaskLoggingService
|
from app.services.task_logging_service import TaskLoggingService
|
||||||
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
|
from app.tasks.celery_tasks import get_celery_session_maker, run_async_celery_task
|
||||||
|
|
@ -59,7 +60,9 @@ def _start_heartbeat(notification_id: int) -> None:
|
||||||
try:
|
try:
|
||||||
key = _get_heartbeat_key(notification_id)
|
key = _get_heartbeat_key(notification_id)
|
||||||
_get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "started")
|
_get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "started")
|
||||||
|
ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document")
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Failed to set initial heartbeat for notification {notification_id}: {e}"
|
f"Failed to set initial heartbeat for notification {notification_id}: {e}"
|
||||||
)
|
)
|
||||||
|
|
@ -87,7 +90,9 @@ async def _run_heartbeat_loop(notification_id: int):
|
||||||
await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
|
await asyncio.sleep(HEARTBEAT_REFRESH_INTERVAL)
|
||||||
try:
|
try:
|
||||||
_get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
|
_get_doc_heartbeat_redis().setex(key, HEARTBEAT_TTL_SECONDS, "alive")
|
||||||
|
ot_metrics.record_celery_heartbeat_refresh(heartbeat_type="document")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
ot_metrics.record_celery_heartbeat_failure(heartbeat_type="document")
|
||||||
logger.warning(
|
logger.warning(
|
||||||
f"Failed to refresh heartbeat for notification {notification_id}: {e}"
|
f"Failed to refresh heartbeat for notification {notification_id}: {e}"
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ import contextlib
|
||||||
import gc
|
import gc
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
import time
|
import time
|
||||||
from collections.abc import AsyncGenerator
|
from collections.abc import AsyncGenerator
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass, field
|
||||||
|
|
@ -58,6 +59,7 @@ from app.db import (
|
||||||
async_session_maker,
|
async_session_maker,
|
||||||
shielded_async_session,
|
shielded_async_session,
|
||||||
)
|
)
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
from app.prompts import TITLE_GENERATION_PROMPT
|
from app.prompts import TITLE_GENERATION_PROMPT
|
||||||
from app.services.auto_model_pin_service import (
|
from app.services.auto_model_pin_service import (
|
||||||
mark_runtime_cooldown,
|
mark_runtime_cooldown,
|
||||||
|
|
@ -883,6 +885,20 @@ async def stream_new_chat(
|
||||||
stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
|
stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
|
||||||
stream_result.filesystem_mode = fs_mode
|
stream_result.filesystem_mode = fs_mode
|
||||||
stream_result.client_platform = fs_platform
|
stream_result.client_platform = fs_platform
|
||||||
|
chat_agent_mode = "unknown"
|
||||||
|
chat_outcome = "success"
|
||||||
|
chat_error_category: str | None = None
|
||||||
|
chat_span_cm = ot.chat_request_span(
|
||||||
|
chat_id=chat_id,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
flow=flow,
|
||||||
|
request_id=request_id,
|
||||||
|
turn_id=stream_result.turn_id,
|
||||||
|
filesystem_mode=fs_mode,
|
||||||
|
client_platform=fs_platform,
|
||||||
|
agent_mode=chat_agent_mode,
|
||||||
|
)
|
||||||
|
chat_span = chat_span_cm.__enter__()
|
||||||
_log_file_contract("turn_start", stream_result)
|
_log_file_contract("turn_start", stream_result)
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[stream_new_chat] filesystem_mode=%s client_platform=%s",
|
"[stream_new_chat] filesystem_mode=%s client_platform=%s",
|
||||||
|
|
@ -971,6 +987,14 @@ async def stream_new_chat(
|
||||||
requires_image_input=_requires_image_input,
|
requires_image_input=_requires_image_input,
|
||||||
)
|
)
|
||||||
).resolved_llm_config_id
|
).resolved_llm_config_id
|
||||||
|
ot.add_event(
|
||||||
|
"model.pin.resolved",
|
||||||
|
{
|
||||||
|
"pin.requested_id": requested_llm_config_id,
|
||||||
|
"pin.resolved_id": llm_config_id,
|
||||||
|
"pin.requires_image_input": _requires_image_input,
|
||||||
|
},
|
||||||
|
)
|
||||||
except ValueError as pin_error:
|
except ValueError as pin_error:
|
||||||
# Auto-pin's "no vision-capable cfg" path raises a ValueError
|
# Auto-pin's "no vision-capable cfg" path raises a ValueError
|
||||||
# whose message we map to the friendly image-input SSE error
|
# whose message we map to the friendly image-input SSE error
|
||||||
|
|
@ -987,6 +1011,13 @@ async def stream_new_chat(
|
||||||
if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT"
|
if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT"
|
||||||
else "server_error"
|
else "server_error"
|
||||||
)
|
)
|
||||||
|
if error_code == "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT":
|
||||||
|
ot.add_event(
|
||||||
|
"quota.denied",
|
||||||
|
{
|
||||||
|
"quota.code": error_code,
|
||||||
|
},
|
||||||
|
)
|
||||||
yield _emit_stream_error(
|
yield _emit_stream_error(
|
||||||
message=str(pin_error),
|
message=str(pin_error),
|
||||||
error_kind=error_kind,
|
error_kind=error_kind,
|
||||||
|
|
@ -1041,6 +1072,12 @@ async def stream_new_chat(
|
||||||
model_label = (
|
model_label = (
|
||||||
agent_config.config_name or agent_config.model_name or "model"
|
agent_config.config_name or agent_config.model_name or "model"
|
||||||
)
|
)
|
||||||
|
ot.add_event(
|
||||||
|
"quota.denied",
|
||||||
|
{
|
||||||
|
"quota.code": "MODEL_DOES_NOT_SUPPORT_IMAGE_INPUT",
|
||||||
|
},
|
||||||
|
)
|
||||||
yield _emit_stream_error(
|
yield _emit_stream_error(
|
||||||
message=(
|
message=(
|
||||||
f"The selected model ({model_label}) does not support "
|
f"The selected model ({model_label}) does not support "
|
||||||
|
|
@ -1084,6 +1121,12 @@ async def stream_new_chat(
|
||||||
)
|
)
|
||||||
_premium_reserved_micros = reserve_amount_micros
|
_premium_reserved_micros = reserve_amount_micros
|
||||||
if not quota_result.allowed:
|
if not quota_result.allowed:
|
||||||
|
ot.add_event(
|
||||||
|
"quota.denied",
|
||||||
|
{
|
||||||
|
"quota.code": "PREMIUM_QUOTA_EXHAUSTED",
|
||||||
|
},
|
||||||
|
)
|
||||||
if requested_llm_config_id == 0:
|
if requested_llm_config_id == 0:
|
||||||
try:
|
try:
|
||||||
llm_config_id = (
|
llm_config_id = (
|
||||||
|
|
@ -1097,6 +1140,13 @@ async def stream_new_chat(
|
||||||
requires_image_input=_requires_image_input,
|
requires_image_input=_requires_image_input,
|
||||||
)
|
)
|
||||||
).resolved_llm_config_id
|
).resolved_llm_config_id
|
||||||
|
ot.add_event(
|
||||||
|
"model.repin",
|
||||||
|
{
|
||||||
|
"repin.reason": "premium_quota_exhausted",
|
||||||
|
"repin.to_config_id": llm_config_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
except ValueError as pin_error:
|
except ValueError as pin_error:
|
||||||
yield _emit_stream_error(
|
yield _emit_stream_error(
|
||||||
message=str(pin_error),
|
message=str(pin_error),
|
||||||
|
|
@ -1189,6 +1239,9 @@ async def stream_new_chat(
|
||||||
from app.config import config as _app_config
|
from app.config import config as _app_config
|
||||||
|
|
||||||
use_multi_agent = bool(_app_config.MULTI_AGENT_CHAT_ENABLED)
|
use_multi_agent = bool(_app_config.MULTI_AGENT_CHAT_ENABLED)
|
||||||
|
chat_agent_mode = "multi" if use_multi_agent else "single"
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
chat_span.set_attribute("agent.mode", chat_agent_mode)
|
||||||
|
|
||||||
_t0 = time.perf_counter()
|
_t0 = time.perf_counter()
|
||||||
agent_factory = (
|
agent_factory = (
|
||||||
|
|
@ -1863,6 +1916,14 @@ async def stream_new_chat(
|
||||||
llm_config_id,
|
llm_config_id,
|
||||||
time.perf_counter() - _t0,
|
time.perf_counter() - _t0,
|
||||||
)
|
)
|
||||||
|
ot.add_event(
|
||||||
|
"chat.rate_limit.recovered",
|
||||||
|
{
|
||||||
|
"recovery.reason": "provider_rate_limited",
|
||||||
|
"recovery.previous_config_id": previous_config_id,
|
||||||
|
"recovery.fallback_config_id": llm_config_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
_log_chat_stream_error(
|
_log_chat_stream_error(
|
||||||
flow=flow,
|
flow=flow,
|
||||||
error_kind="rate_limited",
|
error_kind="rate_limited",
|
||||||
|
|
@ -1893,6 +1954,12 @@ async def stream_new_chat(
|
||||||
log_system_snapshot("stream_new_chat_END")
|
log_system_snapshot("stream_new_chat_END")
|
||||||
|
|
||||||
if stream_result.is_interrupted:
|
if stream_result.is_interrupted:
|
||||||
|
ot.add_event(
|
||||||
|
"chat.interrupted",
|
||||||
|
{
|
||||||
|
"chat.flow": flow,
|
||||||
|
},
|
||||||
|
)
|
||||||
if title_task is not None and not title_task.done():
|
if title_task is not None and not title_task.done():
|
||||||
title_task.cancel()
|
title_task.cancel()
|
||||||
|
|
||||||
|
|
@ -2011,6 +2078,12 @@ async def stream_new_chat(
|
||||||
user_message,
|
user_message,
|
||||||
error_extra,
|
error_extra,
|
||||||
) = _classify_stream_exception(e, flow_label="chat")
|
) = _classify_stream_exception(e, flow_label="chat")
|
||||||
|
chat_outcome = error_code or error_kind or "error"
|
||||||
|
chat_error_category = ot_metrics.categorize_exception(e)
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
chat_span.set_attribute("chat.outcome", chat_outcome)
|
||||||
|
chat_span.set_attribute("error.category", chat_error_category)
|
||||||
|
ot.record_error(chat_span, e)
|
||||||
error_message = f"Error during chat: {e!s}"
|
error_message = f"Error during chat: {e!s}"
|
||||||
print(f"[stream_new_chat] {error_message}")
|
print(f"[stream_new_chat] {error_message}")
|
||||||
print(f"[stream_new_chat] Exception type: {type(e).__name__}")
|
print(f"[stream_new_chat] Exception type: {type(e).__name__}")
|
||||||
|
|
@ -2201,6 +2274,21 @@ async def stream_new_chat(
|
||||||
)
|
)
|
||||||
trim_native_heap()
|
trim_native_heap()
|
||||||
log_system_snapshot("stream_new_chat_END")
|
log_system_snapshot("stream_new_chat_END")
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
chat_span.set_attribute("chat.outcome", chat_outcome)
|
||||||
|
ot_metrics.record_chat_request_duration(
|
||||||
|
(time.perf_counter() - _t_total) * 1000,
|
||||||
|
flow=flow,
|
||||||
|
outcome=chat_outcome,
|
||||||
|
agent_mode=chat_agent_mode,
|
||||||
|
)
|
||||||
|
ot_metrics.record_chat_request_outcome(
|
||||||
|
flow=flow,
|
||||||
|
outcome=chat_outcome,
|
||||||
|
agent_mode=chat_agent_mode,
|
||||||
|
error_category=chat_error_category,
|
||||||
|
)
|
||||||
|
chat_span_cm.__exit__(*sys.exc_info())
|
||||||
|
|
||||||
|
|
||||||
async def stream_resume_chat(
|
async def stream_resume_chat(
|
||||||
|
|
@ -2225,6 +2313,20 @@ async def stream_resume_chat(
|
||||||
stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
|
stream_result.turn_id = f"{chat_id}:{int(time.time() * 1000)}"
|
||||||
stream_result.filesystem_mode = fs_mode
|
stream_result.filesystem_mode = fs_mode
|
||||||
stream_result.client_platform = fs_platform
|
stream_result.client_platform = fs_platform
|
||||||
|
chat_agent_mode = "unknown"
|
||||||
|
chat_outcome = "success"
|
||||||
|
chat_error_category: str | None = None
|
||||||
|
chat_span_cm = ot.chat_request_span(
|
||||||
|
chat_id=chat_id,
|
||||||
|
search_space_id=search_space_id,
|
||||||
|
flow="resume",
|
||||||
|
request_id=request_id,
|
||||||
|
turn_id=stream_result.turn_id,
|
||||||
|
filesystem_mode=fs_mode,
|
||||||
|
client_platform=fs_platform,
|
||||||
|
agent_mode=chat_agent_mode,
|
||||||
|
)
|
||||||
|
chat_span = chat_span_cm.__enter__()
|
||||||
_log_file_contract("turn_start", stream_result)
|
_log_file_contract("turn_start", stream_result)
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[stream_resume] filesystem_mode=%s client_platform=%s",
|
"[stream_resume] filesystem_mode=%s client_platform=%s",
|
||||||
|
|
@ -2297,6 +2399,14 @@ async def stream_resume_chat(
|
||||||
selected_llm_config_id=llm_config_id,
|
selected_llm_config_id=llm_config_id,
|
||||||
)
|
)
|
||||||
).resolved_llm_config_id
|
).resolved_llm_config_id
|
||||||
|
ot.add_event(
|
||||||
|
"model.pin.resolved",
|
||||||
|
{
|
||||||
|
"pin.requested_id": requested_llm_config_id,
|
||||||
|
"pin.resolved_id": llm_config_id,
|
||||||
|
"pin.requires_image_input": False,
|
||||||
|
},
|
||||||
|
)
|
||||||
except ValueError as pin_error:
|
except ValueError as pin_error:
|
||||||
yield _emit_stream_error(
|
yield _emit_stream_error(
|
||||||
message=str(pin_error),
|
message=str(pin_error),
|
||||||
|
|
@ -2353,6 +2463,12 @@ async def stream_resume_chat(
|
||||||
)
|
)
|
||||||
_resume_premium_reserved_micros = reserve_amount_micros
|
_resume_premium_reserved_micros = reserve_amount_micros
|
||||||
if not quota_result.allowed:
|
if not quota_result.allowed:
|
||||||
|
ot.add_event(
|
||||||
|
"quota.denied",
|
||||||
|
{
|
||||||
|
"quota.code": "PREMIUM_QUOTA_EXHAUSTED",
|
||||||
|
},
|
||||||
|
)
|
||||||
if requested_llm_config_id == 0:
|
if requested_llm_config_id == 0:
|
||||||
try:
|
try:
|
||||||
llm_config_id = (
|
llm_config_id = (
|
||||||
|
|
@ -2365,6 +2481,13 @@ async def stream_resume_chat(
|
||||||
force_repin_free=True,
|
force_repin_free=True,
|
||||||
)
|
)
|
||||||
).resolved_llm_config_id
|
).resolved_llm_config_id
|
||||||
|
ot.add_event(
|
||||||
|
"model.repin",
|
||||||
|
{
|
||||||
|
"repin.reason": "premium_quota_exhausted",
|
||||||
|
"repin.to_config_id": llm_config_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
except ValueError as pin_error:
|
except ValueError as pin_error:
|
||||||
yield _emit_stream_error(
|
yield _emit_stream_error(
|
||||||
message=str(pin_error),
|
message=str(pin_error),
|
||||||
|
|
@ -2454,6 +2577,11 @@ async def stream_resume_chat(
|
||||||
visibility = thread_visibility or ChatVisibility.PRIVATE
|
visibility = thread_visibility or ChatVisibility.PRIVATE
|
||||||
from app.config import config as _app_config
|
from app.config import config as _app_config
|
||||||
|
|
||||||
|
chat_agent_mode = (
|
||||||
|
"multi" if _app_config.MULTI_AGENT_CHAT_ENABLED else "single"
|
||||||
|
)
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
chat_span.set_attribute("agent.mode", chat_agent_mode)
|
||||||
_t0 = time.perf_counter()
|
_t0 = time.perf_counter()
|
||||||
agent_factory = (
|
agent_factory = (
|
||||||
create_multi_agent_chat_deep_agent
|
create_multi_agent_chat_deep_agent
|
||||||
|
|
@ -2695,6 +2823,14 @@ async def stream_resume_chat(
|
||||||
llm_config_id,
|
llm_config_id,
|
||||||
time.perf_counter() - _t0,
|
time.perf_counter() - _t0,
|
||||||
)
|
)
|
||||||
|
ot.add_event(
|
||||||
|
"chat.rate_limit.recovered",
|
||||||
|
{
|
||||||
|
"recovery.reason": "provider_rate_limited",
|
||||||
|
"recovery.previous_config_id": previous_config_id,
|
||||||
|
"recovery.fallback_config_id": llm_config_id,
|
||||||
|
},
|
||||||
|
)
|
||||||
_log_chat_stream_error(
|
_log_chat_stream_error(
|
||||||
flow="resume",
|
flow="resume",
|
||||||
error_kind="rate_limited",
|
error_kind="rate_limited",
|
||||||
|
|
@ -2722,6 +2858,12 @@ async def stream_resume_chat(
|
||||||
chat_id,
|
chat_id,
|
||||||
)
|
)
|
||||||
if stream_result.is_interrupted:
|
if stream_result.is_interrupted:
|
||||||
|
ot.add_event(
|
||||||
|
"chat.interrupted",
|
||||||
|
{
|
||||||
|
"chat.flow": "resume",
|
||||||
|
},
|
||||||
|
)
|
||||||
usage_summary = accumulator.per_message_summary()
|
usage_summary = accumulator.per_message_summary()
|
||||||
_perf_log.info(
|
_perf_log.info(
|
||||||
"[token_usage] interrupted resume_chat: calls=%d total=%d cost_micros=%d summary=%s",
|
"[token_usage] interrupted resume_chat: calls=%d total=%d cost_micros=%d summary=%s",
|
||||||
|
|
@ -2815,6 +2957,12 @@ async def stream_resume_chat(
|
||||||
user_message,
|
user_message,
|
||||||
error_extra,
|
error_extra,
|
||||||
) = _classify_stream_exception(e, flow_label="resume")
|
) = _classify_stream_exception(e, flow_label="resume")
|
||||||
|
chat_outcome = error_code or error_kind or "error"
|
||||||
|
chat_error_category = ot_metrics.categorize_exception(e)
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
chat_span.set_attribute("chat.outcome", chat_outcome)
|
||||||
|
chat_span.set_attribute("error.category", chat_error_category)
|
||||||
|
ot.record_error(chat_span, e)
|
||||||
error_message = f"Error during resume: {e!s}"
|
error_message = f"Error during resume: {e!s}"
|
||||||
print(f"[stream_resume_chat] {error_message}")
|
print(f"[stream_resume_chat] {error_message}")
|
||||||
print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}")
|
print(f"[stream_resume_chat] Traceback:\n{traceback.format_exc()}")
|
||||||
|
|
@ -2964,3 +3112,18 @@ async def stream_resume_chat(
|
||||||
)
|
)
|
||||||
trim_native_heap()
|
trim_native_heap()
|
||||||
log_system_snapshot("stream_resume_chat_END")
|
log_system_snapshot("stream_resume_chat_END")
|
||||||
|
with contextlib.suppress(Exception):
|
||||||
|
chat_span.set_attribute("chat.outcome", chat_outcome)
|
||||||
|
ot_metrics.record_chat_request_duration(
|
||||||
|
(time.perf_counter() - _t_total) * 1000,
|
||||||
|
flow="resume",
|
||||||
|
outcome=chat_outcome,
|
||||||
|
agent_mode=chat_agent_mode,
|
||||||
|
)
|
||||||
|
ot_metrics.record_chat_request_outcome(
|
||||||
|
flow="resume",
|
||||||
|
outcome=chat_outcome,
|
||||||
|
agent_mode=chat_agent_mode,
|
||||||
|
error_category=chat_error_category,
|
||||||
|
)
|
||||||
|
chat_span_cm.__exit__(*sys.exc_info())
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ import time
|
||||||
from contextlib import asynccontextmanager, contextmanager
|
from contextlib import asynccontextmanager, contextmanager
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics
|
||||||
|
|
||||||
_perf_log: logging.Logger | None = None
|
_perf_log: logging.Logger | None = None
|
||||||
_last_rss_mb: float = 0.0
|
_last_rss_mb: float = 0.0
|
||||||
|
|
||||||
|
|
@ -50,6 +52,7 @@ def perf_timer(label: str, *, extra: dict[str, Any] | None = None):
|
||||||
if extra:
|
if extra:
|
||||||
suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
|
suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
|
||||||
log.info("%s in %.3fs%s", label, elapsed, suffix)
|
log.info("%s in %.3fs%s", label, elapsed, suffix)
|
||||||
|
ot_metrics.record_perf_elapsed(elapsed * 1000, label=label)
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
|
|
@ -68,6 +71,7 @@ async def perf_async_timer(label: str, *, extra: dict[str, Any] | None = None):
|
||||||
if extra:
|
if extra:
|
||||||
suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
|
suffix = " " + " ".join(f"{k}={v}" for k, v in extra.items())
|
||||||
log.info("%s in %.3fs%s", label, elapsed, suffix)
|
log.info("%s in %.3fs%s", label, elapsed, suffix)
|
||||||
|
ot_metrics.record_perf_elapsed(elapsed * 1000, label=label)
|
||||||
|
|
||||||
|
|
||||||
def system_snapshot() -> dict[str, Any]:
|
def system_snapshot() -> dict[str, Any]:
|
||||||
|
|
|
||||||
|
|
@ -12,9 +12,26 @@ if sys.platform == "win32":
|
||||||
|
|
||||||
from app.config.uvicorn import load_uvicorn_config
|
from app.config.uvicorn import load_uvicorn_config
|
||||||
|
|
||||||
|
_old_log_record_factory = logging.getLogRecordFactory()
|
||||||
|
|
||||||
|
|
||||||
|
def _otel_safe_log_record_factory(*args, **kwargs):
|
||||||
|
record = _old_log_record_factory(*args, **kwargs)
|
||||||
|
if not hasattr(record, "otelTraceID"):
|
||||||
|
record.otelTraceID = "0"
|
||||||
|
if not hasattr(record, "otelSpanID"):
|
||||||
|
record.otelSpanID = "0"
|
||||||
|
return record
|
||||||
|
|
||||||
|
|
||||||
|
logging.setLogRecordFactory(_otel_safe_log_record_factory)
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
format=(
|
||||||
|
"%(asctime)s - %(name)s - %(levelname)s - "
|
||||||
|
"[trace_id=%(otelTraceID)s span_id=%(otelSpanID)s] %(message)s"
|
||||||
|
),
|
||||||
datefmt="%Y-%m-%d %H:%M:%S",
|
datefmt="%Y-%m-%d %H:%M:%S",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -76,6 +76,17 @@ dependencies = [
|
||||||
"litellm>=1.83.7",
|
"litellm>=1.83.7",
|
||||||
"langchain-litellm>=0.6.4",
|
"langchain-litellm>=0.6.4",
|
||||||
"deepagents>=0.4.12,<0.5",
|
"deepagents>=0.4.12,<0.5",
|
||||||
|
"opentelemetry-api>=1.40.0",
|
||||||
|
"opentelemetry-sdk>=1.40.0",
|
||||||
|
"opentelemetry-exporter-otlp>=1.40.0",
|
||||||
|
"opentelemetry-semantic-conventions>=0.61b0",
|
||||||
|
"opentelemetry-instrumentation-fastapi>=0.61b0",
|
||||||
|
"opentelemetry-instrumentation-sqlalchemy>=0.61b0",
|
||||||
|
"opentelemetry-instrumentation-psycopg>=0.61b0",
|
||||||
|
"opentelemetry-instrumentation-redis>=0.61b0",
|
||||||
|
"opentelemetry-instrumentation-httpx>=0.61b0",
|
||||||
|
"opentelemetry-instrumentation-celery>=0.61b0",
|
||||||
|
"opentelemetry-instrumentation-logging>=0.61b0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[dependency-groups]
|
[dependency-groups]
|
||||||
|
|
|
||||||
|
|
@ -23,6 +23,7 @@ pytestmark = pytest.mark.unit
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def _disable_otel(monkeypatch: pytest.MonkeyPatch):
|
def _disable_otel(monkeypatch: pytest.MonkeyPatch):
|
||||||
monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
|
monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
|
||||||
|
monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False)
|
||||||
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
|
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
|
||||||
from app.observability import otel as ot
|
from app.observability import otel as ot
|
||||||
|
|
||||||
|
|
@ -99,16 +100,17 @@ class TestAnnotateModelResponse:
|
||||||
"total_tokens": 150,
|
"total_tokens": 150,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
_annotate_model_response(sp, msg)
|
assert _annotate_model_response(sp, msg) == (100, 50)
|
||||||
sp.set_attribute.assert_any_call("tokens.prompt", 100)
|
sp.set_attribute.assert_any_call("gen_ai.usage.input_tokens", 100)
|
||||||
sp.set_attribute.assert_any_call("tokens.completion", 50)
|
sp.set_attribute.assert_any_call("gen_ai.usage.output_tokens", 50)
|
||||||
sp.set_attribute.assert_any_call("tokens.total", 150)
|
sp.set_attribute.assert_any_call("gen_ai.usage.total_tokens", 150)
|
||||||
|
sp.set_attribute.assert_any_call("gen_ai.operation.name", "chat")
|
||||||
|
|
||||||
def test_handles_response_with_no_metadata(self) -> None:
|
def test_handles_response_with_no_metadata(self) -> None:
|
||||||
sp = MagicMock()
|
sp = MagicMock()
|
||||||
msg = AIMessage(content="hello")
|
msg = AIMessage(content="hello")
|
||||||
# Should not raise even when usage_metadata is missing
|
# Should not raise even when usage_metadata is missing
|
||||||
_annotate_model_response(sp, msg)
|
assert _annotate_model_response(sp, msg) == (None, None)
|
||||||
|
|
||||||
|
|
||||||
class TestAnnotateToolResult:
|
class TestAnnotateToolResult:
|
||||||
|
|
@ -119,7 +121,7 @@ class TestAnnotateToolResult:
|
||||||
tool_call_id="abc",
|
tool_call_id="abc",
|
||||||
status="success",
|
status="success",
|
||||||
)
|
)
|
||||||
_annotate_tool_result(sp, result)
|
assert _annotate_tool_result(sp, result) is False
|
||||||
sp.set_attribute.assert_any_call("tool.output.size", len("result text"))
|
sp.set_attribute.assert_any_call("tool.output.size", len("result text"))
|
||||||
sp.set_attribute.assert_any_call("tool.status", "success")
|
sp.set_attribute.assert_any_call("tool.status", "success")
|
||||||
|
|
||||||
|
|
@ -130,7 +132,7 @@ class TestAnnotateToolResult:
|
||||||
tool_call_id="abc",
|
tool_call_id="abc",
|
||||||
additional_kwargs={"error": {"code": "x"}},
|
additional_kwargs={"error": {"code": "x"}},
|
||||||
)
|
)
|
||||||
_annotate_tool_result(sp, result)
|
assert _annotate_tool_result(sp, result) is True
|
||||||
sp.set_attribute.assert_any_call("tool.error", True)
|
sp.set_attribute.assert_any_call("tool.error", True)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -193,3 +195,91 @@ class TestMiddlewareIntegration:
|
||||||
assert result.content == "enabled"
|
assert result.content == "enabled"
|
||||||
finally:
|
finally:
|
||||||
ot.reload_for_tests()
|
ot.reload_for_tests()
|
||||||
|
|
||||||
|
async def test_enabled_model_call_records_metrics(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
|
||||||
|
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
||||||
|
from app.observability import otel as ot
|
||||||
|
|
||||||
|
duration_calls: list[dict[str, Any]] = []
|
||||||
|
token_calls: list[dict[str, Any]] = []
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_call_duration",
|
||||||
|
lambda duration_ms, **attrs: duration_calls.append(
|
||||||
|
{"duration_ms": duration_ms, **attrs}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_model_token_usage",
|
||||||
|
lambda **attrs: token_calls.append(attrs),
|
||||||
|
)
|
||||||
|
|
||||||
|
ot.reload_for_tests()
|
||||||
|
try:
|
||||||
|
mw = OtelSpanMiddleware()
|
||||||
|
|
||||||
|
async def handler(req):
|
||||||
|
return AIMessage(
|
||||||
|
content="enabled",
|
||||||
|
usage_metadata={
|
||||||
|
"input_tokens": 3,
|
||||||
|
"output_tokens": 5,
|
||||||
|
"total_tokens": 8,
|
||||||
|
},
|
||||||
|
)
|
||||||
|
|
||||||
|
request = MagicMock()
|
||||||
|
request.model = MagicMock()
|
||||||
|
request.model.model_name = "gpt-4o"
|
||||||
|
request.model.provider = "openai"
|
||||||
|
await mw.awrap_model_call(request, handler)
|
||||||
|
|
||||||
|
assert duration_calls
|
||||||
|
assert token_calls == [
|
||||||
|
{
|
||||||
|
"input_tokens": 3,
|
||||||
|
"output_tokens": 5,
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"provider": "openai",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
finally:
|
||||||
|
ot.reload_for_tests()
|
||||||
|
|
||||||
|
async def test_enabled_tool_call_records_error_metric(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
|
||||||
|
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
||||||
|
from app.observability import otel as ot
|
||||||
|
|
||||||
|
errors: list[str] = []
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_error",
|
||||||
|
lambda *, tool_name: errors.append(tool_name),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.agents.new_chat.middleware.otel_span.ot_metrics.record_tool_call_duration",
|
||||||
|
lambda *args, **kwargs: None,
|
||||||
|
)
|
||||||
|
|
||||||
|
ot.reload_for_tests()
|
||||||
|
try:
|
||||||
|
mw = OtelSpanMiddleware()
|
||||||
|
|
||||||
|
async def handler(req):
|
||||||
|
return ToolMessage(
|
||||||
|
content="failed",
|
||||||
|
tool_call_id="abc",
|
||||||
|
status="error",
|
||||||
|
)
|
||||||
|
|
||||||
|
request = MagicMock()
|
||||||
|
request.tool = MagicMock()
|
||||||
|
request.tool.name = "web_search"
|
||||||
|
await mw.awrap_tool_call(request, handler)
|
||||||
|
assert errors == ["web_search"]
|
||||||
|
finally:
|
||||||
|
ot.reload_for_tests()
|
||||||
|
|
|
||||||
101
surfsense_backend/tests/unit/observability/test_helpers.py
Normal file
101
surfsense_backend/tests/unit/observability/test_helpers.py
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
"""Tests for pure observability helper functions."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.observability import metrics as ot_metrics, otel as ot
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def _disable_otel(monkeypatch: pytest.MonkeyPatch):
|
||||||
|
monkeypatch.delenv("OTEL_EXPORTER_OTLP_ENDPOINT", raising=False)
|
||||||
|
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
|
||||||
|
ot.reload_for_tests()
|
||||||
|
yield
|
||||||
|
ot.reload_for_tests()
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("task_name", "expected"),
|
||||||
|
[
|
||||||
|
("reindex_document", "reindex"),
|
||||||
|
("delete_document_background", "delete"),
|
||||||
|
("delete_folder_documents_background", "delete"),
|
||||||
|
("delete_search_space_background", "delete"),
|
||||||
|
("process_extension_document", "process"),
|
||||||
|
("process_youtube_video", "process"),
|
||||||
|
("process_file_upload", "process"),
|
||||||
|
("process_file_upload_with_document", "process"),
|
||||||
|
("process_circleback_meeting", "process"),
|
||||||
|
("generate_video_presentation", "generate"),
|
||||||
|
("generate_content_podcast", "generate"),
|
||||||
|
("cleanup_stale_indexing_notifications", "cleanup"),
|
||||||
|
("reconcile_pending_stripe_page_purchases", "reconcile"),
|
||||||
|
("reconcile_pending_stripe_token_purchases", "reconcile"),
|
||||||
|
("check_periodic_schedules", "check"),
|
||||||
|
("ai_sort_search_space", "ai"),
|
||||||
|
("index_notion_pages", "index"),
|
||||||
|
("index_github_repos", "index"),
|
||||||
|
("index_google_drive_files", "index"),
|
||||||
|
("index_composio_connector", "index"),
|
||||||
|
("index_obsidian_attachment", "index"),
|
||||||
|
("index_local_folder", "index"),
|
||||||
|
("index_uploaded_folder_files", "index"),
|
||||||
|
("noseparator", "noseparator"),
|
||||||
|
("", "unknown"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_parse_celery_task_label(task_name: str, expected: str) -> None:
|
||||||
|
assert ot_metrics.parse_celery_task_label(task_name) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_celery_task_label_handles_none() -> None:
|
||||||
|
assert ot_metrics.parse_celery_task_label(None) == "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
("exc", "expected"),
|
||||||
|
[
|
||||||
|
(type("RateLimitError", (Exception,), {})(), "rate_limited"),
|
||||||
|
(type("AuthenticationError", (Exception,), {})(), "auth_failed"),
|
||||||
|
(type("QuotaInsufficientError", (Exception,), {})(), "quota_exhausted"),
|
||||||
|
(TimeoutError(), "timeout"),
|
||||||
|
(type("APIConnectionError", (Exception,), {})(), "network_failed"),
|
||||||
|
(type("ServiceUnavailableError", (Exception,), {})(), "server_error"),
|
||||||
|
(type("LockContentionError", (Exception,), {})(), "lock_contention"),
|
||||||
|
(type("UnsupportedFormatError", (Exception,), {})(), "unsupported_format"),
|
||||||
|
(type("ProviderError", (Exception,), {})(), "provider_error"),
|
||||||
|
(RuntimeError("plain"), "unknown"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_categorize_exception(exc: BaseException, expected: str) -> None:
|
||||||
|
assert ot_metrics.categorize_exception(exc) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_record_celery_queue_latency_noops_when_disabled() -> None:
|
||||||
|
ot_metrics.record_celery_queue_latency(
|
||||||
|
0.5,
|
||||||
|
task_name="index_notion_pages",
|
||||||
|
queue="surfsense.connectors",
|
||||||
|
scheduled=False,
|
||||||
|
operation="index",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_event_noops_when_disabled() -> None:
|
||||||
|
ot.add_event("test.event", {"value": 1})
|
||||||
|
|
||||||
|
|
||||||
|
def test_add_event_noops_without_current_span(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
class FakeTrace:
|
||||||
|
@staticmethod
|
||||||
|
def get_current_span():
|
||||||
|
return None
|
||||||
|
|
||||||
|
monkeypatch.setattr(ot, "_ENABLED", True)
|
||||||
|
monkeypatch.setattr(ot, "_ot_trace", FakeTrace())
|
||||||
|
|
||||||
|
ot.add_event("test.event", {"value": 1})
|
||||||
|
|
@ -4,7 +4,7 @@ from __future__ import annotations
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from app.observability import otel
|
from app.observability import bootstrap, metrics, otel
|
||||||
|
|
||||||
pytestmark = pytest.mark.unit
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
@ -12,7 +12,14 @@ pytestmark = pytest.mark.unit
|
||||||
@pytest.fixture(autouse=True)
|
@pytest.fixture(autouse=True)
|
||||||
def _reset_otel_state(monkeypatch: pytest.MonkeyPatch):
|
def _reset_otel_state(monkeypatch: pytest.MonkeyPatch):
|
||||||
"""Force a clean OTel disabled state per test, then restore after."""
|
"""Force a clean OTel disabled state per test, then restore after."""
|
||||||
for env in ("OTEL_EXPORTER_OTLP_ENDPOINT", "SURFSENSE_DISABLE_OTEL"):
|
for env in (
|
||||||
|
"OTEL_EXPORTER_OTLP_ENDPOINT",
|
||||||
|
"OTEL_EXPORTER_OTLP_PROTOCOL",
|
||||||
|
"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT",
|
||||||
|
"OTEL_EXPORTER_OTLP_METRICS_ENDPOINT",
|
||||||
|
"SURFSENSE_DISABLE_OTEL",
|
||||||
|
"OTEL_SDK_DISABLED",
|
||||||
|
):
|
||||||
monkeypatch.delenv(env, raising=False)
|
monkeypatch.delenv(env, raising=False)
|
||||||
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
|
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
|
||||||
otel.reload_for_tests()
|
otel.reload_for_tests()
|
||||||
|
|
@ -36,6 +43,195 @@ def test_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None
|
||||||
assert otel.reload_for_tests() is False
|
assert otel.reload_for_tests() is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_spec_kill_switch_overrides_endpoint(monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
|
||||||
|
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
||||||
|
monkeypatch.setenv("OTEL_SDK_DISABLED", "true")
|
||||||
|
assert otel.reload_for_tests() is False
|
||||||
|
|
||||||
|
|
||||||
|
class TestBootstrapConfig:
|
||||||
|
def test_disabled_checks_both_kill_switches(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
|
||||||
|
monkeypatch.delenv("OTEL_SDK_DISABLED", raising=False)
|
||||||
|
assert bootstrap.is_otel_disabled() is False
|
||||||
|
|
||||||
|
monkeypatch.setenv("OTEL_SDK_DISABLED", "on")
|
||||||
|
assert bootstrap.is_otel_disabled() is True
|
||||||
|
|
||||||
|
def test_configured_by_shared_or_signal_endpoint(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
|
||||||
|
assert bootstrap.is_otel_configured() is False
|
||||||
|
|
||||||
|
monkeypatch.setenv(
|
||||||
|
"OTEL_EXPORTER_OTLP_TRACES_ENDPOINT", "http://localhost:4317"
|
||||||
|
)
|
||||||
|
assert bootstrap.is_otel_configured() is True
|
||||||
|
|
||||||
|
def test_init_otel_noops_when_disabled(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
called = {"traces": False}
|
||||||
|
|
||||||
|
def fake_init_traces(app=None):
|
||||||
|
del app
|
||||||
|
called["traces"] = True
|
||||||
|
|
||||||
|
monkeypatch.setenv("SURFSENSE_DISABLE_OTEL", "true")
|
||||||
|
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
||||||
|
monkeypatch.setattr(bootstrap, "init_traces", fake_init_traces)
|
||||||
|
|
||||||
|
bootstrap.init_otel()
|
||||||
|
assert called["traces"] is False
|
||||||
|
|
||||||
|
def test_init_otel_dispatches_enabled_signals(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
called: list[str] = []
|
||||||
|
|
||||||
|
monkeypatch.delenv("SURFSENSE_DISABLE_OTEL", raising=False)
|
||||||
|
monkeypatch.setenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")
|
||||||
|
monkeypatch.setattr(
|
||||||
|
bootstrap, "init_traces", lambda app=None: called.append("traces")
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(bootstrap, "init_metrics", lambda: called.append("metrics"))
|
||||||
|
monkeypatch.setattr(bootstrap, "init_logs", lambda: called.append("logs"))
|
||||||
|
|
||||||
|
bootstrap.init_otel()
|
||||||
|
assert called == ["traces", "metrics", "logs"]
|
||||||
|
|
||||||
|
def test_resource_defaults_include_service_metadata(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
monkeypatch.setenv("OTEL_SERVICE_NAME", "custom-backend")
|
||||||
|
monkeypatch.setenv("SURFSENSE_ENV", "test")
|
||||||
|
|
||||||
|
resource = bootstrap._build_resource()
|
||||||
|
attrs = dict(resource.attributes)
|
||||||
|
assert attrs["service.name"] == "custom-backend"
|
||||||
|
assert attrs["deployment.environment.name"] == "test"
|
||||||
|
assert attrs["deployment.environment"] == "test"
|
||||||
|
assert attrs["service.instance.id"]
|
||||||
|
|
||||||
|
def test_deployment_environment_uses_surfsense_env_only(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
monkeypatch.delenv("SURFSENSE_ENV", raising=False)
|
||||||
|
|
||||||
|
assert bootstrap._deployment_environment() == "dev"
|
||||||
|
|
||||||
|
monkeypatch.setenv("SURFSENSE_ENV", "production")
|
||||||
|
|
||||||
|
assert bootstrap._deployment_environment() == "production"
|
||||||
|
|
||||||
|
def test_shutdown_is_safe_without_providers(self) -> None:
|
||||||
|
bootstrap.shutdown_otel()
|
||||||
|
|
||||||
|
def test_init_logs_enables_log_correlation(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
calls: list[dict[str, object]] = []
|
||||||
|
|
||||||
|
class FakeLoggingInstrumentor:
|
||||||
|
def instrument(self, **kwargs: object) -> None:
|
||||||
|
calls.append(kwargs)
|
||||||
|
|
||||||
|
def fake_safe_instrument(name: str, callback):
|
||||||
|
assert name == "logging"
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"opentelemetry.instrumentation.logging.LoggingInstrumentor",
|
||||||
|
FakeLoggingInstrumentor,
|
||||||
|
)
|
||||||
|
callback()
|
||||||
|
return True
|
||||||
|
|
||||||
|
monkeypatch.setattr(bootstrap, "_LOGS_INITIALIZED", False)
|
||||||
|
monkeypatch.setattr(bootstrap, "_safe_instrument", fake_safe_instrument)
|
||||||
|
|
||||||
|
bootstrap.init_logs()
|
||||||
|
|
||||||
|
assert calls == [{"set_logging_format": True}]
|
||||||
|
|
||||||
|
|
||||||
|
class TestMetricHelpers:
|
||||||
|
def test_all_metric_helpers_noop_safely_when_disabled(self) -> None:
|
||||||
|
metrics.record_model_call_duration(12.5, model="gpt-4o", provider="openai")
|
||||||
|
metrics.record_model_token_usage(
|
||||||
|
input_tokens=10,
|
||||||
|
output_tokens=5,
|
||||||
|
model="gpt-4o",
|
||||||
|
provider="openai",
|
||||||
|
)
|
||||||
|
metrics.record_tool_call_duration(3.0, tool_name="web_search")
|
||||||
|
metrics.record_tool_call_error(tool_name="web_search")
|
||||||
|
metrics.record_kb_search_duration(
|
||||||
|
4.0,
|
||||||
|
search_space_id=1,
|
||||||
|
surface="documents",
|
||||||
|
)
|
||||||
|
metrics.record_compaction_run(reason="auto")
|
||||||
|
metrics.record_permission_ask(permission="write_file")
|
||||||
|
metrics.record_interrupt(interrupt_type="permission_ask")
|
||||||
|
metrics.record_indexing_document_duration(1.2, document_type="FILE")
|
||||||
|
metrics.record_indexing_document_outcome(document_type="FILE", status="success")
|
||||||
|
metrics.record_connector_sync_duration(
|
||||||
|
2.3,
|
||||||
|
connector_type="index_notion_pages",
|
||||||
|
)
|
||||||
|
metrics.record_connector_sync_outcome(
|
||||||
|
connector_type="index_notion_pages",
|
||||||
|
status="success",
|
||||||
|
)
|
||||||
|
metrics.record_auth_failure(reason="UNAUTHORIZED")
|
||||||
|
metrics.record_rate_limit_rejection(scope="login")
|
||||||
|
metrics.record_perf_elapsed(7.0, label="[test]")
|
||||||
|
|
||||||
|
def test_runtime_observables_register_once(
|
||||||
|
self, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
class FakeMeter:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.names: list[str] = []
|
||||||
|
|
||||||
|
def create_observable_gauge(self, name: str, **kwargs) -> None:
|
||||||
|
del kwargs
|
||||||
|
self.names.append(name)
|
||||||
|
|
||||||
|
fake_meter = FakeMeter()
|
||||||
|
monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False)
|
||||||
|
monkeypatch.setattr(metrics, "_is_enabled", lambda: True)
|
||||||
|
monkeypatch.setattr(metrics, "_get_meter", lambda: fake_meter)
|
||||||
|
|
||||||
|
metrics.register_runtime_observables()
|
||||||
|
metrics.register_runtime_observables()
|
||||||
|
|
||||||
|
assert len(fake_meter.names) == 6
|
||||||
|
assert fake_meter.names.count("python.asyncio.tasks") == 1
|
||||||
|
monkeypatch.setattr(metrics, "_OBSERVABLES_REGISTERED", False)
|
||||||
|
|
||||||
|
|
||||||
|
def test_log_record_factory_provides_zero_otel_fields() -> None:
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import main # noqa: F401
|
||||||
|
|
||||||
|
record = logging.getLogRecordFactory()(
|
||||||
|
"test",
|
||||||
|
logging.INFO,
|
||||||
|
__file__,
|
||||||
|
1,
|
||||||
|
"hello",
|
||||||
|
(),
|
||||||
|
None,
|
||||||
|
)
|
||||||
|
assert record.otelTraceID == "0"
|
||||||
|
assert record.otelSpanID == "0"
|
||||||
|
|
||||||
|
|
||||||
class TestNoopSpansWhenDisabled:
|
class TestNoopSpansWhenDisabled:
|
||||||
def test_generic_span_yields_noop(self) -> None:
|
def test_generic_span_yields_noop(self) -> None:
|
||||||
with otel.span("any.thing", attributes={"x": 1}) as sp:
|
with otel.span("any.thing", attributes={"x": 1}) as sp:
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,61 @@
|
||||||
|
"""Tests for retriever OTel wrappers."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from contextlib import contextmanager
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from app.retriever.documents_hybrid_search import _instrument_search
|
||||||
|
|
||||||
|
pytestmark = pytest.mark.unit
|
||||||
|
|
||||||
|
|
||||||
|
class _Span:
|
||||||
|
def __init__(self) -> None:
|
||||||
|
self.attrs: dict[str, Any] = {}
|
||||||
|
|
||||||
|
def set_attribute(self, key: str, value: Any) -> None:
|
||||||
|
self.attrs[key] = value
|
||||||
|
|
||||||
|
|
||||||
|
@contextmanager
|
||||||
|
def _fake_span(**kwargs):
|
||||||
|
span = _Span()
|
||||||
|
span.attrs.update(kwargs)
|
||||||
|
yield span
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_retriever_wrapper_records_one_span_and_metric(monkeypatch) -> None:
|
||||||
|
calls: list[dict[str, Any]] = []
|
||||||
|
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.retriever.documents_hybrid_search.ot.kb_search_span",
|
||||||
|
lambda **kwargs: _fake_span(**kwargs),
|
||||||
|
)
|
||||||
|
monkeypatch.setattr(
|
||||||
|
"app.retriever.documents_hybrid_search.ot_metrics.record_kb_search_duration",
|
||||||
|
lambda duration_ms, **attrs: calls.append(
|
||||||
|
{"duration_ms": duration_ms, **attrs}
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
class Retriever:
|
||||||
|
@_instrument_search("hybrid")
|
||||||
|
async def search(
|
||||||
|
self,
|
||||||
|
query_text: str,
|
||||||
|
top_k: int,
|
||||||
|
search_space_id: int,
|
||||||
|
) -> list[str]:
|
||||||
|
del query_text, top_k, search_space_id
|
||||||
|
return ["doc-1", "doc-2"]
|
||||||
|
|
||||||
|
result = await Retriever().search("hello", 3, 42)
|
||||||
|
|
||||||
|
assert result == ["doc-1", "doc-2"]
|
||||||
|
assert len(calls) == 1
|
||||||
|
assert calls[0]["search_space_id"] == 42
|
||||||
|
assert calls[0]["surface"] == "documents"
|
||||||
8632
surfsense_backend/uv.lock
generated
8632
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -3,9 +3,9 @@ import { useTranslations } from "next-intl";
|
||||||
import { useState } from "react";
|
import { useState } from "react";
|
||||||
import { Logo } from "@/components/Logo";
|
import { Logo } from "@/components/Logo";
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
|
import { BACKEND_URL } from "@/lib/env-config";
|
||||||
import { trackLoginAttempt } from "@/lib/posthog/events";
|
import { trackLoginAttempt } from "@/lib/posthog/events";
|
||||||
import { AmbientBackground } from "./AmbientBackground";
|
import { AmbientBackground } from "./AmbientBackground";
|
||||||
import { BACKEND_URL } from "@/lib/env-config";
|
|
||||||
|
|
||||||
function GoogleGLogo({ className }: { className?: string }) {
|
function GoogleGLogo({ className }: { className?: string }) {
|
||||||
return (
|
return (
|
||||||
|
|
|
||||||
|
|
@ -3,16 +3,19 @@
|
||||||
import { ThreadPrimitive } from "@assistant-ui/react";
|
import { ThreadPrimitive } from "@assistant-ui/react";
|
||||||
import { ArrowDownIcon } from "lucide-react";
|
import { ArrowDownIcon } from "lucide-react";
|
||||||
import type { FC, ReactNode } from "react";
|
import type { FC, ReactNode } from "react";
|
||||||
import { TooltipIconButton } from "@/components/assistant-ui/tooltip-icon-button";
|
import { Button } from "@/components/ui/button";
|
||||||
|
|
||||||
const ChatScrollToBottom: FC = () => (
|
const ChatScrollToBottom: FC = () => (
|
||||||
<ThreadPrimitive.ScrollToBottom asChild>
|
<ThreadPrimitive.ScrollToBottom asChild>
|
||||||
<TooltipIconButton
|
<Button
|
||||||
tooltip="Scroll to bottom"
|
type="button"
|
||||||
className="aui-thread-scroll-to-bottom -top-12 absolute z-10 self-center rounded-full border-0 bg-muted p-4 text-foreground hover:bg-accent hover:text-accent-foreground disabled:invisible"
|
variant="ghost"
|
||||||
|
size="icon"
|
||||||
|
aria-label="Scroll to bottom"
|
||||||
|
className="aui-thread-scroll-to-bottom -top-12 absolute z-10 size-10 self-center rounded-full border border-input bg-muted p-0 text-foreground shadow-sm shadow-black/5 hover:bg-accent hover:text-accent-foreground disabled:invisible dark:shadow-black/10"
|
||||||
>
|
>
|
||||||
<ArrowDownIcon />
|
<ArrowDownIcon />
|
||||||
</TooltipIconButton>
|
</Button>
|
||||||
</ThreadPrimitive.ScrollToBottom>
|
</ThreadPrimitive.ScrollToBottom>
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import { useAtomValue } from "jotai";
|
import { useAtomValue } from "jotai";
|
||||||
import { AlertTriangle, Settings } from "lucide-react";
|
import { AlertTriangle } from "lucide-react";
|
||||||
import { useRouter } from "next/navigation";
|
import { useRouter } from "next/navigation";
|
||||||
import { forwardRef, useEffect, useImperativeHandle, useMemo, useState } from "react";
|
import { forwardRef, useEffect, useImperativeHandle, useMemo, useState } from "react";
|
||||||
import { createPortal } from "react-dom";
|
import { createPortal } from "react-dom";
|
||||||
|
|
@ -381,7 +381,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
{/* LLM Configuration Warning */}
|
{/* LLM Configuration Warning */}
|
||||||
{!llmConfigLoading && !hasDocumentSummaryLLM && (
|
{!llmConfigLoading && !hasDocumentSummaryLLM && (
|
||||||
<div className="mb-6">
|
<div className="mb-6">
|
||||||
<Alert variant="destructive">
|
<Alert variant="warning">
|
||||||
<AlertTriangle />
|
<AlertTriangle />
|
||||||
<AlertTitle>LLM Configuration Required</AlertTitle>
|
<AlertTitle>LLM Configuration Required</AlertTitle>
|
||||||
<AlertDescription>
|
<AlertDescription>
|
||||||
|
|
@ -392,7 +392,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
</p>
|
</p>
|
||||||
<Button
|
<Button
|
||||||
size="sm"
|
size="sm"
|
||||||
variant="outline"
|
variant="secondary"
|
||||||
onClick={() => {
|
onClick={() => {
|
||||||
handleOpenChange(false);
|
handleOpenChange(false);
|
||||||
router.push(
|
router.push(
|
||||||
|
|
@ -400,7 +400,6 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
);
|
);
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<Settings className="mr-2 h-4 w-4" />
|
|
||||||
Go to Settings
|
Go to Settings
|
||||||
</Button>
|
</Button>
|
||||||
</AlertDescription>
|
</AlertDescription>
|
||||||
|
|
|
||||||
|
|
@ -1,7 +1,7 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import { useAtomValue } from "jotai";
|
import { useAtomValue } from "jotai";
|
||||||
import { AlertTriangle, Settings } from "lucide-react";
|
import { AlertTriangle } from "lucide-react";
|
||||||
import { useRouter } from "next/navigation";
|
import { useRouter } from "next/navigation";
|
||||||
import {
|
import {
|
||||||
createContext,
|
createContext,
|
||||||
|
|
@ -148,7 +148,7 @@ const DocumentUploadPopupContent: FC<{
|
||||||
<div className="px-4 sm:px-6 pb-4 sm:pb-6">
|
<div className="px-4 sm:px-6 pb-4 sm:pb-6">
|
||||||
{!isLoading && !hasDocumentSummaryLLM ? (
|
{!isLoading && !hasDocumentSummaryLLM ? (
|
||||||
<div className="mb-4">
|
<div className="mb-4">
|
||||||
<Alert variant="destructive">
|
<Alert variant="warning">
|
||||||
<AlertTriangle />
|
<AlertTriangle />
|
||||||
<AlertTitle>LLM Configuration Required</AlertTitle>
|
<AlertTitle>LLM Configuration Required</AlertTitle>
|
||||||
<AlertDescription>
|
<AlertDescription>
|
||||||
|
|
@ -159,13 +159,12 @@ const DocumentUploadPopupContent: FC<{
|
||||||
</p>
|
</p>
|
||||||
<Button
|
<Button
|
||||||
size="sm"
|
size="sm"
|
||||||
variant="outline"
|
variant="secondary"
|
||||||
onClick={() => {
|
onClick={() => {
|
||||||
onOpenChange(false);
|
onOpenChange(false);
|
||||||
router.push(`/dashboard/${searchSpaceId}/search-space-settings/models`);
|
router.push(`/dashboard/${searchSpaceId}/search-space-settings/models`);
|
||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<Settings className="mr-2 h-4 w-4" />
|
|
||||||
Go to Settings
|
Go to Settings
|
||||||
</Button>
|
</Button>
|
||||||
</AlertDescription>
|
</AlertDescription>
|
||||||
|
|
|
||||||
|
|
@ -2,14 +2,9 @@
|
||||||
|
|
||||||
import { Settings, Trash2, Users } from "lucide-react";
|
import { Settings, Trash2, Users } from "lucide-react";
|
||||||
import { useTranslations } from "next-intl";
|
import { useTranslations } from "next-intl";
|
||||||
|
import type { MouseEvent } from "react";
|
||||||
import { useCallback, useRef, useState } from "react";
|
import { useCallback, useRef, useState } from "react";
|
||||||
import { Button } from "@/components/ui/button";
|
import { Button } from "@/components/ui/button";
|
||||||
import {
|
|
||||||
ContextMenu,
|
|
||||||
ContextMenuContent,
|
|
||||||
ContextMenuItem,
|
|
||||||
ContextMenuTrigger,
|
|
||||||
} from "@/components/ui/context-menu";
|
|
||||||
import {
|
import {
|
||||||
DropdownMenu,
|
DropdownMenu,
|
||||||
DropdownMenuContent,
|
DropdownMenuContent,
|
||||||
|
|
@ -80,19 +75,28 @@ export function SearchSpaceAvatar({
|
||||||
const initials = getInitials(name);
|
const initials = getInitials(name);
|
||||||
const sizeClasses = size === "sm" ? "h-8 w-8 text-xs" : "h-10 w-10 text-sm";
|
const sizeClasses = size === "sm" ? "h-8 w-8 text-xs" : "h-10 w-10 text-sm";
|
||||||
|
|
||||||
// Long-press state for mobile
|
const [menuOpen, setMenuOpen] = useState(false);
|
||||||
const [longPressMenuOpen, setLongPressMenuOpen] = useState(false);
|
|
||||||
const longPressTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
|
const longPressTimer = useRef<ReturnType<typeof setTimeout> | null>(null);
|
||||||
const touchMoved = useRef(false);
|
const touchMoved = useRef(false);
|
||||||
|
|
||||||
|
const openMenu = useCallback(() => {
|
||||||
|
setMenuOpen(true);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const handleContextMenu = useCallback((event: MouseEvent<HTMLButtonElement>) => {
|
||||||
|
event.preventDefault();
|
||||||
|
event.stopPropagation();
|
||||||
|
setMenuOpen(true);
|
||||||
|
}, []);
|
||||||
|
|
||||||
const handleTouchStart = useCallback(() => {
|
const handleTouchStart = useCallback(() => {
|
||||||
touchMoved.current = false;
|
touchMoved.current = false;
|
||||||
longPressTimer.current = setTimeout(() => {
|
longPressTimer.current = setTimeout(() => {
|
||||||
if (!touchMoved.current) {
|
if (!touchMoved.current) {
|
||||||
setLongPressMenuOpen(true);
|
openMenu();
|
||||||
}
|
}
|
||||||
}, 500);
|
}, 500);
|
||||||
}, []);
|
}, [openMenu]);
|
||||||
|
|
||||||
const handleTouchMove = useCallback(() => {
|
const handleTouchMove = useCallback(() => {
|
||||||
touchMoved.current = true;
|
touchMoved.current = true;
|
||||||
|
|
@ -120,12 +124,26 @@ export function SearchSpaceAvatar({
|
||||||
</div>
|
</div>
|
||||||
);
|
);
|
||||||
|
|
||||||
const avatarButton = (
|
const avatarButton = (withMenuHandlers = false) => (
|
||||||
<Button
|
<Button
|
||||||
type="button"
|
type="button"
|
||||||
variant="ghost"
|
variant="ghost"
|
||||||
size="icon"
|
size="icon"
|
||||||
onClick={onClick}
|
onClick={onClick}
|
||||||
|
onPointerDown={
|
||||||
|
withMenuHandlers
|
||||||
|
? (event) => {
|
||||||
|
if (event.button === 0) {
|
||||||
|
event.preventDefault();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
: undefined
|
||||||
|
}
|
||||||
|
onContextMenu={withMenuHandlers ? handleContextMenu : undefined}
|
||||||
|
onTouchStart={withMenuHandlers ? handleTouchStart : undefined}
|
||||||
|
onTouchMove={withMenuHandlers ? handleTouchMove : undefined}
|
||||||
|
onTouchEnd={withMenuHandlers ? handleTouchEnd : undefined}
|
||||||
|
onTouchCancel={withMenuHandlers ? handleTouchEnd : undefined}
|
||||||
className={cn(
|
className={cn(
|
||||||
"relative rounded-lg font-semibold text-white transition-all select-none",
|
"relative rounded-lg font-semibold text-white transition-all select-none",
|
||||||
"hover:text-white hover:opacity-90 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring",
|
"hover:text-white hover:opacity-90 focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring",
|
||||||
|
|
@ -173,73 +191,41 @@ export function SearchSpaceAvatar({
|
||||||
</>
|
</>
|
||||||
);
|
);
|
||||||
|
|
||||||
// If delete or settings handlers are provided, wrap with context menu
|
// If delete or settings handlers are provided, expose them through a dropdown menu.
|
||||||
if (onDelete || onSettings) {
|
if (onDelete || onSettings) {
|
||||||
// Mobile: use long-press triggered DropdownMenu
|
const trigger = (
|
||||||
if (disableTooltip) {
|
|
||||||
return (
|
|
||||||
<DropdownMenu open={longPressMenuOpen} onOpenChange={setLongPressMenuOpen}>
|
|
||||||
<DropdownMenuTrigger asChild>
|
<DropdownMenuTrigger asChild>
|
||||||
<div
|
{avatarButton(true)}
|
||||||
className="inline-block"
|
|
||||||
onTouchStart={handleTouchStart}
|
|
||||||
onTouchMove={handleTouchMove}
|
|
||||||
onTouchEnd={handleTouchEnd}
|
|
||||||
onTouchCancel={handleTouchEnd}
|
|
||||||
>
|
|
||||||
{avatarButton}
|
|
||||||
</div>
|
|
||||||
</DropdownMenuTrigger>
|
</DropdownMenuTrigger>
|
||||||
<DropdownMenuContent>{menuItems}</DropdownMenuContent>
|
|
||||||
</DropdownMenu>
|
|
||||||
);
|
);
|
||||||
}
|
|
||||||
|
|
||||||
// Desktop: use right-click ContextMenu + Tooltip
|
|
||||||
return (
|
return (
|
||||||
<ContextMenu>
|
<DropdownMenu open={menuOpen} onOpenChange={setMenuOpen}>
|
||||||
|
{disableTooltip ? (
|
||||||
|
trigger
|
||||||
|
) : (
|
||||||
<Tooltip>
|
<Tooltip>
|
||||||
<TooltipTrigger asChild>
|
<TooltipTrigger asChild>{trigger}</TooltipTrigger>
|
||||||
<ContextMenuTrigger asChild>
|
|
||||||
<div className="inline-block">{avatarButton}</div>
|
|
||||||
</ContextMenuTrigger>
|
|
||||||
</TooltipTrigger>
|
|
||||||
<TooltipContent side="right" sideOffset={8}>
|
<TooltipContent side="right" sideOffset={8}>
|
||||||
{tooltipContent}
|
{tooltipContent}
|
||||||
</TooltipContent>
|
</TooltipContent>
|
||||||
</Tooltip>
|
</Tooltip>
|
||||||
<ContextMenuContent>
|
|
||||||
{onSettings && (
|
|
||||||
<ContextMenuItem onClick={onSettings}>
|
|
||||||
<Settings className="mr-2 h-4 w-4" />
|
|
||||||
{tCommon("settings")}
|
|
||||||
</ContextMenuItem>
|
|
||||||
)}
|
)}
|
||||||
{onDelete && isOwner && (
|
<DropdownMenuContent side="right" align="start">
|
||||||
<ContextMenuItem onClick={onDelete}>
|
{menuItems}
|
||||||
<Trash2 className="mr-2 h-4 w-4" />
|
</DropdownMenuContent>
|
||||||
{tCommon("delete")}
|
</DropdownMenu>
|
||||||
</ContextMenuItem>
|
|
||||||
)}
|
|
||||||
{onDelete && !isOwner && (
|
|
||||||
<ContextMenuItem onClick={onDelete}>
|
|
||||||
<Trash2 className="mr-2 h-4 w-4" />
|
|
||||||
{t("leave")}
|
|
||||||
</ContextMenuItem>
|
|
||||||
)}
|
|
||||||
</ContextMenuContent>
|
|
||||||
</ContextMenu>
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// No context menu needed
|
// No context menu needed
|
||||||
if (disableTooltip) {
|
if (disableTooltip) {
|
||||||
return avatarButton;
|
return avatarButton();
|
||||||
}
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<Tooltip>
|
<Tooltip>
|
||||||
<TooltipTrigger asChild>{avatarButton}</TooltipTrigger>
|
<TooltipTrigger asChild>{avatarButton()}</TooltipTrigger>
|
||||||
<TooltipContent side="right" sideOffset={8}>
|
<TooltipContent side="right" sideOffset={8}>
|
||||||
{tooltipContent}
|
{tooltipContent}
|
||||||
</TooltipContent>
|
</TooltipContent>
|
||||||
|
|
|
||||||
|
|
@ -14,7 +14,7 @@ const buttonVariants = cva(
|
||||||
"bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40",
|
"bg-destructive text-white shadow-xs hover:bg-destructive/90 focus-visible:ring-destructive/20 dark:focus-visible:ring-destructive/40",
|
||||||
outline:
|
outline:
|
||||||
"border border-input bg-background shadow-xs hover:bg-accent hover:text-accent-foreground",
|
"border border-input bg-background shadow-xs hover:bg-accent hover:text-accent-foreground",
|
||||||
secondary: "bg-secondary text-secondary-foreground shadow-xs hover:bg-secondary/80",
|
secondary: "bg-accent text-accent-foreground shadow-xs hover:bg-accent/80",
|
||||||
ghost:
|
ghost:
|
||||||
"hover:bg-accent hover:text-accent-foreground focus-visible:ring-0 focus-visible:ring-offset-0",
|
"hover:bg-accent hover:text-accent-foreground focus-visible:ring-0 focus-visible:ring-offset-0",
|
||||||
link: "text-primary underline-offset-4 hover:underline",
|
link: "text-primary underline-offset-4 hover:underline",
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@
|
||||||
"how-to",
|
"how-to",
|
||||||
"---Developers---",
|
"---Developers---",
|
||||||
"testing",
|
"testing",
|
||||||
|
"observability",
|
||||||
"code-of-conduct"
|
"code-of-conduct"
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
|
||||||
167
surfsense_web/content/docs/observability.mdx
Normal file
167
surfsense_web/content/docs/observability.mdx
Normal file
|
|
@ -0,0 +1,167 @@
|
||||||
|
---
|
||||||
|
title: Observability
|
||||||
|
description: Configure backend traces and metrics for SurfSense
|
||||||
|
icon: Radar
|
||||||
|
---
|
||||||
|
|
||||||
|
SurfSense uses OpenTelemetry for backend traces and metrics. Application logs
|
||||||
|
include trace and span IDs so you can correlate logs with traces, but logs stay
|
||||||
|
on the normal container stderr path.
|
||||||
|
|
||||||
|
## Enable Locally
|
||||||
|
|
||||||
|
The development compose file reads backend settings from
|
||||||
|
`surfsense_backend/.env`. Add these values there:
|
||||||
|
|
||||||
|
```dotenv
|
||||||
|
SURFSENSE_ENABLE_OTEL=true
|
||||||
|
SURFSENSE_ENV=dev
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-lgtm:4317
|
||||||
|
OTEL_EXPORTER_OTLP_PROTOCOL=grpc
|
||||||
|
OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
|
||||||
|
OTEL_METRIC_EXPORT_INTERVAL=300000
|
||||||
|
```
|
||||||
|
|
||||||
|
Then start the development stack with the local LGTM backend:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f docker/docker-compose.dev.yml up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
Grafana is exposed on `http://localhost:3001` by default.
|
||||||
|
|
||||||
|
## Enable in Production Docker Compose
|
||||||
|
|
||||||
|
Production Docker Compose reads backend and collector settings from
|
||||||
|
`docker/.env`. The API and Celery worker export telemetry to the bundled
|
||||||
|
collector at `otel-collector:4317`; the collector is the only service that uses
|
||||||
|
the Grafana Cloud credentials.
|
||||||
|
|
||||||
|
Add these values to `docker/.env`:
|
||||||
|
|
||||||
|
```dotenv
|
||||||
|
SURFSENSE_ENV=production
|
||||||
|
SURFSENSE_ENABLE_OTEL=true
|
||||||
|
OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4317
|
||||||
|
OTEL_EXPORTER_OTLP_PROTOCOL=grpc
|
||||||
|
OTEL_RESOURCE_ATTRIBUTES=service.namespace=surfsense
|
||||||
|
OTEL_METRIC_EXPORT_INTERVAL=300000
|
||||||
|
|
||||||
|
GRAFANA_CLOUD_OTLP_ENDPOINT=https://otlp-gateway-<region>.grafana.net/otlp
|
||||||
|
GRAFANA_CLOUD_INSTANCE_ID=<stack instance id>
|
||||||
|
GRAFANA_CLOUD_API_KEY=<cloud access policy token>
|
||||||
|
```
|
||||||
|
|
||||||
|
Then start the stack:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose -f docker/docker-compose.yml --profile observability up -d
|
||||||
|
```
|
||||||
|
|
||||||
|
The collector receives OTLP on `otel-collector:4317`, scrubs sensitive span
|
||||||
|
attributes, applies the configured tail-sampling policy, batches exports,
|
||||||
|
retries failures, and forwards traces and metrics to Grafana Cloud over OTLP
|
||||||
|
HTTP.
|
||||||
|
|
||||||
|
When deploying `surfsense_backend/Dockerfile` directly instead of production
|
||||||
|
compose, use the same split: SurfSense containers export to a collector, and
|
||||||
|
the collector owns the Grafana Cloud credentials.
|
||||||
|
|
||||||
|
## Automatic Traces
|
||||||
|
|
||||||
|
When OpenTelemetry is enabled, the backend instruments:
|
||||||
|
|
||||||
|
- FastAPI inbound requests.
|
||||||
|
- SQLAlchemy queries from the main async engine and Celery task engine.
|
||||||
|
- Raw psycopg calls used by the LangGraph checkpointer.
|
||||||
|
- Redis commands.
|
||||||
|
- HTTPX outbound requests.
|
||||||
|
- Celery producer and worker execution.
|
||||||
|
|
||||||
|
## Manual Spans
|
||||||
|
|
||||||
|
SurfSense keeps project-specific spans behind `app.observability.otel`:
|
||||||
|
|
||||||
|
- `model.call`
|
||||||
|
- `tool.call`
|
||||||
|
- `chat.request`
|
||||||
|
- `kb.search`
|
||||||
|
- `kb.persist`
|
||||||
|
- `connector.sync`
|
||||||
|
- `subagent.invoke`
|
||||||
|
- `etl.extract`
|
||||||
|
- `etl.parse`
|
||||||
|
- `etl.ocr`
|
||||||
|
- `etl.picture.describe`
|
||||||
|
- `etl.picture.ocr`
|
||||||
|
- `compaction.run`
|
||||||
|
- `permission.asked`
|
||||||
|
- `interrupt.raised`
|
||||||
|
|
||||||
|
Keep span names and attributes low-cardinality. Do not attach user content,
|
||||||
|
prompts, document titles, file paths, user-specific URLs, secrets, or raw
|
||||||
|
queries as span attributes.
|
||||||
|
|
||||||
|
## Metrics
|
||||||
|
|
||||||
|
The OpenTelemetry instrumentors provide HTTP, HTTPX, and Celery runtime
|
||||||
|
metrics. SurfSense adds these project metrics from `app.observability.metrics`:
|
||||||
|
|
||||||
|
- `surfsense.model.call.duration`
|
||||||
|
- `gen_ai.client.token.usage`
|
||||||
|
- `surfsense.tool.call.duration`
|
||||||
|
- `surfsense.tool.call.errors`
|
||||||
|
- `surfsense.chat.request.duration`
|
||||||
|
- `surfsense.chat.request.outcome`
|
||||||
|
- `surfsense.kb.search.duration`
|
||||||
|
- `surfsense.compaction.runs`
|
||||||
|
- `surfsense.permission.asks`
|
||||||
|
- `surfsense.interrupt.raised`
|
||||||
|
- `surfsense.indexing.document.duration`
|
||||||
|
- `surfsense.indexing.document.outcome`
|
||||||
|
- `surfsense.connector.sync.duration`
|
||||||
|
- `surfsense.connector.sync.outcome`
|
||||||
|
- `surfsense.subagent.invoke.duration`
|
||||||
|
- `surfsense.subagent.invoke.outcome`
|
||||||
|
- `surfsense.etl.extract.duration`
|
||||||
|
- `surfsense.etl.extract.outcome`
|
||||||
|
- `surfsense.celery.heartbeat.refreshes`
|
||||||
|
- `surfsense.celery.heartbeat.failures`
|
||||||
|
- `surfsense.celery.queue.latency`
|
||||||
|
- `surfsense.auth.failures`
|
||||||
|
- `surfsense.rate_limit.rejections`
|
||||||
|
- `surfsense.perf.elapsed_ms`
|
||||||
|
|
||||||
|
Runtime gauges include process RSS, CPU utilization, threads, open file
|
||||||
|
descriptors, asyncio tasks, and CPython GC counters.
|
||||||
|
|
||||||
|
## Logs
|
||||||
|
|
||||||
|
`LoggingInstrumentor().instrument()` injects `otelTraceID` and `otelSpanID` into
|
||||||
|
standard Python `LogRecord`s. The root log format writes them as
|
||||||
|
`trace_id=... span_id=...`.
|
||||||
|
|
||||||
|
SurfSense intentionally does not create an OpenTelemetry `LoggerProvider`,
|
||||||
|
`LoggingHandler`, or `OTLPLogExporter`. Container stderr remains the log
|
||||||
|
transport.
|
||||||
|
|
||||||
|
## Verification
|
||||||
|
|
||||||
|
1. Hit a FastAPI endpoint and confirm an inbound server span appears in Grafana.
|
||||||
|
2. Run a chat request and confirm `model.call` and `tool.call` child spans.
|
||||||
|
3. Run a knowledge-base search and confirm `kb.search` spans and SQL child spans.
|
||||||
|
4. Run connector indexing and confirm Celery producer/worker spans share a trace
|
||||||
|
ID and connector sync metrics increment.
|
||||||
|
5. Confirm `gen_ai.client.token.usage`, model/tool durations, request duration,
|
||||||
|
Celery runtime, and runtime gauges appear within one export interval.
|
||||||
|
6. Confirm logs emitted inside a traced request show non-zero trace and span IDs.
|
||||||
|
|
||||||
|
## Out Of Scope
|
||||||
|
|
||||||
|
- Frontend/browser OpenTelemetry.
|
||||||
|
- OpenTelemetry log export.
|
||||||
|
- Profiling.
|
||||||
|
- Production backend selection.
|
||||||
|
- Tail-sampling collector configuration.
|
||||||
|
- Replacing LangSmith.
|
||||||
|
- Vendor SDKs.
|
||||||
|
|
@ -7,6 +7,7 @@ import {
|
||||||
Download,
|
Download,
|
||||||
FlaskConical,
|
FlaskConical,
|
||||||
Heart,
|
Heart,
|
||||||
|
Radar,
|
||||||
Unplug,
|
Unplug,
|
||||||
Wrench,
|
Wrench,
|
||||||
} from "lucide-react";
|
} from "lucide-react";
|
||||||
|
|
@ -26,6 +27,7 @@ const DOCS_ICONS: Record<string, React.ComponentType> = {
|
||||||
Download,
|
Download,
|
||||||
FlaskConical,
|
FlaskConical,
|
||||||
Heart,
|
Heart,
|
||||||
|
Radar,
|
||||||
Unplug,
|
Unplug,
|
||||||
Wrench,
|
Wrench,
|
||||||
};
|
};
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue