From b9d76f006dff959879b190af3c50d3925466d351 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 21 May 2026 23:03:31 +0530 Subject: [PATCH] feat(retriever): instrument knowledge base search --- .../app/retriever/chunks_hybrid_search.py | 41 +++++++++++++ .../app/retriever/documents_hybrid_search.py | 41 +++++++++++++ .../unit/observability/test_retriever_otel.py | 61 +++++++++++++++++++ 3 files changed, 143 insertions(+) create mode 100644 surfsense_backend/tests/unit/observability/test_retriever_otel.py diff --git a/surfsense_backend/app/retriever/chunks_hybrid_search.py b/surfsense_backend/app/retriever/chunks_hybrid_search.py index e32c6c43d..47f7fe6b1 100644 --- a/surfsense_backend/app/retriever/chunks_hybrid_search.py +++ b/surfsense_backend/app/retriever/chunks_hybrid_search.py @@ -1,13 +1,51 @@ import asyncio import contextlib +import functools import time from datetime import datetime +from app.observability import metrics as ot_metrics, otel as ot from app.utils.perf import get_perf_logger _MAX_FETCH_CHUNKS_PER_DOC = 20 +def _instrument_search(mode: str): + def _decorator(func): + @functools.wraps(func) + async def _wrapper( + self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs + ): + t0 = time.perf_counter() + with ot.kb_search_span( + search_space_id=search_space_id, + query_chars=len(query_text), + extra={"search.surface": "chunks", "search.mode": mode}, + ) as sp: + try: + result = await func( + self, query_text, top_k, search_space_id, *args, **kwargs + ) + except Exception: + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="chunks", + ) + raise + sp.set_attribute("result.count", len(result)) + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="chunks", + ) + return result + + return _wrapper + + return _decorator + + class ChucksHybridSearchRetriever: def __init__(self, db_session): """ @@ -18,6 +56,7 @@ class ChucksHybridSearchRetriever: """ self.db_session = db_session + @_instrument_search("vector") async def vector_search( self, query_text: str, @@ -88,6 +127,7 @@ class ChucksHybridSearchRetriever: return chunks + @_instrument_search("full_text") async def full_text_search( self, query_text: str, @@ -153,6 +193,7 @@ class ChucksHybridSearchRetriever: return chunks + @_instrument_search("hybrid") async def hybrid_search( self, query_text: str, diff --git a/surfsense_backend/app/retriever/documents_hybrid_search.py b/surfsense_backend/app/retriever/documents_hybrid_search.py index 3eabdb004..9ce86d404 100644 --- a/surfsense_backend/app/retriever/documents_hybrid_search.py +++ b/surfsense_backend/app/retriever/documents_hybrid_search.py @@ -1,12 +1,50 @@ import contextlib +import functools import time from datetime import datetime +from app.observability import metrics as ot_metrics, otel as ot from app.utils.perf import get_perf_logger _MAX_FETCH_CHUNKS_PER_DOC = 20 +def _instrument_search(mode: str): + def _decorator(func): + @functools.wraps(func) + async def _wrapper( + self, query_text: str, top_k: int, search_space_id: int, *args, **kwargs + ): + t0 = time.perf_counter() + with ot.kb_search_span( + search_space_id=search_space_id, + query_chars=len(query_text), + extra={"search.surface": "documents", "search.mode": mode}, + ) as sp: + try: + result = await func( + self, query_text, top_k, search_space_id, *args, **kwargs + ) + except Exception: + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="documents", + ) + raise + sp.set_attribute("result.count", len(result)) + ot_metrics.record_kb_search_duration( + (time.perf_counter() - t0) * 1000, + search_space_id=search_space_id, + surface="documents", + ) + return result + + return _wrapper + + return _decorator + + class DocumentHybridSearchRetriever: def __init__(self, db_session): """ @@ -17,6 +55,7 @@ class DocumentHybridSearchRetriever: """ self.db_session = db_session + @_instrument_search("vector") async def vector_search( self, query_text: str, @@ -81,6 +120,7 @@ class DocumentHybridSearchRetriever: return documents + @_instrument_search("full_text") async def full_text_search( self, query_text: str, @@ -145,6 +185,7 @@ class DocumentHybridSearchRetriever: return documents + @_instrument_search("hybrid") async def hybrid_search( self, query_text: str, diff --git a/surfsense_backend/tests/unit/observability/test_retriever_otel.py b/surfsense_backend/tests/unit/observability/test_retriever_otel.py new file mode 100644 index 000000000..9712a3150 --- /dev/null +++ b/surfsense_backend/tests/unit/observability/test_retriever_otel.py @@ -0,0 +1,61 @@ +"""Tests for retriever OTel wrappers.""" + +from __future__ import annotations + +from contextlib import contextmanager +from typing import Any + +import pytest + +from app.retriever.documents_hybrid_search import _instrument_search + +pytestmark = pytest.mark.unit + + +class _Span: + def __init__(self) -> None: + self.attrs: dict[str, Any] = {} + + def set_attribute(self, key: str, value: Any) -> None: + self.attrs[key] = value + + +@contextmanager +def _fake_span(**kwargs): + span = _Span() + span.attrs.update(kwargs) + yield span + + +@pytest.mark.asyncio +async def test_retriever_wrapper_records_one_span_and_metric(monkeypatch) -> None: + calls: list[dict[str, Any]] = [] + + monkeypatch.setattr( + "app.retriever.documents_hybrid_search.ot.kb_search_span", + lambda **kwargs: _fake_span(**kwargs), + ) + monkeypatch.setattr( + "app.retriever.documents_hybrid_search.ot_metrics.record_kb_search_duration", + lambda duration_ms, **attrs: calls.append( + {"duration_ms": duration_ms, **attrs} + ), + ) + + class Retriever: + @_instrument_search("hybrid") + async def search( + self, + query_text: str, + top_k: int, + search_space_id: int, + ) -> list[str]: + del query_text, top_k, search_space_id + return ["doc-1", "doc-2"] + + result = await Retriever().search("hello", 3, 42) + + assert result == ["doc-1", "doc-2"] + assert len(calls) == 1 + assert calls[0]["search_space_id"] == 42 + assert calls[0]["surface"] == "documents"