From bd6e335cb3d0f227bc59e70877297b1ea8221a10 Mon Sep 17 00:00:00 2001 From: Anish Sarkar <104695310+AnishSarkar22@users.noreply.github.com> Date: Thu, 26 Mar 2026 23:10:49 +0530 Subject: [PATCH] feat: enhance performance logging in indexing pipeline - Added performance logging to the `index_batch_parallel` method, capturing metrics for document indexing duration and concurrency. - Introduced timing measurements for both the overall indexing process and the parallel document gathering phase, improving observability of the indexing workflow. - Updated logging statements to provide detailed insights into the number of documents processed, indexed, and failed during the indexing operation. --- .../indexing_pipeline_service.py | 21 +++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py index bd6086892..9a945dd25 100644 --- a/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py +++ b/surfsense_backend/app/indexing_pipeline/indexing_pipeline_service.py @@ -346,6 +346,8 @@ class IndexingPipelineService: bounded by a semaphore to avoid overwhelming APIs/DB. """ logger = logging.getLogger(__name__) + perf = get_perf_logger() + t_total = time.perf_counter() doc_map = { compute_unique_identifier_hash(cd): cd for cd in connector_docs @@ -422,7 +424,17 @@ class IndexingPipelineService: return exc tasks = [_index_one(doc) for doc in documents] + t_parallel = time.perf_counter() outcomes = await asyncio.gather(*tasks, return_exceptions=True) + perf.info( + "[indexing] index_batch_parallel gather docs=%d concurrency=%d " + "indexed=%d failed=%d in %.3fs", + len(documents), + max_concurrency, + indexed_count, + failed_count, + time.perf_counter() - t_parallel, + ) for outcome in outcomes: if isinstance(outcome, Document): @@ -430,4 +442,13 @@ class IndexingPipelineService: elif isinstance(outcome, Exception): pass + perf.info( + "[indexing] index_batch_parallel TOTAL input=%d prepared=%d " + "indexed=%d failed=%d in %.3fs", + len(connector_docs), + len(documents), + indexed_count, + failed_count, + time.perf_counter() - t_total, + ) return results, indexed_count, failed_count