feat(etl): instrument extraction spans and outcomes

This commit is contained in:
Anish Sarkar 2026-05-22 13:49:42 +05:30
parent 8bca29fe0d
commit 4e3a6dff46

View file

@ -1,4 +1,7 @@
import contextlib
import logging import logging
import time
from pathlib import PurePosixPath
from app.config import config as app_config from app.config import config as app_config
from app.etl_pipeline.etl_document import EtlRequest, EtlResult from app.etl_pipeline.etl_document import EtlRequest, EtlResult
@ -10,6 +13,11 @@ from app.etl_pipeline.file_classifier import FileCategory, classify_file
from app.etl_pipeline.parsers.audio import transcribe_audio from app.etl_pipeline.parsers.audio import transcribe_audio
from app.etl_pipeline.parsers.direct_convert import convert_file_directly from app.etl_pipeline.parsers.direct_convert import convert_file_directly
from app.etl_pipeline.parsers.plaintext import read_plaintext from app.etl_pipeline.parsers.plaintext import read_plaintext
from app.observability import metrics as ot_metrics, otel as ot
def _file_extension(filename: str) -> str:
return PurePosixPath(filename).suffix.lower() or "none"
class EtlPipelineService: class EtlPipelineService:
@ -20,49 +28,88 @@ class EtlPipelineService:
async def extract(self, request: EtlRequest) -> EtlResult: async def extract(self, request: EtlRequest) -> EtlResult:
category = classify_file(request.filename) category = classify_file(request.filename)
start = time.perf_counter()
status = "success"
result: EtlResult | None = None
with ot.etl_extract_span(
content_type=category.value,
file_extension=_file_extension(request.filename),
processing_mode=request.processing_mode.value,
) as sp:
try:
if category == FileCategory.UNSUPPORTED:
raise EtlUnsupportedFileError(
f"File type not supported for parsing: {request.filename}"
)
if category == FileCategory.UNSUPPORTED: if category == FileCategory.PLAINTEXT:
raise EtlUnsupportedFileError( content = read_plaintext(request.file_path)
f"File type not supported for parsing: {request.filename}" result = EtlResult(
) markdown_content=content,
etl_service="PLAINTEXT",
content_type="plaintext",
)
return result
if category == FileCategory.PLAINTEXT: if category == FileCategory.DIRECT_CONVERT:
content = read_plaintext(request.file_path) content = convert_file_directly(request.file_path, request.filename)
return EtlResult( result = EtlResult(
markdown_content=content, markdown_content=content,
etl_service="PLAINTEXT", etl_service="DIRECT_CONVERT",
content_type="plaintext", content_type="direct_convert",
) )
return result
if category == FileCategory.DIRECT_CONVERT: if category == FileCategory.AUDIO:
content = convert_file_directly(request.file_path, request.filename) content = await transcribe_audio(request.file_path, request.filename)
return EtlResult( result = EtlResult(
markdown_content=content, markdown_content=content,
etl_service="DIRECT_CONVERT", etl_service="AUDIO",
content_type="direct_convert", content_type="audio",
) )
return result
if category == FileCategory.AUDIO: if category == FileCategory.IMAGE:
content = await transcribe_audio(request.file_path, request.filename) result = await self._extract_image(request)
return EtlResult( return result
markdown_content=content,
etl_service="AUDIO",
content_type="audio",
)
if category == FileCategory.IMAGE: result = await self._extract_document(request)
return await self._extract_image(request) return result
except Exception:
return await self._extract_document(request) status = "error"
raise
finally:
with contextlib.suppress(Exception):
if result is not None:
sp.set_attribute("etl.service", result.etl_service)
sp.set_attribute("content.type", result.content_type)
sp.set_attribute("etl.status", status)
ot_metrics.record_etl_extract_duration(
time.perf_counter() - start,
etl_service=result.etl_service if result else None,
content_type=result.content_type if result else category.value,
status=status,
)
ot_metrics.record_etl_extract_outcome(
etl_service=result.etl_service if result else None,
content_type=result.content_type if result else category.value,
status=status,
)
async def _extract_image(self, request: EtlRequest) -> EtlResult: async def _extract_image(self, request: EtlRequest) -> EtlResult:
if self._vision_llm: if self._vision_llm:
try: try:
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
content = await parse_with_vision_llm( with ot.etl_parse_span(
request.file_path, request.filename, self._vision_llm etl_service="VISION_LLM",
) content_type="image",
file_extension=_file_extension(request.filename),
) as sp:
content = await parse_with_vision_llm(
request.file_path, request.filename, self._vision_llm
)
sp.set_attribute("etl.status", "success")
return EtlResult( return EtlResult(
markdown_content=content, markdown_content=content,
etl_service="VISION_LLM", etl_service="VISION_LLM",
@ -94,7 +141,11 @@ class EtlPipelineService:
) )
try: try:
return await self._extract_document(request) with ot.etl_ocr_span(
etl_service=app_config.ETL_SERVICE,
file_extension=_file_extension(request.filename),
):
return await self._extract_document(request)
except (EtlUnsupportedFileError, EtlServiceUnavailableError): except (EtlUnsupportedFileError, EtlServiceUnavailableError):
raise EtlUnsupportedFileError( raise EtlUnsupportedFileError(
f"Cannot process image {request.filename}: vision LLM " f"Cannot process image {request.filename}: vision LLM "
@ -121,18 +172,27 @@ class EtlPipelineService:
f"File type {ext} is not supported by {etl_service}" f"File type {ext} is not supported by {etl_service}"
) )
if etl_service == "DOCLING": with ot.etl_parse_span(
from app.etl_pipeline.parsers.docling import parse_with_docling etl_service=etl_service,
content_type="document",
file_extension=ext,
processing_mode=request.processing_mode.value,
) as sp:
if etl_service == "DOCLING":
from app.etl_pipeline.parsers.docling import parse_with_docling
content = await parse_with_docling(request.file_path, request.filename) content = await parse_with_docling(request.file_path, request.filename)
elif etl_service == "UNSTRUCTURED": elif etl_service == "UNSTRUCTURED":
from app.etl_pipeline.parsers.unstructured import parse_with_unstructured from app.etl_pipeline.parsers.unstructured import (
parse_with_unstructured,
)
content = await parse_with_unstructured(request.file_path) content = await parse_with_unstructured(request.file_path)
elif etl_service == "LLAMACLOUD": elif etl_service == "LLAMACLOUD":
content = await self._extract_with_llamacloud(request) content = await self._extract_with_llamacloud(request)
else: else:
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
sp.set_attribute("etl.status", "success")
# When the operator opts into vision-LLM at ingest, walk the # When the operator opts into vision-LLM at ingest, walk the
# original file's embedded images and append a structured # original file's embedded images and append a structured
@ -171,9 +231,14 @@ class EtlPipelineService:
async def _ocr_image(image_path: str, image_name: str) -> str: async def _ocr_image(image_path: str, image_name: str) -> str:
try: try:
sub = EtlPipelineService(vision_llm=None) sub = EtlPipelineService(vision_llm=None)
ocr_result = await sub.extract( with ot.etl_picture_ocr_span(
EtlRequest(file_path=image_path, filename=image_name) file_extension=_file_extension(image_name)
) ) as sp:
ocr_result = await sub.extract(
EtlRequest(file_path=image_path, filename=image_name)
)
sp.set_attribute("etl.service", ocr_result.etl_service)
sp.set_attribute("etl.status", "success")
except ( except (
EtlUnsupportedFileError, EtlUnsupportedFileError,
EtlServiceUnavailableError, EtlServiceUnavailableError,
@ -186,12 +251,19 @@ class EtlPipelineService:
return ocr_result.markdown_content return ocr_result.markdown_content
try: try:
result = await describe_pictures( with ot.etl_picture_describe_span() as sp:
request.file_path, result = await describe_pictures(
request.filename, request.file_path,
self._vision_llm, request.filename,
ocr_runner=_ocr_image, self._vision_llm,
) ocr_runner=_ocr_image,
)
sp.set_attribute("image.described.count", len(result.descriptions))
sp.set_attribute("image.failed.count", result.failed)
sp.set_attribute("image.skipped.too_small", result.skipped_too_small)
sp.set_attribute("image.skipped.too_large", result.skipped_too_large)
sp.set_attribute("image.skipped.duplicate", result.skipped_duplicate)
sp.set_attribute("etl.status", "success")
except Exception: except Exception:
# Picture description is additive; never let it fail an # Picture description is additive; never let it fail an
# otherwise-successful document extraction. # otherwise-successful document extraction.