2026-04-05 17:25:25 +05:30
|
|
|
from app.config import config as app_config
|
|
|
|
|
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
|
2026-04-07 05:55:39 +05:30
|
|
|
from app.etl_pipeline.exceptions import (
|
|
|
|
|
EtlServiceUnavailableError,
|
|
|
|
|
EtlUnsupportedFileError,
|
|
|
|
|
)
|
2026-04-05 17:25:25 +05:30
|
|
|
from app.etl_pipeline.file_classifier import FileCategory, classify_file
|
|
|
|
|
from app.etl_pipeline.parsers.audio import transcribe_audio
|
|
|
|
|
from app.etl_pipeline.parsers.direct_convert import convert_file_directly
|
|
|
|
|
from app.etl_pipeline.parsers.plaintext import read_plaintext
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class EtlPipelineService:
|
|
|
|
|
"""Single pipeline for extracting markdown from files. All callers use this."""
|
|
|
|
|
|
|
|
|
|
async def extract(self, request: EtlRequest) -> EtlResult:
|
|
|
|
|
category = classify_file(request.filename)
|
|
|
|
|
|
2026-04-06 22:03:47 +05:30
|
|
|
if category == FileCategory.UNSUPPORTED:
|
|
|
|
|
raise EtlUnsupportedFileError(
|
|
|
|
|
f"File type not supported for parsing: {request.filename}"
|
|
|
|
|
)
|
|
|
|
|
|
2026-04-05 17:25:25 +05:30
|
|
|
if category == FileCategory.PLAINTEXT:
|
|
|
|
|
content = read_plaintext(request.file_path)
|
|
|
|
|
return EtlResult(
|
|
|
|
|
markdown_content=content,
|
|
|
|
|
etl_service="PLAINTEXT",
|
|
|
|
|
content_type="plaintext",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if category == FileCategory.DIRECT_CONVERT:
|
|
|
|
|
content = convert_file_directly(request.file_path, request.filename)
|
|
|
|
|
return EtlResult(
|
|
|
|
|
markdown_content=content,
|
|
|
|
|
etl_service="DIRECT_CONVERT",
|
|
|
|
|
content_type="direct_convert",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
if category == FileCategory.AUDIO:
|
|
|
|
|
content = await transcribe_audio(request.file_path, request.filename)
|
|
|
|
|
return EtlResult(
|
|
|
|
|
markdown_content=content,
|
|
|
|
|
etl_service="AUDIO",
|
|
|
|
|
content_type="audio",
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
return await self._extract_document(request)
|
|
|
|
|
|
|
|
|
|
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
2026-04-07 02:19:31 +05:30
|
|
|
from pathlib import PurePosixPath
|
|
|
|
|
|
|
|
|
|
from app.utils.file_extensions import get_document_extensions_for_service
|
|
|
|
|
|
2026-04-05 17:25:25 +05:30
|
|
|
etl_service = app_config.ETL_SERVICE
|
|
|
|
|
if not etl_service:
|
|
|
|
|
raise EtlServiceUnavailableError(
|
|
|
|
|
"No ETL_SERVICE configured. "
|
2026-04-08 00:59:12 +05:30
|
|
|
"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, DOCLING, or AZURE_DI in your .env"
|
2026-04-05 17:25:25 +05:30
|
|
|
)
|
|
|
|
|
|
2026-04-07 02:19:31 +05:30
|
|
|
ext = PurePosixPath(request.filename).suffix.lower()
|
|
|
|
|
supported = get_document_extensions_for_service(etl_service)
|
|
|
|
|
if ext not in supported:
|
|
|
|
|
raise EtlUnsupportedFileError(
|
|
|
|
|
f"File type {ext} is not supported by {etl_service}"
|
|
|
|
|
)
|
|
|
|
|
|
2026-04-05 17:25:25 +05:30
|
|
|
if etl_service == "DOCLING":
|
|
|
|
|
from app.etl_pipeline.parsers.docling import parse_with_docling
|
|
|
|
|
|
|
|
|
|
content = await parse_with_docling(request.file_path, request.filename)
|
|
|
|
|
elif etl_service == "UNSTRUCTURED":
|
|
|
|
|
from app.etl_pipeline.parsers.unstructured import parse_with_unstructured
|
|
|
|
|
|
|
|
|
|
content = await parse_with_unstructured(request.file_path)
|
|
|
|
|
elif etl_service == "LLAMACLOUD":
|
|
|
|
|
from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
|
|
|
|
|
|
|
|
|
|
content = await parse_with_llamacloud(
|
|
|
|
|
request.file_path, request.estimated_pages
|
|
|
|
|
)
|
2026-04-08 00:59:12 +05:30
|
|
|
elif etl_service == "AZURE_DI":
|
|
|
|
|
from app.etl_pipeline.parsers.azure_doc_intelligence import (
|
|
|
|
|
parse_with_azure_doc_intelligence,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
content = await parse_with_azure_doc_intelligence(request.file_path)
|
2026-04-05 17:25:25 +05:30
|
|
|
else:
|
2026-04-07 05:55:39 +05:30
|
|
|
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
|
2026-04-05 17:25:25 +05:30
|
|
|
|
|
|
|
|
return EtlResult(
|
|
|
|
|
markdown_content=content,
|
|
|
|
|
etl_service=etl_service,
|
|
|
|
|
content_type="document",
|
|
|
|
|
)
|