feat: add parsers for Docling, LlamaCloud, and Unstructured to ETL pipeline

This commit is contained in:
Anish Sarkar 2026-04-05 17:27:24 +05:30
parent 2824410be2
commit f40de6b695
3 changed files with 169 additions and 0 deletions

View file

@ -0,0 +1,26 @@
import warnings
from logging import ERROR, getLogger
async def parse_with_docling(file_path: str, filename: str) -> str:
from app.services.docling_service import create_docling_service
docling_service = create_docling_service()
pdfminer_logger = getLogger("pdfminer")
original_level = pdfminer_logger.level
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
warnings.filterwarnings(
"ignore", message=".*Cannot set gray non-stroke color.*"
)
warnings.filterwarnings("ignore", message=".*invalid float value.*")
pdfminer_logger.setLevel(ERROR)
try:
result = await docling_service.process_document(file_path, filename)
finally:
pdfminer_logger.setLevel(original_level)
return result["content"]