feat: add parsers for Docling, LlamaCloud, and Unstructured to ETL pipeline

This commit is contained in:
Anish Sarkar 2026-04-05 17:27:24 +05:30
parent 2824410be2
commit f40de6b695
3 changed files with 169 additions and 0 deletions

View file

@ -0,0 +1,14 @@
async def parse_with_unstructured(file_path: str) -> str:
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path,
mode="elements",
post_processors=[],
languages=["eng"],
include_orig_elements=False,
include_metadata=False,
strategy="auto",
)
docs = await loader.aload()
return "\n\n".join(doc.page_content for doc in docs if doc.page_content)