mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-08 23:32:40 +02:00
feat: add parsers for Docling, LlamaCloud, and Unstructured to ETL pipeline
This commit is contained in:
parent
2824410be2
commit
f40de6b695
3 changed files with 169 additions and 0 deletions
14
surfsense_backend/app/etl_pipeline/parsers/unstructured.py
Normal file
14
surfsense_backend/app/etl_pipeline/parsers/unstructured.py
Normal file
|
|
@ -0,0 +1,14 @@
|
|||
async def parse_with_unstructured(file_path: str) -> str:
|
||||
from langchain_unstructured import UnstructuredLoader
|
||||
|
||||
loader = UnstructuredLoader(
|
||||
file_path,
|
||||
mode="elements",
|
||||
post_processors=[],
|
||||
languages=["eng"],
|
||||
include_orig_elements=False,
|
||||
include_metadata=False,
|
||||
strategy="auto",
|
||||
)
|
||||
docs = await loader.aload()
|
||||
return "\n\n".join(doc.page_content for doc in docs if doc.page_content)
|
||||
Loading…
Add table
Add a link
Reference in a new issue