feat: Add Docling support as ETL_SERVICE option

- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD)
- Implemented add_received_file_document_using_docling function
- Added Docling processing logic in documents_routes.py
- Enhanced chunking with configurable overlap support
- Added comprehensive document processing service
- Supports both CPU and GPU processing with user selection

Addresses #161 - Add Docling Support as an ETL_SERVICE
Follows same pattern as LlamaCloud integration (PR #123)
This commit is contained in:
Abdullah 3li 2025-07-20 11:42:55 +03:00
parent f852bcb188
commit aa00822169
14 changed files with 3125 additions and 2090 deletions

View file

@ -2,7 +2,7 @@ version: '3.8'
services:
frontend:
image: ghcr.io/modsetter/surfsense_ui:latest
build: ./surfsense_web
ports:
- "${FRONTEND_PORT:-3000}:3000"
volumes:
@ -14,7 +14,7 @@ services:
- NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}
backend:
image: ghcr.io/modsetter/surfsense_backend:latest
build: ./surfsense_backend
ports:
- "${BACKEND_PORT:-8000}:8000"
volumes:
@ -28,3 +28,15 @@ services:
- PYTHONPATH=/app
- UVICORN_LOOP=asyncio
- UNSTRUCTURED_HAS_PATCHED_LOOP=1
- NVIDIA_VISIBLE_DEVICES=all
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- LANGCHAIN_TRACING_V2=false
- LANGSMITH_TRACING=false
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]