feat: Add Docling support as ETL_SERVICE option

- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD) - Implemented add_received_file_document_using_docling function - Added Docling processing logic in documents_routes.py - Enhanced chunking with configurable overlap support - Added comprehensive document processing service - Supports both CPU and GPU processing with user selection Addresses #161 - Add Docling Support as an ETL_SERVICE Follows same pattern as LlamaCloud integration (PR #123)
2026-07-02 22:01:05 +02:00 · 2025-07-20 11:42:55 +03:00 · 2025-07-20 11:42:55 +03:00 · aa00822169
commit aa00822169
parent f852bcb188
14 changed files with 3125 additions and 2090 deletions
--- a/docker-compose.override.yml
+++ b/docker-compose.override.yml
@ -2,7 +2,7 @@ version: '3.8'

 services:
  frontend:
-    image: ghcr.io/modsetter/surfsense_ui:latest
+    build: ./surfsense_web
    ports:
      - "${FRONTEND_PORT:-3000}:3000"
    volumes:
@ -14,7 +14,7 @@ services:
      - NEXT_PUBLIC_API_URL=${NEXT_PUBLIC_API_URL:-http://backend:8000}

  backend:
-    image: ghcr.io/modsetter/surfsense_backend:latest
+    build: ./surfsense_backend
    ports:
      - "${BACKEND_PORT:-8000}:8000"
    volumes:
@ -28,3 +28,15 @@ services:
      - PYTHONPATH=/app
      - UVICORN_LOOP=asyncio
      - UNSTRUCTURED_HAS_PATCHED_LOOP=1
+      - NVIDIA_VISIBLE_DEVICES=all
+      - NVIDIA_DRIVER_CAPABILITIES=compute,utility
+      - LANGCHAIN_TRACING_V2=false
+      - LANGSMITH_TRACING=false
+      - TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]