mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-06-06 20:15:17 +02:00
197 lines
8.2 KiB
Python
197 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Docling Document Processing Service for SurfSense
|
|
SSL-safe implementation with pre-downloaded models
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import ssl
|
|
from typing import Any
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class DoclingService:
|
|
"""Docling service for enhanced document processing with SSL fixes."""
|
|
|
|
def __init__(self):
|
|
"""Initialize Docling service with SSL, model fixes, and GPU acceleration."""
|
|
self.converter = None
|
|
self.use_gpu = False
|
|
self._configure_ssl_environment()
|
|
self._check_wsl2_gpu_support()
|
|
self._initialize_docling()
|
|
|
|
def _configure_ssl_environment(self):
|
|
"""Configure SSL environment for secure model downloads."""
|
|
try:
|
|
# Set SSL context for downloads
|
|
ssl._create_default_https_context = ssl._create_unverified_context
|
|
|
|
# Set SSL environment variables if not already set
|
|
if not os.environ.get("SSL_CERT_FILE"):
|
|
try:
|
|
import certifi
|
|
|
|
os.environ["SSL_CERT_FILE"] = certifi.where()
|
|
os.environ["REQUESTS_CA_BUNDLE"] = certifi.where()
|
|
except ImportError:
|
|
pass
|
|
|
|
logger.info("🔐 SSL environment configured for model downloads")
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ SSL configuration warning: {e}")
|
|
|
|
def _check_wsl2_gpu_support(self):
|
|
"""Check and configure GPU support for WSL2 environment."""
|
|
try:
|
|
import torch
|
|
|
|
if torch.cuda.is_available():
|
|
gpu_count = torch.cuda.device_count()
|
|
gpu_name = torch.cuda.get_device_name(0) if gpu_count > 0 else "Unknown"
|
|
logger.info(f"✅ WSL2 GPU detected: {gpu_name} ({gpu_count} devices)")
|
|
logger.info(f"🚀 CUDA Version: {torch.version.cuda}")
|
|
self.use_gpu = True
|
|
else:
|
|
logger.info("⚠️ CUDA not available in WSL2, falling back to CPU")
|
|
self.use_gpu = False
|
|
except ImportError:
|
|
logger.info("⚠️ PyTorch not found, falling back to CPU")
|
|
self.use_gpu = False
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ GPU detection failed: {e}, falling back to CPU")
|
|
self.use_gpu = False
|
|
|
|
def _initialize_docling(self):
|
|
"""Initialize Docling with version-safe configuration."""
|
|
try:
|
|
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
|
from docling.datamodel.base_models import InputFormat
|
|
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
|
from docling.document_converter import DocumentConverter, PdfFormatOption
|
|
|
|
logger.info("🔧 Initializing Docling with version-safe configuration...")
|
|
|
|
# Create pipeline options with version-safe attribute checking
|
|
pipeline_options = PdfPipelineOptions()
|
|
|
|
# Enable OCR so text-in-image (chart axes, ECG annotations,
|
|
# lab tables embedded as images, scanned pages, etc.) is
|
|
# lifted into the main markdown stream. This pairs with the
|
|
# vision-LLM picture-description pass downstream — OCR
|
|
# captures literal text; vision LLM captures the visual
|
|
# content. Together they give a faithful representation of
|
|
# PDFs that mix text and images.
|
|
if hasattr(pipeline_options, "do_ocr"):
|
|
pipeline_options.do_ocr = True
|
|
logger.info("✅ OCR enabled for embedded text-in-image extraction")
|
|
else:
|
|
logger.warning("⚠️ OCR attribute not available in this Docling version")
|
|
|
|
# Enable table structure if available
|
|
if hasattr(pipeline_options, "do_table_structure"):
|
|
pipeline_options.do_table_structure = True
|
|
logger.info("✅ Table structure detection enabled")
|
|
|
|
# Configure GPU acceleration for WSL2 if available
|
|
if hasattr(pipeline_options, "accelerator_device"):
|
|
if self.use_gpu:
|
|
try:
|
|
pipeline_options.accelerator_device = "cuda"
|
|
logger.info("🚀 GPU acceleration enabled (CUDA)")
|
|
except Exception as e:
|
|
logger.warning(f"⚠️ GPU acceleration failed, using CPU: {e}")
|
|
pipeline_options.accelerator_device = "cpu"
|
|
else:
|
|
pipeline_options.accelerator_device = "cpu"
|
|
logger.info("🖥️ Using CPU acceleration")
|
|
else:
|
|
logger.info(
|
|
"⚠️ Accelerator device attribute not available in this Docling version"
|
|
)
|
|
|
|
# Create PDF format option with backend
|
|
pdf_format_option = PdfFormatOption(
|
|
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
|
|
)
|
|
|
|
self.converter = DocumentConverter(
|
|
format_options={InputFormat.PDF: pdf_format_option},
|
|
)
|
|
|
|
acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"
|
|
logger.info(
|
|
f"✅ Docling initialized successfully with {acceleration_type} acceleration"
|
|
)
|
|
|
|
except ImportError as e:
|
|
logger.error(f"❌ Docling not installed: {e}")
|
|
raise RuntimeError(f"Docling not available: {e}") from e
|
|
except Exception as e:
|
|
logger.error(f"❌ Docling initialization failed: {e}")
|
|
raise RuntimeError(f"Docling initialization failed: {e}") from e
|
|
|
|
async def process_document(
|
|
self, file_path: str, filename: str | None = None
|
|
) -> dict[str, Any]:
|
|
"""Process document with Docling using pre-downloaded models."""
|
|
|
|
if self.converter is None:
|
|
raise RuntimeError("Docling converter not initialized")
|
|
|
|
try:
|
|
logger.info(
|
|
f"🔄 Processing {filename} with Docling (using local models)..."
|
|
)
|
|
|
|
# Process document with local models
|
|
result = self.converter.convert(file_path)
|
|
|
|
# Extract content using version-safe methods
|
|
content = None
|
|
if hasattr(result, "document") and result.document:
|
|
# Try different export methods (version compatibility)
|
|
if hasattr(result.document, "export_to_markdown"):
|
|
content = result.document.export_to_markdown()
|
|
logger.info("📄 Used export_to_markdown method")
|
|
elif hasattr(result.document, "to_markdown"):
|
|
content = result.document.to_markdown()
|
|
logger.info("📄 Used to_markdown method")
|
|
elif hasattr(result.document, "text"):
|
|
content = result.document.text
|
|
logger.info("📄 Used text property")
|
|
elif hasattr(result.document, "__str__"):
|
|
content = str(result.document)
|
|
logger.info("📄 Used string conversion")
|
|
|
|
if content:
|
|
logger.info(
|
|
f"✅ Docling SUCCESS - {filename}: {len(content)} chars (local models)"
|
|
)
|
|
|
|
return {
|
|
"content": content,
|
|
"full_text": content,
|
|
"service_used": "docling",
|
|
"status": "success",
|
|
"processing_notes": "Processed with Docling using pre-downloaded models",
|
|
}
|
|
else:
|
|
raise ValueError("No content could be extracted from document")
|
|
else:
|
|
raise ValueError("No document object returned by Docling")
|
|
|
|
except Exception as e:
|
|
logger.error(f"❌ Docling processing failed for {filename}: {e}")
|
|
# Log the full error for debugging
|
|
import traceback
|
|
|
|
logger.error(f"Full traceback: {traceback.format_exc()}")
|
|
raise RuntimeError(f"Docling processing failed: {e}") from e
|
|
|
|
|
|
def create_docling_service() -> DoclingService:
|
|
"""Create a Docling service instance."""
|
|
return DoclingService()
|