Merge pull request #213 from MODSetter/dev

Refactor: Cleanup DOCLING PR
This commit is contained in:
Rohan Verma 2025-07-21 18:54:13 +05:30 committed by GitHub
commit dd7768206c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 2338 additions and 2333 deletions

2
.gitignore vendored
View file

@ -1,5 +1,3 @@
.flashrank_cache*
podcasts/
reports/
SURFSENSE_CRITICAL_FIXES_REPORT.md
.env

View file

@ -32,7 +32,6 @@ services:
- NVIDIA_DRIVER_CAPABILITIES=compute,utility
- LANGCHAIN_TRACING_V2=false
- LANGSMITH_TRACING=false
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
deploy:
resources:
reservations:

View file

@ -11,10 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
wget \
unzip \
gnupg2 \
tesseract-ocr \
tesseract-ocr-eng \
libtesseract-dev \
libleptonica-dev \
&& rm -rf /var/lib/apt/lists/*
# Update certificates and install SSL tools
@ -60,9 +56,6 @@ COPY . .
ENV PYTHONPATH=/app
ENV UVICORN_LOOP=asyncio
# Set Tesseract data path
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
# Run
EXPOSE 8000
CMD ["python", "main.py"]

View file

@ -3,7 +3,7 @@ from fastapi import APIRouter, Depends, BackgroundTasks, UploadFile, Form, HTTPE
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from typing import List
from app.db import get_async_session, User, SearchSpace, Document, DocumentType
from app.db import Log, get_async_session, User, SearchSpace, Document, DocumentType
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
from app.users import current_active_user
from app.utils.check_ownership import check_ownership
@ -11,6 +11,8 @@ from app.tasks.background_tasks import add_received_markdown_file_document, add_
from app.config import config as app_config
# Force asyncio to use standard event loop before unstructured imports
import asyncio
from app.services.task_logging_service import TaskLoggingService
try:
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
except RuntimeError:
@ -136,8 +138,8 @@ async def process_file_in_background(
search_space_id: int,
user_id: str,
session: AsyncSession,
task_logger: 'TaskLoggingService',
log_entry: 'Log'
task_logger: TaskLoggingService,
log_entry: Log
):
try:
# Check if the file is a markdown or text file
@ -383,7 +385,7 @@ async def process_file_in_background(
)
# Use Docling service for document processing
from app.services.document_processing.docling_service import create_docling_service
from app.services.docling_service import create_docling_service
# Create Docling service
docling_service = create_docling_service()

View file

@ -1 +0,0 @@
# Document processing services for SurfSense

View file

@ -682,7 +682,7 @@ async def add_received_file_document_using_docling(
raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary using chunked processing for large documents
from app.services.document_processing.docling_service import create_docling_service
from app.services.docling_service import create_docling_service
docling_service = create_docling_service()
summary_content = await docling_service.process_large_document_summary(

View file

@ -30,7 +30,6 @@ dependencies = [
"slack-sdk>=3.34.0",
"static-ffmpeg>=2.13",
"tavily-python>=0.3.2",
"tesserocr>=2.8.0",
"unstructured-client>=0.30.0",
"unstructured[all-docs]>=0.16.25",
"uvicorn[standard]>=0.34.0",

4480
surfsense_backend/uv.lock generated

File diff suppressed because it is too large Load diff

View file

@ -53,83 +53,98 @@ export default function FileUploader() {
};
// Conditionally set accepted file types based on ETL service
const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD'
? {
// LlamaCloud supported file types
'application/pdf': ['.pdf'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
'application/msword-template': ['.dot'],
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'application/vnd.ms-powerpoint.template': ['.pot'],
'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
'application/vnd.ms-excel.workspace': ['.xlw'],
'application/rtf': ['.rtf'],
'application/xml': ['.xml'],
'application/epub+zip': ['.epub'],
'application/vnd.apple.keynote': ['.key'],
'application/vnd.apple.pages': ['.pages'],
'application/vnd.apple.numbers': ['.numbers'],
'application/vnd.wordperfect': ['.wpd'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'application/vnd.oasis.opendocument.presentation': ['.odp'],
'application/vnd.oasis.opendocument.graphics': ['.odg'],
'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
'application/vnd.oasis.opendocument.formula': ['.fods'],
'text/csv': ['.csv'],
'text/tab-separated-values': ['.tsv'],
'text/html': ['.html', '.htm', '.web'],
'image/jpeg': ['.jpg', '.jpeg'],
'image/png': ['.png'],
'image/gif': ['.gif'],
'image/bmp': ['.bmp'],
'image/svg+xml': ['.svg'],
'image/tiff': ['.tiff'],
'image/webp': ['.webp'],
'application/dbase': ['.dbf'],
'application/vnd.lotus-1-2-3': ['.123'],
'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
// Audio files (always supported)
...audioFileTypes,
const getAcceptedFileTypes = () => {
const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE;
if (etlService === 'LLAMACLOUD') {
return {
// LlamaCloud supported file types
'application/pdf': ['.pdf'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
'application/msword-template': ['.dot'],
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'application/vnd.ms-powerpoint.template': ['.pot'],
'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
'application/vnd.ms-excel.workspace': ['.xlw'],
'application/rtf': ['.rtf'],
'application/xml': ['.xml'],
'application/epub+zip': ['.epub'],
'application/vnd.apple.keynote': ['.key'],
'application/vnd.apple.pages': ['.pages'],
'application/vnd.apple.numbers': ['.numbers'],
'application/vnd.wordperfect': ['.wpd'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'application/vnd.oasis.opendocument.presentation': ['.odp'],
'application/vnd.oasis.opendocument.graphics': ['.odg'],
'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
'application/vnd.oasis.opendocument.formula': ['.fods'],
'text/csv': ['.csv'],
'text/tab-separated-values': ['.tsv'],
'text/html': ['.html', '.htm', '.web'],
'image/jpeg': ['.jpg', '.jpeg'],
'image/png': ['.png'],
'image/gif': ['.gif'],
'image/bmp': ['.bmp'],
'image/svg+xml': ['.svg'],
'image/tiff': ['.tiff'],
'image/webp': ['.webp'],
'application/dbase': ['.dbf'],
'application/vnd.lotus-1-2-3': ['.123'],
'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
// Audio files (always supported)
...audioFileTypes,
};
} else if (etlService === 'DOCLING') {
return {
// Docling supported file types (currently only PDF)
'application/pdf': ['.pdf'],
// Audio files (always supported)
...audioFileTypes,
};
} else {
return {
// Unstructured supported file types
'image/bmp': ['.bmp'],
'text/csv': ['.csv'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'message/rfc822': ['.eml'],
'application/epub+zip': ['.epub'],
'image/heic': ['.heic'],
'text/html': ['.html'],
'image/jpeg': ['.jpeg', '.jpg'],
'image/png': ['.png'],
'application/vnd.ms-outlook': ['.msg'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'text/x-org': ['.org'],
'application/pkcs7-signature': ['.p7s'],
'application/pdf': ['.pdf'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'text/x-rst': ['.rst'],
'application/rtf': ['.rtf'],
'image/tiff': ['.tiff'],
'text/tab-separated-values': ['.tsv'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/xml': ['.xml'],
// Audio files (always supported)
...audioFileTypes,
};
}
: {
// Unstructured supported file types
'image/bmp': ['.bmp'],
'text/csv': ['.csv'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'message/rfc822': ['.eml'],
'application/epub+zip': ['.epub'],
'image/heic': ['.heic'],
'text/html': ['.html'],
'image/jpeg': ['.jpeg', '.jpg'],
'image/png': ['.png'],
'application/vnd.ms-outlook': ['.msg'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'text/x-org': ['.org'],
'application/pkcs7-signature': ['.p7s'],
'application/pdf': ['.pdf'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'text/x-rst': ['.rst'],
'application/rtf': ['.rtf'],
'image/tiff': ['.tiff'],
'text/tab-separated-values': ['.tsv'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/xml': ['.xml'],
// Audio files (always supported)
...audioFileTypes,
};
};
const acceptedFileTypes = getAcceptedFileTypes();
const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()