Merge pull request #213 from MODSetter/dev

Refactor: Cleanup DOCLING PR
This commit is contained in:
Rohan Verma 2025-07-21 18:54:13 +05:30 committed by GitHub
commit dd7768206c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 2338 additions and 2333 deletions

2
.gitignore vendored
View file

@ -1,5 +1,3 @@
.flashrank_cache* .flashrank_cache*
podcasts/ podcasts/
reports/
SURFSENSE_CRITICAL_FIXES_REPORT.md
.env .env

View file

@ -32,7 +32,6 @@ services:
- NVIDIA_DRIVER_CAPABILITIES=compute,utility - NVIDIA_DRIVER_CAPABILITIES=compute,utility
- LANGCHAIN_TRACING_V2=false - LANGCHAIN_TRACING_V2=false
- LANGSMITH_TRACING=false - LANGSMITH_TRACING=false
- TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
deploy: deploy:
resources: resources:
reservations: reservations:

View file

@ -11,10 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
wget \ wget \
unzip \ unzip \
gnupg2 \ gnupg2 \
tesseract-ocr \
tesseract-ocr-eng \
libtesseract-dev \
libleptonica-dev \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
# Update certificates and install SSL tools # Update certificates and install SSL tools
@ -60,9 +56,6 @@ COPY . .
ENV PYTHONPATH=/app ENV PYTHONPATH=/app
ENV UVICORN_LOOP=asyncio ENV UVICORN_LOOP=asyncio
# Set Tesseract data path
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
# Run # Run
EXPOSE 8000 EXPOSE 8000
CMD ["python", "main.py"] CMD ["python", "main.py"]

View file

@ -3,7 +3,7 @@ from fastapi import APIRouter, Depends, BackgroundTasks, UploadFile, Form, HTTPE
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select from sqlalchemy.future import select
from typing import List from typing import List
from app.db import get_async_session, User, SearchSpace, Document, DocumentType from app.db import Log, get_async_session, User, SearchSpace, Document, DocumentType
from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead from app.schemas import DocumentsCreate, DocumentUpdate, DocumentRead
from app.users import current_active_user from app.users import current_active_user
from app.utils.check_ownership import check_ownership from app.utils.check_ownership import check_ownership
@ -11,6 +11,8 @@ from app.tasks.background_tasks import add_received_markdown_file_document, add_
from app.config import config as app_config from app.config import config as app_config
# Force asyncio to use standard event loop before unstructured imports # Force asyncio to use standard event loop before unstructured imports
import asyncio import asyncio
from app.services.task_logging_service import TaskLoggingService
try: try:
asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy()) asyncio.set_event_loop_policy(asyncio.DefaultEventLoopPolicy())
except RuntimeError: except RuntimeError:
@ -136,8 +138,8 @@ async def process_file_in_background(
search_space_id: int, search_space_id: int,
user_id: str, user_id: str,
session: AsyncSession, session: AsyncSession,
task_logger: 'TaskLoggingService', task_logger: TaskLoggingService,
log_entry: 'Log' log_entry: Log
): ):
try: try:
# Check if the file is a markdown or text file # Check if the file is a markdown or text file
@ -383,7 +385,7 @@ async def process_file_in_background(
) )
# Use Docling service for document processing # Use Docling service for document processing
from app.services.document_processing.docling_service import create_docling_service from app.services.docling_service import create_docling_service
# Create Docling service # Create Docling service
docling_service = create_docling_service() docling_service = create_docling_service()

View file

@ -1 +0,0 @@
# Document processing services for SurfSense

View file

@ -682,7 +682,7 @@ async def add_received_file_document_using_docling(
raise RuntimeError(f"No long context LLM configured for user {user_id}") raise RuntimeError(f"No long context LLM configured for user {user_id}")
# Generate summary using chunked processing for large documents # Generate summary using chunked processing for large documents
from app.services.document_processing.docling_service import create_docling_service from app.services.docling_service import create_docling_service
docling_service = create_docling_service() docling_service = create_docling_service()
summary_content = await docling_service.process_large_document_summary( summary_content = await docling_service.process_large_document_summary(

View file

@ -30,7 +30,6 @@ dependencies = [
"slack-sdk>=3.34.0", "slack-sdk>=3.34.0",
"static-ffmpeg>=2.13", "static-ffmpeg>=2.13",
"tavily-python>=0.3.2", "tavily-python>=0.3.2",
"tesserocr>=2.8.0",
"unstructured-client>=0.30.0", "unstructured-client>=0.30.0",
"unstructured[all-docs]>=0.16.25", "unstructured[all-docs]>=0.16.25",
"uvicorn[standard]>=0.34.0", "uvicorn[standard]>=0.34.0",

4480
surfsense_backend/uv.lock generated

File diff suppressed because it is too large Load diff

View file

@ -53,83 +53,98 @@ export default function FileUploader() {
}; };
// Conditionally set accepted file types based on ETL service // Conditionally set accepted file types based on ETL service
const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD' const getAcceptedFileTypes = () => {
? { const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE;
// LlamaCloud supported file types
'application/pdf': ['.pdf'], if (etlService === 'LLAMACLOUD') {
'application/msword': ['.doc'], return {
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'], // LlamaCloud supported file types
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'], 'application/pdf': ['.pdf'],
'application/msword-template': ['.dot'], 'application/msword': ['.doc'],
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'], 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'application/vnd.ms-powerpoint': ['.ppt'], 'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'], 'application/msword-template': ['.dot'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'], 'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
'application/vnd.ms-powerpoint.template': ['.pot'], 'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'], 'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'], 'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'application/vnd.ms-excel': ['.xls'], 'application/vnd.ms-powerpoint.template': ['.pot'],
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'], 'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'], 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/vnd.ms-excel.workspace': ['.xlw'], 'application/vnd.ms-excel': ['.xls'],
'application/rtf': ['.rtf'], 'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
'application/xml': ['.xml'], 'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
'application/epub+zip': ['.epub'], 'application/vnd.ms-excel.workspace': ['.xlw'],
'application/vnd.apple.keynote': ['.key'], 'application/rtf': ['.rtf'],
'application/vnd.apple.pages': ['.pages'], 'application/xml': ['.xml'],
'application/vnd.apple.numbers': ['.numbers'], 'application/epub+zip': ['.epub'],
'application/vnd.wordperfect': ['.wpd'], 'application/vnd.apple.keynote': ['.key'],
'application/vnd.oasis.opendocument.text': ['.odt'], 'application/vnd.apple.pages': ['.pages'],
'application/vnd.oasis.opendocument.presentation': ['.odp'], 'application/vnd.apple.numbers': ['.numbers'],
'application/vnd.oasis.opendocument.graphics': ['.odg'], 'application/vnd.wordperfect': ['.wpd'],
'application/vnd.oasis.opendocument.spreadsheet': ['.ods'], 'application/vnd.oasis.opendocument.text': ['.odt'],
'application/vnd.oasis.opendocument.formula': ['.fods'], 'application/vnd.oasis.opendocument.presentation': ['.odp'],
'text/csv': ['.csv'], 'application/vnd.oasis.opendocument.graphics': ['.odg'],
'text/tab-separated-values': ['.tsv'], 'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
'text/html': ['.html', '.htm', '.web'], 'application/vnd.oasis.opendocument.formula': ['.fods'],
'image/jpeg': ['.jpg', '.jpeg'], 'text/csv': ['.csv'],
'image/png': ['.png'], 'text/tab-separated-values': ['.tsv'],
'image/gif': ['.gif'], 'text/html': ['.html', '.htm', '.web'],
'image/bmp': ['.bmp'], 'image/jpeg': ['.jpg', '.jpeg'],
'image/svg+xml': ['.svg'], 'image/png': ['.png'],
'image/tiff': ['.tiff'], 'image/gif': ['.gif'],
'image/webp': ['.webp'], 'image/bmp': ['.bmp'],
'application/dbase': ['.dbf'], 'image/svg+xml': ['.svg'],
'application/vnd.lotus-1-2-3': ['.123'], 'image/tiff': ['.tiff'],
'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'], 'image/webp': ['.webp'],
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'], 'application/dbase': ['.dbf'],
// Audio files (always supported) 'application/vnd.lotus-1-2-3': ['.123'],
...audioFileTypes, 'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
// Audio files (always supported)
...audioFileTypes,
};
} else if (etlService === 'DOCLING') {
return {
// Docling supported file types (currently only PDF)
'application/pdf': ['.pdf'],
// Audio files (always supported)
...audioFileTypes,
};
} else {
return {
// Unstructured supported file types
'image/bmp': ['.bmp'],
'text/csv': ['.csv'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'message/rfc822': ['.eml'],
'application/epub+zip': ['.epub'],
'image/heic': ['.heic'],
'text/html': ['.html'],
'image/jpeg': ['.jpeg', '.jpg'],
'image/png': ['.png'],
'application/vnd.ms-outlook': ['.msg'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'text/x-org': ['.org'],
'application/pkcs7-signature': ['.p7s'],
'application/pdf': ['.pdf'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'text/x-rst': ['.rst'],
'application/rtf': ['.rtf'],
'image/tiff': ['.tiff'],
'text/tab-separated-values': ['.tsv'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/xml': ['.xml'],
// Audio files (always supported)
...audioFileTypes,
};
} }
: { };
// Unstructured supported file types
'image/bmp': ['.bmp'], const acceptedFileTypes = getAcceptedFileTypes();
'text/csv': ['.csv'],
'application/msword': ['.doc'],
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
'message/rfc822': ['.eml'],
'application/epub+zip': ['.epub'],
'image/heic': ['.heic'],
'text/html': ['.html'],
'image/jpeg': ['.jpeg', '.jpg'],
'image/png': ['.png'],
'application/vnd.ms-outlook': ['.msg'],
'application/vnd.oasis.opendocument.text': ['.odt'],
'text/x-org': ['.org'],
'application/pkcs7-signature': ['.p7s'],
'application/pdf': ['.pdf'],
'application/vnd.ms-powerpoint': ['.ppt'],
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
'text/x-rst': ['.rst'],
'application/rtf': ['.rtf'],
'image/tiff': ['.tiff'],
'text/tab-separated-values': ['.tsv'],
'application/vnd.ms-excel': ['.xls'],
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
'application/xml': ['.xml'],
// Audio files (always supported)
...audioFileTypes,
};
const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort() const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()