diff --git a/README.es.md b/README.es.md
index d61504cd5..b62d2cece 100644
--- a/README.es.md
+++ b/README.es.md
@@ -21,9 +21,28 @@
# SurfSense
-Conecta cualquier LLM a tus fuentes de conocimiento internas y chatea con él en tiempo real junto a tu equipo. Alternativa de código abierto a NotebookLM, Perplexity y Glean.
-SurfSense es un agente de investigación de IA altamente personalizable, conectado a fuentes externas como motores de búsqueda (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian y más por venir.
+NotebookLM es una de las mejores y más útiles plataformas de IA que existen, pero una vez que comienzas a usarla regularmente también sientes sus limitaciones dejando algo que desear.
+
+1. Hay límites en la cantidad de fuentes que puedes agregar en un notebook.
+2. Hay límites en la cantidad de notebooks que puedes tener.
+3. No puedes tener fuentes que excedan 500,000 palabras y más de 200MB.
+4. Estás bloqueado con los servicios de Google (LLMs, modelos de uso, etc.) sin opción de configurarlos.
+5. Fuentes de datos externas e integraciones de servicios limitadas.
+6. El agente de NotebookLM está específicamente optimizado solo para estudiar e investigar, pero puedes hacer mucho más con los datos de origen.
+7. Falta de soporte multijugador.
+
+...y más.
+
+**SurfSense está específicamente hecho para resolver estos problemas.** SurfSense te permite:
+
+- **Controla Tu Flujo de Datos** - Mantén tus datos privados y seguros.
+- **Sin Límites de Datos** - Agrega una cantidad ilimitada de fuentes y notebooks.
+- **Sin Dependencia de Proveedores** - Configura cualquier modelo LLM, de imagen, TTS y STT.
+- **25+ Fuentes de Datos Externas** - Agrega tus fuentes desde Google Drive, OneDrive, Dropbox, Notion y muchos otros servicios externos.
+- **Soporte Multijugador en Tiempo Real** - Trabaja fácilmente con los miembros de tu equipo en un notebook compartido.
+
+...y más por venir.
@@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1
## Ejemplo de Agente de Video
-https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562
+https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a
@@ -133,24 +152,29 @@ Para Docker Compose, instalación manual y otras opciones de despliegue, consult
-## Funcionalidades Principales
+## SurfSense vs Google NotebookLM
-| Funcionalidad | Descripción |
-|----------------|-------------|
-| Alternativa OSS | Reemplazo directo de NotebookLM, Perplexity y Glean con colaboración en equipo en tiempo real |
-| 50+ Formatos de Archivo | Sube documentos, imágenes, videos vía LlamaCloud, Unstructured o Docling (local) |
-| Búsqueda Híbrida | Semántica + Texto completo con Índices Jerárquicos y Reciprocal Rank Fusion |
-| Respuestas con Citas | Chatea con tu base de conocimiento y obtén respuestas citadas al estilo Perplexity |
-| Arquitectura de Agentes Profundos | Impulsado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) con planificación, subagentes y acceso al sistema de archivos |
-| Soporte Universal de LLM | 100+ LLMs, 6000+ modelos de embeddings, todos los principales rerankers vía OpenAI spec y LiteLLM |
-| Privacidad Primero | Soporte completo de LLM local (vLLM, Ollama) tus datos son tuyos |
-| Colaboración en Equipo | RBAC con roles de Propietario / Admin / Editor / Visor, chat en tiempo real e hilos de comentarios |
-| Generación de Videos | Genera videos con narración y visuales |
-| Generación de Presentaciones | Crea presentaciones editables basadas en diapositivas |
-| Generación de Podcasts | Podcast de 3 min en menos de 20 segundos; múltiples proveedores TTS (OpenAI, Azure, Kokoro) |
-| Extensión de Navegador | Extensión multi-navegador para guardar cualquier página web, incluyendo páginas protegidas por autenticación |
-| 27+ Conectores | Motores de búsqueda, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord y [más](#fuentes-externas) |
-| Auto-Hospedable | Código abierto, Docker en un solo comando o Docker Compose completo para producción |
+| Característica | Google NotebookLM | SurfSense |
+|---------|-------------------|-----------|
+| **Fuentes por Notebook** | 50 (Gratis) a 600 (Ultra, $249.99/mes) | Ilimitadas |
+| **Número de Notebooks** | 100 (Gratis) a 500 (planes de pago) | Ilimitados |
+| **Límite de Tamaño de Fuente** | 500,000 palabras / 200MB por fuente | Sin límite |
+| **Precios** | Nivel gratuito disponible; Pro $19.99/mes, Ultra $249.99/mes | Gratuito y de código abierto, auto-hospedable en tu propia infra |
+| **Soporte de LLM** | Solo Google Gemini | 100+ LLMs vía OpenAI spec y LiteLLM |
+| **Modelos de Embeddings** | Solo Google | 6,000+ modelos de embeddings, todos los principales rerankers |
+| **LLMs Locales / Privados** | No disponible | Soporte completo (vLLM, Ollama) - tus datos son tuyos |
+| **Auto-Hospedable** | No | Sí - Docker en un solo comando o Docker Compose completo |
+| **Código Abierto** | No | Sí |
+| **Conectores Externos** | Google Drive, YouTube, sitios web | 27+ conectores - Motores de búsqueda, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord y [más](#fuentes-externas) |
+| **Soporte de Formatos de Archivo** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, imágenes, URLs web, YouTube | 50+ formatos - documentos, imágenes, videos vía LlamaCloud, Unstructured o Docling (local) |
+| **Búsqueda** | Búsqueda semántica | Búsqueda Híbrida - Semántica + Texto completo con Índices Jerárquicos y Reciprocal Rank Fusion |
+| **Respuestas con Citas** | Sí | Sí - Respuestas citadas al estilo Perplexity |
+| **Arquitectura de Agentes** | No | Sí - impulsado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) con planificación, subagentes y acceso al sistema de archivos |
+| **Multijugador en Tiempo Real** | Notebooks compartidos con roles de Visor/Editor (sin chat en tiempo real) | RBAC con roles de Propietario / Admin / Editor / Visor, chat en tiempo real e hilos de comentarios |
+| **Generación de Videos** | Resúmenes en video cinemáticos vía Veo 3 (solo Ultra) | Disponible (NotebookLM es mejor aquí, mejorando activamente) |
+| **Generación de Presentaciones** | Diapositivas más atractivas pero no editables | Crea presentaciones editables basadas en diapositivas |
+| **Generación de Podcasts** | Resúmenes de audio con hosts e idiomas personalizables | Disponible con múltiples proveedores TTS (NotebookLM es mejor aquí, mejorando activamente) |
+| **Extensión de Navegador** | No | Extensión multi-navegador para guardar cualquier página web, incluyendo páginas protegidas por autenticación |
Lista completa de Fuentes Externas
diff --git a/README.hi.md b/README.hi.md
index 011dbf5db..b49bddc72 100644
--- a/README.hi.md
+++ b/README.hi.md
@@ -21,9 +21,28 @@
# SurfSense
-किसी भी LLM को अपने आंतरिक ज्ञान स्रोतों से जोड़ें और अपनी टीम के साथ रीयल-टाइम में चैट करें। NotebookLM, Perplexity और Glean का ओपन सोर्स विकल्प।
-SurfSense एक अत्यधिक अनुकूलन योग्य AI शोध एजेंट है, जो बाहरी स्रोतों से जुड़ा है जैसे सर्च इंजन (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian और भी बहुत कुछ आने वाला है।
+NotebookLM वहाँ उपलब्ध सबसे अच्छे और सबसे उपयोगी AI प्लेटफ़ॉर्म में से एक है, लेकिन जब आप इसे नियमित रूप से उपयोग करना शुरू करते हैं तो आप इसकी सीमाओं को भी महसूस करते हैं जो कुछ और की चाह छोड़ती हैं।
+
+1. एक notebook में जोड़े जा सकने वाले स्रोतों की मात्रा पर सीमाएं हैं।
+2. आपके पास कितने notebooks हो सकते हैं इस पर सीमाएं हैं।
+3. आपके पास ऐसे स्रोत नहीं हो सकते जो 500,000 शब्दों और 200MB से अधिक हों।
+4. आप Google सेवाओं (LLMs, उपयोग मॉडल, आदि) में बंद हैं और उन्हें कॉन्फ़िगर करने का कोई विकल्प नहीं है।
+5. सीमित बाहरी डेटा स्रोत और सेवा एकीकरण।
+6. NotebookLM एजेंट विशेष रूप से केवल अध्ययन और शोध के लिए अनुकूलित है, लेकिन आप स्रोत डेटा के साथ और भी बहुत कुछ कर सकते हैं।
+7. मल्टीप्लेयर सपोर्ट की कमी।
+
+...और भी बहुत कुछ।
+
+**SurfSense विशेष रूप से इन समस्याओं को हल करने के लिए बनाया गया है।** SurfSense आपको सक्षम बनाता है:
+
+- **अपने डेटा प्रवाह को नियंत्रित करें** - अपने डेटा को निजी और सुरक्षित रखें।
+- **कोई डेटा सीमा नहीं** - असीमित मात्रा में स्रोत और notebooks जोड़ें।
+- **कोई विक्रेता लॉक-इन नहीं** - किसी भी LLM, इमेज, TTS और STT मॉडल को कॉन्फ़िगर करें।
+- **25+ बाहरी डेटा स्रोत** - Google Drive, OneDrive, Dropbox, Notion और कई अन्य बाहरी सेवाओं से अपने स्रोत जोड़ें।
+- **रीयल-टाइम मल्टीप्लेयर सपोर्ट** - एक साझा notebook में अपनी टीम के सदस्यों के साथ आसानी से काम करें।
+
+...और भी बहुत कुछ आने वाला है।
@@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1
## वीडियो एजेंट नमूना
-https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562
+https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a
@@ -133,24 +152,29 @@ Docker Compose, मैनुअल इंस्टॉलेशन और अन
-## प्रमुख विशेषताएं
+## SurfSense vs Google NotebookLM
-| विशेषता | विवरण |
-|----------|--------|
-| OSS विकल्प | रीयल-टाइम टीम सहयोग के साथ NotebookLM, Perplexity और Glean का सीधा प्रतिस्थापन |
-| 50+ फ़ाइल फ़ॉर्मेट | LlamaCloud, Unstructured या Docling (लोकल) के माध्यम से दस्तावेज़, चित्र, वीडियो अपलोड करें |
-| हाइब्रिड सर्च | हायरार्किकल इंडाइसेस और Reciprocal Rank Fusion के साथ सिमैंटिक + फुल टेक्स्ट सर्च |
-| उद्धृत उत्तर | अपने ज्ञान आधार के साथ चैट करें और Perplexity शैली के उद्धृत उत्तर पाएं |
-| डीप एजेंट आर्किटेक्चर | [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) द्वारा संचालित, योजना, सब-एजेंट और फ़ाइल सिस्टम एक्सेस |
-| यूनिवर्सल LLM सपोर्ट | 100+ LLMs, 6000+ एम्बेडिंग मॉडल, सभी प्रमुख रीरैंकर्स OpenAI spec और LiteLLM के माध्यम से |
-| प्राइवेसी फर्स्ट | पूर्ण लोकल LLM सपोर्ट (vLLM, Ollama) आपका डेटा आपका रहता है |
-| टीम सहयोग | मालिक / एडमिन / संपादक / दर्शक भूमिकाओं के साथ RBAC, रीयल-टाइम चैट और कमेंट थ्रेड |
-| वीडियो जनरेशन | नैरेशन और विज़ुअल के साथ वीडियो बनाएं |
-| प्रेजेंटेशन जनरेशन | संपादन योग्य, स्लाइड आधारित प्रेजेंटेशन बनाएं |
-| पॉडकास्ट जनरेशन | 20 सेकंड से कम में 3 मिनट का पॉडकास्ट; कई TTS प्रदाता (OpenAI, Azure, Kokoro) |
-| ब्राउज़र एक्सटेंशन | किसी भी वेबपेज को सहेजने के लिए क्रॉस-ब्राउज़र एक्सटेंशन, प्रमाणीकरण सुरक्षित पेज सहित |
-| 27+ कनेक्टर्स | सर्च इंजन, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord और [अधिक](#बाहरी-स्रोत) |
-| सेल्फ-होस्ट करने योग्य | ओपन सोर्स, Docker एक कमांड या प्रोडक्शन के लिए पूर्ण Docker Compose |
+| विशेषता | Google NotebookLM | SurfSense |
+|---------|-------------------|-----------|
+| **प्रति Notebook स्रोत** | 50 (मुफ़्त) से 600 (Ultra, $249.99/माह) | असीमित |
+| **Notebooks की संख्या** | 100 (मुफ़्त) से 500 (सशुल्क योजनाएं) | असीमित |
+| **स्रोत आकार सीमा** | 500,000 शब्द / 200MB प्रति स्रोत | कोई सीमा नहीं |
+| **मूल्य निर्धारण** | मुफ़्त स्तर उपलब्ध; Pro $19.99/माह, Ultra $249.99/माह | मुफ़्त और ओपन सोर्स, अपनी इंफ्रा पर सेल्फ-होस्ट करें |
+| **LLM सपोर्ट** | केवल Google Gemini | 100+ LLMs OpenAI spec और LiteLLM के माध्यम से |
+| **एम्बेडिंग मॉडल** | केवल Google | 6,000+ एम्बेडिंग मॉडल, सभी प्रमुख रीरैंकर्स |
+| **लोकल / प्राइवेट LLMs** | उपलब्ध नहीं | पूर्ण सपोर्ट (vLLM, Ollama) - आपका डेटा आपका रहता है |
+| **सेल्फ-होस्ट करने योग्य** | नहीं | हाँ - Docker एक कमांड या पूर्ण Docker Compose |
+| **ओपन सोर्स** | नहीं | हाँ |
+| **बाहरी कनेक्टर्स** | Google Drive, YouTube, वेबसाइटें | 27+ कनेक्टर्स - सर्च इंजन, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord और [अधिक](#बाहरी-स्रोत) |
+| **फ़ाइल फ़ॉर्मेट सपोर्ट** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, इमेज, वेब URLs, YouTube | 50+ फ़ॉर्मेट - दस्तावेज़, इमेज, वीडियो LlamaCloud, Unstructured या Docling (लोकल) के माध्यम से |
+| **सर्च** | सिमैंटिक सर्च | हाइब्रिड सर्च - हायरार्किकल इंडाइसेस और Reciprocal Rank Fusion के साथ सिमैंटिक + फुल टेक्स्ट |
+| **उद्धृत उत्तर** | हाँ | हाँ - Perplexity शैली के उद्धृत उत्तर |
+| **एजेंट आर्किटेक्चर** | नहीं | हाँ - [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) द्वारा संचालित, योजना, सब-एजेंट और फ़ाइल सिस्टम एक्सेस |
+| **रीयल-टाइम मल्टीप्लेयर** | दर्शक/संपादक भूमिकाओं के साथ साझा notebooks (कोई रीयल-टाइम चैट नहीं) | मालिक / एडमिन / संपादक / दर्शक भूमिकाओं के साथ RBAC, रीयल-टाइम चैट और कमेंट थ्रेड |
+| **वीडियो जनरेशन** | Veo 3 के माध्यम से सिनेमैटिक वीडियो ओवरव्यू (केवल Ultra) | उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) |
+| **प्रेजेंटेशन जनरेशन** | बेहतर दिखने वाली स्लाइड्स लेकिन संपादन योग्य नहीं | संपादन योग्य, स्लाइड आधारित प्रेजेंटेशन बनाएं |
+| **पॉडकास्ट जनरेशन** | कस्टमाइज़ेबल होस्ट और भाषाओं के साथ ऑडियो ओवरव्यू | कई TTS प्रदाताओं के साथ उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) |
+| **ब्राउज़र एक्सटेंशन** | नहीं | किसी भी वेबपेज को सहेजने के लिए क्रॉस-ब्राउज़र एक्सटेंशन, प्रमाणीकरण सुरक्षित पेज सहित |
बाहरी स्रोतों की पूरी सूची
diff --git a/README.md b/README.md
index f007fd43c..12ea4912a 100644
--- a/README.md
+++ b/README.md
@@ -21,9 +21,28 @@
# SurfSense
-Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team. OSS alternative to NotebookLM, Perplexity, and Glean.
-SurfSense is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian and more to come.
+NotebookLM is one of the best and most useful AI platforms out there, but once you start using it regularly you also feel its limitations leaving something to be desired more.
+
+1. There are limits on the amount of sources you can add in a notebook.
+2. There are limits on the number of notebooks you can have.
+3. You cannot have sources that exceed 500,000 words and are more than 200MB.
+4. You are vendor locked in to Google services (LLMs, usage models, etc.) with no option to configure them.
+5. Limited external data sources and service integrations.
+6. NotebookLM Agent is specifically optimised for just studying and researching, but you can do so much more with the source data.
+7. Lack of multiplayer support.
+
+...and more.
+
+**SurfSense is specifically made to solve these problems.** SurfSense empowers you to:
+
+- **Control Your Data Flow** - Keep your data private and secure.
+- **No Data Limits** - Add an unlimited amount of sources and notebooks.
+- **No Vendor Lock-in** - Configure any LLM, image, TTS, and STT models to use.
+- **25+ External Data Sources** - Add your sources from Google Drive, OneDrive, Dropbox, Notion, and many other external services.
+- **Real-Time Multiplayer Support** - Work easily with your team members in a shared notebook.
+
+...and more to come.
@@ -134,24 +153,29 @@ For Docker Compose, manual installation, and other deployment options, see the [
-## Key Features
+## SurfSense vs Google NotebookLM
-| Feature | Description |
-|---------|-------------|
-| OSS Alternative | Drop in replacement for NotebookLM, Perplexity, and Glean with real time team collaboration |
-| 50+ File Formats | Upload documents, images, videos via LlamaCloud, Unstructured, or Docling (local) |
-| Hybrid Search | Semantic + Full Text Search with Hierarchical Indices and Reciprocal Rank Fusion |
-| Cited Answers | Chat with your knowledge base and get Perplexity style cited responses |
-| Deep Agent Architecture | Powered by [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) planning, subagents, and file system access |
-| Universal LLM Support | 100+ LLMs, 6000+ embedding models, all major rerankers via OpenAI spec & LiteLLM |
-| Privacy First | Full local LLM support (vLLM, Ollama) your data stays yours |
-| Team Collaboration | RBAC with Owner / Admin / Editor / Viewer roles, real time chat & comment threads |
-| Video Generation | Generate videos with narration and visuals |
-| Presentation Generation | Create editable, slide based presentations |
-| Podcast Generation | 3 min podcast in under 20 seconds; multiple TTS providers (OpenAI, Azure, Kokoro) |
-| Browser Extension | Cross browser extension to save any webpage, including auth protected pages |
-| 27+ Connectors | Search Engines, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord & [more](#external-sources) |
-| Self Hostable | Open source, Docker one liner or full Docker Compose for production |
+| Feature | Google NotebookLM | SurfSense |
+|---------|-------------------|-----------|
+| **Sources per Notebook** | 50 (Free) to 600 (Ultra, $249.99/mo) | Unlimited |
+| **Number of Notebooks** | 100 (Free) to 500 (paid tiers) | Unlimited |
+| **Source Size Limit** | 500,000 words / 200MB per source | No limit |
+| **Pricing** | Free tier available; Pro $19.99/mo, Ultra $249.99/mo | Free and open source, self-host on your own infra |
+| **LLM Support** | Google Gemini only | 100+ LLMs via OpenAI spec & LiteLLM |
+| **Embedding Models** | Google only | 6,000+ embedding models, all major rerankers |
+| **Local / Private LLMs** | Not available | Full support (vLLM, Ollama) - your data stays yours |
+| **Self Hostable** | No | Yes - Docker one-liner or full Docker Compose |
+| **Open Source** | No | Yes |
+| **External Connectors** | Google Drive, YouTube, websites | 27+ connectors - Search Engines, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord & [more](#external-sources) |
+| **File Format Support** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, images, web URLs, YouTube | 50+ formats - documents, images, videos via LlamaCloud, Unstructured, or Docling (local) |
+| **Search** | Semantic search | Hybrid Search - Semantic + Full Text with Hierarchical Indices & Reciprocal Rank Fusion |
+| **Cited Answers** | Yes | Yes - Perplexity-style cited responses |
+| **Agentic Architecture** | No | Yes - powered by [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) with planning, subagents, and file system access |
+| **Real-Time Multiplayer** | Shared notebooks with Viewer/Editor roles (no real-time chat) | RBAC with Owner / Admin / Editor / Viewer roles, real-time chat & comment threads |
+| **Video Generation** | Cinematic Video Overviews via Veo 3 (Ultra only) | Available (NotebookLM is better here, actively improving) |
+| **Presentation Generation** | Better looking slides but not editable | Create editable, slide-based presentations |
+| **Podcast Generation** | Audio Overviews with customizable hosts and languages | Available with multiple TTS providers (NotebookLM is better here, actively improving) |
+| **Browser Extension** | No | Cross-browser extension to save any webpage, including auth-protected pages |
Full list of External Sources
diff --git a/README.pt-BR.md b/README.pt-BR.md
index 4306b0767..50a8b739e 100644
--- a/README.pt-BR.md
+++ b/README.pt-BR.md
@@ -21,9 +21,28 @@
# SurfSense
-Conecte qualquer LLM às suas fontes de conhecimento internas e converse com ele em tempo real junto com sua equipe. Alternativa de código aberto ao NotebookLM, Perplexity e Glean.
-SurfSense é um agente de pesquisa de IA altamente personalizável, conectado a fontes externas como mecanismos de busca (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian e mais por vir.
+O NotebookLM é uma das melhores e mais úteis plataformas de IA disponíveis, mas quando você começa a usá-lo regularmente também sente suas limitações deixando algo a desejar.
+
+1. Há limites na quantidade de fontes que você pode adicionar em um notebook.
+2. Há limites no número de notebooks que você pode ter.
+3. Você não pode ter fontes que excedam 500.000 palavras e mais de 200MB.
+4. Você fica preso aos serviços do Google (LLMs, modelos de uso, etc.) sem opção de configurá-los.
+5. Fontes de dados externas e integrações de serviços limitadas.
+6. O agente do NotebookLM é especificamente otimizado apenas para estudar e pesquisar, mas você pode fazer muito mais com os dados de origem.
+7. Falta de suporte multiplayer.
+
+...e mais.
+
+**O SurfSense foi feito especificamente para resolver esses problemas.** O SurfSense permite que você:
+
+- **Controle Seu Fluxo de Dados** - Mantenha seus dados privados e seguros.
+- **Sem Limites de Dados** - Adicione uma quantidade ilimitada de fontes e notebooks.
+- **Sem Dependência de Fornecedor** - Configure qualquer modelo LLM, de imagem, TTS e STT.
+- **25+ Fontes de Dados Externas** - Adicione suas fontes do Google Drive, OneDrive, Dropbox, Notion e muitos outros serviços externos.
+- **Suporte Multiplayer em Tempo Real** - Trabalhe facilmente com os membros da sua equipe em um notebook compartilhado.
+
+...e mais por vir.
@@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1
## Exemplo de Agente de Vídeo
-https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562
+https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a
@@ -133,24 +152,29 @@ Para Docker Compose, instalação manual e outras opções de implantação, con
-## Funcionalidades Principais
+## SurfSense vs Google NotebookLM
-| Funcionalidade | Descrição |
-|----------------|-----------|
-| Alternativa OSS | Substituto direto do NotebookLM, Perplexity e Glean com colaboração em equipe em tempo real |
-| 50+ Formatos de Arquivo | Faça upload de documentos, imagens, vídeos via LlamaCloud, Unstructured ou Docling (local) |
-| Busca Híbrida | Semântica + Texto completo com Índices Hierárquicos e Reciprocal Rank Fusion |
-| Respostas com Citações | Converse com sua base de conhecimento e obtenha respostas citadas no estilo Perplexity |
-| Arquitetura de Agentes Profundos | Alimentado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) com planejamento, subagentes e acesso ao sistema de arquivos |
-| Suporte Universal de LLM | 100+ LLMs, 6000+ modelos de embeddings, todos os principais rerankers via OpenAI spec e LiteLLM |
-| Privacidade em Primeiro Lugar | Suporte completo a LLM local (vLLM, Ollama) seus dados ficam com você |
-| Colaboração em Equipe | RBAC com papéis de Proprietário / Admin / Editor / Visualizador, chat em tempo real e threads de comentários |
-| Geração de Vídeos | Gera vídeos com narração e visuais |
-| Geração de Apresentações | Cria apresentações editáveis baseadas em slides |
-| Geração de Podcasts | Podcast de 3 min em menos de 20 segundos; múltiplos provedores TTS (OpenAI, Azure, Kokoro) |
-| Extensão de Navegador | Extensão multi-navegador para salvar qualquer página web, incluindo páginas protegidas por autenticação |
-| 27+ Conectores | Mecanismos de busca, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord e [mais](#fontes-externas) |
-| Auto-Hospedável | Código aberto, Docker em um único comando ou Docker Compose completo para produção |
+| Recurso | Google NotebookLM | SurfSense |
+|---------|-------------------|-----------|
+| **Fontes por Notebook** | 50 (Grátis) a 600 (Ultra, $249.99/mês) | Ilimitadas |
+| **Número de Notebooks** | 100 (Grátis) a 500 (planos pagos) | Ilimitados |
+| **Limite de Tamanho da Fonte** | 500.000 palavras / 200MB por fonte | Sem limite |
+| **Preços** | Nível gratuito disponível; Pro $19.99/mês, Ultra $249.99/mês | Gratuito e de código aberto, auto-hospedável na sua própria infra |
+| **Suporte a LLM** | Apenas Google Gemini | 100+ LLMs via OpenAI spec e LiteLLM |
+| **Modelos de Embeddings** | Apenas Google | 6.000+ modelos de embeddings, todos os principais rerankers |
+| **LLMs Locais / Privados** | Não disponível | Suporte completo (vLLM, Ollama) - seus dados ficam com você |
+| **Auto-Hospedável** | Não | Sim - Docker em um único comando ou Docker Compose completo |
+| **Código Aberto** | Não | Sim |
+| **Conectores Externos** | Google Drive, YouTube, sites | 27+ conectores - Mecanismos de busca, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord e [mais](#fontes-externas) |
+| **Suporte a Formatos de Arquivo** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, imagens, URLs web, YouTube | 50+ formatos - documentos, imagens, vídeos via LlamaCloud, Unstructured ou Docling (local) |
+| **Busca** | Busca semântica | Busca Híbrida - Semântica + Texto completo com Índices Hierárquicos e Reciprocal Rank Fusion |
+| **Respostas com Citações** | Sim | Sim - Respostas citadas no estilo Perplexity |
+| **Arquitetura de Agentes** | Não | Sim - alimentado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) com planejamento, subagentes e acesso ao sistema de arquivos |
+| **Multiplayer em Tempo Real** | Notebooks compartilhados com papéis de Visualizador/Editor (sem chat em tempo real) | RBAC com papéis de Proprietário / Admin / Editor / Visualizador, chat em tempo real e threads de comentários |
+| **Geração de Vídeos** | Visões gerais cinemáticas via Veo 3 (apenas Ultra) | Disponível (NotebookLM é melhor aqui, melhorando ativamente) |
+| **Geração de Apresentações** | Slides mais bonitos mas não editáveis | Cria apresentações editáveis baseadas em slides |
+| **Geração de Podcasts** | Visões gerais em áudio com hosts e idiomas personalizáveis | Disponível com múltiplos provedores TTS (NotebookLM é melhor aqui, melhorando ativamente) |
+| **Extensão de Navegador** | Não | Extensão multi-navegador para salvar qualquer página web, incluindo páginas protegidas por autenticação |
Lista completa de Fontes Externas
diff --git a/README.zh-CN.md b/README.zh-CN.md
index 96ebb25ad..419a831ae 100644
--- a/README.zh-CN.md
+++ b/README.zh-CN.md
@@ -21,9 +21,28 @@
# SurfSense
-将任何 LLM 连接到您的内部知识源,并与团队成员实时聊天。NotebookLM、Perplexity 和 Glean 的开源替代方案。
-SurfSense 是一个高度可定制的 AI 研究助手,可以连接外部数据源,如搜索引擎(SearxNG、Tavily、LinkUp)、Google Drive、OneDrive、Dropbox、Slack、Microsoft Teams、Linear、Jira、ClickUp、Confluence、BookStack、Gmail、Notion、YouTube、GitHub、Discord、Airtable、Google Calendar、Luma、Circleback、Elasticsearch、Obsidian 等,未来还会支持更多。
+NotebookLM 是目前最好、最实用的 AI 平台之一,但当你开始经常使用它时,你也会感受到它的局限性,总觉得还有不足之处。
+
+1. 一个笔记本中可以添加的来源数量有限制。
+2. 可以拥有的笔记本数量有限制。
+3. 来源不能超过 500,000 个单词和 200MB。
+4. 你被锁定在 Google 服务中(LLM、使用模型等),没有配置选项。
+5. 有限的外部数据源和服务集成。
+6. NotebookLM 代理专门针对学习和研究进行了优化,但你可以用源数据做更多事情。
+7. 缺乏多人协作支持。
+
+...还有更多。
+
+**SurfSense 正是为了解决这些问题而生。** SurfSense 赋予你:
+
+- **控制你的数据流** - 保持数据私密和安全。
+- **无数据限制** - 添加无限数量的来源和笔记本。
+- **无供应商锁定** - 配置任何 LLM、图像、TTS 和 STT 模型。
+- **25+ 外部数据源** - 从 Google Drive、OneDrive、Dropbox、Notion 和许多其他外部服务添加你的来源。
+- **实时多人协作支持** - 在共享笔记本中轻松与团队成员协作。
+
+...更多功能即将推出。
@@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1
## 视频代理示例
-https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562
+https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a
@@ -133,24 +152,29 @@ irm https://raw.githubusercontent.com/MODSetter/SurfSense/main/docker/scripts/in
-## 核心功能
+## SurfSense vs Google NotebookLM
-| 功能 | 描述 |
-|------|------|
-| 开源替代方案 | 支持实时团队协作的 NotebookLM、Perplexity 和 Glean 替代品 |
-| 50+ 文件格式 | 通过 LlamaCloud、Unstructured 或 Docling(本地)上传文档、图像、视频 |
-| 混合搜索 | 语义搜索 + 全文搜索,结合层次化索引和倒数排名融合 |
-| 引用回答 | 与知识库对话,获得 Perplexity 风格的引用回答 |
-| 深度代理架构 | 基于 [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) 构建,支持规划、子代理和文件系统访问 |
-| 通用 LLM 支持 | 100+ LLM、6000+ 嵌入模型、所有主流重排序器,通过 OpenAI spec 和 LiteLLM |
-| 隐私优先 | 完整本地 LLM 支持(vLLM、Ollama),您的数据由您掌控 |
-| 团队协作 | RBAC 角色控制(所有者/管理员/编辑者/查看者),实时聊天和评论线程 |
-| 视频生成 | 生成带有旁白和视觉效果的视频 |
-| 演示文稿生成 | 创建可编辑的幻灯片式演示文稿 |
-| 播客生成 | 20 秒内生成 3 分钟播客;多种 TTS 提供商(OpenAI、Azure、Kokoro) |
-| 浏览器扩展 | 跨浏览器扩展,保存任何网页,包括需要身份验证的页面 |
-| 27+ 连接器 | 搜索引擎、Google Drive、OneDrive、Dropbox、Slack、Teams、Jira、Notion、GitHub、Discord 等[更多](#外部数据源) |
-| 可自托管 | 开源,Docker 一行命令或完整 Docker Compose 用于生产环境 |
+| 功能 | Google NotebookLM | SurfSense |
+|---------|-------------------|-----------|
+| **每个笔记本的来源数** | 50(免费)到 600(Ultra,$249.99/月) | 无限制 |
+| **笔记本数量** | 100(免费)到 500(付费方案) | 无限制 |
+| **来源大小限制** | 500,000 词 / 200MB 每个来源 | 无限制 |
+| **定价** | 免费版可用;Pro $19.99/月,Ultra $249.99/月 | 免费开源,在自己的基础设施上自托管 |
+| **LLM 支持** | 仅 Google Gemini | 100+ LLM,通过 OpenAI spec 和 LiteLLM |
+| **嵌入模型** | 仅 Google | 6,000+ 嵌入模型,所有主流重排序器 |
+| **本地 / 私有 LLM** | 不可用 | 完整支持(vLLM、Ollama)- 您的数据由您掌控 |
+| **可自托管** | 否 | 是 - Docker 一行命令或完整 Docker Compose |
+| **开源** | 否 | 是 |
+| **外部连接器** | Google Drive、YouTube、网站 | 27+ 连接器 - 搜索引擎、Google Drive、OneDrive、Dropbox、Slack、Teams、Jira、Notion、GitHub、Discord 等[更多](#外部数据源) |
+| **文件格式支持** | PDF、Docs、Slides、Sheets、CSV、Word、EPUB、图像、网页 URL、YouTube | 50+ 格式 - 文档、图像、视频,通过 LlamaCloud、Unstructured 或 Docling(本地) |
+| **搜索** | 语义搜索 | 混合搜索 - 语义 + 全文搜索,结合层次化索引和倒数排名融合 |
+| **引用回答** | 是 | 是 - Perplexity 风格的引用回答 |
+| **代理架构** | 否 | 是 - 基于 [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) 构建,支持规划、子代理和文件系统访问 |
+| **实时多人协作** | 共享笔记本,支持查看者/编辑者角色(无实时聊天) | RBAC 角色控制(所有者/管理员/编辑者/查看者),实时聊天和评论线程 |
+| **视频生成** | 通过 Veo 3 的电影级视频概览(仅 Ultra) | 可用(NotebookLM 在此方面更好,正在积极改进) |
+| **演示文稿生成** | 更美观的幻灯片但不可编辑 | 创建可编辑的幻灯片式演示文稿 |
+| **播客生成** | 可自定义主持人和语言的音频概览 | 可用,支持多种 TTS 提供商(NotebookLM 在此方面更好,正在积极改进) |
+| **浏览器扩展** | 否 | 跨浏览器扩展,保存任何网页,包括需要身份验证的页面 |
外部数据源完整列表
diff --git a/surfsense_backend/alembic/versions/116_create_zero_publication.py b/surfsense_backend/alembic/versions/116_create_zero_publication.py
index 8f0d7b5d3..ff74952a9 100644
--- a/surfsense_backend/alembic/versions/116_create_zero_publication.py
+++ b/surfsense_backend/alembic/versions/116_create_zero_publication.py
@@ -42,9 +42,7 @@ def upgrade() -> None:
if not exists:
table_list = ", ".join(TABLES)
conn.execute(
- sa.text(
- f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}"
- )
+ sa.text(f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}")
)
diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
new file mode 100644
index 000000000..78a26a381
--- /dev/null
+++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py
@@ -0,0 +1,123 @@
+"""optimize zero_publication with column lists
+
+Recreates the zero_publication using column lists for the documents
+table so that large text columns (content, source_markdown,
+blocknote_document, etc.) are excluded from WAL replication.
+This prevents RangeError: Invalid string length in zero-cache's
+change-streamer when documents have very large content.
+
+Also resets REPLICA IDENTITY to DEFAULT on tables that had it set
+to FULL for the old Electric SQL setup (migration 66/75/76).
+With DEFAULT (primary-key) identity, column-list publications
+only need to include the PK — not every column.
+
+IMPORTANT — before AND after running this migration:
+ 1. Stop zero-cache (it holds replication locks that will deadlock DDL)
+ 2. Run: alembic upgrade head
+ 3. Delete / reset the zero-cache data volume
+ 4. Restart zero-cache (it will do a fresh initial sync)
+
+Revision ID: 117
+Revises: 116
+"""
+
+from collections.abc import Sequence
+
+import sqlalchemy as sa
+
+from alembic import op
+
+revision: str = "117"
+down_revision: str | None = "116"
+branch_labels: str | Sequence[str] | None = None
+depends_on: str | Sequence[str] | None = None
+
+PUBLICATION_NAME = "zero_publication"
+
+TABLES_WITH_FULL_IDENTITY = [
+ "documents",
+ "notifications",
+ "search_source_connectors",
+ "new_chat_messages",
+ "chat_comments",
+ "chat_session_state",
+]
+
+DOCUMENT_COLS = [
+ "id",
+ "title",
+ "document_type",
+ "search_space_id",
+ "folder_id",
+ "created_by_id",
+ "status",
+ "created_at",
+ "updated_at",
+]
+
+PUBLICATION_DDL_FULL = f"""\
+CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE
+ notifications, documents, folders,
+ search_source_connectors, new_chat_messages,
+ chat_comments, chat_session_state
+"""
+
+
+def _terminate_blocked_pids(conn, table: str) -> None:
+ """Kill backends whose locks on *table* would block our AccessExclusiveLock."""
+ conn.execute(
+ sa.text(
+ "SELECT pg_terminate_backend(l.pid) "
+ "FROM pg_locks l "
+ "JOIN pg_class c ON c.oid = l.relation "
+ "WHERE c.relname = :tbl "
+ " AND l.pid != pg_backend_pid()"
+ ),
+ {"tbl": table},
+ )
+
+
+def upgrade() -> None:
+ conn = op.get_bind()
+
+ conn.execute(sa.text("SET lock_timeout = '10s'"))
+
+ for tbl in sorted(TABLES_WITH_FULL_IDENTITY):
+ _terminate_blocked_pids(conn, tbl)
+ conn.execute(sa.text(f'LOCK TABLE "{tbl}" IN ACCESS EXCLUSIVE MODE'))
+
+ for tbl in TABLES_WITH_FULL_IDENTITY:
+ conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT'))
+
+ conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+
+ has_zero_ver = conn.execute(
+ sa.text(
+ "SELECT 1 FROM information_schema.columns "
+ "WHERE table_name = 'documents' AND column_name = '_0_version'"
+ )
+ ).fetchone()
+
+ cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else [])
+ col_list = ", ".join(cols)
+
+ conn.execute(
+ sa.text(
+ f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE "
+ f"notifications, "
+ f"documents ({col_list}), "
+ f"folders, "
+ f"search_source_connectors, "
+ f"new_chat_messages, "
+ f"chat_comments, "
+ f"chat_session_state"
+ )
+ )
+
+
+def downgrade() -> None:
+ conn = op.get_bind()
+ conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}"))
+ conn.execute(sa.text(PUBLICATION_DDL_FULL))
+ for tbl in TABLES_WITH_FULL_IDENTITY:
+ conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY FULL'))
diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
index ccc06f272..fc1e80d28 100644
--- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py
+++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py
@@ -159,6 +159,7 @@ async def create_surfsense_deep_agent(
additional_tools: Sequence[BaseTool] | None = None,
firecrawl_api_key: str | None = None,
thread_visibility: ChatVisibility | None = None,
+ mentioned_document_ids: list[int] | None = None,
):
"""
Create a SurfSense deep agent with configurable tools and prompts.
@@ -451,6 +452,7 @@ async def create_surfsense_deep_agent(
search_space_id=search_space_id,
available_connectors=available_connectors,
available_document_types=available_document_types,
+ mentioned_document_ids=mentioned_document_ids,
),
SurfSenseFilesystemMiddleware(
search_space_id=search_space_id,
diff --git a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
index 41b24f88b..d7697ef15 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py
@@ -66,6 +66,16 @@ the ``, identify chunks marked `matched="true"`, then use
those sections instead of reading the entire file sequentially.
Use `` values as citation IDs in your answers.
+
+## User-Mentioned Documents
+
+When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`,
+the user **explicitly selected** that document. These files are your highest-
+priority sources:
+1. **Always read them thoroughly** — scan the full ``, then read
+ all major sections, not just matched chunks.
+2. **Prefer their content** over other search results when answering.
+3. **Cite from them first** whenever applicable.
"""
# =============================================================================
diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
index 3728f229c..7b0dd2f71 100644
--- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
+++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py
@@ -28,7 +28,13 @@ from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession
from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range
-from app.db import NATIVE_TO_LEGACY_DOCTYPE, Document, Folder, shielded_async_session
+from app.db import (
+ NATIVE_TO_LEGACY_DOCTYPE,
+ Chunk,
+ Document,
+ Folder,
+ shielded_async_session,
+)
from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever
from app.utils.document_converters import embed_texts
from app.utils.perf import get_perf_logger
@@ -430,21 +436,36 @@ async def _get_folder_paths(
def _build_synthetic_ls(
existing_files: dict[str, Any] | None,
new_files: dict[str, Any],
+ *,
+ mentioned_paths: set[str] | None = None,
) -> tuple[AIMessage, ToolMessage]:
"""Build a synthetic ls("/documents") tool-call + result for the LLM context.
- Paths are listed with *new* (rank-ordered) files first, then existing files
- that were already in state from prior turns.
+ Mentioned files are listed first. A separate header tells the LLM which
+ files the user explicitly selected; the path list itself stays clean so
+ paths can be passed directly to ``read_file`` without stripping tags.
"""
+ _mentioned = mentioned_paths or set()
merged: dict[str, Any] = {**(existing_files or {}), **new_files}
doc_paths = [
p for p, v in merged.items() if p.startswith("/documents/") and v is not None
]
new_set = set(new_files)
- new_paths = [p for p in doc_paths if p in new_set]
+ mentioned_list = [p for p in doc_paths if p in _mentioned]
+ new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned]
old_paths = [p for p in doc_paths if p not in new_set]
- ordered = new_paths + old_paths
+ ordered = mentioned_list + new_non_mentioned + old_paths
+
+ parts: list[str] = []
+ if mentioned_list:
+ parts.append(
+ "USER-MENTIONED documents (read these thoroughly before answering):"
+ )
+ for p in mentioned_list:
+ parts.append(f" {p}")
+ parts.append("")
+ parts.append(str(ordered) if ordered else "No documents found.")
tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}"
ai_msg = AIMessage(
@@ -452,7 +473,7 @@ def _build_synthetic_ls(
tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}],
)
tool_msg = ToolMessage(
- content=str(ordered) if ordered else "No documents found.",
+ content="\n".join(parts),
tool_call_id=tool_call_id,
)
return ai_msg, tool_msg
@@ -524,12 +545,92 @@ async def search_knowledge_base(
return results[:top_k]
+async def fetch_mentioned_documents(
+ *,
+ document_ids: list[int],
+ search_space_id: int,
+) -> list[dict[str, Any]]:
+ """Fetch explicitly mentioned documents with *all* their chunks.
+
+ Returns the same dict structure as ``search_knowledge_base`` so results
+ can be merged directly into ``build_scoped_filesystem``. Unlike search
+ results, every chunk is included (no top-K limiting) and none are marked
+ as ``matched`` since the entire document is relevant by virtue of the
+ user's explicit mention.
+ """
+ if not document_ids:
+ return []
+
+ async with shielded_async_session() as session:
+ doc_result = await session.execute(
+ select(Document).where(
+ Document.id.in_(document_ids),
+ Document.search_space_id == search_space_id,
+ )
+ )
+ docs = {doc.id: doc for doc in doc_result.scalars().all()}
+
+ if not docs:
+ return []
+
+ chunk_result = await session.execute(
+ select(Chunk.id, Chunk.content, Chunk.document_id)
+ .where(Chunk.document_id.in_(list(docs.keys())))
+ .order_by(Chunk.document_id, Chunk.id)
+ )
+ chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs}
+ for row in chunk_result.all():
+ if row.document_id in chunks_by_doc:
+ chunks_by_doc[row.document_id].append(
+ {"chunk_id": row.id, "content": row.content}
+ )
+
+ results: list[dict[str, Any]] = []
+ for doc_id in document_ids:
+ doc = docs.get(doc_id)
+ if doc is None:
+ continue
+ metadata = doc.document_metadata or {}
+ results.append(
+ {
+ "document_id": doc.id,
+ "content": "",
+ "score": 1.0,
+ "chunks": chunks_by_doc.get(doc.id, []),
+ "matched_chunk_ids": [],
+ "document": {
+ "id": doc.id,
+ "title": doc.title,
+ "document_type": (
+ doc.document_type.value
+ if getattr(doc, "document_type", None)
+ else None
+ ),
+ "metadata": metadata,
+ },
+ "source": (
+ doc.document_type.value
+ if getattr(doc, "document_type", None)
+ else None
+ ),
+ "_user_mentioned": True,
+ }
+ )
+ return results
+
+
async def build_scoped_filesystem(
*,
documents: Sequence[dict[str, Any]],
search_space_id: int,
-) -> dict[str, dict[str, str]]:
- """Build a StateBackend-compatible files dict from search results."""
+) -> tuple[dict[str, dict[str, str]], dict[int, str]]:
+ """Build a StateBackend-compatible files dict from search results.
+
+ Returns ``(files, doc_id_to_path)`` so callers can reliably map a
+ document id back to its filesystem path without guessing by title.
+ Paths are collision-proof: when two documents resolve to the same
+ path the doc-id is appended to disambiguate.
+ """
async with shielded_async_session() as session:
folder_paths = await _get_folder_paths(session, search_space_id)
doc_ids = [
@@ -551,6 +652,7 @@ async def build_scoped_filesystem(
}
files: dict[str, dict[str, str]] = {}
+ doc_id_to_path: dict[int, str] = {}
for document in documents:
doc_meta = document.get("document") or {}
title = str(doc_meta.get("title") or "untitled")
@@ -559,6 +661,9 @@ async def build_scoped_filesystem(
base_folder = folder_paths.get(folder_id, "/documents")
file_name = _safe_filename(title)
path = f"{base_folder}/{file_name}"
+ if path in files:
+ stem = file_name.removesuffix(".xml")
+ path = f"{base_folder}/{stem} ({doc_id}).xml"
matched_ids = set(document.get("matched_chunk_ids") or [])
xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids)
files[path] = {
@@ -567,7 +672,9 @@ async def build_scoped_filesystem(
"created_at": "",
"modified_at": "",
}
- return files
+ if isinstance(doc_id, int):
+ doc_id_to_path[doc_id] = path
+ return files, doc_id_to_path
class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
@@ -583,12 +690,14 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
available_connectors: list[str] | None = None,
available_document_types: list[str] | None = None,
top_k: int = 10,
+ mentioned_document_ids: list[int] | None = None,
) -> None:
self.llm = llm
self.search_space_id = search_space_id
self.available_connectors = available_connectors
self.available_document_types = available_document_types
self.top_k = top_k
+ self.mentioned_document_ids = mentioned_document_ids or []
async def _plan_search_inputs(
self,
@@ -680,6 +789,18 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
user_text=user_text,
)
+ # --- 1. Fetch mentioned documents (user-selected, all chunks) ---
+ mentioned_results: list[dict[str, Any]] = []
+ if self.mentioned_document_ids:
+ mentioned_results = await fetch_mentioned_documents(
+ document_ids=self.mentioned_document_ids,
+ search_space_id=self.search_space_id,
+ )
+ # Clear after first turn so they are not re-fetched on subsequent
+ # messages within the same agent instance.
+ self.mentioned_document_ids = []
+
+ # --- 2. Run KB hybrid search ---
search_results = await search_knowledge_base(
query=planned_query,
search_space_id=self.search_space_id,
@@ -689,19 +810,50 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg]
start_date=start_date,
end_date=end_date,
)
- new_files = await build_scoped_filesystem(
- documents=search_results,
+
+ # --- 3. Merge: mentioned first, then search (dedup by doc id) ---
+ seen_doc_ids: set[int] = set()
+ merged: list[dict[str, Any]] = []
+ for doc in mentioned_results:
+ doc_id = (doc.get("document") or {}).get("id")
+ if doc_id is not None:
+ seen_doc_ids.add(doc_id)
+ merged.append(doc)
+ for doc in search_results:
+ doc_id = (doc.get("document") or {}).get("id")
+ if doc_id is not None and doc_id in seen_doc_ids:
+ continue
+ merged.append(doc)
+
+ # --- 4. Build scoped filesystem ---
+ new_files, doc_id_to_path = await build_scoped_filesystem(
+ documents=merged,
search_space_id=self.search_space_id,
)
- ai_msg, tool_msg = _build_synthetic_ls(existing_files, new_files)
+ # Identify which paths belong to user-mentioned documents using
+ # the authoritative doc_id -> path mapping (no title guessing).
+ mentioned_doc_ids = {
+ (d.get("document") or {}).get("id") for d in mentioned_results
+ }
+ mentioned_paths = {
+ doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path
+ }
+
+ ai_msg, tool_msg = _build_synthetic_ls(
+ existing_files,
+ new_files,
+ mentioned_paths=mentioned_paths,
+ )
if t0 is not None:
_perf_log.info(
- "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r new_files=%d total=%d",
+ "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r "
+ "mentioned=%d new_files=%d total=%d",
asyncio.get_event_loop().time() - t0,
user_text[:80],
planned_query[:120],
+ len(mentioned_results),
len(new_files),
len(new_files) + len(existing_files or {}),
)
diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py
index 6e69218f1..f53c81bb6 100644
--- a/surfsense_backend/app/routes/documents_routes.py
+++ b/surfsense_backend/app/routes/documents_routes.py
@@ -1,7 +1,7 @@
# Force asyncio to use standard event loop before unstructured imports
import asyncio
-from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile
+from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
from sqlalchemy.ext.asyncio import AsyncSession
from sqlalchemy.future import select
from sqlalchemy.orm import selectinload
@@ -17,6 +17,7 @@ from app.db import (
get_async_session,
)
from app.schemas import (
+ ChunkRead,
DocumentRead,
DocumentsCreate,
DocumentStatusBatchResponse,
@@ -45,9 +46,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1"
router = APIRouter()
-MAX_FILES_PER_UPLOAD = 10
-MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file
-MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024 # 200 MB total
+MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB per file
@router.post("/documents")
@@ -156,13 +155,6 @@ async def create_documents_file_upload(
if not files:
raise HTTPException(status_code=400, detail="No files provided")
- if len(files) > MAX_FILES_PER_UPLOAD:
- raise HTTPException(
- status_code=413,
- detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.",
- )
-
- total_size = 0
for file in files:
file_size = file.size or 0
if file_size > MAX_FILE_SIZE_BYTES:
@@ -171,14 +163,6 @@ async def create_documents_file_upload(
detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) "
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
)
- total_size += file_size
-
- if total_size > MAX_TOTAL_SIZE_BYTES:
- raise HTTPException(
- status_code=413,
- detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) "
- f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
- )
# ===== Read all files concurrently to avoid blocking the event loop =====
async def _read_and_save(file: UploadFile) -> tuple[str, str, int]:
@@ -206,16 +190,6 @@ async def create_documents_file_upload(
saved_files = await asyncio.gather(*(_read_and_save(f) for f in files))
- actual_total_size = sum(size for _, _, size in saved_files)
- if actual_total_size > MAX_TOTAL_SIZE_BYTES:
- for temp_path, _, _ in saved_files:
- os.unlink(temp_path)
- raise HTTPException(
- status_code=413,
- detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) "
- f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.",
- )
-
# ===== PHASE 1: Create pending documents for all files =====
created_documents: list[Document] = []
files_to_process: list[tuple[Document, str, str]] = []
@@ -451,13 +425,15 @@ async def read_documents(
reason=doc.status.get("reason"),
)
+ raw_content = doc.content or ""
api_documents.append(
DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
- content=doc.content,
+ content="",
+ content_preview=raw_content[:300],
content_hash=doc.content_hash,
unique_identifier_hash=doc.unique_identifier_hash,
created_at=doc.created_at,
@@ -609,13 +585,15 @@ async def search_documents(
reason=doc.status.get("reason"),
)
+ raw_content = doc.content or ""
api_documents.append(
DocumentRead(
id=doc.id,
title=doc.title,
document_type=doc.document_type,
document_metadata=doc.document_metadata,
- content=doc.content,
+ content="",
+ content_preview=raw_content[:300],
content_hash=doc.content_hash,
unique_identifier_hash=doc.unique_identifier_hash,
created_at=doc.created_at,
@@ -884,16 +862,19 @@ async def get_document_type_counts(
@router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead)
async def get_document_by_chunk_id(
chunk_id: int,
+ chunk_window: int = Query(
+ 5, ge=0, description="Number of chunks before/after the cited chunk to include"
+ ),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
"""
- Retrieves a document based on a chunk ID, including all its chunks ordered by creation time.
- Requires DOCUMENTS_READ permission for the search space.
- The document's embedding and chunk embeddings are excluded from the response.
+ Retrieves a document based on a chunk ID, including a window of chunks around the cited one.
+ Uses SQL-level pagination to avoid loading all chunks into memory.
"""
try:
- # First, get the chunk and verify it exists
+ from sqlalchemy import and_, func, or_
+
chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id))
chunk = chunk_result.scalars().first()
@@ -902,11 +883,8 @@ async def get_document_by_chunk_id(
status_code=404, detail=f"Chunk with id {chunk_id} not found"
)
- # Get the associated document
document_result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(Document.id == chunk.document_id)
+ select(Document).filter(Document.id == chunk.document_id)
)
document = document_result.scalars().first()
@@ -916,7 +894,6 @@ async def get_document_by_chunk_id(
detail="Document not found",
)
- # Check permission for the search space
await check_permission(
session,
user,
@@ -925,10 +902,38 @@ async def get_document_by_chunk_id(
"You don't have permission to read documents in this search space",
)
- # Sort chunks by creation time
- sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at)
+ total_result = await session.execute(
+ select(func.count())
+ .select_from(Chunk)
+ .filter(Chunk.document_id == document.id)
+ )
+ total_chunks = total_result.scalar() or 0
+
+ cited_idx_result = await session.execute(
+ select(func.count())
+ .select_from(Chunk)
+ .filter(
+ Chunk.document_id == document.id,
+ or_(
+ Chunk.created_at < chunk.created_at,
+ and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id),
+ ),
+ )
+ )
+ cited_idx = cited_idx_result.scalar() or 0
+
+ start = max(0, cited_idx - chunk_window)
+ end = min(total_chunks, cited_idx + chunk_window + 1)
+
+ windowed_result = await session.execute(
+ select(Chunk)
+ .filter(Chunk.document_id == document.id)
+ .order_by(Chunk.created_at, Chunk.id)
+ .offset(start)
+ .limit(end - start)
+ )
+ windowed_chunks = windowed_result.scalars().all()
- # Return the document with its chunks
return DocumentWithChunksRead(
id=document.id,
title=document.title,
@@ -940,7 +945,9 @@ async def get_document_by_chunk_id(
created_at=document.created_at,
updated_at=document.updated_at,
search_space_id=document.search_space_id,
- chunks=sorted_chunks,
+ chunks=windowed_chunks,
+ total_chunks=total_chunks,
+ chunk_start_index=start,
)
except HTTPException:
raise
@@ -950,6 +957,75 @@ async def get_document_by_chunk_id(
) from e
+@router.get(
+ "/documents/{document_id}/chunks",
+ response_model=PaginatedResponse[ChunkRead],
+)
+async def get_document_chunks_paginated(
+ document_id: int,
+ page: int = Query(0, ge=0),
+ page_size: int = Query(20, ge=1, le=100),
+ start_offset: int | None = Query(
+ None, ge=0, description="Direct offset; overrides page * page_size"
+ ),
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """
+ Paginated chunk loading for a document.
+ Supports both page-based and offset-based access.
+ """
+ try:
+ from sqlalchemy import func
+
+ doc_result = await session.execute(
+ select(Document).filter(Document.id == document_id)
+ )
+ document = doc_result.scalars().first()
+
+ if not document:
+ raise HTTPException(status_code=404, detail="Document not found")
+
+ await check_permission(
+ session,
+ user,
+ document.search_space_id,
+ Permission.DOCUMENTS_READ.value,
+ "You don't have permission to read documents in this search space",
+ )
+
+ total_result = await session.execute(
+ select(func.count())
+ .select_from(Chunk)
+ .filter(Chunk.document_id == document_id)
+ )
+ total = total_result.scalar() or 0
+
+ offset = start_offset if start_offset is not None else page * page_size
+ chunks_result = await session.execute(
+ select(Chunk)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.created_at, Chunk.id)
+ .offset(offset)
+ .limit(page_size)
+ )
+ chunks = chunks_result.scalars().all()
+
+ return PaginatedResponse(
+ items=chunks,
+ total=total,
+ page=offset // page_size if page_size else page,
+ page_size=page_size,
+ has_more=(offset + len(chunks)) < total,
+ )
+ except HTTPException:
+ raise
+ except Exception as e:
+ raise HTTPException(
+ status_code=500, detail=f"Failed to fetch chunks: {e!s}"
+ ) from e
+
+
@router.get("/documents/{document_id}", response_model=DocumentRead)
async def read_document(
document_id: int,
@@ -980,13 +1056,14 @@ async def read_document(
"You don't have permission to read documents in this search space",
)
- # Convert database object to API-friendly format
+ raw_content = document.content or ""
return DocumentRead(
id=document.id,
title=document.title,
document_type=document.document_type,
document_metadata=document.document_metadata,
- content=document.content,
+ content=raw_content,
+ content_preview=raw_content[:300],
content_hash=document.content_hash,
unique_identifier_hash=document.unique_identifier_hash,
created_at=document.created_at,
diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py
index f54f18def..09a35c619 100644
--- a/surfsense_backend/app/routes/editor_routes.py
+++ b/surfsense_backend/app/routes/editor_routes.py
@@ -15,11 +15,10 @@ import pypandoc
import typst
from fastapi import APIRouter, Depends, HTTPException, Query
from fastapi.responses import StreamingResponse
-from sqlalchemy import select
+from sqlalchemy import func, select
from sqlalchemy.ext.asyncio import AsyncSession
-from sqlalchemy.orm import selectinload
-from app.db import Document, DocumentType, Permission, User, get_async_session
+from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session
from app.routes.reports_routes import (
_FILE_EXTENSIONS,
_MEDIA_TYPES,
@@ -44,6 +43,9 @@ router = APIRouter()
async def get_editor_content(
search_space_id: int,
document_id: int,
+ max_length: int | None = Query(
+ None, description="Truncate source_markdown to this many characters"
+ ),
session: AsyncSession = Depends(get_async_session),
user: User = Depends(current_active_user),
):
@@ -65,9 +67,7 @@ async def get_editor_content(
)
result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(
+ select(Document).filter(
Document.id == document_id,
Document.search_space_id == search_space_id,
)
@@ -77,62 +77,63 @@ async def get_editor_content(
if not document:
raise HTTPException(status_code=404, detail="Document not found")
- # Priority 1: Return source_markdown if it exists (check `is not None` to allow empty strings)
- if document.source_markdown is not None:
+ count_result = await session.execute(
+ select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id)
+ )
+ chunk_count = count_result.scalar() or 0
+
+ def _build_response(md: str) -> dict:
+ size_bytes = len(md.encode("utf-8"))
+ truncated = False
+ output_md = md
+ if max_length is not None and size_bytes > max_length:
+ output_md = md[:max_length]
+ truncated = True
return {
"document_id": document.id,
"title": document.title,
"document_type": document.document_type.value,
- "source_markdown": document.source_markdown,
+ "source_markdown": output_md,
+ "content_size_bytes": size_bytes,
+ "chunk_count": chunk_count,
+ "truncated": truncated,
"updated_at": document.updated_at.isoformat()
if document.updated_at
else None,
}
- # Priority 2: Lazy-migrate from blocknote_document (pure Python, no external deps)
+ if document.source_markdown is not None:
+ return _build_response(document.source_markdown)
+
if document.blocknote_document:
from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown = blocknote_to_markdown(document.blocknote_document)
if markdown:
- # Persist the migration so we don't repeat it
document.source_markdown = markdown
await session.commit()
- return {
- "document_id": document.id,
- "title": document.title,
- "document_type": document.document_type.value,
- "source_markdown": markdown,
- "updated_at": document.updated_at.isoformat()
- if document.updated_at
- else None,
- }
+ return _build_response(markdown)
- # Priority 3: For NOTE type with no content, return empty markdown
if document.document_type == DocumentType.NOTE:
empty_markdown = ""
document.source_markdown = empty_markdown
await session.commit()
- return {
- "document_id": document.id,
- "title": document.title,
- "document_type": document.document_type.value,
- "source_markdown": empty_markdown,
- "updated_at": document.updated_at.isoformat()
- if document.updated_at
- else None,
- }
+ return _build_response(empty_markdown)
- # Priority 4: Reconstruct from chunks
- chunks = sorted(document.chunks, key=lambda c: c.id)
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
- if not chunks:
+ if not chunk_contents:
raise HTTPException(
status_code=400,
detail="This document has no content and cannot be edited. Please re-upload to enable editing.",
)
- markdown_content = "\n\n".join(chunk.content for chunk in chunks)
+ markdown_content = "\n\n".join(chunk_contents)
if not markdown_content.strip():
raise HTTPException(
@@ -140,17 +141,77 @@ async def get_editor_content(
detail="This document has empty content and cannot be edited.",
)
- # Persist the lazy migration
document.source_markdown = markdown_content
await session.commit()
- return {
- "document_id": document.id,
- "title": document.title,
- "document_type": document.document_type.value,
- "source_markdown": markdown_content,
- "updated_at": document.updated_at.isoformat() if document.updated_at else None,
- }
+ return _build_response(markdown_content)
+
+
+@router.get(
+ "/search-spaces/{search_space_id}/documents/{document_id}/download-markdown"
+)
+async def download_document_markdown(
+ search_space_id: int,
+ document_id: int,
+ session: AsyncSession = Depends(get_async_session),
+ user: User = Depends(current_active_user),
+):
+ """
+ Download the full document content as a .md file.
+ Reconstructs markdown from source_markdown or chunks.
+ """
+ await check_permission(
+ session,
+ user,
+ search_space_id,
+ Permission.DOCUMENTS_READ.value,
+ "You don't have permission to read documents in this search space",
+ )
+
+ result = await session.execute(
+ select(Document).filter(
+ Document.id == document_id,
+ Document.search_space_id == search_space_id,
+ )
+ )
+ document = result.scalars().first()
+
+ if not document:
+ raise HTTPException(status_code=404, detail="Document not found")
+
+ markdown: str | None = document.source_markdown
+ if markdown is None and document.blocknote_document:
+ from app.utils.blocknote_to_markdown import blocknote_to_markdown
+
+ markdown = blocknote_to_markdown(document.blocknote_document)
+ if markdown is None:
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
+ if chunk_contents:
+ markdown = "\n\n".join(chunk_contents)
+
+ if not markdown or not markdown.strip():
+ raise HTTPException(
+ status_code=400, detail="Document has no content to download"
+ )
+
+ safe_title = (
+ "".join(
+ c if c.isalnum() or c in " -_" else "_"
+ for c in (document.title or "document")
+ ).strip()[:80]
+ or "document"
+ )
+
+ return StreamingResponse(
+ io.BytesIO(markdown.encode("utf-8")),
+ media_type="text/markdown; charset=utf-8",
+ headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'},
+ )
@router.post("/search-spaces/{search_space_id}/documents/{document_id}/save")
@@ -258,9 +319,7 @@ async def export_document(
)
result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(
+ select(Document).filter(
Document.id == document_id,
Document.search_space_id == search_space_id,
)
@@ -269,16 +328,20 @@ async def export_document(
if not document:
raise HTTPException(status_code=404, detail="Document not found")
- # Resolve markdown content (same priority as editor-content endpoint)
markdown_content: str | None = document.source_markdown
if markdown_content is None and document.blocknote_document:
from app.utils.blocknote_to_markdown import blocknote_to_markdown
markdown_content = blocknote_to_markdown(document.blocknote_document)
if markdown_content is None:
- chunks = sorted(document.chunks, key=lambda c: c.id)
- if chunks:
- markdown_content = "\n\n".join(chunk.content for chunk in chunks)
+ chunk_contents_result = await session.execute(
+ select(Chunk.content)
+ .filter(Chunk.document_id == document_id)
+ .order_by(Chunk.id)
+ )
+ chunk_contents = chunk_contents_result.scalars().all()
+ if chunk_contents:
+ markdown_content = "\n\n".join(chunk_contents)
if not markdown_content or not markdown_content.strip():
raise HTTPException(status_code=400, detail="Document has no content to export")
diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py
index c022a09d2..49d2836b2 100644
--- a/surfsense_backend/app/schemas/documents.py
+++ b/surfsense_backend/app/schemas/documents.py
@@ -53,25 +53,26 @@ class DocumentRead(BaseModel):
title: str
document_type: DocumentType
document_metadata: dict
- content: str # Changed to string to match frontend
+ content: str = ""
+ content_preview: str = ""
content_hash: str
unique_identifier_hash: str | None
created_at: datetime
updated_at: datetime | None
search_space_id: int
folder_id: int | None = None
- created_by_id: UUID | None = None # User who created/uploaded this document
+ created_by_id: UUID | None = None
created_by_name: str | None = None
created_by_email: str | None = None
- status: DocumentStatusSchema | None = (
- None # Processing status (ready, processing, failed)
- )
+ status: DocumentStatusSchema | None = None
model_config = ConfigDict(from_attributes=True)
class DocumentWithChunksRead(DocumentRead):
chunks: list[ChunkRead] = []
+ total_chunks: int = 0
+ chunk_start_index: int = 0
model_config = ConfigDict(from_attributes=True)
diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py
index 7c1e3b7ea..5ff907459 100644
--- a/surfsense_backend/app/tasks/chat/stream_new_chat.py
+++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py
@@ -39,7 +39,6 @@ from app.agents.new_chat.llm_config import (
)
from app.db import (
ChatVisibility,
- Document,
NewChatMessage,
NewChatThread,
Report,
@@ -63,74 +62,6 @@ _perf_log = get_perf_logger()
_background_tasks: set[asyncio.Task] = set()
-def format_mentioned_documents_as_context(documents: list[Document]) -> str:
- """
- Format mentioned documents as context for the agent.
-
- Uses the same XML structure as knowledge_base.format_documents_for_context
- to ensure citations work properly with chunk IDs.
- """
- if not documents:
- return ""
-
- context_parts = [""]
- context_parts.append(
- "The user has explicitly mentioned the following documents from their knowledge base. "
- "These documents are directly relevant to the query and should be prioritized as primary sources. "
- "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])."
- )
- context_parts.append("")
-
- for doc in documents:
- # Build metadata JSON
- metadata = doc.document_metadata or {}
- metadata_json = json.dumps(metadata, ensure_ascii=False)
-
- # Get URL from metadata
- url = (
- metadata.get("url")
- or metadata.get("source")
- or metadata.get("page_url")
- or ""
- )
-
- context_parts.append("")
- context_parts.append("")
- context_parts.append(f" {doc.id}")
- context_parts.append(
- f" {doc.document_type.value}"
- )
- context_parts.append(f" ")
- context_parts.append(f" ")
- context_parts.append(
- f" "
- )
- context_parts.append("")
- context_parts.append("")
- context_parts.append("")
-
- # Use chunks if available (preferred for proper citations)
- if hasattr(doc, "chunks") and doc.chunks:
- for chunk in doc.chunks:
- context_parts.append(
- f" "
- )
- else:
- # Fallback to document content if chunks not loaded
- # Use document ID as chunk ID prefix for consistency
- context_parts.append(
- f" "
- )
-
- context_parts.append("")
- context_parts.append("")
- context_parts.append("")
-
- context_parts.append("")
-
- return "\n".join(context_parts)
-
-
def format_mentioned_surfsense_docs_as_context(
documents: list[SurfsenseDocsDocument],
) -> str:
@@ -1317,6 +1248,7 @@ async def stream_new_chat(
firecrawl_api_key=firecrawl_api_key,
thread_visibility=visibility,
disabled_tools=disabled_tools,
+ mentioned_document_ids=mentioned_document_ids,
)
_perf_log.info(
"[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0
@@ -1340,18 +1272,9 @@ async def stream_new_chat(
thread.needs_history_bootstrap = False
await session.commit()
- # Fetch mentioned documents if any (with chunks for proper citations)
- mentioned_documents: list[Document] = []
- if mentioned_document_ids:
- result = await session.execute(
- select(Document)
- .options(selectinload(Document.chunks))
- .filter(
- Document.id.in_(mentioned_document_ids),
- Document.search_space_id == search_space_id,
- )
- )
- mentioned_documents = list(result.scalars().all())
+ # Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware
+ # which merges them into the scoped filesystem with full document
+ # structure. Only SurfSense docs and report context are inlined here.
# Fetch mentioned SurfSense docs if any
mentioned_surfsense_docs: list[SurfsenseDocsDocument] = []
@@ -1379,15 +1302,10 @@ async def stream_new_chat(
)
recent_reports = list(recent_reports_result.scalars().all())
- # Format the user query with context (mentioned documents + SurfSense docs)
+ # Format the user query with context (SurfSense docs + reports only)
final_query = user_query
context_parts = []
- if mentioned_documents:
- context_parts.append(
- format_mentioned_documents_as_context(mentioned_documents)
- )
-
if mentioned_surfsense_docs:
context_parts.append(
format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs)
@@ -1479,7 +1397,7 @@ async def stream_new_chat(
yield streaming_service.format_start_step()
# Initial thinking step - analyzing the request
- if mentioned_documents or mentioned_surfsense_docs:
+ if mentioned_surfsense_docs:
initial_title = "Analyzing referenced content"
action_verb = "Analyzing"
else:
@@ -1490,18 +1408,6 @@ async def stream_new_chat(
query_text = user_query[:80] + ("..." if len(user_query) > 80 else "")
processing_parts.append(query_text)
- if mentioned_documents:
- doc_names = []
- for doc in mentioned_documents:
- title = doc.title
- if len(title) > 30:
- title = title[:27] + "..."
- doc_names.append(title)
- if len(doc_names) == 1:
- processing_parts.append(f"[{doc_names[0]}]")
- else:
- processing_parts.append(f"[{len(doc_names)} documents]")
-
if mentioned_surfsense_docs:
doc_names = []
for doc in mentioned_surfsense_docs:
@@ -1527,7 +1433,7 @@ async def stream_new_chat(
# These ORM objects (with eagerly-loaded chunks) can be very large.
# They're only needed to build context strings already copied into
# final_query / langchain_messages — release them before streaming.
- del mentioned_documents, mentioned_surfsense_docs, recent_reports
+ del mentioned_surfsense_docs, recent_reports
del langchain_messages, final_query
# Check if this is the first assistant response so we can generate
diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py
index e70c41cb4..2b5690d02 100644
--- a/surfsense_backend/app/tasks/document_processors/__init__.py
+++ b/surfsense_backend/app/tasks/document_processors/__init__.py
@@ -12,16 +12,14 @@ Available processors:
- YouTube processor: Process YouTube videos and extract transcripts
"""
-# URL crawler
# Extension processor
-from .extension_processor import add_extension_received_document
-
-# File processors
-from .file_processors import (
+# File processors (backward-compatible re-exports from _save)
+from ._save import (
add_received_file_document_using_docling,
add_received_file_document_using_llamacloud,
add_received_file_document_using_unstructured,
)
+from .extension_processor import add_extension_received_document
# Markdown processor
from .markdown_processor import add_received_markdown_file_document
@@ -32,9 +30,9 @@ from .youtube_processor import add_youtube_video_document
__all__ = [
# Extension processing
"add_extension_received_document",
+ # File processing with different ETL services
"add_received_file_document_using_docling",
"add_received_file_document_using_llamacloud",
- # File processing with different ETL services
"add_received_file_document_using_unstructured",
# Markdown file processing
"add_received_markdown_file_document",
diff --git a/surfsense_backend/app/tasks/document_processors/_constants.py b/surfsense_backend/app/tasks/document_processors/_constants.py
new file mode 100644
index 000000000..f74d7acce
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_constants.py
@@ -0,0 +1,74 @@
+"""
+Constants for file document processing.
+
+Centralizes file type classification, LlamaCloud retry configuration,
+and timeout calculation parameters.
+"""
+
+import ssl
+from enum import Enum
+
+import httpx
+
+# ---------------------------------------------------------------------------
+# File type classification
+# ---------------------------------------------------------------------------
+
+MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt")
+AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
+DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm")
+
+
+class FileCategory(Enum):
+ MARKDOWN = "markdown"
+ AUDIO = "audio"
+ DIRECT_CONVERT = "direct_convert"
+ DOCUMENT = "document"
+
+
+def classify_file(filename: str) -> FileCategory:
+ """Classify a file by its extension into a processing category."""
+ lower = filename.lower()
+ if lower.endswith(MARKDOWN_EXTENSIONS):
+ return FileCategory.MARKDOWN
+ if lower.endswith(AUDIO_EXTENSIONS):
+ return FileCategory.AUDIO
+ if lower.endswith(DIRECT_CONVERT_EXTENSIONS):
+ return FileCategory.DIRECT_CONVERT
+ return FileCategory.DOCUMENT
+
+
+# ---------------------------------------------------------------------------
+# LlamaCloud retry configuration
+# ---------------------------------------------------------------------------
+
+LLAMACLOUD_MAX_RETRIES = 5
+LLAMACLOUD_BASE_DELAY = 10 # seconds (exponential backoff base)
+LLAMACLOUD_MAX_DELAY = 120 # max delay between retries (2 minutes)
+LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
+ ssl.SSLError,
+ httpx.ConnectError,
+ httpx.ConnectTimeout,
+ httpx.ReadError,
+ httpx.ReadTimeout,
+ httpx.WriteError,
+ httpx.WriteTimeout,
+ httpx.RemoteProtocolError,
+ httpx.LocalProtocolError,
+ ConnectionError,
+ ConnectionResetError,
+ TimeoutError,
+ OSError,
+)
+
+# ---------------------------------------------------------------------------
+# Timeout calculation constants
+# ---------------------------------------------------------------------------
+
+UPLOAD_BYTES_PER_SECOND_SLOW = (
+ 100 * 1024
+) # 100 KB/s (conservative for slow connections)
+MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
+MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
+BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
+PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing
diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
new file mode 100644
index 000000000..b1a69ef4f
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py
@@ -0,0 +1,90 @@
+"""
+Lossless file-to-markdown converters for text-based formats.
+
+These converters handle file types that can be faithfully represented as
+markdown without any external ETL/OCR service:
+
+- CSV / TSV → markdown table (stdlib ``csv``)
+- HTML / HTM → markdown (``markdownify``)
+"""
+
+from __future__ import annotations
+
+import csv
+from collections.abc import Callable
+from pathlib import Path
+
+from markdownify import markdownify
+
+# The stdlib csv module defaults to a 128 KB field-size limit which is too
+# small for real-world exports (e.g. chat logs, CRM dumps). We raise it once
+# at import time so every csv.reader call in this module can handle large fields.
+csv.field_size_limit(2**31 - 1)
+
+
+def _escape_pipe(cell: str) -> str:
+ """Escape literal pipe characters inside a markdown table cell."""
+ return cell.replace("|", "\\|")
+
+
+def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str:
+ """Convert a CSV (or TSV) file to a markdown table.
+
+ The first row is treated as the header. An empty file returns an
+ empty string so the caller can decide how to handle it.
+ """
+ with open(file_path, encoding="utf-8", newline="") as fh:
+ reader = csv.reader(fh, delimiter=delimiter)
+ rows = list(reader)
+
+ if not rows:
+ return ""
+
+ header, *body = rows
+ col_count = len(header)
+
+ lines: list[str] = []
+
+ header_cells = [_escape_pipe(c.strip()) for c in header]
+ lines.append("| " + " | ".join(header_cells) + " |")
+ lines.append("| " + " | ".join(["---"] * col_count) + " |")
+
+ for row in body:
+ padded = row + [""] * (col_count - len(row))
+ cells = [_escape_pipe(c.strip()) for c in padded[:col_count]]
+ lines.append("| " + " | ".join(cells) + " |")
+
+ return "\n".join(lines) + "\n"
+
+
+def tsv_to_markdown(file_path: str) -> str:
+ """Convert a TSV file to a markdown table."""
+ return csv_to_markdown(file_path, delimiter="\t")
+
+
+def html_to_markdown(file_path: str) -> str:
+ """Convert an HTML file to markdown via ``markdownify``."""
+ html = Path(file_path).read_text(encoding="utf-8")
+ return markdownify(html).strip()
+
+
+_CONVERTER_MAP: dict[str, Callable[..., str]] = {
+ ".csv": csv_to_markdown,
+ ".tsv": tsv_to_markdown,
+ ".html": html_to_markdown,
+ ".htm": html_to_markdown,
+}
+
+
+def convert_file_directly(file_path: str, filename: str) -> str:
+ """Dispatch to the appropriate lossless converter based on file extension.
+
+ Raises ``ValueError`` if the extension is not supported.
+ """
+ suffix = Path(filename).suffix.lower()
+ converter = _CONVERTER_MAP.get(suffix)
+ if converter is None:
+ raise ValueError(
+ f"No direct converter for extension '{suffix}' (file: {filename})"
+ )
+ return converter(file_path)
diff --git a/surfsense_backend/app/tasks/document_processors/_etl.py b/surfsense_backend/app/tasks/document_processors/_etl.py
new file mode 100644
index 000000000..cc3a8b1ac
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_etl.py
@@ -0,0 +1,209 @@
+"""
+ETL parsing strategies for different document processing services.
+
+Provides parse functions for Unstructured, LlamaCloud, and Docling, along with
+LlamaCloud retry logic and dynamic timeout calculations.
+"""
+
+import asyncio
+import logging
+import os
+import random
+import warnings
+from logging import ERROR, getLogger
+
+import httpx
+
+from app.config import config as app_config
+from app.db import Log
+from app.services.task_logging_service import TaskLoggingService
+
+from ._constants import (
+ LLAMACLOUD_BASE_DELAY,
+ LLAMACLOUD_MAX_DELAY,
+ LLAMACLOUD_MAX_RETRIES,
+ LLAMACLOUD_RETRYABLE_EXCEPTIONS,
+ PER_PAGE_JOB_TIMEOUT,
+)
+from ._helpers import calculate_job_timeout, calculate_upload_timeout
+
+# ---------------------------------------------------------------------------
+# LlamaCloud parsing with retry
+# ---------------------------------------------------------------------------
+
+
+async def parse_with_llamacloud_retry(
+ file_path: str,
+ estimated_pages: int,
+ task_logger: TaskLoggingService | None = None,
+ log_entry: Log | None = None,
+):
+ """
+ Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
+
+ Uses dynamic timeout calculations based on file size and page count to handle
+ very large files reliably.
+
+ Returns:
+ LlamaParse result object
+
+ Raises:
+ Exception: If all retries fail
+ """
+ from llama_cloud_services import LlamaParse
+ from llama_cloud_services.parse.utils import ResultType
+
+ file_size_bytes = os.path.getsize(file_path)
+ file_size_mb = file_size_bytes / (1024 * 1024)
+
+ upload_timeout = calculate_upload_timeout(file_size_bytes)
+ job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
+
+ custom_timeout = httpx.Timeout(
+ connect=120.0,
+ read=upload_timeout,
+ write=upload_timeout,
+ pool=120.0,
+ )
+
+ logging.info(
+ f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
+ f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
+ f"job_timeout={job_timeout:.0f}s"
+ )
+
+ last_exception = None
+ attempt_errors: list[str] = []
+
+ for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
+ try:
+ async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
+ parser = LlamaParse(
+ api_key=app_config.LLAMA_CLOUD_API_KEY,
+ num_workers=1,
+ verbose=True,
+ language="en",
+ result_type=ResultType.MD,
+ max_timeout=int(max(2000, job_timeout + upload_timeout)),
+ job_timeout_in_seconds=job_timeout,
+ job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
+ custom_client=custom_client,
+ )
+ result = await parser.aparse(file_path)
+
+ if attempt > 1:
+ logging.info(
+ f"LlamaCloud upload succeeded on attempt {attempt} after "
+ f"{len(attempt_errors)} failures"
+ )
+ return result
+
+ except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
+ last_exception = e
+ error_type = type(e).__name__
+ error_msg = str(e)[:200]
+ attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
+
+ if attempt < LLAMACLOUD_MAX_RETRIES:
+ base_delay = min(
+ LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
+ LLAMACLOUD_MAX_DELAY,
+ )
+ jitter = base_delay * 0.25 * (2 * random.random() - 1)
+ delay = base_delay + jitter
+
+ if task_logger and log_entry:
+ await task_logger.log_task_progress(
+ log_entry,
+ f"LlamaCloud upload failed "
+ f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), "
+ f"retrying in {delay:.0f}s",
+ {
+ "error_type": error_type,
+ "error_message": error_msg,
+ "attempt": attempt,
+ "retry_delay": delay,
+ "file_size_mb": round(file_size_mb, 1),
+ "upload_timeout": upload_timeout,
+ },
+ )
+ else:
+ logging.warning(
+ f"LlamaCloud upload failed "
+ f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
+ f"{error_type}. File: {file_size_mb:.1f}MB. "
+ f"Retrying in {delay:.0f}s..."
+ )
+
+ await asyncio.sleep(delay)
+ else:
+ logging.error(
+ f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
+ f"attempts. File size: {file_size_mb:.1f}MB, "
+ f"Pages: {estimated_pages}. "
+ f"Errors: {'; '.join(attempt_errors)}"
+ )
+
+ except Exception:
+ raise
+
+ raise last_exception or RuntimeError(
+ f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
+ f"File size: {file_size_mb:.1f}MB"
+ )
+
+
+# ---------------------------------------------------------------------------
+# Per-service parse functions
+# ---------------------------------------------------------------------------
+
+
+async def parse_with_unstructured(file_path: str):
+ """
+ Parse a file using the Unstructured ETL service.
+
+ Returns:
+ List of LangChain Document elements.
+ """
+ from langchain_unstructured import UnstructuredLoader
+
+ loader = UnstructuredLoader(
+ file_path,
+ mode="elements",
+ post_processors=[],
+ languages=["eng"],
+ include_orig_elements=False,
+ include_metadata=False,
+ strategy="auto",
+ )
+ return await loader.aload()
+
+
+async def parse_with_docling(file_path: str, filename: str) -> str:
+ """
+ Parse a file using the Docling ETL service (via the Docling service wrapper).
+
+ Returns:
+ Markdown content string.
+ """
+ from app.services.docling_service import create_docling_service
+
+ docling_service = create_docling_service()
+
+ pdfminer_logger = getLogger("pdfminer")
+ original_level = pdfminer_logger.level
+
+ with warnings.catch_warnings():
+ warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
+ warnings.filterwarnings(
+ "ignore", message=".*Cannot set gray non-stroke color.*"
+ )
+ warnings.filterwarnings("ignore", message=".*invalid float value.*")
+ pdfminer_logger.setLevel(ERROR)
+
+ try:
+ result = await docling_service.process_document(file_path, filename)
+ finally:
+ pdfminer_logger.setLevel(original_level)
+
+ return result["content"]
diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py
new file mode 100644
index 000000000..7ac05932c
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_helpers.py
@@ -0,0 +1,218 @@
+"""
+Document helper functions for deduplication, migration, and connector updates.
+
+Provides reusable logic shared across file processors and ETL strategies.
+"""
+
+import logging
+
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentStatus, DocumentType
+from app.utils.document_converters import generate_unique_identifier_hash
+
+from ._constants import (
+ BASE_JOB_TIMEOUT,
+ MAX_UPLOAD_TIMEOUT,
+ MIN_UPLOAD_TIMEOUT,
+ PER_PAGE_JOB_TIMEOUT,
+ UPLOAD_BYTES_PER_SECOND_SLOW,
+)
+from .base import (
+ check_document_by_unique_identifier,
+ check_duplicate_document,
+)
+
+# ---------------------------------------------------------------------------
+# Unique identifier helpers
+# ---------------------------------------------------------------------------
+
+
+def get_google_drive_unique_identifier(
+ connector: dict | None,
+ filename: str,
+ search_space_id: int,
+) -> tuple[str, str | None]:
+ """
+ Get unique identifier hash, using file_id for Google Drive (stable across renames).
+
+ Returns:
+ Tuple of (primary_hash, legacy_hash or None).
+ For Google Drive: (file_id-based hash, filename-based hash for migration).
+ For other sources: (filename-based hash, None).
+ """
+ if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+ metadata = connector.get("metadata", {})
+ file_id = metadata.get("google_drive_file_id")
+
+ if file_id:
+ primary_hash = generate_unique_identifier_hash(
+ DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
+ )
+ legacy_hash = generate_unique_identifier_hash(
+ DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
+ )
+ return primary_hash, legacy_hash
+
+ primary_hash = generate_unique_identifier_hash(
+ DocumentType.FILE, filename, search_space_id
+ )
+ return primary_hash, None
+
+
+# ---------------------------------------------------------------------------
+# Document deduplication and migration
+# ---------------------------------------------------------------------------
+
+
+async def handle_existing_document_update(
+ session: AsyncSession,
+ existing_document: Document,
+ content_hash: str,
+ connector: dict | None,
+ filename: str,
+ primary_hash: str,
+) -> tuple[bool, Document | None]:
+ """
+ Handle update logic for an existing document.
+
+ Returns:
+ Tuple of (should_skip_processing, document_to_return):
+ - (True, document): Content unchanged, return existing document
+ - (False, None): Content changed, needs re-processing
+ """
+ if existing_document.unique_identifier_hash != primary_hash:
+ existing_document.unique_identifier_hash = primary_hash
+ logging.info(f"Migrated document to file_id-based identifier: {filename}")
+
+ if existing_document.content_hash == content_hash:
+ if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+ connector_metadata = connector.get("metadata", {})
+ new_name = connector_metadata.get("google_drive_file_name")
+ doc_metadata = existing_document.document_metadata or {}
+ old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
+ "google_drive_file_name"
+ )
+
+ if new_name and old_name and old_name != new_name:
+ from sqlalchemy.orm.attributes import flag_modified
+
+ existing_document.title = new_name
+ if not existing_document.document_metadata:
+ existing_document.document_metadata = {}
+ existing_document.document_metadata["FILE_NAME"] = new_name
+ existing_document.document_metadata["google_drive_file_name"] = new_name
+ flag_modified(existing_document, "document_metadata")
+ await session.commit()
+ logging.info(
+ f"File renamed in Google Drive: '{old_name}' → '{new_name}' "
+ f"(no re-processing needed)"
+ )
+
+ logging.info(f"Document for file {filename} unchanged. Skipping.")
+ return True, existing_document
+
+ # Content has changed — guard against content_hash collision before
+ # expensive ETL processing.
+ collision_doc = await check_duplicate_document(session, content_hash)
+ if collision_doc and collision_doc.id != existing_document.id:
+ logging.warning(
+ "Content-hash collision for %s: identical content exists in "
+ "document #%s (%s). Skipping re-processing.",
+ filename,
+ collision_doc.id,
+ collision_doc.document_type,
+ )
+ if DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PENDING
+ ) or DocumentStatus.is_state(
+ existing_document.status, DocumentStatus.PROCESSING
+ ):
+ await session.delete(existing_document)
+ await session.commit()
+ return True, None
+
+ return True, existing_document
+
+ logging.info(f"Content changed for file {filename}. Updating document.")
+ return False, None
+
+
+async def find_existing_document_with_migration(
+ session: AsyncSession,
+ primary_hash: str,
+ legacy_hash: str | None,
+ content_hash: str | None = None,
+) -> Document | None:
+ """
+ Find existing document, checking primary hash, legacy hash, and content_hash.
+
+ Supports migration from filename-based to file_id-based hashing for
+ Google Drive files, with content_hash fallback for cross-source dedup.
+ """
+ existing_document = await check_document_by_unique_identifier(session, primary_hash)
+
+ if not existing_document and legacy_hash:
+ existing_document = await check_document_by_unique_identifier(
+ session, legacy_hash
+ )
+ if existing_document:
+ logging.info(
+ "Found legacy document (filename-based hash), "
+ "will migrate to file_id-based hash"
+ )
+
+ if not existing_document and content_hash:
+ existing_document = await check_duplicate_document(session, content_hash)
+ if existing_document:
+ logging.info(
+ f"Found duplicate content from different source (content_hash match). "
+ f"Original document ID: {existing_document.id}, "
+ f"type: {existing_document.document_type}"
+ )
+
+ return existing_document
+
+
+# ---------------------------------------------------------------------------
+# Connector helpers
+# ---------------------------------------------------------------------------
+
+
+async def update_document_from_connector(
+ document: Document | None,
+ connector: dict | None,
+ session: AsyncSession,
+) -> None:
+ """Update document type, metadata, and connector_id from connector info."""
+ if not document or not connector:
+ return
+ if "type" in connector:
+ document.document_type = connector["type"]
+ if "metadata" in connector:
+ if not document.document_metadata:
+ document.document_metadata = connector["metadata"]
+ else:
+ merged = {**document.document_metadata, **connector["metadata"]}
+ document.document_metadata = merged
+ if "connector_id" in connector:
+ document.connector_id = connector["connector_id"]
+ await session.commit()
+
+
+# ---------------------------------------------------------------------------
+# Timeout calculations
+# ---------------------------------------------------------------------------
+
+
+def calculate_upload_timeout(file_size_bytes: int) -> float:
+ """Calculate upload timeout based on file size (conservative for slow connections)."""
+ estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
+ return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
+
+
+def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
+ """Calculate job processing timeout based on page count and file size."""
+ page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
+ size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
+ return max(page_based_timeout, size_based_timeout)
diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py
new file mode 100644
index 000000000..5088ad004
--- /dev/null
+++ b/surfsense_backend/app/tasks/document_processors/_save.py
@@ -0,0 +1,285 @@
+"""
+Unified document save/update logic for file processors.
+
+Replaces the three nearly-identical ``add_received_file_document_using_*``
+functions with a single ``save_file_document`` function plus thin wrappers
+for backward compatibility.
+"""
+
+import logging
+
+from langchain_core.documents import Document as LangChainDocument
+from sqlalchemy.exc import SQLAlchemyError
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from app.db import Document, DocumentStatus, DocumentType
+from app.services.llm_service import get_user_long_context_llm
+from app.utils.document_converters import (
+ create_document_chunks,
+ embed_text,
+ generate_content_hash,
+ generate_document_summary,
+)
+
+from ._helpers import (
+ find_existing_document_with_migration,
+ get_google_drive_unique_identifier,
+ handle_existing_document_update,
+)
+from .base import get_current_timestamp, safe_set_chunks
+
+# ---------------------------------------------------------------------------
+# Summary generation
+# ---------------------------------------------------------------------------
+
+
+async def _generate_summary(
+ markdown_content: str,
+ file_name: str,
+ etl_service: str,
+ user_llm,
+ enable_summary: bool,
+) -> tuple[str, list[float]]:
+ """
+ Generate a document summary and embedding.
+
+ Docling uses its own large-document summary strategy; other ETL services
+ use the standard ``generate_document_summary`` helper.
+ """
+ if not enable_summary:
+ summary = f"File: {file_name}\n\n{markdown_content[:4000]}"
+ return summary, embed_text(summary)
+
+ if etl_service == "DOCLING":
+ from app.services.docling_service import create_docling_service
+
+ docling_service = create_docling_service()
+ summary_text = await docling_service.process_large_document_summary(
+ content=markdown_content, llm=user_llm, document_title=file_name
+ )
+
+ meta = {
+ "file_name": file_name,
+ "etl_service": etl_service,
+ "document_type": "File Document",
+ }
+ parts = ["# DOCUMENT METADATA"]
+ for key, value in meta.items():
+ if value:
+ formatted_key = key.replace("_", " ").title()
+ parts.append(f"**{formatted_key}:** {value}")
+
+ enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text
+ return enhanced, embed_text(enhanced)
+
+ # Standard summary (Unstructured / LlamaCloud / others)
+ meta = {
+ "file_name": file_name,
+ "etl_service": etl_service,
+ "document_type": "File Document",
+ }
+ return await generate_document_summary(markdown_content, user_llm, meta)
+
+
+# ---------------------------------------------------------------------------
+# Unified save function
+# ---------------------------------------------------------------------------
+
+
+async def save_file_document(
+ session: AsyncSession,
+ file_name: str,
+ markdown_content: str,
+ search_space_id: int,
+ user_id: str,
+ etl_service: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """
+ Process and store a file document with deduplication and migration support.
+
+ Handles both creating new documents and updating existing ones. This is
+ the single implementation behind the per-ETL-service wrapper functions.
+
+ Args:
+ session: Database session
+ file_name: Name of the processed file
+ markdown_content: Markdown content to store
+ search_space_id: ID of the search space
+ user_id: ID of the user
+ etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING)
+ connector: Optional connector info for Google Drive files
+ enable_summary: Whether to generate an AI summary
+
+ Returns:
+ Document object if successful, None if duplicate detected
+ """
+ try:
+ primary_hash, legacy_hash = get_google_drive_unique_identifier(
+ connector, file_name, search_space_id
+ )
+ content_hash = generate_content_hash(markdown_content, search_space_id)
+
+ existing_document = await find_existing_document_with_migration(
+ session, primary_hash, legacy_hash, content_hash
+ )
+
+ if existing_document:
+ should_skip, doc = await handle_existing_document_update(
+ session,
+ existing_document,
+ content_hash,
+ connector,
+ file_name,
+ primary_hash,
+ )
+ if should_skip:
+ return doc
+
+ user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
+ if not user_llm:
+ raise RuntimeError(
+ f"No long context LLM configured for user {user_id} "
+ f"in search space {search_space_id}"
+ )
+
+ summary_content, summary_embedding = await _generate_summary(
+ markdown_content, file_name, etl_service, user_llm, enable_summary
+ )
+ chunks = await create_document_chunks(markdown_content)
+ doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service}
+
+ if existing_document:
+ existing_document.title = file_name
+ existing_document.content = summary_content
+ existing_document.content_hash = content_hash
+ existing_document.embedding = summary_embedding
+ existing_document.document_metadata = doc_metadata
+ await safe_set_chunks(session, existing_document, chunks)
+ existing_document.source_markdown = markdown_content
+ existing_document.content_needs_reindexing = False
+ existing_document.updated_at = get_current_timestamp()
+ existing_document.status = DocumentStatus.ready()
+
+ await session.commit()
+ await session.refresh(existing_document)
+ return existing_document
+
+ doc_type = DocumentType.FILE
+ if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
+ doc_type = DocumentType.GOOGLE_DRIVE_FILE
+
+ document = Document(
+ search_space_id=search_space_id,
+ title=file_name,
+ document_type=doc_type,
+ document_metadata=doc_metadata,
+ content=summary_content,
+ embedding=summary_embedding,
+ chunks=chunks,
+ content_hash=content_hash,
+ unique_identifier_hash=primary_hash,
+ source_markdown=markdown_content,
+ content_needs_reindexing=False,
+ updated_at=get_current_timestamp(),
+ created_by_id=user_id,
+ connector_id=connector.get("connector_id") if connector else None,
+ status=DocumentStatus.ready(),
+ )
+ session.add(document)
+ await session.commit()
+ await session.refresh(document)
+ return document
+
+ except SQLAlchemyError as db_error:
+ await session.rollback()
+ if "ix_documents_content_hash" in str(db_error):
+ logging.warning(
+ "content_hash collision during commit for %s (%s). Skipping.",
+ file_name,
+ etl_service,
+ )
+ return None
+ raise db_error
+ except Exception as e:
+ await session.rollback()
+ raise RuntimeError(
+ f"Failed to process file document using {etl_service}: {e!s}"
+ ) from e
+
+
+# ---------------------------------------------------------------------------
+# Backward-compatible wrapper functions
+# ---------------------------------------------------------------------------
+
+
+async def add_received_file_document_using_unstructured(
+ session: AsyncSession,
+ file_name: str,
+ unstructured_processed_elements: list[LangChainDocument],
+ search_space_id: int,
+ user_id: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """Process and store a file document using the Unstructured service."""
+ from app.utils.document_converters import convert_document_to_markdown
+
+ markdown_content = await convert_document_to_markdown(
+ unstructured_processed_elements
+ )
+ return await save_file_document(
+ session,
+ file_name,
+ markdown_content,
+ search_space_id,
+ user_id,
+ "UNSTRUCTURED",
+ connector,
+ enable_summary,
+ )
+
+
+async def add_received_file_document_using_llamacloud(
+ session: AsyncSession,
+ file_name: str,
+ llamacloud_markdown_document: str,
+ search_space_id: int,
+ user_id: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """Process and store document content parsed by LlamaCloud."""
+ return await save_file_document(
+ session,
+ file_name,
+ llamacloud_markdown_document,
+ search_space_id,
+ user_id,
+ "LLAMACLOUD",
+ connector,
+ enable_summary,
+ )
+
+
+async def add_received_file_document_using_docling(
+ session: AsyncSession,
+ file_name: str,
+ docling_markdown_document: str,
+ search_space_id: int,
+ user_id: str,
+ connector: dict | None = None,
+ enable_summary: bool = True,
+) -> Document | None:
+ """Process and store document content parsed by Docling."""
+ return await save_file_document(
+ session,
+ file_name,
+ docling_markdown_document,
+ search_space_id,
+ user_id,
+ "DOCLING",
+ connector,
+ enable_summary,
+ )
diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py
index 6c0ae1870..0c1cad52d 100644
--- a/surfsense_backend/app/tasks/document_processors/file_processors.py
+++ b/surfsense_backend/app/tasks/document_processors/file_processors.py
@@ -1,905 +1,685 @@
"""
-File document processors for different ETL services (Unstructured, LlamaCloud, Docling).
+File document processors orchestrating content extraction and indexing.
+
+This module is the public entry point for file processing. It delegates to
+specialised sub-modules that each own a single concern:
+
+- ``_constants`` — file type classification and configuration constants
+- ``_helpers`` — document deduplication, migration, connector helpers
+- ``_direct_converters`` — lossless file-to-markdown for csv/tsv/html
+- ``_etl`` — ETL parsing strategies (Unstructured, LlamaCloud, Docling)
+- ``_save`` — unified document creation / update logic
"""
-import asyncio
+from __future__ import annotations
+
import contextlib
import logging
-import ssl
-import warnings
+import os
+from dataclasses import dataclass, field
from logging import ERROR, getLogger
-import httpx
from fastapi import HTTPException
-from langchain_core.documents import Document as LangChainDocument
-from litellm import atranscription
-from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config as app_config
-from app.db import Document, DocumentStatus, DocumentType, Log, Notification
-from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter
-from app.services.llm_service import get_user_long_context_llm
+from app.db import Document, Log, Notification
from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService
-from app.utils.document_converters import (
- convert_document_to_markdown,
- create_document_chunks,
- embed_text,
- generate_content_hash,
- generate_document_summary,
- generate_unique_identifier_hash,
-)
-from .base import (
- check_document_by_unique_identifier,
- check_duplicate_document,
- get_current_timestamp,
- safe_set_chunks,
+from ._constants import FileCategory, classify_file
+from ._direct_converters import convert_file_directly
+from ._etl import (
+ parse_with_docling,
+ parse_with_llamacloud_retry,
+ parse_with_unstructured,
+)
+from ._helpers import update_document_from_connector
+from ._save import (
+ add_received_file_document_using_docling,
+ add_received_file_document_using_llamacloud,
+ add_received_file_document_using_unstructured,
+ save_file_document,
)
from .markdown_processor import add_received_markdown_file_document
-# Constants for LlamaCloud retry configuration
-LLAMACLOUD_MAX_RETRIES = 5 # Increased from 3 for large file resilience
-LLAMACLOUD_BASE_DELAY = 10 # Base delay in seconds for exponential backoff
-LLAMACLOUD_MAX_DELAY = 120 # Maximum delay between retries (2 minutes)
-LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
- ssl.SSLError,
- httpx.ConnectError,
- httpx.ConnectTimeout,
- httpx.ReadTimeout,
- httpx.WriteTimeout,
- httpx.RemoteProtocolError,
- httpx.LocalProtocolError,
- ConnectionError,
- ConnectionResetError,
- TimeoutError,
- OSError, # Catches various network-level errors
-)
-
-# Timeout calculation constants
-UPLOAD_BYTES_PER_SECOND_SLOW = (
- 100 * 1024
-) # 100 KB/s (conservative for slow connections)
-MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
-MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
-BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
-PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing
+# Re-export public API so existing ``from file_processors import …`` keeps working.
+__all__ = [
+ "add_received_file_document_using_docling",
+ "add_received_file_document_using_llamacloud",
+ "add_received_file_document_using_unstructured",
+ "parse_with_llamacloud_retry",
+ "process_file_in_background",
+ "process_file_in_background_with_document",
+ "save_file_document",
+]
-def get_google_drive_unique_identifier(
- connector: dict | None,
- filename: str,
- search_space_id: int,
-) -> tuple[str, str | None]:
- """
- Get unique identifier hash for a file, with special handling for Google Drive.
-
- For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
- For other files, uses filename.
-
- Args:
- connector: Optional connector info dict with type and metadata
- filename: The filename (used for non-Google Drive files or as fallback)
- search_space_id: The search space ID
-
- Returns:
- Tuple of (primary_hash, legacy_hash or None)
- - For Google Drive: (file_id_based_hash, filename_based_hash for migration)
- - For other sources: (filename_based_hash, None)
- """
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- metadata = connector.get("metadata", {})
- file_id = metadata.get("google_drive_file_id")
-
- if file_id:
- # New method: use file_id as unique identifier (doesn't change on rename)
- primary_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
- )
- # Legacy method: for backward compatibility with existing documents
- # that were indexed with filename-based hash
- legacy_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
- )
- return primary_hash, legacy_hash
-
- # For non-Google Drive files, use filename as before
- primary_hash = generate_unique_identifier_hash(
- DocumentType.FILE, filename, search_space_id
- )
- return primary_hash, None
+# ---------------------------------------------------------------------------
+# Processing context (bundles parameters shared across handler functions)
+# ---------------------------------------------------------------------------
-async def handle_existing_document_update(
- session: AsyncSession,
- existing_document: Document,
- content_hash: str,
- connector: dict | None,
- filename: str,
- primary_hash: str,
-) -> tuple[bool, Document | None]:
- """
- Handle update logic for an existing document.
+@dataclass
+class _ProcessingContext:
+ session: AsyncSession
+ file_path: str
+ filename: str
+ search_space_id: int
+ user_id: str
+ task_logger: TaskLoggingService
+ log_entry: Log
+ connector: dict | None = None
+ notification: Notification | None = None
+ enable_summary: bool = field(init=False)
- Args:
- session: Database session
- existing_document: The existing document found in database
- content_hash: Hash of the new content
- connector: Optional connector info
- filename: Current filename
- primary_hash: The primary hash (file_id based for Google Drive)
-
- Returns:
- Tuple of (should_skip_processing, document_to_return)
- - (True, document): Content unchanged, just return existing document
- - (False, None): Content changed, need to re-process
- """
- # Check if this document needs hash migration (found via legacy hash)
- if existing_document.unique_identifier_hash != primary_hash:
- existing_document.unique_identifier_hash = primary_hash
- logging.info(f"Migrated document to file_id-based identifier: {filename}")
-
- # Check if content has changed
- if existing_document.content_hash == content_hash:
- # Content unchanged - check if we need to update metadata (e.g., filename changed)
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- connector_metadata = connector.get("metadata", {})
- new_name = connector_metadata.get("google_drive_file_name")
- # Check both possible keys for old name (FILE_NAME is used in stored documents)
- doc_metadata = existing_document.document_metadata or {}
- old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get(
- "google_drive_file_name"
- )
-
- if new_name and old_name and old_name != new_name:
- # File was renamed - update title and metadata, skip expensive processing
- from sqlalchemy.orm.attributes import flag_modified
-
- existing_document.title = new_name
- if not existing_document.document_metadata:
- existing_document.document_metadata = {}
- existing_document.document_metadata["FILE_NAME"] = new_name
- existing_document.document_metadata["google_drive_file_name"] = new_name
- flag_modified(existing_document, "document_metadata")
- await session.commit()
- logging.info(
- f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)"
- )
-
- logging.info(f"Document for file {filename} unchanged. Skipping.")
- return True, existing_document
- else:
- # Content has changed — guard against content_hash collision before
- # expensive ETL processing. A collision means the exact same content
- # already lives in a *different* document (e.g. a manual upload of the
- # same file). Proceeding would trigger a unique-constraint violation
- # on ix_documents_content_hash.
- collision_doc = await check_duplicate_document(session, content_hash)
- if collision_doc and collision_doc.id != existing_document.id:
- logging.warning(
- "Content-hash collision for %s: identical content exists in "
- "document #%s (%s). Skipping re-processing.",
- filename,
- collision_doc.id,
- collision_doc.document_type,
- )
- if DocumentStatus.is_state(
- existing_document.status, DocumentStatus.PENDING
- ) or DocumentStatus.is_state(
- existing_document.status, DocumentStatus.PROCESSING
- ):
- # Pending/processing doc has no real content yet — remove it
- # so the UI doesn't show a contentless entry.
- await session.delete(existing_document)
- await session.commit()
- return True, None
-
- # Document already has valid content — keep it as-is.
- return True, existing_document
-
- logging.info(f"Content changed for file {filename}. Updating document.")
- return False, None
-
-
-async def find_existing_document_with_migration(
- session: AsyncSession,
- primary_hash: str,
- legacy_hash: str | None,
- content_hash: str | None = None,
-) -> Document | None:
- """
- Find existing document, checking both new hash and legacy hash for migration,
- with fallback to content_hash for cross-source deduplication.
-
- Args:
- session: Database session
- primary_hash: The primary hash (file_id based for Google Drive)
- legacy_hash: The legacy hash (filename based) for migration, or None
- content_hash: The content hash for fallback deduplication, or None
-
- Returns:
- Existing document if found, None otherwise
- """
- # First check with primary hash (new method)
- existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
- # If not found and we have a legacy hash, check with that (migration path)
- if not existing_document and legacy_hash:
- existing_document = await check_document_by_unique_identifier(
- session, legacy_hash
- )
- if existing_document:
- logging.info(
- "Found legacy document (filename-based hash), will migrate to file_id-based hash"
- )
-
- # Fallback: check by content_hash to catch duplicates from different sources
- # This prevents unique constraint violations when the same content exists
- # under a different unique_identifier (e.g., manual upload vs Google Drive)
- if not existing_document and content_hash:
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- logging.info(
- f"Found duplicate content from different source (content_hash match). "
- f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
- )
-
- return existing_document
-
-
-def calculate_upload_timeout(file_size_bytes: int) -> float:
- """
- Calculate appropriate upload timeout based on file size.
-
- Assumes a conservative slow connection speed to handle worst-case scenarios.
-
- Args:
- file_size_bytes: Size of the file in bytes
-
- Returns:
- Timeout in seconds
- """
- # Calculate time needed at slow connection speed
- # Add 50% buffer for network variability and SSL overhead
- estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
-
- # Clamp to reasonable bounds
- return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
-
-
-def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
- """
- Calculate job processing timeout based on page count and file size.
-
- Args:
- estimated_pages: Estimated number of pages
- file_size_bytes: Size of the file in bytes
-
- Returns:
- Timeout in seconds
- """
- # Base timeout + time per page
- page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
-
- # Also consider file size (large images take longer to process)
- # ~1 minute per 10MB of file size
- size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
-
- # Use the larger of the two estimates
- return max(page_based_timeout, size_based_timeout)
-
-
-async def parse_with_llamacloud_retry(
- file_path: str,
- estimated_pages: int,
- task_logger: TaskLoggingService | None = None,
- log_entry: Log | None = None,
-):
- """
- Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
-
- Uses dynamic timeout calculations based on file size and page count to handle
- very large files reliably.
-
- Args:
- file_path: Path to the file to parse
- estimated_pages: Estimated number of pages for timeout calculation
- task_logger: Optional task logger for progress updates
- log_entry: Optional log entry for progress updates
-
- Returns:
- LlamaParse result object
-
- Raises:
- Exception: If all retries fail
- """
- import os
- import random
-
- from llama_cloud_services import LlamaParse
- from llama_cloud_services.parse.utils import ResultType
-
- # Get file size for timeout calculations
- file_size_bytes = os.path.getsize(file_path)
- file_size_mb = file_size_bytes / (1024 * 1024)
-
- # Calculate dynamic timeouts based on file size and page count
- upload_timeout = calculate_upload_timeout(file_size_bytes)
- job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
-
- # HTTP client timeouts - scaled based on file size
- # Write timeout is critical for large file uploads
- custom_timeout = httpx.Timeout(
- connect=120.0, # 2 minutes to establish connection (handles slow DNS, etc.)
- read=upload_timeout, # Dynamic based on file size
- write=upload_timeout, # Dynamic based on file size (upload time)
- pool=120.0, # 2 minutes to acquire connection from pool
- )
-
- logging.info(
- f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
- f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
- f"job_timeout={job_timeout:.0f}s"
- )
-
- last_exception = None
- attempt_errors = []
-
- for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
- try:
- # Create a fresh httpx client for each attempt
- async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
- # Create LlamaParse parser instance with optimized settings
- parser = LlamaParse(
- api_key=app_config.LLAMA_CLOUD_API_KEY,
- num_workers=1, # Use single worker for file processing
- verbose=True,
- language="en",
- result_type=ResultType.MD,
- # Timeout settings for large files
- max_timeout=int(max(2000, job_timeout + upload_timeout)),
- job_timeout_in_seconds=job_timeout,
- job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
- # Use our custom client with larger timeouts
- custom_client=custom_client,
- )
-
- # Parse the file asynchronously
- result = await parser.aparse(file_path)
-
- # Success - log if we had previous failures
- if attempt > 1:
- logging.info(
- f"LlamaCloud upload succeeded on attempt {attempt} after "
- f"{len(attempt_errors)} failures"
- )
-
- return result
-
- except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
- last_exception = e
- error_type = type(e).__name__
- error_msg = str(e)[:200]
- attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
-
- if attempt < LLAMACLOUD_MAX_RETRIES:
- # Calculate exponential backoff with jitter
- # Base delay doubles each attempt, capped at max delay
- base_delay = min(
- LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY
- )
- # Add random jitter (±25%) to prevent thundering herd
- jitter = base_delay * 0.25 * (2 * random.random() - 1)
- delay = base_delay + jitter
-
- if task_logger and log_entry:
- await task_logger.log_task_progress(
- log_entry,
- f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay:.0f}s",
- {
- "error_type": error_type,
- "error_message": error_msg,
- "attempt": attempt,
- "retry_delay": delay,
- "file_size_mb": round(file_size_mb, 1),
- "upload_timeout": upload_timeout,
- },
- )
- else:
- logging.warning(
- f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
- f"{error_type}. File: {file_size_mb:.1f}MB. Retrying in {delay:.0f}s..."
- )
-
- await asyncio.sleep(delay)
- else:
- logging.error(
- f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts. "
- f"File size: {file_size_mb:.1f}MB, Pages: {estimated_pages}. "
- f"Errors: {'; '.join(attempt_errors)}"
- )
-
- except Exception:
- # Non-retryable exception, raise immediately
- raise
-
- # All retries exhausted
- raise last_exception or RuntimeError(
- f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
- f"File size: {file_size_mb:.1f}MB"
- )
-
-
-async def add_received_file_document_using_unstructured(
- session: AsyncSession,
- file_name: str,
- unstructured_processed_elements: list[LangChainDocument],
- search_space_id: int,
- user_id: str,
- connector: dict | None = None,
- enable_summary: bool = True,
-) -> Document | None:
- """
- Process and store a file document using Unstructured service.
-
- Args:
- session: Database session
- file_name: Name of the processed file
- unstructured_processed_elements: Processed elements from Unstructured
- search_space_id: ID of the search space
- user_id: ID of the user
- connector: Optional connector info for Google Drive files
-
- Returns:
- Document object if successful, None if failed
- """
- try:
- file_in_markdown = await convert_document_to_markdown(
- unstructured_processed_elements
+ def __post_init__(self) -> None:
+ self.enable_summary = (
+ self.connector.get("enable_summary", True) if self.connector else True
)
- # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = get_google_drive_unique_identifier(
- connector, file_name, search_space_id
- )
- # Generate content hash
- content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
- # Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await find_existing_document_with_migration(
- session, primary_hash, legacy_hash, content_hash
- )
-
- if existing_document:
- # Handle existing document (rename detection, content change check)
- should_skip, doc = await handle_existing_document_update(
- session,
- existing_document,
- content_hash,
- connector,
- file_name,
- primary_hash,
- )
- if should_skip:
- return doc
- # Content changed - continue to update
-
- # Get user's long context LLM (needed for both create and update)
- user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
- if not user_llm:
- raise RuntimeError(
- f"No long context LLM configured for user {user_id} in search space {search_space_id}"
- )
-
- # Generate summary with metadata
- document_metadata = {
- "file_name": file_name,
- "etl_service": "UNSTRUCTURED",
- "document_type": "File Document",
- }
- if enable_summary:
- summary_content, summary_embedding = await generate_document_summary(
- file_in_markdown, user_llm, document_metadata
- )
- else:
- summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
- summary_embedding = embed_text(summary_content)
-
- # Process chunks
- chunks = await create_document_chunks(file_in_markdown)
-
- # Update or create document
- if existing_document:
- # Update existing document
- existing_document.title = file_name
- existing_document.content = summary_content
- existing_document.content_hash = content_hash
- existing_document.embedding = summary_embedding
- existing_document.document_metadata = {
- "FILE_NAME": file_name,
- "ETL_SERVICE": "UNSTRUCTURED",
- }
- await safe_set_chunks(session, existing_document, chunks)
- existing_document.source_markdown = file_in_markdown
- existing_document.content_needs_reindexing = False
- existing_document.updated_at = get_current_timestamp()
- existing_document.status = DocumentStatus.ready()
-
- await session.commit()
- await session.refresh(existing_document)
- document = existing_document
- else:
- # Create new document
- doc_type = DocumentType.FILE
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=doc_type,
- document_metadata={
- "FILE_NAME": file_name,
- "ETL_SERVICE": "UNSTRUCTURED",
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- unique_identifier_hash=primary_hash,
- source_markdown=file_in_markdown,
- content_needs_reindexing=False,
- updated_at=get_current_timestamp(),
- created_by_id=user_id,
- connector_id=connector.get("connector_id") if connector else None,
- status=DocumentStatus.ready(),
- )
-
- session.add(document)
- await session.commit()
- await session.refresh(document)
-
- return document
- except SQLAlchemyError as db_error:
- await session.rollback()
- if "ix_documents_content_hash" in str(db_error):
- logging.warning(
- "content_hash collision during commit for %s (Unstructured). Skipping.",
- file_name,
- )
- return None
- raise db_error
- except Exception as e:
- await session.rollback()
- raise RuntimeError(f"Failed to process file document: {e!s}") from e
+# ---------------------------------------------------------------------------
+# Notification helper
+# ---------------------------------------------------------------------------
-async def add_received_file_document_using_llamacloud(
- session: AsyncSession,
- file_name: str,
- llamacloud_markdown_document: str,
- search_space_id: int,
- user_id: str,
- connector: dict | None = None,
- enable_summary: bool = True,
-) -> Document | None:
- """
- Process and store document content parsed by LlamaCloud.
-
- Args:
- session: Database session
- file_name: Name of the processed file
- llamacloud_markdown_document: Markdown content from LlamaCloud parsing
- search_space_id: ID of the search space
- user_id: ID of the user
- connector: Optional connector info for Google Drive files
-
- Returns:
- Document object if successful, None if failed
- """
- try:
- # Combine all markdown documents into one
- file_in_markdown = llamacloud_markdown_document
-
- # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = get_google_drive_unique_identifier(
- connector, file_name, search_space_id
- )
-
- # Generate content hash
- content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
- # Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await find_existing_document_with_migration(
- session, primary_hash, legacy_hash, content_hash
- )
-
- if existing_document:
- # Handle existing document (rename detection, content change check)
- should_skip, doc = await handle_existing_document_update(
- session,
- existing_document,
- content_hash,
- connector,
- file_name,
- primary_hash,
- )
- if should_skip:
- return doc
- # Content changed - continue to update
-
- # Get user's long context LLM (needed for both create and update)
- user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
- if not user_llm:
- raise RuntimeError(
- f"No long context LLM configured for user {user_id} in search space {search_space_id}"
- )
-
- # Generate summary with metadata
- document_metadata = {
- "file_name": file_name,
- "etl_service": "LLAMACLOUD",
- "document_type": "File Document",
- }
- if enable_summary:
- summary_content, summary_embedding = await generate_document_summary(
- file_in_markdown, user_llm, document_metadata
- )
- else:
- summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
- summary_embedding = embed_text(summary_content)
-
- # Process chunks
- chunks = await create_document_chunks(file_in_markdown)
-
- # Update or create document
- if existing_document:
- existing_document.title = file_name
- existing_document.content = summary_content
- existing_document.content_hash = content_hash
- existing_document.embedding = summary_embedding
- existing_document.document_metadata = {
- "FILE_NAME": file_name,
- "ETL_SERVICE": "LLAMACLOUD",
- }
- await safe_set_chunks(session, existing_document, chunks)
- existing_document.source_markdown = file_in_markdown
- existing_document.content_needs_reindexing = False
- existing_document.updated_at = get_current_timestamp()
- existing_document.status = DocumentStatus.ready()
-
- await session.commit()
- await session.refresh(existing_document)
- document = existing_document
- else:
- doc_type = DocumentType.FILE
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=doc_type,
- document_metadata={
- "FILE_NAME": file_name,
- "ETL_SERVICE": "LLAMACLOUD",
- },
- content=summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- unique_identifier_hash=primary_hash,
- source_markdown=file_in_markdown,
- content_needs_reindexing=False,
- updated_at=get_current_timestamp(),
- created_by_id=user_id,
- connector_id=connector.get("connector_id") if connector else None,
- status=DocumentStatus.ready(),
- )
-
- session.add(document)
- await session.commit()
- await session.refresh(document)
-
- return document
- except SQLAlchemyError as db_error:
- await session.rollback()
- if "ix_documents_content_hash" in str(db_error):
- logging.warning(
- "content_hash collision during commit for %s (LlamaCloud). Skipping.",
- file_name,
- )
- return None
- raise db_error
- except Exception as e:
- await session.rollback()
- raise RuntimeError(
- f"Failed to process file document using LlamaCloud: {e!s}"
- ) from e
-
-
-async def add_received_file_document_using_docling(
- session: AsyncSession,
- file_name: str,
- docling_markdown_document: str,
- search_space_id: int,
- user_id: str,
- connector: dict | None = None,
- enable_summary: bool = True,
-) -> Document | None:
- """
- Process and store document content parsed by Docling.
-
- Args:
- session: Database session
- file_name: Name of the processed file
- docling_markdown_document: Markdown content from Docling parsing
- search_space_id: ID of the search space
- user_id: ID of the user
- connector: Optional connector info for Google Drive files
-
- Returns:
- Document object if successful, None if failed
- """
- try:
- file_in_markdown = docling_markdown_document
-
- # Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = get_google_drive_unique_identifier(
- connector, file_name, search_space_id
- )
-
- # Generate content hash
- content_hash = generate_content_hash(file_in_markdown, search_space_id)
-
- # Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await find_existing_document_with_migration(
- session, primary_hash, legacy_hash, content_hash
- )
-
- if existing_document:
- # Handle existing document (rename detection, content change check)
- should_skip, doc = await handle_existing_document_update(
- session,
- existing_document,
- content_hash,
- connector,
- file_name,
- primary_hash,
- )
- if should_skip:
- return doc
- # Content changed - continue to update
-
- # Get user's long context LLM (needed for both create and update)
- user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
- if not user_llm:
- raise RuntimeError(
- f"No long context LLM configured for user {user_id} in search_space {search_space_id}"
- )
-
- if enable_summary:
- from app.services.docling_service import create_docling_service
-
- docling_service = create_docling_service()
-
- summary_content = await docling_service.process_large_document_summary(
- content=file_in_markdown, llm=user_llm, document_title=file_name
- )
-
- document_metadata = {
- "file_name": file_name,
- "etl_service": "DOCLING",
- "document_type": "File Document",
- }
- metadata_parts = ["# DOCUMENT METADATA"]
- for key, value in document_metadata.items():
- if value:
- formatted_key = key.replace("_", " ").title()
- metadata_parts.append(f"**{formatted_key}:** {value}")
-
- metadata_section = "\n".join(metadata_parts)
- enhanced_summary_content = (
- f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}"
- )
- else:
- enhanced_summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}"
-
- summary_embedding = embed_text(enhanced_summary_content)
-
- # Process chunks
- chunks = await create_document_chunks(file_in_markdown)
-
- # Update or create document
- if existing_document:
- # Update existing document
- existing_document.title = file_name
- existing_document.content = enhanced_summary_content
- existing_document.content_hash = content_hash
- existing_document.embedding = summary_embedding
- existing_document.document_metadata = {
- "FILE_NAME": file_name,
- "ETL_SERVICE": "DOCLING",
- }
- await safe_set_chunks(session, existing_document, chunks)
- existing_document.source_markdown = file_in_markdown
- existing_document.content_needs_reindexing = False
- existing_document.updated_at = get_current_timestamp()
- existing_document.status = DocumentStatus.ready() # Mark as ready
-
- await session.commit()
- await session.refresh(existing_document)
- document = existing_document
- else:
- # Create new document
- # Determine document type based on connector
- doc_type = DocumentType.FILE
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- doc_type = DocumentType.GOOGLE_DRIVE_FILE
-
- document = Document(
- search_space_id=search_space_id,
- title=file_name,
- document_type=doc_type,
- document_metadata={
- "FILE_NAME": file_name,
- "ETL_SERVICE": "DOCLING",
- },
- content=enhanced_summary_content,
- embedding=summary_embedding,
- chunks=chunks,
- content_hash=content_hash,
- unique_identifier_hash=primary_hash,
- source_markdown=file_in_markdown,
- content_needs_reindexing=False,
- updated_at=get_current_timestamp(),
- created_by_id=user_id,
- connector_id=connector.get("connector_id") if connector else None,
- status=DocumentStatus.ready(), # Mark as ready
- )
-
- session.add(document)
- await session.commit()
- await session.refresh(document)
-
- return document
- except SQLAlchemyError as db_error:
- await session.rollback()
- if "ix_documents_content_hash" in str(db_error):
- logging.warning(
- "content_hash collision during commit for %s (Docling). Skipping.",
- file_name,
- )
- return None
- raise db_error
- except Exception as e:
- await session.rollback()
- raise RuntimeError(
- f"Failed to process file document using Docling: {e!s}"
- ) from e
-
-
-async def _update_document_from_connector(
- document: Document | None, connector: dict | None, session: AsyncSession
+async def _notify(
+ ctx: _ProcessingContext,
+ stage: str,
+ stage_message: str | None = None,
+ **kwargs,
) -> None:
- """Helper to update document type, metadata, and connector_id from connector info."""
- if document and connector:
- if "type" in connector:
- document.document_type = connector["type"]
- if "metadata" in connector:
- # Merge with existing document_metadata (the actual column name)
- if not document.document_metadata:
- document.document_metadata = connector["metadata"]
- else:
- # Expand existing metadata with connector metadata
- merged = {**document.document_metadata, **connector["metadata"]}
- document.document_metadata = merged
- # Set connector_id if provided for de-indexing support
- if "connector_id" in connector:
- document.connector_id = connector["connector_id"]
- await session.commit()
+ """Send a processing-progress notification if one is attached."""
+ if not ctx.notification:
+ return
+ await NotificationService.document_processing.notify_processing_progress(
+ ctx.session,
+ ctx.notification,
+ stage=stage,
+ stage_message=stage_message,
+ **kwargs,
+ )
+
+
+# ---------------------------------------------------------------------------
+# Page-limit helpers
+# ---------------------------------------------------------------------------
+
+
+def _estimate_pages_safe(page_limit_service, file_path: str) -> int:
+ """Estimate page count with a file-size fallback."""
+ try:
+ return page_limit_service.estimate_pages_before_processing(file_path)
+ except Exception:
+ file_size = os.path.getsize(file_path)
+ return max(1, file_size // (80 * 1024))
+
+
+async def _log_page_divergence(
+ task_logger: TaskLoggingService,
+ log_entry: Log,
+ filename: str,
+ estimated: int,
+ actual: int,
+ final: int,
+) -> None:
+ """Log a warning when the actual page count far exceeds the pre-estimate."""
+ if actual > estimated * 1.5:
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Actual page count higher than estimate: {filename}",
+ {
+ "estimated_before": estimated,
+ "actual_pages": actual,
+ "using_count": final,
+ },
+ )
+
+
+# ===================================================================
+# Handlers for process_file_in_background (legacy / connector path)
+# ===================================================================
+
+
+async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
+ """Read a markdown / text file and create or update a document."""
+ await _notify(ctx, "parsing", "Reading file")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing markdown/text file: {ctx.filename}",
+ {"file_type": "markdown", "processing_stage": "reading_file"},
+ )
+
+ with open(ctx.file_path, encoding="utf-8") as f:
+ markdown_content = f.read()
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ await _notify(ctx, "chunking")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Creating document from markdown content: {ctx.filename}",
+ {
+ "processing_stage": "creating_document",
+ "content_length": len(markdown_content),
+ },
+ )
+
+ result = await add_received_markdown_file_document(
+ ctx.session,
+ ctx.filename,
+ markdown_content,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed markdown file: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "markdown",
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Markdown file already exists (duplicate): {ctx.filename}",
+ {"duplicate_detected": True, "file_type": "markdown"},
+ )
+ return result
+
+
+async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None:
+ """Convert a text-based file (csv/tsv/html) to markdown without ETL."""
+ await _notify(ctx, "parsing", "Converting file")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Direct-converting file to markdown: {ctx.filename}",
+ {"file_type": "direct_convert", "processing_stage": "converting"},
+ )
+
+ markdown_content = convert_file_directly(ctx.file_path, ctx.filename)
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ await _notify(ctx, "chunking")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Creating document from converted content: {ctx.filename}",
+ {
+ "processing_stage": "creating_document",
+ "content_length": len(markdown_content),
+ },
+ )
+
+ result = await add_received_markdown_file_document(
+ ctx.session,
+ ctx.filename,
+ markdown_content,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully direct-converted file: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "direct_convert",
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Direct-converted file already exists (duplicate): {ctx.filename}",
+ {"duplicate_detected": True, "file_type": "direct_convert"},
+ )
+ return result
+
+
+async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
+ """Transcribe an audio file and create or update a document."""
+ await _notify(ctx, "parsing", "Transcribing audio")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing audio file for transcription: {ctx.filename}",
+ {"file_type": "audio", "processing_stage": "starting_transcription"},
+ )
+
+ stt_service_type = (
+ "local"
+ if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
+ else "external"
+ )
+
+ if stt_service_type == "local":
+ from app.services.stt_service import stt_service
+
+ try:
+ stt_result = stt_service.transcribe_file(ctx.file_path)
+ transcribed_text = stt_result.get("text", "")
+ if not transcribed_text:
+ raise ValueError("Transcription returned empty text")
+ transcribed_text = (
+ f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
+ )
+ except Exception as e:
+ raise HTTPException(
+ status_code=422,
+ detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}",
+ ) from e
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Local STT transcription completed: {ctx.filename}",
+ {
+ "processing_stage": "local_transcription_complete",
+ "language": stt_result.get("language"),
+ "confidence": stt_result.get("language_probability"),
+ "duration": stt_result.get("duration"),
+ },
+ )
+ else:
+ from litellm import atranscription
+
+ with open(ctx.file_path, "rb") as audio_file:
+ transcription_kwargs: dict = {
+ "model": app_config.STT_SERVICE,
+ "file": audio_file,
+ "api_key": app_config.STT_SERVICE_API_KEY,
+ }
+ if app_config.STT_SERVICE_API_BASE:
+ transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+
+ transcription_response = await atranscription(**transcription_kwargs)
+ transcribed_text = transcription_response.get("text", "")
+ if not transcribed_text:
+ raise ValueError("Transcription returned empty text")
+
+ transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Transcription completed, creating document: {ctx.filename}",
+ {
+ "processing_stage": "transcription_complete",
+ "transcript_length": len(transcribed_text),
+ },
+ )
+
+ await _notify(ctx, "chunking")
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ result = await add_received_markdown_file_document(
+ ctx.session,
+ ctx.filename,
+ transcribed_text,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully transcribed and processed audio file: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "audio",
+ "transcript_length": len(transcribed_text),
+ "stt_service": stt_service_type,
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Audio file transcript already exists (duplicate): {ctx.filename}",
+ {"duplicate_detected": True, "file_type": "audio"},
+ )
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Document file processing (ETL service dispatch)
+# ---------------------------------------------------------------------------
+
+
+async def _etl_unstructured(
+ ctx: _ProcessingContext,
+ page_limit_service,
+ estimated_pages: int,
+) -> Document | None:
+ """Parse and save via the Unstructured ETL service."""
+ await _notify(ctx, "parsing", "Extracting content")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing file with Unstructured ETL: {ctx.filename}",
+ {
+ "file_type": "document",
+ "etl_service": "UNSTRUCTURED",
+ "processing_stage": "loading",
+ },
+ )
+
+ docs = await parse_with_unstructured(ctx.file_path)
+
+ await _notify(ctx, "chunking", chunks_count=len(docs))
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Unstructured ETL completed, creating document: {ctx.filename}",
+ {"processing_stage": "etl_complete", "elements_count": len(docs)},
+ )
+
+ actual_pages = page_limit_service.estimate_pages_from_elements(docs)
+ final_pages = max(estimated_pages, actual_pages)
+ await _log_page_divergence(
+ ctx.task_logger,
+ ctx.log_entry,
+ ctx.filename,
+ estimated_pages,
+ actual_pages,
+ final_pages,
+ )
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ result = await add_received_file_document_using_unstructured(
+ ctx.session,
+ ctx.filename,
+ docs,
+ ctx.search_space_id,
+ ctx.user_id,
+ ctx.connector,
+ enable_summary=ctx.enable_summary,
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+
+ if result:
+ await page_limit_service.update_page_usage(
+ ctx.user_id, final_pages, allow_exceed=True
+ )
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed file with Unstructured: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "document",
+ "etl_service": "UNSTRUCTURED",
+ "pages_processed": final_pages,
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Document already exists (duplicate): {ctx.filename}",
+ {
+ "duplicate_detected": True,
+ "file_type": "document",
+ "etl_service": "UNSTRUCTURED",
+ },
+ )
+ return result
+
+
+async def _etl_llamacloud(
+ ctx: _ProcessingContext,
+ page_limit_service,
+ estimated_pages: int,
+) -> Document | None:
+ """Parse and save via the LlamaCloud ETL service."""
+ await _notify(ctx, "parsing", "Extracting content")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing file with LlamaCloud ETL: {ctx.filename}",
+ {
+ "file_type": "document",
+ "etl_service": "LLAMACLOUD",
+ "processing_stage": "parsing",
+ "estimated_pages": estimated_pages,
+ },
+ )
+
+ raw_result = await parse_with_llamacloud_retry(
+ file_path=ctx.file_path,
+ estimated_pages=estimated_pages,
+ task_logger=ctx.task_logger,
+ log_entry=ctx.log_entry,
+ )
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False)
+
+ await _notify(ctx, "chunking", chunks_count=len(markdown_documents))
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"LlamaCloud parsing completed, creating documents: {ctx.filename}",
+ {
+ "processing_stage": "parsing_complete",
+ "documents_count": len(markdown_documents),
+ },
+ )
+
+ if not markdown_documents:
+ await ctx.task_logger.log_task_failure(
+ ctx.log_entry,
+ f"LlamaCloud parsing returned no documents: {ctx.filename}",
+ "ETL service returned empty document list",
+ {"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"},
+ )
+ raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}")
+
+ actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents)
+ final_pages = max(estimated_pages, actual_pages)
+ await _log_page_divergence(
+ ctx.task_logger,
+ ctx.log_entry,
+ ctx.filename,
+ estimated_pages,
+ actual_pages,
+ final_pages,
+ )
+
+ any_created = False
+ last_doc: Document | None = None
+
+ for doc in markdown_documents:
+ doc_result = await add_received_file_document_using_llamacloud(
+ ctx.session,
+ ctx.filename,
+ llamacloud_markdown_document=doc.text,
+ search_space_id=ctx.search_space_id,
+ user_id=ctx.user_id,
+ connector=ctx.connector,
+ enable_summary=ctx.enable_summary,
+ )
+ if doc_result:
+ any_created = True
+ last_doc = doc_result
+
+ if any_created:
+ await page_limit_service.update_page_usage(
+ ctx.user_id, final_pages, allow_exceed=True
+ )
+ if ctx.connector:
+ await update_document_from_connector(last_doc, ctx.connector, ctx.session)
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed file with LlamaCloud: {ctx.filename}",
+ {
+ "document_id": last_doc.id,
+ "content_hash": last_doc.content_hash,
+ "file_type": "document",
+ "etl_service": "LLAMACLOUD",
+ "pages_processed": final_pages,
+ "documents_count": len(markdown_documents),
+ },
+ )
+ return last_doc
+
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Document already exists (duplicate): {ctx.filename}",
+ {
+ "duplicate_detected": True,
+ "file_type": "document",
+ "etl_service": "LLAMACLOUD",
+ "documents_count": len(markdown_documents),
+ },
+ )
+ return None
+
+
+async def _etl_docling(
+ ctx: _ProcessingContext,
+ page_limit_service,
+ estimated_pages: int,
+) -> Document | None:
+ """Parse and save via the Docling ETL service."""
+ await _notify(ctx, "parsing", "Extracting content")
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Processing file with Docling ETL: {ctx.filename}",
+ {
+ "file_type": "document",
+ "etl_service": "DOCLING",
+ "processing_stage": "parsing",
+ },
+ )
+
+ content = await parse_with_docling(ctx.file_path, ctx.filename)
+
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Docling parsing completed, creating document: {ctx.filename}",
+ {"processing_stage": "parsing_complete", "content_length": len(content)},
+ )
+
+ actual_pages = page_limit_service.estimate_pages_from_content_length(len(content))
+ final_pages = max(estimated_pages, actual_pages)
+ await _log_page_divergence(
+ ctx.task_logger,
+ ctx.log_entry,
+ ctx.filename,
+ estimated_pages,
+ actual_pages,
+ final_pages,
+ )
+
+ await _notify(ctx, "chunking")
+
+ result = await add_received_file_document_using_docling(
+ ctx.session,
+ ctx.filename,
+ docling_markdown_document=content,
+ search_space_id=ctx.search_space_id,
+ user_id=ctx.user_id,
+ connector=ctx.connector,
+ enable_summary=ctx.enable_summary,
+ )
+
+ if result:
+ await page_limit_service.update_page_usage(
+ ctx.user_id, final_pages, allow_exceed=True
+ )
+ if ctx.connector:
+ await update_document_from_connector(result, ctx.connector, ctx.session)
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Successfully processed file with Docling: {ctx.filename}",
+ {
+ "document_id": result.id,
+ "content_hash": result.content_hash,
+ "file_type": "document",
+ "etl_service": "DOCLING",
+ "pages_processed": final_pages,
+ },
+ )
+ else:
+ await ctx.task_logger.log_task_success(
+ ctx.log_entry,
+ f"Document already exists (duplicate): {ctx.filename}",
+ {
+ "duplicate_detected": True,
+ "file_type": "document",
+ "etl_service": "DOCLING",
+ },
+ )
+ return result
+
+
+async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
+ """Route a document file to the configured ETL service."""
+ from app.services.page_limit_service import PageLimitExceededError, PageLimitService
+
+ page_limit_service = PageLimitService(ctx.session)
+ estimated_pages = _estimate_pages_safe(page_limit_service, ctx.file_path)
+
+ await ctx.task_logger.log_task_progress(
+ ctx.log_entry,
+ f"Estimated {estimated_pages} pages for file: {ctx.filename}",
+ {"estimated_pages": estimated_pages, "file_type": "document"},
+ )
+
+ try:
+ await page_limit_service.check_page_limit(ctx.user_id, estimated_pages)
+ except PageLimitExceededError as e:
+ await ctx.task_logger.log_task_failure(
+ ctx.log_entry,
+ f"Page limit exceeded before processing: {ctx.filename}",
+ str(e),
+ {
+ "error_type": "PageLimitExceeded",
+ "pages_used": e.pages_used,
+ "pages_limit": e.pages_limit,
+ "estimated_pages": estimated_pages,
+ },
+ )
+ with contextlib.suppress(Exception):
+ os.unlink(ctx.file_path)
+ raise HTTPException(status_code=403, detail=str(e)) from e
+
+ etl_dispatch = {
+ "UNSTRUCTURED": _etl_unstructured,
+ "LLAMACLOUD": _etl_llamacloud,
+ "DOCLING": _etl_docling,
+ }
+ handler = etl_dispatch.get(app_config.ETL_SERVICE)
+ if handler is None:
+ raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
+
+ return await handler(ctx, page_limit_service, estimated_pages)
+
+
+# ===================================================================
+# Public orchestrators
+# ===================================================================
async def process_file_in_background(
@@ -910,726 +690,35 @@ async def process_file_in_background(
session: AsyncSession,
task_logger: TaskLoggingService,
log_entry: Log,
- connector: dict
- | None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}}
- notification: Notification
- | None = None, # Optional notification for progress updates
+ connector: dict | None = None,
+ notification: Notification | None = None,
) -> Document | None:
+ ctx = _ProcessingContext(
+ session=session,
+ file_path=file_path,
+ filename=filename,
+ search_space_id=search_space_id,
+ user_id=user_id,
+ task_logger=task_logger,
+ log_entry=log_entry,
+ connector=connector,
+ notification=notification,
+ )
+
try:
- # Check if the file is a markdown or text file
- if filename.lower().endswith((".md", ".markdown", ".txt")):
- # Update notification: parsing stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Reading file",
- )
- )
+ category = classify_file(filename)
- await task_logger.log_task_progress(
- log_entry,
- f"Processing markdown/text file: {filename}",
- {"file_type": "markdown", "processing_stage": "reading_file"},
- )
+ if category == FileCategory.MARKDOWN:
+ return await _process_markdown_upload(ctx)
+ if category == FileCategory.DIRECT_CONVERT:
+ return await _process_direct_convert_upload(ctx)
+ if category == FileCategory.AUDIO:
+ return await _process_audio_upload(ctx)
+ return await _process_document_upload(ctx)
- # For markdown files, read the content directly
- with open(file_path, encoding="utf-8") as f:
- markdown_content = f.read()
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- # Update notification: chunking stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Creating document from markdown content: {filename}",
- {
- "processing_stage": "creating_document",
- "content_length": len(markdown_content),
- },
- )
-
- # Process markdown directly through specialized function
- result = await add_received_markdown_file_document(
- session, filename, markdown_content, search_space_id, user_id, connector
- )
-
- if connector:
- await _update_document_from_connector(result, connector, session)
-
- if result:
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed markdown file: {filename}",
- {
- "document_id": result.id,
- "content_hash": result.content_hash,
- "file_type": "markdown",
- },
- )
- return result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Markdown file already exists (duplicate): {filename}",
- {"duplicate_detected": True, "file_type": "markdown"},
- )
- return None
-
- # Check if the file is an audio file
- elif filename.lower().endswith(
- (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
- ):
- # Update notification: parsing stage (transcription)
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Transcribing audio",
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing audio file for transcription: {filename}",
- {"file_type": "audio", "processing_stage": "starting_transcription"},
- )
-
- # Determine STT service type
- stt_service_type = (
- "local"
- if app_config.STT_SERVICE
- and app_config.STT_SERVICE.startswith("local/")
- else "external"
- )
-
- # Check if using local STT service
- if stt_service_type == "local":
- # Use local Faster-Whisper for transcription
- from app.services.stt_service import stt_service
-
- try:
- result = stt_service.transcribe_file(file_path)
- transcribed_text = result.get("text", "")
-
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
-
- # Add metadata about the transcription
- transcribed_text = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
- except Exception as e:
- raise HTTPException(
- status_code=422,
- detail=f"Failed to transcribe audio file {filename}: {e!s}",
- ) from e
-
- await task_logger.log_task_progress(
- log_entry,
- f"Local STT transcription completed: {filename}",
- {
- "processing_stage": "local_transcription_complete",
- "language": result.get("language"),
- "confidence": result.get("language_probability"),
- "duration": result.get("duration"),
- },
- )
- else:
- # Use LiteLLM for audio transcription
- with open(file_path, "rb") as audio_file:
- transcription_kwargs = {
- "model": app_config.STT_SERVICE,
- "file": audio_file,
- "api_key": app_config.STT_SERVICE_API_KEY,
- }
- if app_config.STT_SERVICE_API_BASE:
- transcription_kwargs["api_base"] = (
- app_config.STT_SERVICE_API_BASE
- )
-
- transcription_response = await atranscription(
- **transcription_kwargs
- )
-
- # Extract the transcribed text
- transcribed_text = transcription_response.get("text", "")
-
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
-
- # Add metadata about the transcription
- transcribed_text = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Transcription completed, creating document: {filename}",
- {
- "processing_stage": "transcription_complete",
- "transcript_length": len(transcribed_text),
- },
- )
-
- # Update notification: chunking stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
- )
- )
-
- # Clean up the temp file
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- # Process transcription as markdown document
- result = await add_received_markdown_file_document(
- session, filename, transcribed_text, search_space_id, user_id, connector
- )
-
- if connector:
- await _update_document_from_connector(result, connector, session)
-
- if result:
- await task_logger.log_task_success(
- log_entry,
- f"Successfully transcribed and processed audio file: {filename}",
- {
- "document_id": result.id,
- "content_hash": result.content_hash,
- "file_type": "audio",
- "transcript_length": len(transcribed_text),
- "stt_service": stt_service_type,
- },
- )
- return result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Audio file transcript already exists (duplicate): {filename}",
- {"duplicate_detected": True, "file_type": "audio"},
- )
- return None
-
- else:
- # Import page limit service
- from app.services.page_limit_service import (
- PageLimitExceededError,
- PageLimitService,
- )
-
- # Initialize page limit service
- page_limit_service = PageLimitService(session)
-
- # CRITICAL: Estimate page count BEFORE making expensive ETL API calls
- # This prevents users from incurring costs on files that would exceed their limit
- try:
- estimated_pages_before = (
- page_limit_service.estimate_pages_before_processing(file_path)
- )
- except Exception:
- # If estimation fails, use a conservative estimate based on file size
- import os
-
- file_size = os.path.getsize(file_path)
- estimated_pages_before = max(
- 1, file_size // (80 * 1024)
- ) # ~80KB per page
-
- await task_logger.log_task_progress(
- log_entry,
- f"Estimated {estimated_pages_before} pages for file: {filename}",
- {
- "estimated_pages": estimated_pages_before,
- "file_type": "document",
- },
- )
-
- # Check page limit BEFORE calling ETL service to avoid unnecessary costs
- try:
- await page_limit_service.check_page_limit(
- user_id, estimated_pages_before
- )
- except PageLimitExceededError as e:
- await task_logger.log_task_failure(
- log_entry,
- f"Page limit exceeded before processing: {filename}",
- str(e),
- {
- "error_type": "PageLimitExceeded",
- "pages_used": e.pages_used,
- "pages_limit": e.pages_limit,
- "estimated_pages": estimated_pages_before,
- },
- )
- # Clean up the temp file
- import os
-
- with contextlib.suppress(Exception):
- os.unlink(file_path)
-
- raise HTTPException(
- status_code=403,
- detail=str(e),
- ) from e
-
- if app_config.ETL_SERVICE == "UNSTRUCTURED":
- # Update notification: parsing stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing file with Unstructured ETL: {filename}",
- {
- "file_type": "document",
- "etl_service": "UNSTRUCTURED",
- "processing_stage": "loading",
- },
- )
-
- from langchain_unstructured import UnstructuredLoader
-
- # Process the file
- loader = UnstructuredLoader(
- file_path,
- mode="elements",
- post_processors=[],
- languages=["eng"],
- include_orig_elements=False,
- include_metadata=False,
- strategy="auto",
- )
-
- docs = await loader.aload()
-
- # Update notification: chunking stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking", chunks_count=len(docs)
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Unstructured ETL completed, creating document: {filename}",
- {"processing_stage": "etl_complete", "elements_count": len(docs)},
- )
-
- # Verify actual page count from parsed documents
- actual_pages = page_limit_service.estimate_pages_from_elements(docs)
-
- # Use the higher of the two estimates for safety (in case pre-estimate was too low)
- final_page_count = max(estimated_pages_before, actual_pages)
-
- # If actual is significantly higher than estimate, log a warning
- if actual_pages > estimated_pages_before * 1.5:
- await task_logger.log_task_progress(
- log_entry,
- f"Actual page count higher than estimate: {filename}",
- {
- "estimated_before": estimated_pages_before,
- "actual_pages": actual_pages,
- "using_count": final_page_count,
- },
- )
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- enable_summary = (
- connector.get("enable_summary", True) if connector else True
- )
- result = await add_received_file_document_using_unstructured(
- session,
- filename,
- docs,
- search_space_id,
- user_id,
- connector,
- enable_summary=enable_summary,
- )
-
- if connector:
- await _update_document_from_connector(result, connector, session)
-
- if result:
- # Update page usage after successful processing
- # allow_exceed=True because document was already created after passing initial check
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed file with Unstructured: {filename}",
- {
- "document_id": result.id,
- "content_hash": result.content_hash,
- "file_type": "document",
- "etl_service": "UNSTRUCTURED",
- "pages_processed": final_page_count,
- },
- )
- return result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Document already exists (duplicate): {filename}",
- {
- "duplicate_detected": True,
- "file_type": "document",
- "etl_service": "UNSTRUCTURED",
- },
- )
- return None
-
- elif app_config.ETL_SERVICE == "LLAMACLOUD":
- # Update notification: parsing stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing file with LlamaCloud ETL: {filename}",
- {
- "file_type": "document",
- "etl_service": "LLAMACLOUD",
- "processing_stage": "parsing",
- "estimated_pages": estimated_pages_before,
- },
- )
-
- # Parse file with retry logic for SSL/connection errors (common with large files)
- result = await parse_with_llamacloud_retry(
- file_path=file_path,
- estimated_pages=estimated_pages_before,
- task_logger=task_logger,
- log_entry=log_entry,
- )
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- # Get markdown documents from the result
- markdown_documents = await result.aget_markdown_documents(
- split_by_page=False
- )
-
- # Update notification: chunking stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="chunking",
- chunks_count=len(markdown_documents),
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"LlamaCloud parsing completed, creating documents: {filename}",
- {
- "processing_stage": "parsing_complete",
- "documents_count": len(markdown_documents),
- },
- )
-
- # Check if LlamaCloud returned any documents
- if not markdown_documents or len(markdown_documents) == 0:
- await task_logger.log_task_failure(
- log_entry,
- f"LlamaCloud parsing returned no documents: {filename}",
- "ETL service returned empty document list",
- {
- "error_type": "EmptyDocumentList",
- "etl_service": "LLAMACLOUD",
- },
- )
- raise ValueError(
- f"LlamaCloud parsing returned no documents for {filename}"
- )
-
- # Verify actual page count from parsed markdown documents
- actual_pages = page_limit_service.estimate_pages_from_markdown(
- markdown_documents
- )
-
- # Use the higher of the two estimates for safety (in case pre-estimate was too low)
- final_page_count = max(estimated_pages_before, actual_pages)
-
- # If actual is significantly higher than estimate, log a warning
- if actual_pages > estimated_pages_before * 1.5:
- await task_logger.log_task_progress(
- log_entry,
- f"Actual page count higher than estimate: {filename}",
- {
- "estimated_before": estimated_pages_before,
- "actual_pages": actual_pages,
- "using_count": final_page_count,
- },
- )
-
- # Track if any document was successfully created (not a duplicate)
- any_doc_created = False
- last_created_doc = None
-
- for doc in markdown_documents:
- # Extract text content from the markdown documents
- markdown_content = doc.text
-
- enable_summary = (
- connector.get("enable_summary", True) if connector else True
- )
- doc_result = await add_received_file_document_using_llamacloud(
- session,
- filename,
- llamacloud_markdown_document=markdown_content,
- search_space_id=search_space_id,
- user_id=user_id,
- connector=connector,
- enable_summary=enable_summary,
- )
-
- # Track if this document was successfully created
- if doc_result:
- any_doc_created = True
- last_created_doc = doc_result
-
- # Update page usage once after processing all documents
- # Only update if at least one document was created (not all duplicates)
- if any_doc_created:
- # Update page usage after successful processing
- # allow_exceed=True because document was already created after passing initial check
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- if connector:
- await _update_document_from_connector(
- last_created_doc, connector, session
- )
-
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed file with LlamaCloud: {filename}",
- {
- "document_id": last_created_doc.id,
- "content_hash": last_created_doc.content_hash,
- "file_type": "document",
- "etl_service": "LLAMACLOUD",
- "pages_processed": final_page_count,
- "documents_count": len(markdown_documents),
- },
- )
- return last_created_doc
- else:
- # All documents were duplicates (markdown_documents was not empty, but all returned None)
- await task_logger.log_task_success(
- log_entry,
- f"Document already exists (duplicate): {filename}",
- {
- "duplicate_detected": True,
- "file_type": "document",
- "etl_service": "LLAMACLOUD",
- "documents_count": len(markdown_documents),
- },
- )
- return None
-
- elif app_config.ETL_SERVICE == "DOCLING":
- # Update notification: parsing stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing file with Docling ETL: {filename}",
- {
- "file_type": "document",
- "etl_service": "DOCLING",
- "processing_stage": "parsing",
- },
- )
-
- # Use Docling service for document processing
- from app.services.docling_service import create_docling_service
-
- # Create Docling service
- docling_service = create_docling_service()
-
- # Suppress pdfminer warnings that can cause processing to hang
- # These warnings are harmless but can spam logs and potentially halt processing
- # Suppress both Python warnings and logging warnings from pdfminer
- pdfminer_logger = getLogger("pdfminer")
- original_level = pdfminer_logger.level
-
- with warnings.catch_warnings():
- warnings.filterwarnings(
- "ignore", category=UserWarning, module="pdfminer"
- )
- warnings.filterwarnings(
- "ignore",
- message=".*Cannot set gray non-stroke color.*",
- )
- warnings.filterwarnings("ignore", message=".*invalid float value.*")
-
- # Temporarily suppress pdfminer logging warnings
- pdfminer_logger.setLevel(ERROR)
-
- try:
- # Process the document
- result = await docling_service.process_document(
- file_path, filename
- )
- finally:
- # Restore original logging level
- pdfminer_logger.setLevel(original_level)
-
- # Clean up the temp file
- import os
-
- try:
- os.unlink(file_path)
- except Exception as e:
- print("Error deleting temp file", e)
- pass
-
- await task_logger.log_task_progress(
- log_entry,
- f"Docling parsing completed, creating document: {filename}",
- {
- "processing_stage": "parsing_complete",
- "content_length": len(result["content"]),
- },
- )
-
- # Verify actual page count from content length
- actual_pages = page_limit_service.estimate_pages_from_content_length(
- len(result["content"])
- )
-
- # Use the higher of the two estimates for safety (in case pre-estimate was too low)
- final_page_count = max(estimated_pages_before, actual_pages)
-
- # If actual is significantly higher than estimate, log a warning
- if actual_pages > estimated_pages_before * 1.5:
- await task_logger.log_task_progress(
- log_entry,
- f"Actual page count higher than estimate: {filename}",
- {
- "estimated_before": estimated_pages_before,
- "actual_pages": actual_pages,
- "using_count": final_page_count,
- },
- )
-
- # Update notification: chunking stage
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
- )
-
- enable_summary = (
- connector.get("enable_summary", True) if connector else True
- )
- doc_result = await add_received_file_document_using_docling(
- session,
- filename,
- docling_markdown_document=result["content"],
- search_space_id=search_space_id,
- user_id=user_id,
- connector=connector,
- enable_summary=enable_summary,
- )
-
- if doc_result:
- # Update page usage after successful processing
- # allow_exceed=True because document was already created after passing initial check
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- if connector:
- await _update_document_from_connector(
- doc_result, connector, session
- )
-
- await task_logger.log_task_success(
- log_entry,
- f"Successfully processed file with Docling: {filename}",
- {
- "document_id": doc_result.id,
- "content_hash": doc_result.content_hash,
- "file_type": "document",
- "etl_service": "DOCLING",
- "pages_processed": final_page_count,
- },
- )
- return doc_result
- else:
- await task_logger.log_task_success(
- log_entry,
- f"Document already exists (duplicate): {filename}",
- {
- "duplicate_detected": True,
- "file_type": "document",
- "etl_service": "DOCLING",
- },
- )
- return None
except Exception as e:
await session.rollback()
- # For page limit errors, use the detailed message from the exception
from app.services.page_limit_service import PageLimitExceededError
if isinstance(e, PageLimitExceededError):
@@ -1645,10 +734,225 @@ async def process_file_in_background(
str(e),
{"error_type": type(e).__name__, "filename": filename},
)
- import logging
-
logging.error(f"Error processing file in background: {error_message}")
- raise # Re-raise so the wrapper can also handle it
+ raise
+
+
+# ===================================================================
+# 2-phase handler (process_file_in_background_with_document)
+# ===================================================================
+
+
+async def _extract_file_content(
+ file_path: str,
+ filename: str,
+ session: AsyncSession,
+ user_id: str,
+ task_logger: TaskLoggingService,
+ log_entry: Log,
+ notification: Notification | None,
+) -> tuple[str, str]:
+ """
+ Extract markdown content from a file regardless of type.
+
+ Returns:
+ Tuple of (markdown_content, etl_service_name).
+ """
+ category = classify_file(filename)
+
+ if category == FileCategory.MARKDOWN:
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Reading file",
+ )
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Processing markdown/text file: {filename}",
+ {"file_type": "markdown", "processing_stage": "reading_file"},
+ )
+ with open(file_path, encoding="utf-8") as f:
+ content = f.read()
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+ return content, "MARKDOWN"
+
+ if category == FileCategory.DIRECT_CONVERT:
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Converting file",
+ )
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Direct-converting file to markdown: {filename}",
+ {"file_type": "direct_convert", "processing_stage": "converting"},
+ )
+ content = convert_file_directly(file_path, filename)
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+ return content, "DIRECT_CONVERT"
+
+ if category == FileCategory.AUDIO:
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Transcribing audio",
+ )
+ await task_logger.log_task_progress(
+ log_entry,
+ f"Processing audio file for transcription: {filename}",
+ {"file_type": "audio", "processing_stage": "starting_transcription"},
+ )
+ transcribed_text = await _transcribe_audio(file_path, filename)
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+ return transcribed_text, "AUDIO_TRANSCRIPTION"
+
+ # Document file — use ETL service
+ return await _extract_document_content(
+ file_path,
+ filename,
+ session,
+ user_id,
+ task_logger,
+ log_entry,
+ notification,
+ )
+
+
+async def _transcribe_audio(file_path: str, filename: str) -> str:
+ """Transcribe an audio file and return formatted markdown text."""
+ stt_service_type = (
+ "local"
+ if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
+ else "external"
+ )
+
+ if stt_service_type == "local":
+ from app.services.stt_service import stt_service
+
+ result = stt_service.transcribe_file(file_path)
+ text = result.get("text", "")
+ if not text:
+ raise ValueError("Transcription returned empty text")
+ else:
+ from litellm import atranscription
+
+ with open(file_path, "rb") as audio_file:
+ kwargs: dict = {
+ "model": app_config.STT_SERVICE,
+ "file": audio_file,
+ "api_key": app_config.STT_SERVICE_API_KEY,
+ }
+ if app_config.STT_SERVICE_API_BASE:
+ kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
+ response = await atranscription(**kwargs)
+ text = response.get("text", "")
+ if not text:
+ raise ValueError("Transcription returned empty text")
+
+ return f"# Transcription of {filename}\n\n{text}"
+
+
+async def _extract_document_content(
+ file_path: str,
+ filename: str,
+ session: AsyncSession,
+ user_id: str,
+ task_logger: TaskLoggingService,
+ log_entry: Log,
+ notification: Notification | None,
+) -> tuple[str, str]:
+ """
+ Parse a document file via the configured ETL service.
+
+ Returns:
+ Tuple of (markdown_content, etl_service_name).
+ """
+ from app.services.page_limit_service import PageLimitService
+
+ page_limit_service = PageLimitService(session)
+
+ try:
+ estimated_pages = page_limit_service.estimate_pages_before_processing(file_path)
+ except Exception:
+ file_size = os.path.getsize(file_path)
+ estimated_pages = max(1, file_size // (80 * 1024))
+
+ await page_limit_service.check_page_limit(user_id, estimated_pages)
+
+ etl_service = app_config.ETL_SERVICE
+ markdown_content: str | None = None
+
+ if notification:
+ await NotificationService.document_processing.notify_processing_progress(
+ session,
+ notification,
+ stage="parsing",
+ stage_message="Extracting content",
+ )
+
+ if etl_service == "UNSTRUCTURED":
+ from app.utils.document_converters import convert_document_to_markdown
+
+ docs = await parse_with_unstructured(file_path)
+ markdown_content = await convert_document_to_markdown(docs)
+ actual_pages = page_limit_service.estimate_pages_from_elements(docs)
+ final_pages = max(estimated_pages, actual_pages)
+ await page_limit_service.update_page_usage(
+ user_id, final_pages, allow_exceed=True
+ )
+
+ elif etl_service == "LLAMACLOUD":
+ raw_result = await parse_with_llamacloud_retry(
+ file_path=file_path,
+ estimated_pages=estimated_pages,
+ task_logger=task_logger,
+ log_entry=log_entry,
+ )
+ markdown_documents = await raw_result.aget_markdown_documents(
+ split_by_page=False
+ )
+ if not markdown_documents:
+ raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}")
+ markdown_content = markdown_documents[0].text
+ await page_limit_service.update_page_usage(
+ user_id, estimated_pages, allow_exceed=True
+ )
+
+ elif etl_service == "DOCLING":
+ getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
+ getLogger("docling.document_converter").setLevel(ERROR)
+ getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(
+ ERROR
+ )
+
+ from docling.document_converter import DocumentConverter
+
+ converter = DocumentConverter()
+ result = converter.convert(file_path)
+ markdown_content = result.document.export_to_markdown()
+ await page_limit_service.update_page_usage(
+ user_id, estimated_pages, allow_exceed=True
+ )
+
+ else:
+ raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}")
+
+ with contextlib.suppress(Exception):
+ os.unlink(file_path)
+
+ if not markdown_content:
+ raise RuntimeError(f"Failed to extract content from file: {filename}")
+
+ return markdown_content, etl_service
async def process_file_in_background_with_document(
@@ -1667,272 +971,50 @@ async def process_file_in_background_with_document(
"""
Process file and update existing pending document (2-phase pattern).
- This function is Phase 2 of the real-time document status updates:
- - Phase 1 (API): Created document with pending status
- - Phase 2 (this): Process file and update document to ready/failed
-
- The document already exists with pending status. This function:
- 1. Parses the file content (markdown, audio, or ETL services)
- 2. Updates the document with content, embeddings, and chunks
- 3. Sets status to 'ready' on success
-
- Args:
- document: Existing document with pending status
- file_path: Path to the uploaded file
- filename: Original filename
- search_space_id: ID of the search space
- user_id: ID of the user
- session: Database session
- task_logger: Task logging service
- log_entry: Log entry for this task
- connector: Optional connector info for Google Drive files
- notification: Optional notification for progress updates
-
- Returns:
- Updated Document object if successful, None if duplicate content detected
+ Phase 1 (API layer): Created document with pending status.
+ Phase 2 (this function): Process file and update document to ready/failed.
"""
- import os
-
- from app.config import config as app_config
+ from app.indexing_pipeline.adapters.file_upload_adapter import (
+ UploadDocumentAdapter,
+ )
from app.services.llm_service import get_user_long_context_llm
+ from app.utils.document_converters import generate_content_hash
+
+ from .base import check_duplicate_document
doc_id = document.id
try:
- markdown_content = None
- etl_service = None
-
- # ===== STEP 1: Parse file content based on type =====
-
- # Check if the file is a markdown or text file
- if filename.lower().endswith((".md", ".markdown", ".txt")):
- # Update notification: parsing stage
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Reading file",
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing markdown/text file: {filename}",
- {"file_type": "markdown", "processing_stage": "reading_file"},
- )
-
- # Read markdown content directly
- with open(file_path, encoding="utf-8") as f:
- markdown_content = f.read()
- etl_service = "MARKDOWN"
-
- # Clean up temp file
- with contextlib.suppress(Exception):
- os.unlink(file_path)
-
- # Check if the file is an audio file
- elif filename.lower().endswith(
- (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
- ):
- # Update notification: parsing stage (transcription)
- if notification:
- await (
- NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Transcribing audio",
- )
- )
-
- await task_logger.log_task_progress(
- log_entry,
- f"Processing audio file for transcription: {filename}",
- {"file_type": "audio", "processing_stage": "starting_transcription"},
- )
-
- # Transcribe audio
- stt_service_type = (
- "local"
- if app_config.STT_SERVICE
- and app_config.STT_SERVICE.startswith("local/")
- else "external"
- )
-
- if stt_service_type == "local":
- from app.services.stt_service import stt_service
-
- result = stt_service.transcribe_file(file_path)
- transcribed_text = result.get("text", "")
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
- markdown_content = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
- else:
- with open(file_path, "rb") as audio_file:
- transcription_kwargs = {
- "model": app_config.STT_SERVICE,
- "file": audio_file,
- "api_key": app_config.STT_SERVICE_API_KEY,
- }
- if app_config.STT_SERVICE_API_BASE:
- transcription_kwargs["api_base"] = (
- app_config.STT_SERVICE_API_BASE
- )
- transcription_response = await atranscription(
- **transcription_kwargs
- )
- transcribed_text = transcription_response.get("text", "")
- if not transcribed_text:
- raise ValueError("Transcription returned empty text")
- markdown_content = (
- f"# Transcription of {filename}\n\n{transcribed_text}"
- )
-
- etl_service = "AUDIO_TRANSCRIPTION"
- # Clean up temp file
- with contextlib.suppress(Exception):
- os.unlink(file_path)
-
- else:
- # Document files - use ETL service
- from app.services.page_limit_service import (
- PageLimitExceededError,
- PageLimitService,
- )
-
- page_limit_service = PageLimitService(session)
-
- # Estimate page count
- try:
- estimated_pages = page_limit_service.estimate_pages_before_processing(
- file_path
- )
- except Exception:
- file_size = os.path.getsize(file_path)
- estimated_pages = max(1, file_size // (80 * 1024))
-
- # Check page limit
- await page_limit_service.check_page_limit(user_id, estimated_pages)
-
- if app_config.ETL_SERVICE == "UNSTRUCTURED":
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- from langchain_unstructured import UnstructuredLoader
-
- loader = UnstructuredLoader(
- file_path,
- mode="elements",
- post_processors=[],
- languages=["eng"],
- include_orig_elements=False,
- include_metadata=False,
- strategy="auto",
- )
- docs = await loader.aload()
- markdown_content = await convert_document_to_markdown(docs)
- actual_pages = page_limit_service.estimate_pages_from_elements(docs)
- final_page_count = max(estimated_pages, actual_pages)
- etl_service = "UNSTRUCTURED"
-
- # Update page usage
- await page_limit_service.update_page_usage(
- user_id, final_page_count, allow_exceed=True
- )
-
- elif app_config.ETL_SERVICE == "LLAMACLOUD":
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- result = await parse_with_llamacloud_retry(
- file_path=file_path,
- estimated_pages=estimated_pages,
- task_logger=task_logger,
- log_entry=log_entry,
- )
- markdown_documents = await result.aget_markdown_documents(
- split_by_page=False
- )
- if not markdown_documents:
- raise RuntimeError(
- f"LlamaCloud parsing returned no documents: {filename}"
- )
- markdown_content = markdown_documents[0].text
- etl_service = "LLAMACLOUD"
-
- # Update page usage
- await page_limit_service.update_page_usage(
- user_id, estimated_pages, allow_exceed=True
- )
-
- elif app_config.ETL_SERVICE == "DOCLING":
- if notification:
- await NotificationService.document_processing.notify_processing_progress(
- session,
- notification,
- stage="parsing",
- stage_message="Extracting content",
- )
-
- # Suppress logging during Docling import
- getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
- getLogger("docling.document_converter").setLevel(ERROR)
- getLogger(
- "docling_core.transforms.chunker.hierarchical_chunker"
- ).setLevel(ERROR)
-
- from docling.document_converter import DocumentConverter
-
- converter = DocumentConverter()
- result = converter.convert(file_path)
- markdown_content = result.document.export_to_markdown()
- etl_service = "DOCLING"
-
- # Update page usage
- await page_limit_service.update_page_usage(
- user_id, estimated_pages, allow_exceed=True
- )
-
- else:
- raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
-
- # Clean up temp file
- with contextlib.suppress(Exception):
- os.unlink(file_path)
+ # Step 1: extract content
+ markdown_content, etl_service = await _extract_file_content(
+ file_path,
+ filename,
+ session,
+ user_id,
+ task_logger,
+ log_entry,
+ notification,
+ )
if not markdown_content:
raise RuntimeError(f"Failed to extract content from file: {filename}")
- # ===== STEP 2: Check for duplicate content =====
+ # Step 2: duplicate check
content_hash = generate_content_hash(markdown_content, search_space_id)
-
existing_by_content = await check_duplicate_document(session, content_hash)
if existing_by_content and existing_by_content.id != doc_id:
- # Duplicate content found - mark this document as failed
logging.info(
f"Duplicate content detected for {filename}, "
f"matches document {existing_by_content.id}"
)
return None
- # ===== STEP 3+4: Index via pipeline =====
+ # Step 3: index via pipeline
if notification:
await NotificationService.document_processing.notify_processing_progress(
- session, notification, stage="chunking"
+ session,
+ notification,
+ stage="chunking",
)
user_llm = await get_user_long_context_llm(session, user_id, search_space_id)
@@ -1957,7 +1039,6 @@ async def process_file_in_background_with_document(
"file_type": etl_service,
},
)
-
return document
except Exception as e:
diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
index 2fb711bf8..0ff340c0e 100644
--- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py
+++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py
@@ -14,88 +14,19 @@ from app.utils.document_converters import (
create_document_chunks,
generate_content_hash,
generate_document_summary,
- generate_unique_identifier_hash,
)
+from ._helpers import (
+ find_existing_document_with_migration,
+ get_google_drive_unique_identifier,
+)
from .base import (
- check_document_by_unique_identifier,
check_duplicate_document,
get_current_timestamp,
safe_set_chunks,
)
-def _get_google_drive_unique_identifier(
- connector: dict | None,
- filename: str,
- search_space_id: int,
-) -> tuple[str, str | None]:
- """
- Get unique identifier hash for a file, with special handling for Google Drive.
-
- For Google Drive files, uses file_id as the unique identifier (doesn't change on rename).
- For other files, uses filename.
-
- Args:
- connector: Optional connector info dict with type and metadata
- filename: The filename (used for non-Google Drive files or as fallback)
- search_space_id: The search space ID
-
- Returns:
- Tuple of (primary_hash, legacy_hash or None)
- """
- if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE:
- metadata = connector.get("metadata", {})
- file_id = metadata.get("google_drive_file_id")
-
- if file_id:
- primary_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id
- )
- legacy_hash = generate_unique_identifier_hash(
- DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id
- )
- return primary_hash, legacy_hash
-
- primary_hash = generate_unique_identifier_hash(
- DocumentType.FILE, filename, search_space_id
- )
- return primary_hash, None
-
-
-async def _find_existing_document_with_migration(
- session: AsyncSession,
- primary_hash: str,
- legacy_hash: str | None,
- content_hash: str | None = None,
-) -> Document | None:
- """
- Find existing document, checking both new hash and legacy hash for migration,
- with fallback to content_hash for cross-source deduplication.
- """
- existing_document = await check_document_by_unique_identifier(session, primary_hash)
-
- if not existing_document and legacy_hash:
- existing_document = await check_document_by_unique_identifier(
- session, legacy_hash
- )
- if existing_document:
- logging.info(
- "Found legacy document (filename-based hash), will migrate to file_id-based hash"
- )
-
- # Fallback: check by content_hash to catch duplicates from different sources
- if not existing_document and content_hash:
- existing_document = await check_duplicate_document(session, content_hash)
- if existing_document:
- logging.info(
- f"Found duplicate content from different source (content_hash match). "
- f"Original document ID: {existing_document.id}, type: {existing_document.document_type}"
- )
-
- return existing_document
-
-
async def _handle_existing_document_update(
session: AsyncSession,
existing_document: Document,
@@ -224,7 +155,7 @@ async def add_received_markdown_file_document(
try:
# Generate unique identifier hash (uses file_id for Google Drive, filename for others)
- primary_hash, legacy_hash = _get_google_drive_unique_identifier(
+ primary_hash, legacy_hash = get_google_drive_unique_identifier(
connector, file_name, search_space_id
)
@@ -232,7 +163,7 @@ async def add_received_markdown_file_document(
content_hash = generate_content_hash(file_in_markdown, search_space_id)
# Check if document exists (with migration support for Google Drive and content_hash fallback)
- existing_document = await _find_existing_document_with_migration(
+ existing_document = await find_existing_document_with_migration(
session, primary_hash, legacy_hash, content_hash
)
diff --git a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
index a8dab43f0..a56398baa 100644
--- a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
+++ b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py
@@ -2,12 +2,11 @@
Integration tests for backend file upload limit enforcement.
These tests verify that the API rejects uploads that exceed:
- - Max files per upload (10)
- - Max per-file size (50 MB)
- - Max total upload size (200 MB)
+ - Max per-file size (500 MB)
-The limits mirror the frontend's DocumentUploadTab.tsx constants and are
-enforced server-side to protect against direct API calls.
+No file count or total size limits are enforced — the frontend batches
+uploads in groups of 5 and there is no cap on how many files a user can
+upload in a single session.
Prerequisites:
- PostgreSQL + pgvector
@@ -24,60 +23,12 @@ pytestmark = pytest.mark.integration
# ---------------------------------------------------------------------------
-# Test A: File count limit
-# ---------------------------------------------------------------------------
-
-
-class TestFileCountLimit:
- """Uploading more than 10 files in a single request should be rejected."""
-
- async def test_11_files_returns_413(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- ):
- files = [
- ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
- for i in range(11)
- ]
- resp = await client.post(
- "/api/v1/documents/fileupload",
- headers=headers,
- files=files,
- data={"search_space_id": str(search_space_id)},
- )
- assert resp.status_code == 413
- assert "too many files" in resp.json()["detail"].lower()
-
- async def test_10_files_accepted(
- self,
- client: httpx.AsyncClient,
- headers: dict[str, str],
- search_space_id: int,
- cleanup_doc_ids: list[int],
- ):
- files = [
- ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
- for i in range(10)
- ]
- resp = await client.post(
- "/api/v1/documents/fileupload",
- headers=headers,
- files=files,
- data={"search_space_id": str(search_space_id)},
- )
- assert resp.status_code == 200
- cleanup_doc_ids.extend(resp.json().get("document_ids", []))
-
-
-# ---------------------------------------------------------------------------
-# Test B: Per-file size limit
+# Test: Per-file size limit (500 MB)
# ---------------------------------------------------------------------------
class TestPerFileSizeLimit:
- """A single file exceeding 50 MB should be rejected."""
+ """A single file exceeding 500 MB should be rejected."""
async def test_oversized_file_returns_413(
self,
@@ -85,7 +36,7 @@ class TestPerFileSizeLimit:
headers: dict[str, str],
search_space_id: int,
):
- oversized = io.BytesIO(b"\x00" * (50 * 1024 * 1024 + 1))
+ oversized = io.BytesIO(b"\x00" * (500 * 1024 * 1024 + 1))
resp = await client.post(
"/api/v1/documents/fileupload",
headers=headers,
@@ -102,11 +53,11 @@ class TestPerFileSizeLimit:
search_space_id: int,
cleanup_doc_ids: list[int],
):
- at_limit = io.BytesIO(b"\x00" * (50 * 1024 * 1024))
+ at_limit = io.BytesIO(b"\x00" * (500 * 1024 * 1024))
resp = await client.post(
"/api/v1/documents/fileupload",
headers=headers,
- files=[("files", ("exact50mb.txt", at_limit, "text/plain"))],
+ files=[("files", ("exact500mb.txt", at_limit, "text/plain"))],
data={"search_space_id": str(search_space_id)},
)
assert resp.status_code == 200
@@ -114,26 +65,23 @@ class TestPerFileSizeLimit:
# ---------------------------------------------------------------------------
-# Test C: Total upload size limit
+# Test: Multiple files accepted without count limit
# ---------------------------------------------------------------------------
-class TestTotalSizeLimit:
- """Multiple files whose combined size exceeds 200 MB should be rejected."""
+class TestNoFileCountLimit:
+ """Many files in a single request should be accepted."""
- async def test_total_size_over_200mb_returns_413(
+ async def test_many_files_accepted(
self,
client: httpx.AsyncClient,
headers: dict[str, str],
search_space_id: int,
+ cleanup_doc_ids: list[int],
):
- chunk_size = 45 * 1024 * 1024 # 45 MB each
files = [
- (
- "files",
- (f"chunk_{i}.txt", io.BytesIO(b"\x00" * chunk_size), "text/plain"),
- )
- for i in range(5) # 5 x 45 MB = 225 MB > 200 MB
+ ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain"))
+ for i in range(20)
]
resp = await client.post(
"/api/v1/documents/fileupload",
@@ -141,5 +89,5 @@ class TestTotalSizeLimit:
files=files,
data={"search_space_id": str(search_space_id)},
)
- assert resp.status_code == 413
- assert "total upload size" in resp.json()["detail"].lower()
+ assert resp.status_code == 200
+ cleanup_doc_ids.extend(resp.json().get("document_ids", []))
diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
index 163dd0d1d..a8cf5c93b 100644
--- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
+++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py
@@ -248,7 +248,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
return []
async def fake_build_scoped_filesystem(**kwargs):
- return {}
+ return {}, {}
monkeypatch.setattr(
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
@@ -298,7 +298,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
return []
async def fake_build_scoped_filesystem(**kwargs):
- return {}
+ return {}, {}
monkeypatch.setattr(
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
@@ -334,7 +334,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner:
return []
async def fake_build_scoped_filesystem(**kwargs):
- return {}
+ return {}, {}
monkeypatch.setattr(
"app.agents.new_chat.middleware.knowledge_search.search_knowledge_base",
diff --git a/surfsense_web/app/(home)/changelog/page.tsx b/surfsense_web/app/(home)/changelog/page.tsx
index 8d38cb687..c14218dab 100644
--- a/surfsense_web/app/(home)/changelog/page.tsx
+++ b/surfsense_web/app/(home)/changelog/page.tsx
@@ -29,7 +29,7 @@ interface ChangelogPageItem {
export default async function ChangelogPage() {
const allPages = source.getPages() as ChangelogPageItem[];
- const sortedChangelogs = allPages.sort((a, b) => {
+ const sortedChangelogs = allPages.toSorted((a, b) => {
const dateA = new Date(a.data.date).getTime();
const dateB = new Date(b.data.date).getTime();
return dateB - dateA;
diff --git a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
index 4e0c36267..ceef9f2e1 100644
--- a/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
+++ b/surfsense_web/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentsTableShell.tsx
@@ -329,14 +329,15 @@ export function DocumentsTableShell({
const handleViewDocument = useCallback(async (doc: Document) => {
setViewingDoc(doc);
- if (doc.content) {
- setViewingContent(doc.content);
+ const preview = doc.content_preview || doc.content;
+ if (preview) {
+ setViewingContent(preview);
return;
}
setViewingLoading(true);
try {
const fullDoc = await documentsApiService.getDocument({ id: doc.id });
- setViewingContent(fullDoc.content);
+ setViewingContent(fullDoc.content_preview || fullDoc.content);
} catch (err) {
console.error("[DocumentsTableShell] Failed to fetch document content:", err);
setViewingContent("Failed to load document content.");
@@ -951,7 +952,30 @@ export function DocumentsTableShell({
) : (
-
+ <>
+
+ {viewingDoc && (
+