diff --git a/.github/workflows/desktop-release.yml b/.github/workflows/desktop-release.yml index 491df0992..784dffb32 100644 --- a/.github/workflows/desktop-release.yml +++ b/.github/workflows/desktop-release.yml @@ -5,6 +5,20 @@ on: tags: - 'v*' - 'beta-v*' + workflow_dispatch: + inputs: + version: + description: 'Version number (e.g. 0.0.15) — used for dry-run testing without a tag' + required: true + default: '0.0.0-test' + publish: + description: 'Publish to GitHub Releases' + required: true + type: choice + options: + - never + - always + default: 'never' permissions: contents: write @@ -25,24 +39,28 @@ jobs: steps: - name: Checkout - uses: actions/checkout@v4 + uses: actions/checkout@v5 - - name: Extract version from tag + - name: Extract version id: version shell: bash run: | - TAG=${GITHUB_REF#refs/tags/} - VERSION=${TAG#beta-} - VERSION=${VERSION#v} + if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then + VERSION="${{ inputs.version }}" + else + TAG=${GITHUB_REF#refs/tags/} + VERSION=${TAG#beta-} + VERSION=${VERSION#v} + fi echo "VERSION=$VERSION" >> "$GITHUB_OUTPUT" - name: Setup pnpm - uses: pnpm/action-setup@v4 + uses: pnpm/action-setup@v5 - name: Setup Node.js - uses: actions/setup-node@v4 + uses: actions/setup-node@v5 with: - node-version: 20 + node-version: 22 cache: 'pnpm' cache-dependency-path: | surfsense_web/pnpm-lock.yaml @@ -60,6 +78,7 @@ jobs: NEXT_PUBLIC_ZERO_CACHE_URL: ${{ vars.NEXT_PUBLIC_ZERO_CACHE_URL }} NEXT_PUBLIC_DEPLOYMENT_MODE: ${{ vars.NEXT_PUBLIC_DEPLOYMENT_MODE }} NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE: ${{ vars.NEXT_PUBLIC_FASTAPI_BACKEND_AUTH_TYPE }} + NEXT_PUBLIC_POSTHOG_KEY: ${{ secrets.NEXT_PUBLIC_POSTHOG_KEY }} - name: Install desktop dependencies run: pnpm install @@ -70,9 +89,12 @@ jobs: working-directory: surfsense_desktop env: HOSTED_FRONTEND_URL: ${{ vars.HOSTED_FRONTEND_URL }} + POSTHOG_KEY: ${{ secrets.POSTHOG_KEY }} + POSTHOG_HOST: ${{ vars.POSTHOG_HOST }} - name: Package & Publish - run: pnpm exec electron-builder ${{ matrix.platform }} --config electron-builder.yml --publish always -c.extraMetadata.version=${{ steps.version.outputs.VERSION }} + shell: bash + run: pnpm exec electron-builder ${{ matrix.platform }} --config electron-builder.yml --publish ${{ inputs.publish || 'always' }} -c.extraMetadata.version=${{ steps.version.outputs.VERSION }} working-directory: surfsense_desktop env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/README.es.md b/README.es.md index d61504cd5..b62d2cece 100644 --- a/README.es.md +++ b/README.es.md @@ -21,9 +21,28 @@ # SurfSense -Conecta cualquier LLM a tus fuentes de conocimiento internas y chatea con él en tiempo real junto a tu equipo. Alternativa de código abierto a NotebookLM, Perplexity y Glean. -SurfSense es un agente de investigación de IA altamente personalizable, conectado a fuentes externas como motores de búsqueda (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian y más por venir. +NotebookLM es una de las mejores y más útiles plataformas de IA que existen, pero una vez que comienzas a usarla regularmente también sientes sus limitaciones dejando algo que desear. + +1. Hay límites en la cantidad de fuentes que puedes agregar en un notebook. +2. Hay límites en la cantidad de notebooks que puedes tener. +3. No puedes tener fuentes que excedan 500,000 palabras y más de 200MB. +4. Estás bloqueado con los servicios de Google (LLMs, modelos de uso, etc.) sin opción de configurarlos. +5. Fuentes de datos externas e integraciones de servicios limitadas. +6. El agente de NotebookLM está específicamente optimizado solo para estudiar e investigar, pero puedes hacer mucho más con los datos de origen. +7. Falta de soporte multijugador. + +...y más. + +**SurfSense está específicamente hecho para resolver estos problemas.** SurfSense te permite: + +- **Controla Tu Flujo de Datos** - Mantén tus datos privados y seguros. +- **Sin Límites de Datos** - Agrega una cantidad ilimitada de fuentes y notebooks. +- **Sin Dependencia de Proveedores** - Configura cualquier modelo LLM, de imagen, TTS y STT. +- **25+ Fuentes de Datos Externas** - Agrega tus fuentes desde Google Drive, OneDrive, Dropbox, Notion y muchos otros servicios externos. +- **Soporte Multijugador en Tiempo Real** - Trabaja fácilmente con los miembros de tu equipo en un notebook compartido. + +...y más por venir. @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Ejemplo de Agente de Video -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ Para Docker Compose, instalación manual y otras opciones de despliegue, consult

Comentarios en Tiempo Real

-## Funcionalidades Principales +## SurfSense vs Google NotebookLM -| Funcionalidad | Descripción | -|----------------|-------------| -| Alternativa OSS | Reemplazo directo de NotebookLM, Perplexity y Glean con colaboración en equipo en tiempo real | -| 50+ Formatos de Archivo | Sube documentos, imágenes, videos vía LlamaCloud, Unstructured o Docling (local) | -| Búsqueda Híbrida | Semántica + Texto completo con Índices Jerárquicos y Reciprocal Rank Fusion | -| Respuestas con Citas | Chatea con tu base de conocimiento y obtén respuestas citadas al estilo Perplexity | -| Arquitectura de Agentes Profundos | Impulsado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) con planificación, subagentes y acceso al sistema de archivos | -| Soporte Universal de LLM | 100+ LLMs, 6000+ modelos de embeddings, todos los principales rerankers vía OpenAI spec y LiteLLM | -| Privacidad Primero | Soporte completo de LLM local (vLLM, Ollama) tus datos son tuyos | -| Colaboración en Equipo | RBAC con roles de Propietario / Admin / Editor / Visor, chat en tiempo real e hilos de comentarios | -| Generación de Videos | Genera videos con narración y visuales | -| Generación de Presentaciones | Crea presentaciones editables basadas en diapositivas | -| Generación de Podcasts | Podcast de 3 min en menos de 20 segundos; múltiples proveedores TTS (OpenAI, Azure, Kokoro) | -| Extensión de Navegador | Extensión multi-navegador para guardar cualquier página web, incluyendo páginas protegidas por autenticación | -| 27+ Conectores | Motores de búsqueda, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord y [más](#fuentes-externas) | -| Auto-Hospedable | Código abierto, Docker en un solo comando o Docker Compose completo para producción | +| Característica | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **Fuentes por Notebook** | 50 (Gratis) a 600 (Ultra, $249.99/mes) | Ilimitadas | +| **Número de Notebooks** | 100 (Gratis) a 500 (planes de pago) | Ilimitados | +| **Límite de Tamaño de Fuente** | 500,000 palabras / 200MB por fuente | Sin límite | +| **Precios** | Nivel gratuito disponible; Pro $19.99/mes, Ultra $249.99/mes | Gratuito y de código abierto, auto-hospedable en tu propia infra | +| **Soporte de LLM** | Solo Google Gemini | 100+ LLMs vía OpenAI spec y LiteLLM | +| **Modelos de Embeddings** | Solo Google | 6,000+ modelos de embeddings, todos los principales rerankers | +| **LLMs Locales / Privados** | No disponible | Soporte completo (vLLM, Ollama) - tus datos son tuyos | +| **Auto-Hospedable** | No | Sí - Docker en un solo comando o Docker Compose completo | +| **Código Abierto** | No | Sí | +| **Conectores Externos** | Google Drive, YouTube, sitios web | 27+ conectores - Motores de búsqueda, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord y [más](#fuentes-externas) | +| **Soporte de Formatos de Archivo** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, imágenes, URLs web, YouTube | 50+ formatos - documentos, imágenes, videos vía LlamaCloud, Unstructured o Docling (local) | +| **Búsqueda** | Búsqueda semántica | Búsqueda Híbrida - Semántica + Texto completo con Índices Jerárquicos y Reciprocal Rank Fusion | +| **Respuestas con Citas** | Sí | Sí - Respuestas citadas al estilo Perplexity | +| **Arquitectura de Agentes** | No | Sí - impulsado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) con planificación, subagentes y acceso al sistema de archivos | +| **Multijugador en Tiempo Real** | Notebooks compartidos con roles de Visor/Editor (sin chat en tiempo real) | RBAC con roles de Propietario / Admin / Editor / Visor, chat en tiempo real e hilos de comentarios | +| **Generación de Videos** | Resúmenes en video cinemáticos vía Veo 3 (solo Ultra) | Disponible (NotebookLM es mejor aquí, mejorando activamente) | +| **Generación de Presentaciones** | Diapositivas más atractivas pero no editables | Crea presentaciones editables basadas en diapositivas | +| **Generación de Podcasts** | Resúmenes de audio con hosts e idiomas personalizables | Disponible con múltiples proveedores TTS (NotebookLM es mejor aquí, mejorando activamente) | +| **Extensión de Navegador** | No | Extensión multi-navegador para guardar cualquier página web, incluyendo páginas protegidas por autenticación |
Lista completa de Fuentes Externas diff --git a/README.hi.md b/README.hi.md index 011dbf5db..b49bddc72 100644 --- a/README.hi.md +++ b/README.hi.md @@ -21,9 +21,28 @@ # SurfSense -किसी भी LLM को अपने आंतरिक ज्ञान स्रोतों से जोड़ें और अपनी टीम के साथ रीयल-टाइम में चैट करें। NotebookLM, Perplexity और Glean का ओपन सोर्स विकल्प। -SurfSense एक अत्यधिक अनुकूलन योग्य AI शोध एजेंट है, जो बाहरी स्रोतों से जुड़ा है जैसे सर्च इंजन (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian और भी बहुत कुछ आने वाला है। +NotebookLM वहाँ उपलब्ध सबसे अच्छे और सबसे उपयोगी AI प्लेटफ़ॉर्म में से एक है, लेकिन जब आप इसे नियमित रूप से उपयोग करना शुरू करते हैं तो आप इसकी सीमाओं को भी महसूस करते हैं जो कुछ और की चाह छोड़ती हैं। + +1. एक notebook में जोड़े जा सकने वाले स्रोतों की मात्रा पर सीमाएं हैं। +2. आपके पास कितने notebooks हो सकते हैं इस पर सीमाएं हैं। +3. आपके पास ऐसे स्रोत नहीं हो सकते जो 500,000 शब्दों और 200MB से अधिक हों। +4. आप Google सेवाओं (LLMs, उपयोग मॉडल, आदि) में बंद हैं और उन्हें कॉन्फ़िगर करने का कोई विकल्प नहीं है। +5. सीमित बाहरी डेटा स्रोत और सेवा एकीकरण। +6. NotebookLM एजेंट विशेष रूप से केवल अध्ययन और शोध के लिए अनुकूलित है, लेकिन आप स्रोत डेटा के साथ और भी बहुत कुछ कर सकते हैं। +7. मल्टीप्लेयर सपोर्ट की कमी। + +...और भी बहुत कुछ। + +**SurfSense विशेष रूप से इन समस्याओं को हल करने के लिए बनाया गया है।** SurfSense आपको सक्षम बनाता है: + +- **अपने डेटा प्रवाह को नियंत्रित करें** - अपने डेटा को निजी और सुरक्षित रखें। +- **कोई डेटा सीमा नहीं** - असीमित मात्रा में स्रोत और notebooks जोड़ें। +- **कोई विक्रेता लॉक-इन नहीं** - किसी भी LLM, इमेज, TTS और STT मॉडल को कॉन्फ़िगर करें। +- **25+ बाहरी डेटा स्रोत** - Google Drive, OneDrive, Dropbox, Notion और कई अन्य बाहरी सेवाओं से अपने स्रोत जोड़ें। +- **रीयल-टाइम मल्टीप्लेयर सपोर्ट** - एक साझा notebook में अपनी टीम के सदस्यों के साथ आसानी से काम करें। + +...और भी बहुत कुछ आने वाला है। @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## वीडियो एजेंट नमूना -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ Docker Compose, मैनुअल इंस्टॉलेशन और अन

रीयल-टाइम कमेंट्स

-## प्रमुख विशेषताएं +## SurfSense vs Google NotebookLM -| विशेषता | विवरण | -|----------|--------| -| OSS विकल्प | रीयल-टाइम टीम सहयोग के साथ NotebookLM, Perplexity और Glean का सीधा प्रतिस्थापन | -| 50+ फ़ाइल फ़ॉर्मेट | LlamaCloud, Unstructured या Docling (लोकल) के माध्यम से दस्तावेज़, चित्र, वीडियो अपलोड करें | -| हाइब्रिड सर्च | हायरार्किकल इंडाइसेस और Reciprocal Rank Fusion के साथ सिमैंटिक + फुल टेक्स्ट सर्च | -| उद्धृत उत्तर | अपने ज्ञान आधार के साथ चैट करें और Perplexity शैली के उद्धृत उत्तर पाएं | -| डीप एजेंट आर्किटेक्चर | [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) द्वारा संचालित, योजना, सब-एजेंट और फ़ाइल सिस्टम एक्सेस | -| यूनिवर्सल LLM सपोर्ट | 100+ LLMs, 6000+ एम्बेडिंग मॉडल, सभी प्रमुख रीरैंकर्स OpenAI spec और LiteLLM के माध्यम से | -| प्राइवेसी फर्स्ट | पूर्ण लोकल LLM सपोर्ट (vLLM, Ollama) आपका डेटा आपका रहता है | -| टीम सहयोग | मालिक / एडमिन / संपादक / दर्शक भूमिकाओं के साथ RBAC, रीयल-टाइम चैट और कमेंट थ्रेड | -| वीडियो जनरेशन | नैरेशन और विज़ुअल के साथ वीडियो बनाएं | -| प्रेजेंटेशन जनरेशन | संपादन योग्य, स्लाइड आधारित प्रेजेंटेशन बनाएं | -| पॉडकास्ट जनरेशन | 20 सेकंड से कम में 3 मिनट का पॉडकास्ट; कई TTS प्रदाता (OpenAI, Azure, Kokoro) | -| ब्राउज़र एक्सटेंशन | किसी भी वेबपेज को सहेजने के लिए क्रॉस-ब्राउज़र एक्सटेंशन, प्रमाणीकरण सुरक्षित पेज सहित | -| 27+ कनेक्टर्स | सर्च इंजन, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord और [अधिक](#बाहरी-स्रोत) | -| सेल्फ-होस्ट करने योग्य | ओपन सोर्स, Docker एक कमांड या प्रोडक्शन के लिए पूर्ण Docker Compose | +| विशेषता | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **प्रति Notebook स्रोत** | 50 (मुफ़्त) से 600 (Ultra, $249.99/माह) | असीमित | +| **Notebooks की संख्या** | 100 (मुफ़्त) से 500 (सशुल्क योजनाएं) | असीमित | +| **स्रोत आकार सीमा** | 500,000 शब्द / 200MB प्रति स्रोत | कोई सीमा नहीं | +| **मूल्य निर्धारण** | मुफ़्त स्तर उपलब्ध; Pro $19.99/माह, Ultra $249.99/माह | मुफ़्त और ओपन सोर्स, अपनी इंफ्रा पर सेल्फ-होस्ट करें | +| **LLM सपोर्ट** | केवल Google Gemini | 100+ LLMs OpenAI spec और LiteLLM के माध्यम से | +| **एम्बेडिंग मॉडल** | केवल Google | 6,000+ एम्बेडिंग मॉडल, सभी प्रमुख रीरैंकर्स | +| **लोकल / प्राइवेट LLMs** | उपलब्ध नहीं | पूर्ण सपोर्ट (vLLM, Ollama) - आपका डेटा आपका रहता है | +| **सेल्फ-होस्ट करने योग्य** | नहीं | हाँ - Docker एक कमांड या पूर्ण Docker Compose | +| **ओपन सोर्स** | नहीं | हाँ | +| **बाहरी कनेक्टर्स** | Google Drive, YouTube, वेबसाइटें | 27+ कनेक्टर्स - सर्च इंजन, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord और [अधिक](#बाहरी-स्रोत) | +| **फ़ाइल फ़ॉर्मेट सपोर्ट** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, इमेज, वेब URLs, YouTube | 50+ फ़ॉर्मेट - दस्तावेज़, इमेज, वीडियो LlamaCloud, Unstructured या Docling (लोकल) के माध्यम से | +| **सर्च** | सिमैंटिक सर्च | हाइब्रिड सर्च - हायरार्किकल इंडाइसेस और Reciprocal Rank Fusion के साथ सिमैंटिक + फुल टेक्स्ट | +| **उद्धृत उत्तर** | हाँ | हाँ - Perplexity शैली के उद्धृत उत्तर | +| **एजेंट आर्किटेक्चर** | नहीं | हाँ - [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) द्वारा संचालित, योजना, सब-एजेंट और फ़ाइल सिस्टम एक्सेस | +| **रीयल-टाइम मल्टीप्लेयर** | दर्शक/संपादक भूमिकाओं के साथ साझा notebooks (कोई रीयल-टाइम चैट नहीं) | मालिक / एडमिन / संपादक / दर्शक भूमिकाओं के साथ RBAC, रीयल-टाइम चैट और कमेंट थ्रेड | +| **वीडियो जनरेशन** | Veo 3 के माध्यम से सिनेमैटिक वीडियो ओवरव्यू (केवल Ultra) | उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) | +| **प्रेजेंटेशन जनरेशन** | बेहतर दिखने वाली स्लाइड्स लेकिन संपादन योग्य नहीं | संपादन योग्य, स्लाइड आधारित प्रेजेंटेशन बनाएं | +| **पॉडकास्ट जनरेशन** | कस्टमाइज़ेबल होस्ट और भाषाओं के साथ ऑडियो ओवरव्यू | कई TTS प्रदाताओं के साथ उपलब्ध (NotebookLM यहाँ बेहतर है, सक्रिय रूप से सुधार हो रहा है) | +| **ब्राउज़र एक्सटेंशन** | नहीं | किसी भी वेबपेज को सहेजने के लिए क्रॉस-ब्राउज़र एक्सटेंशन, प्रमाणीकरण सुरक्षित पेज सहित |
बाहरी स्रोतों की पूरी सूची diff --git a/README.md b/README.md index f007fd43c..12ea4912a 100644 --- a/README.md +++ b/README.md @@ -21,9 +21,28 @@ # SurfSense -Connect any LLM to your internal knowledge sources and chat with it in real time alongside your team. OSS alternative to NotebookLM, Perplexity, and Glean. -SurfSense is a highly customizable AI research agent, connected to external sources such as Search Engines (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian and more to come. +NotebookLM is one of the best and most useful AI platforms out there, but once you start using it regularly you also feel its limitations leaving something to be desired more. + +1. There are limits on the amount of sources you can add in a notebook. +2. There are limits on the number of notebooks you can have. +3. You cannot have sources that exceed 500,000 words and are more than 200MB. +4. You are vendor locked in to Google services (LLMs, usage models, etc.) with no option to configure them. +5. Limited external data sources and service integrations. +6. NotebookLM Agent is specifically optimised for just studying and researching, but you can do so much more with the source data. +7. Lack of multiplayer support. + +...and more. + +**SurfSense is specifically made to solve these problems.** SurfSense empowers you to: + +- **Control Your Data Flow** - Keep your data private and secure. +- **No Data Limits** - Add an unlimited amount of sources and notebooks. +- **No Vendor Lock-in** - Configure any LLM, image, TTS, and STT models to use. +- **25+ External Data Sources** - Add your sources from Google Drive, OneDrive, Dropbox, Notion, and many other external services. +- **Real-Time Multiplayer Support** - Work easily with your team members in a shared notebook. + +...and more to come. @@ -134,24 +153,29 @@ For Docker Compose, manual installation, and other deployment options, see the [

Realtime Comments

-## Key Features +## SurfSense vs Google NotebookLM -| Feature | Description | -|---------|-------------| -| OSS Alternative | Drop in replacement for NotebookLM, Perplexity, and Glean with real time team collaboration | -| 50+ File Formats | Upload documents, images, videos via LlamaCloud, Unstructured, or Docling (local) | -| Hybrid Search | Semantic + Full Text Search with Hierarchical Indices and Reciprocal Rank Fusion | -| Cited Answers | Chat with your knowledge base and get Perplexity style cited responses | -| Deep Agent Architecture | Powered by [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) planning, subagents, and file system access | -| Universal LLM Support | 100+ LLMs, 6000+ embedding models, all major rerankers via OpenAI spec & LiteLLM | -| Privacy First | Full local LLM support (vLLM, Ollama) your data stays yours | -| Team Collaboration | RBAC with Owner / Admin / Editor / Viewer roles, real time chat & comment threads | -| Video Generation | Generate videos with narration and visuals | -| Presentation Generation | Create editable, slide based presentations | -| Podcast Generation | 3 min podcast in under 20 seconds; multiple TTS providers (OpenAI, Azure, Kokoro) | -| Browser Extension | Cross browser extension to save any webpage, including auth protected pages | -| 27+ Connectors | Search Engines, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord & [more](#external-sources) | -| Self Hostable | Open source, Docker one liner or full Docker Compose for production | +| Feature | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **Sources per Notebook** | 50 (Free) to 600 (Ultra, $249.99/mo) | Unlimited | +| **Number of Notebooks** | 100 (Free) to 500 (paid tiers) | Unlimited | +| **Source Size Limit** | 500,000 words / 200MB per source | No limit | +| **Pricing** | Free tier available; Pro $19.99/mo, Ultra $249.99/mo | Free and open source, self-host on your own infra | +| **LLM Support** | Google Gemini only | 100+ LLMs via OpenAI spec & LiteLLM | +| **Embedding Models** | Google only | 6,000+ embedding models, all major rerankers | +| **Local / Private LLMs** | Not available | Full support (vLLM, Ollama) - your data stays yours | +| **Self Hostable** | No | Yes - Docker one-liner or full Docker Compose | +| **Open Source** | No | Yes | +| **External Connectors** | Google Drive, YouTube, websites | 27+ connectors - Search Engines, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord & [more](#external-sources) | +| **File Format Support** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, images, web URLs, YouTube | 50+ formats - documents, images, videos via LlamaCloud, Unstructured, or Docling (local) | +| **Search** | Semantic search | Hybrid Search - Semantic + Full Text with Hierarchical Indices & Reciprocal Rank Fusion | +| **Cited Answers** | Yes | Yes - Perplexity-style cited responses | +| **Agentic Architecture** | No | Yes - powered by [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) with planning, subagents, and file system access | +| **Real-Time Multiplayer** | Shared notebooks with Viewer/Editor roles (no real-time chat) | RBAC with Owner / Admin / Editor / Viewer roles, real-time chat & comment threads | +| **Video Generation** | Cinematic Video Overviews via Veo 3 (Ultra only) | Available (NotebookLM is better here, actively improving) | +| **Presentation Generation** | Better looking slides but not editable | Create editable, slide-based presentations | +| **Podcast Generation** | Audio Overviews with customizable hosts and languages | Available with multiple TTS providers (NotebookLM is better here, actively improving) | +| **Browser Extension** | No | Cross-browser extension to save any webpage, including auth-protected pages |
Full list of External Sources diff --git a/README.pt-BR.md b/README.pt-BR.md index 4306b0767..50a8b739e 100644 --- a/README.pt-BR.md +++ b/README.pt-BR.md @@ -21,9 +21,28 @@ # SurfSense -Conecte qualquer LLM às suas fontes de conhecimento internas e converse com ele em tempo real junto com sua equipe. Alternativa de código aberto ao NotebookLM, Perplexity e Glean. -SurfSense é um agente de pesquisa de IA altamente personalizável, conectado a fontes externas como mecanismos de busca (SearxNG, Tavily, LinkUp), Google Drive, OneDrive, Dropbox, Slack, Microsoft Teams, Linear, Jira, ClickUp, Confluence, BookStack, Gmail, Notion, YouTube, GitHub, Discord, Airtable, Google Calendar, Luma, Circleback, Elasticsearch, Obsidian e mais por vir. +O NotebookLM é uma das melhores e mais úteis plataformas de IA disponíveis, mas quando você começa a usá-lo regularmente também sente suas limitações deixando algo a desejar. + +1. Há limites na quantidade de fontes que você pode adicionar em um notebook. +2. Há limites no número de notebooks que você pode ter. +3. Você não pode ter fontes que excedam 500.000 palavras e mais de 200MB. +4. Você fica preso aos serviços do Google (LLMs, modelos de uso, etc.) sem opção de configurá-los. +5. Fontes de dados externas e integrações de serviços limitadas. +6. O agente do NotebookLM é especificamente otimizado apenas para estudar e pesquisar, mas você pode fazer muito mais com os dados de origem. +7. Falta de suporte multiplayer. + +...e mais. + +**O SurfSense foi feito especificamente para resolver esses problemas.** O SurfSense permite que você: + +- **Controle Seu Fluxo de Dados** - Mantenha seus dados privados e seguros. +- **Sem Limites de Dados** - Adicione uma quantidade ilimitada de fontes e notebooks. +- **Sem Dependência de Fornecedor** - Configure qualquer modelo LLM, de imagem, TTS e STT. +- **25+ Fontes de Dados Externas** - Adicione suas fontes do Google Drive, OneDrive, Dropbox, Notion e muitos outros serviços externos. +- **Suporte Multiplayer em Tempo Real** - Trabalhe facilmente com os membros da sua equipe em um notebook compartilhado. + +...e mais por vir. @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## Exemplo de Agente de Vídeo -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ Para Docker Compose, instalação manual e outras opções de implantação, con

Comentários em Tempo Real

-## Funcionalidades Principais +## SurfSense vs Google NotebookLM -| Funcionalidade | Descrição | -|----------------|-----------| -| Alternativa OSS | Substituto direto do NotebookLM, Perplexity e Glean com colaboração em equipe em tempo real | -| 50+ Formatos de Arquivo | Faça upload de documentos, imagens, vídeos via LlamaCloud, Unstructured ou Docling (local) | -| Busca Híbrida | Semântica + Texto completo com Índices Hierárquicos e Reciprocal Rank Fusion | -| Respostas com Citações | Converse com sua base de conhecimento e obtenha respostas citadas no estilo Perplexity | -| Arquitetura de Agentes Profundos | Alimentado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) com planejamento, subagentes e acesso ao sistema de arquivos | -| Suporte Universal de LLM | 100+ LLMs, 6000+ modelos de embeddings, todos os principais rerankers via OpenAI spec e LiteLLM | -| Privacidade em Primeiro Lugar | Suporte completo a LLM local (vLLM, Ollama) seus dados ficam com você | -| Colaboração em Equipe | RBAC com papéis de Proprietário / Admin / Editor / Visualizador, chat em tempo real e threads de comentários | -| Geração de Vídeos | Gera vídeos com narração e visuais | -| Geração de Apresentações | Cria apresentações editáveis baseadas em slides | -| Geração de Podcasts | Podcast de 3 min em menos de 20 segundos; múltiplos provedores TTS (OpenAI, Azure, Kokoro) | -| Extensão de Navegador | Extensão multi-navegador para salvar qualquer página web, incluindo páginas protegidas por autenticação | -| 27+ Conectores | Mecanismos de busca, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord e [mais](#fontes-externas) | -| Auto-Hospedável | Código aberto, Docker em um único comando ou Docker Compose completo para produção | +| Recurso | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **Fontes por Notebook** | 50 (Grátis) a 600 (Ultra, $249.99/mês) | Ilimitadas | +| **Número de Notebooks** | 100 (Grátis) a 500 (planos pagos) | Ilimitados | +| **Limite de Tamanho da Fonte** | 500.000 palavras / 200MB por fonte | Sem limite | +| **Preços** | Nível gratuito disponível; Pro $19.99/mês, Ultra $249.99/mês | Gratuito e de código aberto, auto-hospedável na sua própria infra | +| **Suporte a LLM** | Apenas Google Gemini | 100+ LLMs via OpenAI spec e LiteLLM | +| **Modelos de Embeddings** | Apenas Google | 6.000+ modelos de embeddings, todos os principais rerankers | +| **LLMs Locais / Privados** | Não disponível | Suporte completo (vLLM, Ollama) - seus dados ficam com você | +| **Auto-Hospedável** | Não | Sim - Docker em um único comando ou Docker Compose completo | +| **Código Aberto** | Não | Sim | +| **Conectores Externos** | Google Drive, YouTube, sites | 27+ conectores - Mecanismos de busca, Google Drive, OneDrive, Dropbox, Slack, Teams, Jira, Notion, GitHub, Discord e [mais](#fontes-externas) | +| **Suporte a Formatos de Arquivo** | PDFs, Docs, Slides, Sheets, CSV, Word, EPUB, imagens, URLs web, YouTube | 50+ formatos - documentos, imagens, vídeos via LlamaCloud, Unstructured ou Docling (local) | +| **Busca** | Busca semântica | Busca Híbrida - Semântica + Texto completo com Índices Hierárquicos e Reciprocal Rank Fusion | +| **Respostas com Citações** | Sim | Sim - Respostas citadas no estilo Perplexity | +| **Arquitetura de Agentes** | Não | Sim - alimentado por [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) com planejamento, subagentes e acesso ao sistema de arquivos | +| **Multiplayer em Tempo Real** | Notebooks compartilhados com papéis de Visualizador/Editor (sem chat em tempo real) | RBAC com papéis de Proprietário / Admin / Editor / Visualizador, chat em tempo real e threads de comentários | +| **Geração de Vídeos** | Visões gerais cinemáticas via Veo 3 (apenas Ultra) | Disponível (NotebookLM é melhor aqui, melhorando ativamente) | +| **Geração de Apresentações** | Slides mais bonitos mas não editáveis | Cria apresentações editáveis baseadas em slides | +| **Geração de Podcasts** | Visões gerais em áudio com hosts e idiomas personalizáveis | Disponível com múltiplos provedores TTS (NotebookLM é melhor aqui, melhorando ativamente) | +| **Extensão de Navegador** | Não | Extensão multi-navegador para salvar qualquer página web, incluindo páginas protegidas por autenticação |
Lista completa de Fontes Externas diff --git a/README.zh-CN.md b/README.zh-CN.md index 96ebb25ad..419a831ae 100644 --- a/README.zh-CN.md +++ b/README.zh-CN.md @@ -21,9 +21,28 @@ # SurfSense -将任何 LLM 连接到您的内部知识源,并与团队成员实时聊天。NotebookLM、Perplexity 和 Glean 的开源替代方案。 -SurfSense 是一个高度可定制的 AI 研究助手,可以连接外部数据源,如搜索引擎(SearxNG、Tavily、LinkUp)、Google Drive、OneDrive、Dropbox、Slack、Microsoft Teams、Linear、Jira、ClickUp、Confluence、BookStack、Gmail、Notion、YouTube、GitHub、Discord、Airtable、Google Calendar、Luma、Circleback、Elasticsearch、Obsidian 等,未来还会支持更多。 +NotebookLM 是目前最好、最实用的 AI 平台之一,但当你开始经常使用它时,你也会感受到它的局限性,总觉得还有不足之处。 + +1. 一个笔记本中可以添加的来源数量有限制。 +2. 可以拥有的笔记本数量有限制。 +3. 来源不能超过 500,000 个单词和 200MB。 +4. 你被锁定在 Google 服务中(LLM、使用模型等),没有配置选项。 +5. 有限的外部数据源和服务集成。 +6. NotebookLM 代理专门针对学习和研究进行了优化,但你可以用源数据做更多事情。 +7. 缺乏多人协作支持。 + +...还有更多。 + +**SurfSense 正是为了解决这些问题而生。** SurfSense 赋予你: + +- **控制你的数据流** - 保持数据私密和安全。 +- **无数据限制** - 添加无限数量的来源和笔记本。 +- **无供应商锁定** - 配置任何 LLM、图像、TTS 和 STT 模型。 +- **25+ 外部数据源** - 从 Google Drive、OneDrive、Dropbox、Notion 和许多其他外部服务添加你的来源。 +- **实时多人协作支持** - 在共享笔记本中轻松与团队成员协作。 + +...更多功能即将推出。 @@ -34,7 +53,7 @@ https://github.com/user-attachments/assets/cc0c84d3-1f2f-4f7a-b519-2ecce22310b1 ## 视频代理示例 -https://github.com/user-attachments/assets/cc977e6d-8292-4ffe-abb8-3b0560ef5562 +https://github.com/user-attachments/assets/012a7ffa-6f76-4f06-9dda-7632b470057a @@ -133,24 +152,29 @@ irm https://raw.githubusercontent.com/MODSetter/SurfSense/main/docker/scripts/in

实时评论

-## 核心功能 +## SurfSense vs Google NotebookLM -| 功能 | 描述 | -|------|------| -| 开源替代方案 | 支持实时团队协作的 NotebookLM、Perplexity 和 Glean 替代品 | -| 50+ 文件格式 | 通过 LlamaCloud、Unstructured 或 Docling(本地)上传文档、图像、视频 | -| 混合搜索 | 语义搜索 + 全文搜索,结合层次化索引和倒数排名融合 | -| 引用回答 | 与知识库对话,获得 Perplexity 风格的引用回答 | -| 深度代理架构 | 基于 [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) 构建,支持规划、子代理和文件系统访问 | -| 通用 LLM 支持 | 100+ LLM、6000+ 嵌入模型、所有主流重排序器,通过 OpenAI spec 和 LiteLLM | -| 隐私优先 | 完整本地 LLM 支持(vLLM、Ollama),您的数据由您掌控 | -| 团队协作 | RBAC 角色控制(所有者/管理员/编辑者/查看者),实时聊天和评论线程 | -| 视频生成 | 生成带有旁白和视觉效果的视频 | -| 演示文稿生成 | 创建可编辑的幻灯片式演示文稿 | -| 播客生成 | 20 秒内生成 3 分钟播客;多种 TTS 提供商(OpenAI、Azure、Kokoro) | -| 浏览器扩展 | 跨浏览器扩展,保存任何网页,包括需要身份验证的页面 | -| 27+ 连接器 | 搜索引擎、Google Drive、OneDrive、Dropbox、Slack、Teams、Jira、Notion、GitHub、Discord 等[更多](#外部数据源) | -| 可自托管 | 开源,Docker 一行命令或完整 Docker Compose 用于生产环境 | +| 功能 | Google NotebookLM | SurfSense | +|---------|-------------------|-----------| +| **每个笔记本的来源数** | 50(免费)到 600(Ultra,$249.99/月) | 无限制 | +| **笔记本数量** | 100(免费)到 500(付费方案) | 无限制 | +| **来源大小限制** | 500,000 词 / 200MB 每个来源 | 无限制 | +| **定价** | 免费版可用;Pro $19.99/月,Ultra $249.99/月 | 免费开源,在自己的基础设施上自托管 | +| **LLM 支持** | 仅 Google Gemini | 100+ LLM,通过 OpenAI spec 和 LiteLLM | +| **嵌入模型** | 仅 Google | 6,000+ 嵌入模型,所有主流重排序器 | +| **本地 / 私有 LLM** | 不可用 | 完整支持(vLLM、Ollama)- 您的数据由您掌控 | +| **可自托管** | 否 | 是 - Docker 一行命令或完整 Docker Compose | +| **开源** | 否 | 是 | +| **外部连接器** | Google Drive、YouTube、网站 | 27+ 连接器 - 搜索引擎、Google Drive、OneDrive、Dropbox、Slack、Teams、Jira、Notion、GitHub、Discord 等[更多](#外部数据源) | +| **文件格式支持** | PDF、Docs、Slides、Sheets、CSV、Word、EPUB、图像、网页 URL、YouTube | 50+ 格式 - 文档、图像、视频,通过 LlamaCloud、Unstructured 或 Docling(本地) | +| **搜索** | 语义搜索 | 混合搜索 - 语义 + 全文搜索,结合层次化索引和倒数排名融合 | +| **引用回答** | 是 | 是 - Perplexity 风格的引用回答 | +| **代理架构** | 否 | 是 - 基于 [LangChain Deep Agents](https://docs.langchain.com/oss/python/deepagents/overview) 构建,支持规划、子代理和文件系统访问 | +| **实时多人协作** | 共享笔记本,支持查看者/编辑者角色(无实时聊天) | RBAC 角色控制(所有者/管理员/编辑者/查看者),实时聊天和评论线程 | +| **视频生成** | 通过 Veo 3 的电影级视频概览(仅 Ultra) | 可用(NotebookLM 在此方面更好,正在积极改进) | +| **演示文稿生成** | 更美观的幻灯片但不可编辑 | 创建可编辑的幻灯片式演示文稿 | +| **播客生成** | 可自定义主持人和语言的音频概览 | 可用,支持多种 TTS 提供商(NotebookLM 在此方面更好,正在积极改进) | +| **浏览器扩展** | 否 | 跨浏览器扩展,保存任何网页,包括需要身份验证的页面 |
外部数据源完整列表 diff --git a/docs/chinese-llm-setup.md b/docs/chinese-llm-setup.md index 1fb0ce2a1..6638dbba1 100644 --- a/docs/chinese-llm-setup.md +++ b/docs/chinese-llm-setup.md @@ -24,7 +24,7 @@ SurfSense 现已支持以下国产 LLM: 1. 登录 SurfSense Dashboard 2. 进入 **Settings** → **API Keys** (或 **LLM Configurations**) -3. 点击 **Add LLM Model** +3. 点击 **Add Model** 4. 从 **Provider** 下拉菜单中选择你的国产 LLM 提供商 5. 填写必填字段(见下方各提供商详细配置) 6. 点击 **Save** diff --git a/package-lock.json b/package-lock.json new file mode 100644 index 000000000..9703ac09f --- /dev/null +++ b/package-lock.json @@ -0,0 +1,6 @@ +{ + "name": "SurfSense", + "lockfileVersion": 3, + "requires": true, + "packages": {} +} diff --git a/package.json b/package.json new file mode 100644 index 000000000..8a1a6add8 --- /dev/null +++ b/package.json @@ -0,0 +1,5 @@ +{ + "name": "surfsense", + "private": true, + "packageManager": "pnpm@10.24.0" +} diff --git a/surfsense_backend/alembic/versions/116_create_zero_publication.py b/surfsense_backend/alembic/versions/116_create_zero_publication.py index 8f0d7b5d3..ff74952a9 100644 --- a/surfsense_backend/alembic/versions/116_create_zero_publication.py +++ b/surfsense_backend/alembic/versions/116_create_zero_publication.py @@ -42,9 +42,7 @@ def upgrade() -> None: if not exists: table_list = ", ".join(TABLES) conn.execute( - sa.text( - f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}" - ) + sa.text(f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE {table_list}") ) diff --git a/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py new file mode 100644 index 000000000..78a26a381 --- /dev/null +++ b/surfsense_backend/alembic/versions/117_optimize_zero_publication_column_lists.py @@ -0,0 +1,123 @@ +"""optimize zero_publication with column lists + +Recreates the zero_publication using column lists for the documents +table so that large text columns (content, source_markdown, +blocknote_document, etc.) are excluded from WAL replication. +This prevents RangeError: Invalid string length in zero-cache's +change-streamer when documents have very large content. + +Also resets REPLICA IDENTITY to DEFAULT on tables that had it set +to FULL for the old Electric SQL setup (migration 66/75/76). +With DEFAULT (primary-key) identity, column-list publications +only need to include the PK — not every column. + +IMPORTANT — before AND after running this migration: + 1. Stop zero-cache (it holds replication locks that will deadlock DDL) + 2. Run: alembic upgrade head + 3. Delete / reset the zero-cache data volume + 4. Restart zero-cache (it will do a fresh initial sync) + +Revision ID: 117 +Revises: 116 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "117" +down_revision: str | None = "116" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +PUBLICATION_NAME = "zero_publication" + +TABLES_WITH_FULL_IDENTITY = [ + "documents", + "notifications", + "search_source_connectors", + "new_chat_messages", + "chat_comments", + "chat_session_state", +] + +DOCUMENT_COLS = [ + "id", + "title", + "document_type", + "search_space_id", + "folder_id", + "created_by_id", + "status", + "created_at", + "updated_at", +] + +PUBLICATION_DDL_FULL = f"""\ +CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE + notifications, documents, folders, + search_source_connectors, new_chat_messages, + chat_comments, chat_session_state +""" + + +def _terminate_blocked_pids(conn, table: str) -> None: + """Kill backends whose locks on *table* would block our AccessExclusiveLock.""" + conn.execute( + sa.text( + "SELECT pg_terminate_backend(l.pid) " + "FROM pg_locks l " + "JOIN pg_class c ON c.oid = l.relation " + "WHERE c.relname = :tbl " + " AND l.pid != pg_backend_pid()" + ), + {"tbl": table}, + ) + + +def upgrade() -> None: + conn = op.get_bind() + + conn.execute(sa.text("SET lock_timeout = '10s'")) + + for tbl in sorted(TABLES_WITH_FULL_IDENTITY): + _terminate_blocked_pids(conn, tbl) + conn.execute(sa.text(f'LOCK TABLE "{tbl}" IN ACCESS EXCLUSIVE MODE')) + + for tbl in TABLES_WITH_FULL_IDENTITY: + conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY DEFAULT')) + + conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}")) + + has_zero_ver = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = 'documents' AND column_name = '_0_version'" + ) + ).fetchone() + + cols = DOCUMENT_COLS + (['"_0_version"'] if has_zero_ver else []) + col_list = ", ".join(cols) + + conn.execute( + sa.text( + f"CREATE PUBLICATION {PUBLICATION_NAME} FOR TABLE " + f"notifications, " + f"documents ({col_list}), " + f"folders, " + f"search_source_connectors, " + f"new_chat_messages, " + f"chat_comments, " + f"chat_session_state" + ) + ) + + +def downgrade() -> None: + conn = op.get_bind() + conn.execute(sa.text(f"DROP PUBLICATION IF EXISTS {PUBLICATION_NAME}")) + conn.execute(sa.text(PUBLICATION_DDL_FULL)) + for tbl in TABLES_WITH_FULL_IDENTITY: + conn.execute(sa.text(f'ALTER TABLE "{tbl}" REPLICA IDENTITY FULL')) diff --git a/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py new file mode 100644 index 000000000..1fef9fbcb --- /dev/null +++ b/surfsense_backend/alembic/versions/118_add_local_folder_sync_and_versioning.py @@ -0,0 +1,149 @@ +"""Add LOCAL_FOLDER_FILE document type, folder metadata, and document_versions table + +Revision ID: 118 +Revises: 117 +""" + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "118" +down_revision: str | None = "117" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +PUBLICATION_NAME = "zero_publication" + + +def upgrade() -> None: + conn = op.get_bind() + + # Add LOCAL_FOLDER_FILE to documenttype enum + op.execute( + """ + DO $$ + BEGIN + IF NOT EXISTS ( + SELECT 1 FROM pg_type t + JOIN pg_enum e ON t.oid = e.enumtypid + WHERE t.typname = 'documenttype' AND e.enumlabel = 'LOCAL_FOLDER_FILE' + ) THEN + ALTER TYPE documenttype ADD VALUE 'LOCAL_FOLDER_FILE'; + END IF; + END + $$; + """ + ) + + # Add JSONB metadata column to folders table + col_exists = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = 'folders' AND column_name = 'metadata'" + ) + ).fetchone() + if not col_exists: + op.add_column( + "folders", + sa.Column("metadata", sa.dialects.postgresql.JSONB, nullable=True), + ) + + # Create document_versions table + table_exists = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.tables WHERE table_name = 'document_versions'" + ) + ).fetchone() + if not table_exists: + op.create_table( + "document_versions", + sa.Column("id", sa.Integer(), nullable=False, autoincrement=True), + sa.Column("document_id", sa.Integer(), nullable=False), + sa.Column("version_number", sa.Integer(), nullable=False), + sa.Column("source_markdown", sa.Text(), nullable=True), + sa.Column("content_hash", sa.String(), nullable=False), + sa.Column("title", sa.String(), nullable=True), + sa.Column( + "created_at", + sa.TIMESTAMP(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.ForeignKeyConstraint( + ["document_id"], + ["documents.id"], + ondelete="CASCADE", + ), + sa.PrimaryKeyConstraint("id"), + sa.UniqueConstraint( + "document_id", + "version_number", + name="uq_document_version", + ), + ) + + op.execute( + "CREATE INDEX IF NOT EXISTS ix_document_versions_document_id " + "ON document_versions (document_id)" + ) + op.execute( + "CREATE INDEX IF NOT EXISTS ix_document_versions_created_at " + "ON document_versions (created_at)" + ) + + # Add document_versions to Zero publication + pub_exists = conn.execute( + sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"), + {"name": PUBLICATION_NAME}, + ).fetchone() + if pub_exists: + already_in_pub = conn.execute( + sa.text( + "SELECT 1 FROM pg_publication_tables " + "WHERE pubname = :name AND tablename = 'document_versions'" + ), + {"name": PUBLICATION_NAME}, + ).fetchone() + if not already_in_pub: + op.execute( + f"ALTER PUBLICATION {PUBLICATION_NAME} ADD TABLE document_versions" + ) + + +def downgrade() -> None: + conn = op.get_bind() + + # Remove from publication + pub_exists = conn.execute( + sa.text("SELECT 1 FROM pg_publication WHERE pubname = :name"), + {"name": PUBLICATION_NAME}, + ).fetchone() + if pub_exists: + already_in_pub = conn.execute( + sa.text( + "SELECT 1 FROM pg_publication_tables " + "WHERE pubname = :name AND tablename = 'document_versions'" + ), + {"name": PUBLICATION_NAME}, + ).fetchone() + if already_in_pub: + op.execute( + f"ALTER PUBLICATION {PUBLICATION_NAME} DROP TABLE document_versions" + ) + + op.execute("DROP INDEX IF EXISTS ix_document_versions_created_at") + op.execute("DROP INDEX IF EXISTS ix_document_versions_document_id") + op.execute("DROP TABLE IF EXISTS document_versions") + + # Drop metadata column from folders + col_exists = conn.execute( + sa.text( + "SELECT 1 FROM information_schema.columns " + "WHERE table_name = 'folders' AND column_name = 'metadata'" + ) + ).fetchone() + if col_exists: + op.drop_column("folders", "metadata") diff --git a/surfsense_backend/alembic/versions/119_add_vision_llm_id_to_search_spaces.py b/surfsense_backend/alembic/versions/119_add_vision_llm_id_to_search_spaces.py new file mode 100644 index 000000000..8e41d5e67 --- /dev/null +++ b/surfsense_backend/alembic/versions/119_add_vision_llm_id_to_search_spaces.py @@ -0,0 +1,39 @@ +"""119_add_vision_llm_id_to_search_spaces + +Revision ID: 119 +Revises: 118 + +Adds vision_llm_id column to search_spaces for vision/screenshot analysis +LLM role assignment. Defaults to 0 (Auto mode), same convention as +agent_llm_id and document_summary_llm_id. +""" + +from __future__ import annotations + +from collections.abc import Sequence + +import sqlalchemy as sa + +from alembic import op + +revision: str = "119" +down_revision: str | None = "118" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + + +def upgrade() -> None: + conn = op.get_bind() + existing_columns = [ + col["name"] for col in sa.inspect(conn).get_columns("searchspaces") + ] + + if "vision_llm_id" not in existing_columns: + op.add_column( + "searchspaces", + sa.Column("vision_llm_id", sa.Integer(), nullable=True, server_default="0"), + ) + + +def downgrade() -> None: + op.drop_column("searchspaces", "vision_llm_id") diff --git a/surfsense_backend/alembic/versions/120_add_vision_llm_configs_table.py b/surfsense_backend/alembic/versions/120_add_vision_llm_configs_table.py new file mode 100644 index 000000000..c0c915388 --- /dev/null +++ b/surfsense_backend/alembic/versions/120_add_vision_llm_configs_table.py @@ -0,0 +1,190 @@ +"""Add vision LLM configs table and rename preference column + +Revision ID: 120 +Revises: 119 + +Changes: +1. Create visionprovider enum type +2. Create vision_llm_configs table +3. Rename vision_llm_id -> vision_llm_config_id on searchspaces +4. Add vision config permissions to existing system roles +""" + +from __future__ import annotations + +from collections.abc import Sequence + +import sqlalchemy as sa +from sqlalchemy.dialects.postgresql import ENUM as PG_ENUM, UUID + +from alembic import op + +revision: str = "120" +down_revision: str | None = "119" +branch_labels: str | Sequence[str] | None = None +depends_on: str | Sequence[str] | None = None + +VISION_PROVIDER_VALUES = ( + "OPENAI", + "ANTHROPIC", + "GOOGLE", + "AZURE_OPENAI", + "VERTEX_AI", + "BEDROCK", + "XAI", + "OPENROUTER", + "OLLAMA", + "GROQ", + "TOGETHER_AI", + "FIREWORKS_AI", + "DEEPSEEK", + "MISTRAL", + "CUSTOM", +) + + +def upgrade() -> None: + connection = op.get_bind() + + # 1. Create visionprovider enum + connection.execute( + sa.text( + """ + DO $$ + BEGIN + IF NOT EXISTS (SELECT 1 FROM pg_type WHERE typname = 'visionprovider') THEN + CREATE TYPE visionprovider AS ENUM ( + 'OPENAI', 'ANTHROPIC', 'GOOGLE', 'AZURE_OPENAI', 'VERTEX_AI', + 'BEDROCK', 'XAI', 'OPENROUTER', 'OLLAMA', 'GROQ', + 'TOGETHER_AI', 'FIREWORKS_AI', 'DEEPSEEK', 'MISTRAL', 'CUSTOM' + ); + END IF; + END + $$; + """ + ) + ) + + # 2. Create vision_llm_configs table + result = connection.execute( + sa.text( + "SELECT EXISTS (SELECT FROM information_schema.tables WHERE table_name = 'vision_llm_configs')" + ) + ) + if not result.scalar(): + op.create_table( + "vision_llm_configs", + sa.Column("id", sa.Integer(), autoincrement=True, nullable=False), + sa.Column("name", sa.String(100), nullable=False), + sa.Column("description", sa.String(500), nullable=True), + sa.Column( + "provider", + PG_ENUM(*VISION_PROVIDER_VALUES, name="visionprovider", create_type=False), + nullable=False, + ), + sa.Column("custom_provider", sa.String(100), nullable=True), + sa.Column("model_name", sa.String(100), nullable=False), + sa.Column("api_key", sa.String(), nullable=False), + sa.Column("api_base", sa.String(500), nullable=True), + sa.Column("api_version", sa.String(50), nullable=True), + sa.Column("litellm_params", sa.JSON(), nullable=True), + sa.Column("search_space_id", sa.Integer(), nullable=False), + sa.Column("user_id", UUID(as_uuid=True), nullable=False), + sa.Column( + "created_at", + sa.TIMESTAMP(timezone=True), + server_default=sa.text("now()"), + nullable=False, + ), + sa.PrimaryKeyConstraint("id"), + sa.ForeignKeyConstraint( + ["search_space_id"], ["searchspaces.id"], ondelete="CASCADE" + ), + sa.ForeignKeyConstraint( + ["user_id"], ["user.id"], ondelete="CASCADE" + ), + ) + op.execute( + "CREATE INDEX IF NOT EXISTS ix_vision_llm_configs_name " + "ON vision_llm_configs (name)" + ) + op.execute( + "CREATE INDEX IF NOT EXISTS ix_vision_llm_configs_search_space_id " + "ON vision_llm_configs (search_space_id)" + ) + + # 3. Rename vision_llm_id -> vision_llm_config_id on searchspaces + existing_columns = [ + col["name"] for col in sa.inspect(connection).get_columns("searchspaces") + ] + if "vision_llm_id" in existing_columns and "vision_llm_config_id" not in existing_columns: + op.alter_column("searchspaces", "vision_llm_id", new_column_name="vision_llm_config_id") + elif "vision_llm_config_id" not in existing_columns: + op.add_column( + "searchspaces", + sa.Column("vision_llm_config_id", sa.Integer(), nullable=True, server_default="0"), + ) + + # 4. Add vision config permissions to existing system roles + connection.execute( + sa.text( + """ + UPDATE search_space_roles + SET permissions = array_cat( + permissions, + ARRAY['vision_configs:create', 'vision_configs:read'] + ) + WHERE is_system_role = true + AND name = 'Editor' + AND NOT ('vision_configs:create' = ANY(permissions)) + """ + ) + ) + connection.execute( + sa.text( + """ + UPDATE search_space_roles + SET permissions = array_cat( + permissions, + ARRAY['vision_configs:read'] + ) + WHERE is_system_role = true + AND name = 'Viewer' + AND NOT ('vision_configs:read' = ANY(permissions)) + """ + ) + ) + + +def downgrade() -> None: + connection = op.get_bind() + + # Remove permissions + connection.execute( + sa.text( + """ + UPDATE search_space_roles + SET permissions = array_remove( + array_remove( + array_remove(permissions, 'vision_configs:create'), + 'vision_configs:read' + ), + 'vision_configs:delete' + ) + WHERE is_system_role = true + """ + ) + ) + + # Rename column back + existing_columns = [ + col["name"] for col in sa.inspect(connection).get_columns("searchspaces") + ] + if "vision_llm_config_id" in existing_columns: + op.alter_column("searchspaces", "vision_llm_config_id", new_column_name="vision_llm_id") + + # Drop table and enum + op.execute("DROP INDEX IF EXISTS ix_vision_llm_configs_search_space_id") + op.execute("DROP INDEX IF EXISTS ix_vision_llm_configs_name") + op.execute("DROP TABLE IF EXISTS vision_llm_configs") + op.execute("DROP TYPE IF EXISTS visionprovider") diff --git a/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py b/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py index 89a5c1246..7d90f4b13 100644 --- a/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py +++ b/surfsense_backend/alembic/versions/51_add_new_llm_config_table.py @@ -17,10 +17,10 @@ depends_on: str | Sequence[str] | None = None def upgrade() -> None: """ - Add the new_llm_configs table that combines LLM model settings with prompt configuration. + Add the new_llm_configs table that combines model settings with prompt configuration. This table includes: - - LLM model configuration (provider, model_name, api_key, etc.) + - Model configuration (provider, model_name, api_key, etc.) - Configurable system instructions - Citation toggle """ @@ -41,7 +41,7 @@ def upgrade() -> None: name VARCHAR(100) NOT NULL, description VARCHAR(500), - -- LLM Model Configuration (same as llm_configs, excluding language) + -- Model Configuration (same as llm_configs, excluding language) provider litellmprovider NOT NULL, custom_provider VARCHAR(100), model_name VARCHAR(100) NOT NULL, diff --git a/surfsense_backend/app/agents/autocomplete/__init__.py b/surfsense_backend/app/agents/autocomplete/__init__.py new file mode 100644 index 000000000..55d7a692d --- /dev/null +++ b/surfsense_backend/app/agents/autocomplete/__init__.py @@ -0,0 +1,11 @@ +"""Agent-based vision autocomplete with scoped filesystem exploration.""" + +from app.agents.autocomplete.autocomplete_agent import ( + create_autocomplete_agent, + stream_autocomplete_agent, +) + +__all__ = [ + "create_autocomplete_agent", + "stream_autocomplete_agent", +] diff --git a/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py new file mode 100644 index 000000000..77c0af5bb --- /dev/null +++ b/surfsense_backend/app/agents/autocomplete/autocomplete_agent.py @@ -0,0 +1,497 @@ +"""Vision autocomplete agent with scoped filesystem exploration. + +Converts the stateless single-shot vision autocomplete into an agent that +seeds a virtual filesystem from KB search results and lets the vision LLM +explore documents via ``ls``, ``read_file``, ``glob``, ``grep``, etc. +before generating the final completion. + +Performance: KB search and agent graph compilation run in parallel so +the only sequential latency is KB-search (or agent compile, whichever is +slower) + the agent's LLM turns. There is no separate "query extraction" +LLM call — the window title is used directly as the KB search query. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import re +import uuid +from collections.abc import AsyncGenerator +from typing import Any + +from deepagents.graph import BASE_AGENT_PROMPT +from deepagents.middleware.patch_tool_calls import PatchToolCallsMiddleware +from langchain.agents import create_agent +from langchain_anthropic.middleware import AnthropicPromptCachingMiddleware +from langchain_core.language_models import BaseChatModel +from langchain_core.messages import AIMessage, ToolMessage + +from app.agents.new_chat.middleware.filesystem import SurfSenseFilesystemMiddleware +from app.agents.new_chat.middleware.knowledge_search import ( + build_scoped_filesystem, + search_knowledge_base, +) +from app.services.new_streaming_service import VercelStreamingService + +logger = logging.getLogger(__name__) + +KB_TOP_K = 10 + +# --------------------------------------------------------------------------- +# System prompt +# --------------------------------------------------------------------------- + +AUTOCOMPLETE_SYSTEM_PROMPT = """You are a smart writing assistant that analyzes the user's screen to draft or complete text. + +You will receive a screenshot of the user's screen. Your PRIMARY source of truth is the screenshot itself — the visual context determines what to write. + +Your job: +1. Analyze the ENTIRE screenshot to understand what the user is working on (email thread, chat conversation, document, code editor, form, etc.). +2. Identify the text area where the user will type. +3. Generate the text the user most likely wants to write based on the visual context. + +You also have access to the user's knowledge base documents via filesystem tools. However: +- ONLY consult the knowledge base if the screenshot clearly involves a topic where your KB documents are DIRECTLY relevant (e.g., the user is writing about a specific project/topic that matches a document title). +- Do NOT explore documents just because they exist. Most autocomplete requests can be answered purely from the screenshot. +- If you do read a document, only incorporate information that is 100% relevant to what the user is typing RIGHT NOW. Do not add extra details, background, or tangential information from the KB. +- Keep your output SHORT — autocomplete should feel like a natural continuation, not an essay. + +Key behavior: +- If the text area is EMPTY, draft a concise response or message based on what you see on screen (e.g., reply to an email, respond to a chat message, continue a document). +- If the text area already has text, continue it naturally — typically just a sentence or two. + +Rules: +- Be CONCISE. Prefer a single paragraph or a few sentences. Autocomplete is a quick assist, not a full draft. +- Match the tone and formality of the surrounding context. +- If the screen shows code, write code. If it shows a casual chat, be casual. If it shows a formal email, be formal. +- Do NOT describe the screenshot or explain your reasoning. +- Do NOT cite or reference documents explicitly — just let the knowledge inform your writing naturally. +- If you cannot determine what to write, output an empty JSON array: [] + +## Output Format + +You MUST provide exactly 3 different suggestion options. Each should be a distinct, plausible completion — vary the tone, detail level, or angle. + +Return your suggestions as a JSON array of exactly 3 strings. Output ONLY the JSON array, nothing else — no markdown fences, no explanation, no commentary. + +Example format: +["First suggestion text here.", "Second suggestion — a different take.", "Third option with another approach."] + +## Filesystem Tools `ls`, `read_file`, `write_file`, `edit_file`, `glob`, `grep` + +All file paths must start with a `/`. +- ls: list files and directories at a given path. +- read_file: read a file from the filesystem. +- write_file: create a temporary file in the session (not persisted). +- edit_file: edit a file in the session (not persisted for /documents/ files). +- glob: find files matching a pattern (e.g., "**/*.xml"). +- grep: search for text within files. + +## When to Use Filesystem Tools + +BEFORE reaching for any tool, ask yourself: "Can I write a good completion purely from the screenshot?" If yes, just write it — do NOT explore the KB. + +Only use tools when: +- The user is clearly writing about a specific topic that likely has detailed information in their KB. +- You need a specific fact, name, number, or reference that the screenshot doesn't provide. + +When you do use tools, be surgical: +- Check the `ls` output first. If no document title looks relevant, stop — do not read files just to see what's there. +- If a title looks relevant, read only the `` (first ~20 lines) and jump to matched chunks. Do not read entire documents. +- Extract only the specific information you need and move on to generating the completion. + +## Reading Documents Efficiently + +Documents are formatted as XML. Each document contains: +- `` — title, type, URL, etc. +- `` — a table of every chunk with its **line range** and a + `matched="true"` flag for chunks that matched the search query. +- `` — the actual chunks in original document order. + +**Workflow**: read the first ~20 lines to see the ``, identify +chunks marked `matched="true"`, then use `read_file(path, offset=, +limit=)` to jump directly to those sections.""" + +APP_CONTEXT_BLOCK = """ + +The user is currently working in "{app_name}" (window: "{window_title}"). Use this to understand the type of application and adapt your tone and format accordingly.""" + + +def _build_autocomplete_system_prompt(app_name: str, window_title: str) -> str: + prompt = AUTOCOMPLETE_SYSTEM_PROMPT + if app_name: + prompt += APP_CONTEXT_BLOCK.format(app_name=app_name, window_title=window_title) + return prompt + + +# --------------------------------------------------------------------------- +# Pre-compute KB filesystem (runs in parallel with agent compilation) +# --------------------------------------------------------------------------- + + +class _KBResult: + """Container for pre-computed KB filesystem results.""" + + __slots__ = ("files", "ls_ai_msg", "ls_tool_msg") + + def __init__( + self, + files: dict[str, Any] | None = None, + ls_ai_msg: AIMessage | None = None, + ls_tool_msg: ToolMessage | None = None, + ) -> None: + self.files = files + self.ls_ai_msg = ls_ai_msg + self.ls_tool_msg = ls_tool_msg + + @property + def has_documents(self) -> bool: + return bool(self.files) + + +async def precompute_kb_filesystem( + search_space_id: int, + query: str, + top_k: int = KB_TOP_K, +) -> _KBResult: + """Search the KB and build the scoped filesystem outside the agent. + + This is designed to be called via ``asyncio.gather`` alongside agent + graph compilation so the two run concurrently. + """ + if not query: + return _KBResult() + + try: + search_results = await search_knowledge_base( + query=query, + search_space_id=search_space_id, + top_k=top_k, + ) + + if not search_results: + return _KBResult() + + new_files, _ = await build_scoped_filesystem( + documents=search_results, + search_space_id=search_space_id, + ) + + if not new_files: + return _KBResult() + + doc_paths = [ + p + for p, v in new_files.items() + if p.startswith("/documents/") and v is not None + ] + tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}" + ai_msg = AIMessage( + content="", + tool_calls=[ + {"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id} + ], + ) + tool_msg = ToolMessage( + content=str(doc_paths) if doc_paths else "No documents found.", + tool_call_id=tool_call_id, + ) + return _KBResult(files=new_files, ls_ai_msg=ai_msg, ls_tool_msg=tool_msg) + + except Exception: + logger.warning( + "KB pre-computation failed, proceeding without KB", exc_info=True + ) + return _KBResult() + + +# --------------------------------------------------------------------------- +# Filesystem middleware — no save_document, no persistence +# --------------------------------------------------------------------------- + + +class AutocompleteFilesystemMiddleware(SurfSenseFilesystemMiddleware): + """Filesystem middleware for autocomplete — read-only exploration only. + + Strips ``save_document`` (permanent KB persistence) and passes + ``search_space_id=None`` so ``write_file`` / ``edit_file`` stay ephemeral. + """ + + def __init__(self) -> None: + super().__init__(search_space_id=None, created_by_id=None) + self.tools = [t for t in self.tools if t.name != "save_document"] + + +# --------------------------------------------------------------------------- +# Agent factory +# --------------------------------------------------------------------------- + + +async def _compile_agent( + llm: BaseChatModel, + app_name: str, + window_title: str, +) -> Any: + """Compile the agent graph (CPU-bound, runs in a thread).""" + system_prompt = _build_autocomplete_system_prompt(app_name, window_title) + final_system_prompt = system_prompt + "\n\n" + BASE_AGENT_PROMPT + + middleware = [ + AutocompleteFilesystemMiddleware(), + PatchToolCallsMiddleware(), + AnthropicPromptCachingMiddleware(unsupported_model_behavior="ignore"), + ] + + agent = await asyncio.to_thread( + create_agent, + llm, + system_prompt=final_system_prompt, + tools=[], + middleware=middleware, + ) + return agent.with_config({"recursion_limit": 200}) + + +async def create_autocomplete_agent( + llm: BaseChatModel, + *, + search_space_id: int, + kb_query: str, + app_name: str = "", + window_title: str = "", +) -> tuple[Any, _KBResult]: + """Create the autocomplete agent and pre-compute KB in parallel. + + Returns ``(agent, kb_result)`` so the caller can inject the pre-computed + filesystem into the agent's initial state without any middleware delay. + """ + agent, kb = await asyncio.gather( + _compile_agent(llm, app_name, window_title), + precompute_kb_filesystem(search_space_id, kb_query), + ) + return agent, kb + + +# --------------------------------------------------------------------------- +# JSON suggestion parsing (with fallback) +# --------------------------------------------------------------------------- + + +def _parse_suggestions(raw: str) -> list[str]: + """Extract a list of suggestion strings from the agent's output. + + Tries, in order: + 1. Direct ``json.loads`` + 2. Extract content between ```json ... ``` fences + 3. Find the first ``[`` … ``]`` span + Falls back to wrapping the raw text as a single suggestion. + """ + text = raw.strip() + if not text: + return [] + + for candidate in _json_candidates(text): + try: + parsed = json.loads(candidate) + if isinstance(parsed, list) and all(isinstance(s, str) for s in parsed): + return [s for s in parsed if s.strip()] + except (json.JSONDecodeError, ValueError): + continue + + return [text] + + +def _json_candidates(text: str) -> list[str]: + """Yield candidate JSON strings from raw text.""" + candidates = [text] + + fence = re.search(r"```(?:json)?\s*\n?(.*?)```", text, re.DOTALL) + if fence: + candidates.append(fence.group(1).strip()) + + bracket = re.search(r"\[.*]", text, re.DOTALL) + if bracket: + candidates.append(bracket.group(0)) + + return candidates + + +# --------------------------------------------------------------------------- +# Streaming helper +# --------------------------------------------------------------------------- + + +async def stream_autocomplete_agent( + agent: Any, + input_data: dict[str, Any], + streaming_service: VercelStreamingService, + *, + emit_message_start: bool = True, +) -> AsyncGenerator[str, None]: + """Stream agent events as Vercel SSE, with thinking steps for tool calls. + + When ``emit_message_start`` is False the caller has already sent the + ``message_start`` event (e.g. to show preparation steps before the agent + runs). + """ + thread_id = uuid.uuid4().hex + config = {"configurable": {"thread_id": thread_id}} + + text_buffer: list[str] = [] + active_tool_depth = 0 + thinking_step_counter = 0 + tool_step_ids: dict[str, str] = {} + step_titles: dict[str, str] = {} + completed_step_ids: set[str] = set() + last_active_step_id: str | None = None + + def next_thinking_step_id() -> str: + nonlocal thinking_step_counter + thinking_step_counter += 1 + return f"autocomplete-step-{thinking_step_counter}" + + def complete_current_step() -> str | None: + nonlocal last_active_step_id + if last_active_step_id and last_active_step_id not in completed_step_ids: + completed_step_ids.add(last_active_step_id) + title = step_titles.get(last_active_step_id, "Done") + event = streaming_service.format_thinking_step( + step_id=last_active_step_id, + title=title, + status="complete", + ) + last_active_step_id = None + return event + return None + + if emit_message_start: + yield streaming_service.format_message_start() + + gen_step_id = next_thinking_step_id() + last_active_step_id = gen_step_id + step_titles[gen_step_id] = "Generating suggestions" + yield streaming_service.format_thinking_step( + step_id=gen_step_id, + title="Generating suggestions", + status="in_progress", + ) + + try: + async for event in agent.astream_events( + input_data, config=config, version="v2" + ): + event_type = event.get("event", "") + if event_type == "on_chat_model_stream": + if active_tool_depth > 0: + continue + if "surfsense:internal" in event.get("tags", []): + continue + chunk = event.get("data", {}).get("chunk") + if chunk and hasattr(chunk, "content"): + content = chunk.content + if content and isinstance(content, str): + text_buffer.append(content) + + elif event_type == "on_chat_model_end": + if active_tool_depth > 0: + continue + if "surfsense:internal" in event.get("tags", []): + continue + output = event.get("data", {}).get("output") + if output and hasattr(output, "content"): + if getattr(output, "tool_calls", None): + continue + content = output.content + if content and isinstance(content, str) and not text_buffer: + text_buffer.append(content) + + elif event_type == "on_tool_start": + active_tool_depth += 1 + tool_name = event.get("name", "unknown_tool") + run_id = event.get("run_id", "") + tool_input = event.get("data", {}).get("input", {}) + + step_event = complete_current_step() + if step_event: + yield step_event + + tool_step_id = next_thinking_step_id() + tool_step_ids[run_id] = tool_step_id + last_active_step_id = tool_step_id + + title, items = _describe_tool_call(tool_name, tool_input) + step_titles[tool_step_id] = title + yield streaming_service.format_thinking_step( + step_id=tool_step_id, + title=title, + status="in_progress", + items=items, + ) + + elif event_type == "on_tool_end": + active_tool_depth = max(0, active_tool_depth - 1) + run_id = event.get("run_id", "") + step_id = tool_step_ids.pop(run_id, None) + if step_id and step_id not in completed_step_ids: + completed_step_ids.add(step_id) + title = step_titles.get(step_id, "Done") + yield streaming_service.format_thinking_step( + step_id=step_id, + title=title, + status="complete", + ) + if last_active_step_id == step_id: + last_active_step_id = None + + step_event = complete_current_step() + if step_event: + yield step_event + + raw_text = "".join(text_buffer) + suggestions = _parse_suggestions(raw_text) + + yield streaming_service.format_data( + "suggestions", {"options": suggestions} + ) + + yield streaming_service.format_finish() + yield streaming_service.format_done() + + except Exception as e: + logger.error(f"Autocomplete agent streaming error: {e}", exc_info=True) + yield streaming_service.format_error("Autocomplete failed. Please try again.") + yield streaming_service.format_done() + + +def _describe_tool_call(tool_name: str, tool_input: Any) -> tuple[str, list[str]]: + """Return a human-readable (title, items) for a tool call thinking step.""" + inp = tool_input if isinstance(tool_input, dict) else {} + if tool_name == "ls": + path = inp.get("path", "/") + return "Listing files", [path] + if tool_name == "read_file": + fp = inp.get("file_path", "") + display = fp if len(fp) <= 80 else "…" + fp[-77:] + return "Reading file", [display] + if tool_name == "write_file": + fp = inp.get("file_path", "") + display = fp if len(fp) <= 80 else "…" + fp[-77:] + return "Writing file", [display] + if tool_name == "edit_file": + fp = inp.get("file_path", "") + display = fp if len(fp) <= 80 else "…" + fp[-77:] + return "Editing file", [display] + if tool_name == "glob": + pat = inp.get("pattern", "") + base = inp.get("path", "/") + return "Searching files", [f"{pat} in {base}"] + if tool_name == "grep": + pat = inp.get("pattern", "") + path = inp.get("path", "") + display_pat = pat[:60] + ("…" if len(pat) > 60 else "") + return "Searching content", [ + f'"{display_pat}"' + (f" in {path}" if path else "") + ] + return f"Using {tool_name}", [] diff --git a/surfsense_backend/app/agents/new_chat/chat_deepagent.py b/surfsense_backend/app/agents/new_chat/chat_deepagent.py index ccc06f272..fc1e80d28 100644 --- a/surfsense_backend/app/agents/new_chat/chat_deepagent.py +++ b/surfsense_backend/app/agents/new_chat/chat_deepagent.py @@ -159,6 +159,7 @@ async def create_surfsense_deep_agent( additional_tools: Sequence[BaseTool] | None = None, firecrawl_api_key: str | None = None, thread_visibility: ChatVisibility | None = None, + mentioned_document_ids: list[int] | None = None, ): """ Create a SurfSense deep agent with configurable tools and prompts. @@ -451,6 +452,7 @@ async def create_surfsense_deep_agent( search_space_id=search_space_id, available_connectors=available_connectors, available_document_types=available_document_types, + mentioned_document_ids=mentioned_document_ids, ), SurfSenseFilesystemMiddleware( search_space_id=search_space_id, diff --git a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py index 41b24f88b..d7697ef15 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/filesystem.py +++ b/surfsense_backend/app/agents/new_chat/middleware/filesystem.py @@ -66,6 +66,16 @@ the ``, identify chunks marked `matched="true"`, then use those sections instead of reading the entire file sequentially. Use `` values as citation IDs in your answers. + +## User-Mentioned Documents + +When the `ls` output tags a file with `[MENTIONED BY USER — read deeply]`, +the user **explicitly selected** that document. These files are your highest- +priority sources: +1. **Always read them thoroughly** — scan the full ``, then read + all major sections, not just matched chunks. +2. **Prefer their content** over other search results when answering. +3. **Cite from them first** whenever applicable. """ # ============================================================================= diff --git a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py index 3728f229c..7b0dd2f71 100644 --- a/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py +++ b/surfsense_backend/app/agents/new_chat/middleware/knowledge_search.py @@ -28,7 +28,13 @@ from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession from app.agents.new_chat.utils import parse_date_or_datetime, resolve_date_range -from app.db import NATIVE_TO_LEGACY_DOCTYPE, Document, Folder, shielded_async_session +from app.db import ( + NATIVE_TO_LEGACY_DOCTYPE, + Chunk, + Document, + Folder, + shielded_async_session, +) from app.retriever.chunks_hybrid_search import ChucksHybridSearchRetriever from app.utils.document_converters import embed_texts from app.utils.perf import get_perf_logger @@ -430,21 +436,36 @@ async def _get_folder_paths( def _build_synthetic_ls( existing_files: dict[str, Any] | None, new_files: dict[str, Any], + *, + mentioned_paths: set[str] | None = None, ) -> tuple[AIMessage, ToolMessage]: """Build a synthetic ls("/documents") tool-call + result for the LLM context. - Paths are listed with *new* (rank-ordered) files first, then existing files - that were already in state from prior turns. + Mentioned files are listed first. A separate header tells the LLM which + files the user explicitly selected; the path list itself stays clean so + paths can be passed directly to ``read_file`` without stripping tags. """ + _mentioned = mentioned_paths or set() merged: dict[str, Any] = {**(existing_files or {}), **new_files} doc_paths = [ p for p, v in merged.items() if p.startswith("/documents/") and v is not None ] new_set = set(new_files) - new_paths = [p for p in doc_paths if p in new_set] + mentioned_list = [p for p in doc_paths if p in _mentioned] + new_non_mentioned = [p for p in doc_paths if p in new_set and p not in _mentioned] old_paths = [p for p in doc_paths if p not in new_set] - ordered = new_paths + old_paths + ordered = mentioned_list + new_non_mentioned + old_paths + + parts: list[str] = [] + if mentioned_list: + parts.append( + "USER-MENTIONED documents (read these thoroughly before answering):" + ) + for p in mentioned_list: + parts.append(f" {p}") + parts.append("") + parts.append(str(ordered) if ordered else "No documents found.") tool_call_id = f"auto_ls_{uuid.uuid4().hex[:12]}" ai_msg = AIMessage( @@ -452,7 +473,7 @@ def _build_synthetic_ls( tool_calls=[{"name": "ls", "args": {"path": "/documents"}, "id": tool_call_id}], ) tool_msg = ToolMessage( - content=str(ordered) if ordered else "No documents found.", + content="\n".join(parts), tool_call_id=tool_call_id, ) return ai_msg, tool_msg @@ -524,12 +545,92 @@ async def search_knowledge_base( return results[:top_k] +async def fetch_mentioned_documents( + *, + document_ids: list[int], + search_space_id: int, +) -> list[dict[str, Any]]: + """Fetch explicitly mentioned documents with *all* their chunks. + + Returns the same dict structure as ``search_knowledge_base`` so results + can be merged directly into ``build_scoped_filesystem``. Unlike search + results, every chunk is included (no top-K limiting) and none are marked + as ``matched`` since the entire document is relevant by virtue of the + user's explicit mention. + """ + if not document_ids: + return [] + + async with shielded_async_session() as session: + doc_result = await session.execute( + select(Document).where( + Document.id.in_(document_ids), + Document.search_space_id == search_space_id, + ) + ) + docs = {doc.id: doc for doc in doc_result.scalars().all()} + + if not docs: + return [] + + chunk_result = await session.execute( + select(Chunk.id, Chunk.content, Chunk.document_id) + .where(Chunk.document_id.in_(list(docs.keys()))) + .order_by(Chunk.document_id, Chunk.id) + ) + chunks_by_doc: dict[int, list[dict[str, Any]]] = {doc_id: [] for doc_id in docs} + for row in chunk_result.all(): + if row.document_id in chunks_by_doc: + chunks_by_doc[row.document_id].append( + {"chunk_id": row.id, "content": row.content} + ) + + results: list[dict[str, Any]] = [] + for doc_id in document_ids: + doc = docs.get(doc_id) + if doc is None: + continue + metadata = doc.document_metadata or {} + results.append( + { + "document_id": doc.id, + "content": "", + "score": 1.0, + "chunks": chunks_by_doc.get(doc.id, []), + "matched_chunk_ids": [], + "document": { + "id": doc.id, + "title": doc.title, + "document_type": ( + doc.document_type.value + if getattr(doc, "document_type", None) + else None + ), + "metadata": metadata, + }, + "source": ( + doc.document_type.value + if getattr(doc, "document_type", None) + else None + ), + "_user_mentioned": True, + } + ) + return results + + async def build_scoped_filesystem( *, documents: Sequence[dict[str, Any]], search_space_id: int, -) -> dict[str, dict[str, str]]: - """Build a StateBackend-compatible files dict from search results.""" +) -> tuple[dict[str, dict[str, str]], dict[int, str]]: + """Build a StateBackend-compatible files dict from search results. + + Returns ``(files, doc_id_to_path)`` so callers can reliably map a + document id back to its filesystem path without guessing by title. + Paths are collision-proof: when two documents resolve to the same + path the doc-id is appended to disambiguate. + """ async with shielded_async_session() as session: folder_paths = await _get_folder_paths(session, search_space_id) doc_ids = [ @@ -551,6 +652,7 @@ async def build_scoped_filesystem( } files: dict[str, dict[str, str]] = {} + doc_id_to_path: dict[int, str] = {} for document in documents: doc_meta = document.get("document") or {} title = str(doc_meta.get("title") or "untitled") @@ -559,6 +661,9 @@ async def build_scoped_filesystem( base_folder = folder_paths.get(folder_id, "/documents") file_name = _safe_filename(title) path = f"{base_folder}/{file_name}" + if path in files: + stem = file_name.removesuffix(".xml") + path = f"{base_folder}/{stem} ({doc_id}).xml" matched_ids = set(document.get("matched_chunk_ids") or []) xml_content = _build_document_xml(document, matched_chunk_ids=matched_ids) files[path] = { @@ -567,7 +672,9 @@ async def build_scoped_filesystem( "created_at": "", "modified_at": "", } - return files + if isinstance(doc_id, int): + doc_id_to_path[doc_id] = path + return files, doc_id_to_path class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] @@ -583,12 +690,14 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] available_connectors: list[str] | None = None, available_document_types: list[str] | None = None, top_k: int = 10, + mentioned_document_ids: list[int] | None = None, ) -> None: self.llm = llm self.search_space_id = search_space_id self.available_connectors = available_connectors self.available_document_types = available_document_types self.top_k = top_k + self.mentioned_document_ids = mentioned_document_ids or [] async def _plan_search_inputs( self, @@ -680,6 +789,18 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] user_text=user_text, ) + # --- 1. Fetch mentioned documents (user-selected, all chunks) --- + mentioned_results: list[dict[str, Any]] = [] + if self.mentioned_document_ids: + mentioned_results = await fetch_mentioned_documents( + document_ids=self.mentioned_document_ids, + search_space_id=self.search_space_id, + ) + # Clear after first turn so they are not re-fetched on subsequent + # messages within the same agent instance. + self.mentioned_document_ids = [] + + # --- 2. Run KB hybrid search --- search_results = await search_knowledge_base( query=planned_query, search_space_id=self.search_space_id, @@ -689,19 +810,50 @@ class KnowledgeBaseSearchMiddleware(AgentMiddleware): # type: ignore[type-arg] start_date=start_date, end_date=end_date, ) - new_files = await build_scoped_filesystem( - documents=search_results, + + # --- 3. Merge: mentioned first, then search (dedup by doc id) --- + seen_doc_ids: set[int] = set() + merged: list[dict[str, Any]] = [] + for doc in mentioned_results: + doc_id = (doc.get("document") or {}).get("id") + if doc_id is not None: + seen_doc_ids.add(doc_id) + merged.append(doc) + for doc in search_results: + doc_id = (doc.get("document") or {}).get("id") + if doc_id is not None and doc_id in seen_doc_ids: + continue + merged.append(doc) + + # --- 4. Build scoped filesystem --- + new_files, doc_id_to_path = await build_scoped_filesystem( + documents=merged, search_space_id=self.search_space_id, ) - ai_msg, tool_msg = _build_synthetic_ls(existing_files, new_files) + # Identify which paths belong to user-mentioned documents using + # the authoritative doc_id -> path mapping (no title guessing). + mentioned_doc_ids = { + (d.get("document") or {}).get("id") for d in mentioned_results + } + mentioned_paths = { + doc_id_to_path[did] for did in mentioned_doc_ids if did in doc_id_to_path + } + + ai_msg, tool_msg = _build_synthetic_ls( + existing_files, + new_files, + mentioned_paths=mentioned_paths, + ) if t0 is not None: _perf_log.info( - "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r new_files=%d total=%d", + "[kb_fs_middleware] completed in %.3fs query=%r optimized=%r " + "mentioned=%d new_files=%d total=%d", asyncio.get_event_loop().time() - t0, user_text[:80], planned_query[:120], + len(mentioned_results), len(new_files), len(new_files) + len(existing_files or {}), ) diff --git a/surfsense_backend/app/app.py b/surfsense_backend/app/app.py index bba2f1f3a..7b2b421ac 100644 --- a/surfsense_backend/app/app.py +++ b/surfsense_backend/app/app.py @@ -25,7 +25,12 @@ from app.agents.new_chat.checkpointer import ( close_checkpointer, setup_checkpointer_tables, ) -from app.config import config, initialize_image_gen_router, initialize_llm_router +from app.config import ( + config, + initialize_image_gen_router, + initialize_llm_router, + initialize_vision_llm_router, +) from app.db import User, create_db_and_tables, get_async_session from app.routes import router as crud_router from app.routes.auth_routes import router as auth_router @@ -223,6 +228,7 @@ async def lifespan(app: FastAPI): await setup_checkpointer_tables() initialize_llm_router() initialize_image_gen_router() + initialize_vision_llm_router() try: await asyncio.wait_for(seed_surfsense_docs(), timeout=120) except TimeoutError: diff --git a/surfsense_backend/app/celery_app.py b/surfsense_backend/app/celery_app.py index 684da6a13..bf2fdcb39 100644 --- a/surfsense_backend/app/celery_app.py +++ b/surfsense_backend/app/celery_app.py @@ -18,10 +18,15 @@ def init_worker(**kwargs): This ensures the Auto mode (LiteLLM Router) is available for background tasks like document summarization and image generation. """ - from app.config import initialize_image_gen_router, initialize_llm_router + from app.config import ( + initialize_image_gen_router, + initialize_llm_router, + initialize_vision_llm_router, + ) initialize_llm_router() initialize_image_gen_router() + initialize_vision_llm_router() # Get Celery configuration from environment diff --git a/surfsense_backend/app/config/__init__.py b/surfsense_backend/app/config/__init__.py index 7c4baf923..4c49a4f8b 100644 --- a/surfsense_backend/app/config/__init__.py +++ b/surfsense_backend/app/config/__init__.py @@ -102,6 +102,44 @@ def load_global_image_gen_configs(): return [] +def load_global_vision_llm_configs(): + global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml" + + if not global_config_file.exists(): + return [] + + try: + with open(global_config_file, encoding="utf-8") as f: + data = yaml.safe_load(f) + return data.get("global_vision_llm_configs", []) + except Exception as e: + print(f"Warning: Failed to load global vision LLM configs: {e}") + return [] + + +def load_vision_llm_router_settings(): + default_settings = { + "routing_strategy": "usage-based-routing", + "num_retries": 3, + "allowed_fails": 3, + "cooldown_time": 60, + } + + global_config_file = BASE_DIR / "app" / "config" / "global_llm_config.yaml" + + if not global_config_file.exists(): + return default_settings + + try: + with open(global_config_file, encoding="utf-8") as f: + data = yaml.safe_load(f) + settings = data.get("vision_llm_router_settings", {}) + return {**default_settings, **settings} + except Exception as e: + print(f"Warning: Failed to load vision LLM router settings: {e}") + return default_settings + + def load_image_gen_router_settings(): """ Load router settings for image generation Auto mode from YAML file. @@ -182,6 +220,29 @@ def initialize_image_gen_router(): print(f"Warning: Failed to initialize Image Generation Router: {e}") +def initialize_vision_llm_router(): + vision_configs = load_global_vision_llm_configs() + router_settings = load_vision_llm_router_settings() + + if not vision_configs: + print( + "Info: No global vision LLM configs found, " + "Vision LLM Auto mode will not be available" + ) + return + + try: + from app.services.vision_llm_router_service import VisionLLMRouterService + + VisionLLMRouterService.initialize(vision_configs, router_settings) + print( + f"Info: Vision LLM Router initialized with {len(vision_configs)} models " + f"(strategy: {router_settings.get('routing_strategy', 'usage-based-routing')})" + ) + except Exception as e: + print(f"Warning: Failed to initialize Vision LLM Router: {e}") + + class Config: # Check if ffmpeg is installed if not is_ffmpeg_installed(): @@ -335,6 +396,12 @@ class Config: # Router settings for Image Generation Auto mode IMAGE_GEN_ROUTER_SETTINGS = load_image_gen_router_settings() + # Global Vision LLM Configurations (optional) + GLOBAL_VISION_LLM_CONFIGS = load_global_vision_llm_configs() + + # Router settings for Vision LLM Auto mode + VISION_LLM_ROUTER_SETTINGS = load_vision_llm_router_settings() + # Chonkie Configuration | Edit this to your needs EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL") # Azure OpenAI credentials from environment variables diff --git a/surfsense_backend/app/config/global_llm_config.example.yaml b/surfsense_backend/app/config/global_llm_config.example.yaml index 6ca3e95e3..e382fdc74 100644 --- a/surfsense_backend/app/config/global_llm_config.example.yaml +++ b/surfsense_backend/app/config/global_llm_config.example.yaml @@ -17,7 +17,7 @@ # - Configure router_settings below to customize the load balancing behavior # # Structure matches NewLLMConfig: -# - LLM model configuration (provider, model_name, api_key, etc.) +# - Model configuration (provider, model_name, api_key, etc.) # - Prompt configuration (system_instructions, citations_enabled) # Router Settings for Auto Mode @@ -263,6 +263,82 @@ global_image_generation_configs: # rpm: 30 # litellm_params: {} +# ============================================================================= +# Vision LLM Configuration +# ============================================================================= +# These configurations power the vision autocomplete feature (screenshot analysis). +# Only vision-capable models should be used here (e.g. GPT-4o, Gemini Pro, Claude 3). +# Supported providers: OpenAI, Anthropic, Google, Azure OpenAI, Vertex AI, Bedrock, +# xAI, OpenRouter, Ollama, Groq, Together AI, Fireworks AI, DeepSeek, Mistral, Custom +# +# Auto mode (ID 0) uses LiteLLM Router for load balancing across all vision configs. + +# Router Settings for Vision LLM Auto Mode +vision_llm_router_settings: + routing_strategy: "usage-based-routing" + num_retries: 3 + allowed_fails: 3 + cooldown_time: 60 + +global_vision_llm_configs: + # Example: OpenAI GPT-4o (recommended for vision) + - id: -1 + name: "Global GPT-4o Vision" + description: "OpenAI's GPT-4o with strong vision capabilities" + provider: "OPENAI" + model_name: "gpt-4o" + api_key: "sk-your-openai-api-key-here" + api_base: "" + rpm: 500 + tpm: 100000 + litellm_params: + temperature: 0.3 + max_tokens: 1000 + + # Example: Google Gemini 2.0 Flash + - id: -2 + name: "Global Gemini 2.0 Flash" + description: "Google's fast vision model with large context" + provider: "GOOGLE" + model_name: "gemini-2.0-flash" + api_key: "your-google-ai-api-key-here" + api_base: "" + rpm: 1000 + tpm: 200000 + litellm_params: + temperature: 0.3 + max_tokens: 1000 + + # Example: Anthropic Claude 3.5 Sonnet + - id: -3 + name: "Global Claude 3.5 Sonnet Vision" + description: "Anthropic's Claude 3.5 Sonnet with vision support" + provider: "ANTHROPIC" + model_name: "claude-3-5-sonnet-20241022" + api_key: "sk-ant-your-anthropic-api-key-here" + api_base: "" + rpm: 1000 + tpm: 100000 + litellm_params: + temperature: 0.3 + max_tokens: 1000 + + # Example: Azure OpenAI GPT-4o + # - id: -4 + # name: "Global Azure GPT-4o Vision" + # description: "Azure-hosted GPT-4o for vision analysis" + # provider: "AZURE_OPENAI" + # model_name: "azure/gpt-4o-deployment" + # api_key: "your-azure-api-key-here" + # api_base: "https://your-resource.openai.azure.com" + # api_version: "2024-02-15-preview" + # rpm: 500 + # tpm: 100000 + # litellm_params: + # temperature: 0.3 + # max_tokens: 1000 + # base_model: "gpt-4o" + # Notes: # - ID 0 is reserved for "Auto" mode - uses LiteLLM Router for load balancing # - Use negative IDs to distinguish global configs from user configs (NewLLMConfig in DB) @@ -283,3 +359,9 @@ global_image_generation_configs: # - The router uses litellm.aimage_generation() for async image generation # - Only RPM (requests per minute) is relevant for image generation rate limiting. # TPM (tokens per minute) does not apply since image APIs are billed/rate-limited per request, not per token. +# +# VISION LLM NOTES: +# - Vision configs use the same ID scheme (negative for global, positive for user DB) +# - Only use vision-capable models (GPT-4o, Gemini, Claude 3, etc.) +# - Lower temperature (0.3) is recommended for accurate screenshot analysis +# - Lower max_tokens (1000) is sufficient since autocomplete produces short suggestions diff --git a/surfsense_backend/app/config/vision_model_list_fallback.json b/surfsense_backend/app/config/vision_model_list_fallback.json new file mode 100644 index 000000000..830eb6517 --- /dev/null +++ b/surfsense_backend/app/config/vision_model_list_fallback.json @@ -0,0 +1,23 @@ +[ + {"value": "gpt-4o", "label": "GPT-4o", "provider": "OPENAI", "context_window": "128K"}, + {"value": "gpt-4o-mini", "label": "GPT-4o Mini", "provider": "OPENAI", "context_window": "128K"}, + {"value": "gpt-4-turbo", "label": "GPT-4 Turbo", "provider": "OPENAI", "context_window": "128K"}, + {"value": "claude-sonnet-4-20250514", "label": "Claude Sonnet 4", "provider": "ANTHROPIC", "context_window": "200K"}, + {"value": "claude-3-7-sonnet-20250219", "label": "Claude 3.7 Sonnet", "provider": "ANTHROPIC", "context_window": "200K"}, + {"value": "claude-3-5-sonnet-20241022", "label": "Claude 3.5 Sonnet", "provider": "ANTHROPIC", "context_window": "200K"}, + {"value": "claude-3-opus-20240229", "label": "Claude 3 Opus", "provider": "ANTHROPIC", "context_window": "200K"}, + {"value": "claude-3-haiku-20240307", "label": "Claude 3 Haiku", "provider": "ANTHROPIC", "context_window": "200K"}, + {"value": "gemini-2.5-flash", "label": "Gemini 2.5 Flash", "provider": "GOOGLE", "context_window": "1M"}, + {"value": "gemini-2.5-pro", "label": "Gemini 2.5 Pro", "provider": "GOOGLE", "context_window": "1M"}, + {"value": "gemini-2.0-flash", "label": "Gemini 2.0 Flash", "provider": "GOOGLE", "context_window": "1M"}, + {"value": "gemini-1.5-pro", "label": "Gemini 1.5 Pro", "provider": "GOOGLE", "context_window": "1M"}, + {"value": "gemini-1.5-flash", "label": "Gemini 1.5 Flash", "provider": "GOOGLE", "context_window": "1M"}, + {"value": "pixtral-large-latest", "label": "Pixtral Large", "provider": "MISTRAL", "context_window": "128K"}, + {"value": "pixtral-12b-2409", "label": "Pixtral 12B", "provider": "MISTRAL", "context_window": "128K"}, + {"value": "grok-2-vision-1212", "label": "Grok 2 Vision", "provider": "XAI", "context_window": "32K"}, + {"value": "llava", "label": "LLaVA", "provider": "OLLAMA"}, + {"value": "bakllava", "label": "BakLLaVA", "provider": "OLLAMA"}, + {"value": "llava-llama3", "label": "LLaVA Llama 3", "provider": "OLLAMA"}, + {"value": "llama-4-scout-17b-16e-instruct", "label": "Llama 4 Scout 17B", "provider": "GROQ", "context_window": "128K"}, + {"value": "meta-llama/Llama-4-Scout-17B-16E-Instruct", "label": "Llama 4 Scout 17B", "provider": "TOGETHER_AI", "context_window": "128K"} +] diff --git a/surfsense_backend/app/connectors/dropbox/client.py b/surfsense_backend/app/connectors/dropbox/client.py index dfae38f66..e89800191 100644 --- a/surfsense_backend/app/connectors/dropbox/client.py +++ b/surfsense_backend/app/connectors/dropbox/client.py @@ -225,6 +225,55 @@ class DropboxClient: return all_items, None + async def get_latest_cursor(self, path: str = "") -> tuple[str | None, str | None]: + """Get a cursor representing the current state of a folder. + + Uses /2/files/list_folder/get_latest_cursor so we can later call + get_changes to receive only incremental updates. + """ + resp = await self._request( + "/2/files/list_folder/get_latest_cursor", + {"path": path, "recursive": False, "include_non_downloadable_files": True}, + ) + if resp.status_code != 200: + return None, f"Failed to get cursor: {resp.status_code} - {resp.text}" + return resp.json().get("cursor"), None + + async def get_changes( + self, cursor: str + ) -> tuple[list[dict[str, Any]], str | None, str | None]: + """Fetch incremental changes since the given cursor. + + Calls /2/files/list_folder/continue and handles pagination. + Returns (entries, new_cursor, error). + """ + all_entries: list[dict[str, Any]] = [] + + resp = await self._request("/2/files/list_folder/continue", {"cursor": cursor}) + if resp.status_code == 401: + return [], None, "Dropbox authentication expired (401)" + if resp.status_code != 200: + return [], None, f"Failed to get changes: {resp.status_code} - {resp.text}" + + data = resp.json() + all_entries.extend(data.get("entries", [])) + + while data.get("has_more"): + cursor = data["cursor"] + resp = await self._request( + "/2/files/list_folder/continue", {"cursor": cursor} + ) + if resp.status_code != 200: + return ( + all_entries, + data.get("cursor"), + f"Pagination failed: {resp.status_code}", + ) + data = resp.json() + all_entries.extend(data.get("entries", [])) + + return all_entries, data.get("cursor"), None + async def get_metadata(self, path: str) -> tuple[dict[str, Any] | None, str | None]: resp = await self._request("/2/files/get_metadata", {"path": path}) if resp.status_code != 200: diff --git a/surfsense_backend/app/connectors/dropbox/content_extractor.py b/surfsense_backend/app/connectors/dropbox/content_extractor.py index e89893b14..8cbc3e417 100644 --- a/surfsense_backend/app/connectors/dropbox/content_extractor.py +++ b/surfsense_backend/app/connectors/dropbox/content_extractor.py @@ -53,7 +53,8 @@ async def download_and_extract_content( file_name = file.get("name", "Unknown") file_id = file.get("id", "") - if should_skip_file(file): + skip, _unsup_ext = should_skip_file(file) + if skip: return None, {}, "Skipping non-indexable item" logger.info(f"Downloading file for content extraction: {file_name}") @@ -87,9 +88,13 @@ async def download_and_extract_content( if error: return None, metadata, error - from app.connectors.onedrive.content_extractor import _parse_file_to_markdown + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - markdown = await _parse_file_to_markdown(temp_file_path, file_name) + result = await EtlPipelineService().extract( + EtlRequest(file_path=temp_file_path, filename=file_name) + ) + markdown = result.markdown_content return markdown, metadata, None except Exception as e: diff --git a/surfsense_backend/app/connectors/dropbox/file_types.py b/surfsense_backend/app/connectors/dropbox/file_types.py index e6d772a1c..d26306665 100644 --- a/surfsense_backend/app/connectors/dropbox/file_types.py +++ b/surfsense_backend/app/connectors/dropbox/file_types.py @@ -1,8 +1,8 @@ """File type handlers for Dropbox.""" -PAPER_EXTENSION = ".paper" +from app.etl_pipeline.file_classifier import should_skip_for_service -SKIP_EXTENSIONS: frozenset[str] = frozenset() +PAPER_EXTENSION = ".paper" MIME_TO_EXTENSION: dict[str, str] = { "application/pdf": ".pdf", @@ -42,17 +42,25 @@ def is_paper_file(item: dict) -> bool: return ext == PAPER_EXTENSION -def should_skip_file(item: dict) -> bool: +def should_skip_file(item: dict) -> tuple[bool, str | None]: """Skip folders and truly non-indexable files. Paper docs are non-downloadable but exportable, so they are NOT skipped. + Returns (should_skip, unsupported_extension_or_None). """ if is_folder(item): - return True + return True, None if is_paper_file(item): - return False + return False, None if not item.get("is_downloadable", True): - return True + return True, None + + from pathlib import PurePosixPath + + from app.config import config as app_config + name = item.get("name", "") - ext = get_extension_from_name(name).lower() - return ext in SKIP_EXTENSIONS + if should_skip_for_service(name, app_config.ETL_SERVICE): + ext = PurePosixPath(name).suffix.lower() + return True, ext + return False, None diff --git a/surfsense_backend/app/connectors/dropbox/folder_manager.py b/surfsense_backend/app/connectors/dropbox/folder_manager.py index 5453c8785..f9aa78873 100644 --- a/surfsense_backend/app/connectors/dropbox/folder_manager.py +++ b/surfsense_backend/app/connectors/dropbox/folder_manager.py @@ -64,8 +64,10 @@ async def get_files_in_folder( ) continue files.extend(sub_files) - elif not should_skip_file(item): - files.append(item) + else: + skip, _unsup_ext = should_skip_file(item) + if not skip: + files.append(item) return files, None diff --git a/surfsense_backend/app/connectors/google_drive/content_extractor.py b/surfsense_backend/app/connectors/google_drive/content_extractor.py index 1e94133b4..83ff32e82 100644 --- a/surfsense_backend/app/connectors/google_drive/content_extractor.py +++ b/surfsense_backend/app/connectors/google_drive/content_extractor.py @@ -1,12 +1,9 @@ """Content extraction for Google Drive files.""" -import asyncio import contextlib import logging import os import tempfile -import threading -import time from pathlib import Path from typing import Any @@ -20,6 +17,7 @@ from .file_types import ( get_export_mime_type, get_extension_from_mime, is_google_workspace_file, + should_skip_by_extension, should_skip_file, ) @@ -45,6 +43,11 @@ async def download_and_extract_content( if should_skip_file(mime_type): return None, {}, f"Skipping {mime_type}" + if not is_google_workspace_file(mime_type): + ext_skip, _unsup_ext = should_skip_by_extension(file_name) + if ext_skip: + return None, {}, f"Skipping unsupported extension: {file_name}" + logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})") drive_metadata: dict[str, Any] = { @@ -97,7 +100,10 @@ async def download_and_extract_content( if error: return None, drive_metadata, error - markdown = await _parse_file_to_markdown(temp_file_path, file_name) + etl_filename = ( + file_name + extension if is_google_workspace_file(mime_type) else file_name + ) + markdown = await _parse_file_to_markdown(temp_file_path, etl_filename) return markdown, drive_metadata, None except Exception as e: @@ -110,99 +116,14 @@ async def download_and_extract_content( async def _parse_file_to_markdown(file_path: str, filename: str) -> str: - """Parse a local file to markdown using the configured ETL service.""" - lower = filename.lower() + """Parse a local file to markdown using the unified ETL pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - if lower.endswith((".md", ".markdown", ".txt")): - with open(file_path, encoding="utf-8") as f: - return f.read() - - if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")): - from litellm import atranscription - - from app.config import config as app_config - - stt_service_type = ( - "local" - if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - if stt_service_type == "local": - from app.services.stt_service import stt_service - - t0 = time.monotonic() - logger.info( - f"[local-stt] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(stt_service.transcribe_file, file_path) - logger.info( - f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - text = result.get("text", "") - else: - with open(file_path, "rb") as audio_file: - kwargs: dict[str, Any] = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - resp = await atranscription(**kwargs) - text = resp.get("text", "") - - if not text: - raise ValueError("Transcription returned empty text") - return f"# Transcription of {filename}\n\n{text}" - - # Document files -- use configured ETL service - from app.config import config as app_config - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - from langchain_unstructured import UnstructuredLoader - - from app.utils.document_converters import convert_document_to_markdown - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - docs = await loader.aload() - return await convert_document_to_markdown(docs) - - if app_config.ETL_SERVICE == "LLAMACLOUD": - from app.tasks.document_processors.file_processors import ( - parse_with_llamacloud_retry, - ) - - result = await parse_with_llamacloud_retry( - file_path=file_path, estimated_pages=50 - ) - markdown_documents = await result.aget_markdown_documents(split_by_page=False) - if not markdown_documents: - raise RuntimeError(f"LlamaCloud returned no documents for {filename}") - return markdown_documents[0].text - - if app_config.ETL_SERVICE == "DOCLING": - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - t0 = time.monotonic() - logger.info( - f"[docling] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(converter.convert, file_path) - logger.info( - f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - return result.document.export_to_markdown() - - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + result = await EtlPipelineService().extract( + EtlRequest(file_path=file_path, filename=filename) + ) + return result.markdown_content async def download_and_process_file( @@ -236,10 +157,14 @@ async def download_and_process_file( file_name = file.get("name", "Unknown") mime_type = file.get("mimeType", "") - # Skip folders and shortcuts if should_skip_file(mime_type): return None, f"Skipping {mime_type}", None + if not is_google_workspace_file(mime_type): + ext_skip, _unsup_ext = should_skip_by_extension(file_name) + if ext_skip: + return None, f"Skipping unsupported extension: {file_name}", None + logger.info(f"Downloading file: {file_name} ({mime_type})") temp_file_path = None @@ -310,10 +235,13 @@ async def download_and_process_file( "." )[-1] + etl_filename = ( + file_name + extension if is_google_workspace_file(mime_type) else file_name + ) logger.info(f"Processing {file_name} with Surfsense's file processor") await process_file_in_background( file_path=temp_file_path, - filename=file_name, + filename=etl_filename, search_space_id=search_space_id, user_id=user_id, session=session, diff --git a/surfsense_backend/app/connectors/google_drive/file_types.py b/surfsense_backend/app/connectors/google_drive/file_types.py index dd6aff4d7..75dc1d4b3 100644 --- a/surfsense_backend/app/connectors/google_drive/file_types.py +++ b/surfsense_backend/app/connectors/google_drive/file_types.py @@ -1,5 +1,7 @@ """File type handlers for Google Drive.""" +from app.etl_pipeline.file_classifier import should_skip_for_service + GOOGLE_DOC = "application/vnd.google-apps.document" GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet" GOOGLE_SLIDE = "application/vnd.google-apps.presentation" @@ -46,6 +48,21 @@ def should_skip_file(mime_type: str) -> bool: return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT] +def should_skip_by_extension(filename: str) -> tuple[bool, str | None]: + """Check if the file extension is not parseable by the configured ETL service. + + Returns (should_skip, unsupported_extension_or_None). + """ + from pathlib import PurePosixPath + + from app.config import config as app_config + + if should_skip_for_service(filename, app_config.ETL_SERVICE): + ext = PurePosixPath(filename).suffix.lower() + return True, ext + return False, None + + def get_export_mime_type(mime_type: str) -> str | None: """Get export MIME type for Google Workspace files.""" return EXPORT_FORMATS.get(mime_type) diff --git a/surfsense_backend/app/connectors/onedrive/content_extractor.py b/surfsense_backend/app/connectors/onedrive/content_extractor.py index 8917ba1fd..2238b8603 100644 --- a/surfsense_backend/app/connectors/onedrive/content_extractor.py +++ b/surfsense_backend/app/connectors/onedrive/content_extractor.py @@ -1,16 +1,9 @@ -"""Content extraction for OneDrive files. +"""Content extraction for OneDrive files.""" -Reuses the same ETL parsing logic as Google Drive since file parsing is -extension-based, not provider-specific. -""" - -import asyncio import contextlib import logging import os import tempfile -import threading -import time from pathlib import Path from typing import Any @@ -31,7 +24,8 @@ async def download_and_extract_content( item_id = file.get("id") file_name = file.get("name", "Unknown") - if should_skip_file(file): + skip, _unsup_ext = should_skip_file(file) + if skip: return None, {}, "Skipping non-indexable item" file_info = file.get("file", {}) @@ -84,98 +78,11 @@ async def download_and_extract_content( async def _parse_file_to_markdown(file_path: str, filename: str) -> str: - """Parse a local file to markdown using the configured ETL service. + """Parse a local file to markdown using the unified ETL pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService - Same logic as Google Drive -- file parsing is extension-based. - """ - lower = filename.lower() - - if lower.endswith((".md", ".markdown", ".txt")): - with open(file_path, encoding="utf-8") as f: - return f.read() - - if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")): - from litellm import atranscription - - from app.config import config as app_config - - stt_service_type = ( - "local" - if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - if stt_service_type == "local": - from app.services.stt_service import stt_service - - t0 = time.monotonic() - logger.info( - f"[local-stt] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(stt_service.transcribe_file, file_path) - logger.info( - f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - text = result.get("text", "") - else: - with open(file_path, "rb") as audio_file: - kwargs: dict[str, Any] = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - kwargs["api_base"] = app_config.STT_SERVICE_API_BASE - resp = await atranscription(**kwargs) - text = resp.get("text", "") - - if not text: - raise ValueError("Transcription returned empty text") - return f"# Transcription of {filename}\n\n{text}" - - from app.config import config as app_config - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - from langchain_unstructured import UnstructuredLoader - - from app.utils.document_converters import convert_document_to_markdown - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - docs = await loader.aload() - return await convert_document_to_markdown(docs) - - if app_config.ETL_SERVICE == "LLAMACLOUD": - from app.tasks.document_processors.file_processors import ( - parse_with_llamacloud_retry, - ) - - result = await parse_with_llamacloud_retry( - file_path=file_path, estimated_pages=50 - ) - markdown_documents = await result.aget_markdown_documents(split_by_page=False) - if not markdown_documents: - raise RuntimeError(f"LlamaCloud returned no documents for {filename}") - return markdown_documents[0].text - - if app_config.ETL_SERVICE == "DOCLING": - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - t0 = time.monotonic() - logger.info( - f"[docling] START file={filename} thread={threading.current_thread().name}" - ) - result = await asyncio.to_thread(converter.convert, file_path) - logger.info( - f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s" - ) - return result.document.export_to_markdown() - - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") + result = await EtlPipelineService().extract( + EtlRequest(file_path=file_path, filename=filename) + ) + return result.markdown_content diff --git a/surfsense_backend/app/connectors/onedrive/file_types.py b/surfsense_backend/app/connectors/onedrive/file_types.py index 403fdc337..942b0be73 100644 --- a/surfsense_backend/app/connectors/onedrive/file_types.py +++ b/surfsense_backend/app/connectors/onedrive/file_types.py @@ -1,5 +1,7 @@ """File type handlers for Microsoft OneDrive.""" +from app.etl_pipeline.file_classifier import should_skip_for_service + ONEDRIVE_FOLDER_FACET = "folder" ONENOTE_MIME = "application/msonenote" @@ -38,13 +40,28 @@ def is_folder(item: dict) -> bool: return ONEDRIVE_FOLDER_FACET in item -def should_skip_file(item: dict) -> bool: - """Skip folders, OneNote files, remote items (shared links), and packages.""" +def should_skip_file(item: dict) -> tuple[bool, str | None]: + """Skip folders, OneNote files, remote items, packages, and unsupported extensions. + + Returns (should_skip, unsupported_extension_or_None). + The second element is only set when the skip is due to an unsupported extension. + """ if is_folder(item): - return True + return True, None if "remoteItem" in item: - return True + return True, None if "package" in item: - return True + return True, None mime = item.get("file", {}).get("mimeType", "") - return mime in SKIP_MIME_TYPES + if mime in SKIP_MIME_TYPES: + return True, None + + from pathlib import PurePosixPath + + from app.config import config as app_config + + name = item.get("name", "") + if should_skip_for_service(name, app_config.ETL_SERVICE): + ext = PurePosixPath(name).suffix.lower() + return True, ext + return False, None diff --git a/surfsense_backend/app/connectors/onedrive/folder_manager.py b/surfsense_backend/app/connectors/onedrive/folder_manager.py index 6fa725ca1..a5d7fa713 100644 --- a/surfsense_backend/app/connectors/onedrive/folder_manager.py +++ b/surfsense_backend/app/connectors/onedrive/folder_manager.py @@ -71,8 +71,10 @@ async def get_files_in_folder( ) continue files.extend(sub_files) - elif not should_skip_file(item): - files.append(item) + else: + skip, _unsup_ext = should_skip_file(item) + if not skip: + files.append(item) return files, None diff --git a/surfsense_backend/app/db.py b/surfsense_backend/app/db.py index 90630cc83..4689313f7 100644 --- a/surfsense_backend/app/db.py +++ b/surfsense_backend/app/db.py @@ -64,6 +64,7 @@ class DocumentType(StrEnum): COMPOSIO_GOOGLE_DRIVE_CONNECTOR = "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" COMPOSIO_GMAIL_CONNECTOR = "COMPOSIO_GMAIL_CONNECTOR" COMPOSIO_GOOGLE_CALENDAR_CONNECTOR = "COMPOSIO_GOOGLE_CALENDAR_CONNECTOR" + LOCAL_FOLDER_FILE = "LOCAL_FOLDER_FILE" # Native Google document types → their legacy Composio equivalents. @@ -259,6 +260,24 @@ class ImageGenProvider(StrEnum): NSCALE = "NSCALE" +class VisionProvider(StrEnum): + OPENAI = "OPENAI" + ANTHROPIC = "ANTHROPIC" + GOOGLE = "GOOGLE" + AZURE_OPENAI = "AZURE_OPENAI" + VERTEX_AI = "VERTEX_AI" + BEDROCK = "BEDROCK" + XAI = "XAI" + OPENROUTER = "OPENROUTER" + OLLAMA = "OLLAMA" + GROQ = "GROQ" + TOGETHER_AI = "TOGETHER_AI" + FIREWORKS_AI = "FIREWORKS_AI" + DEEPSEEK = "DEEPSEEK" + MISTRAL = "MISTRAL" + CUSTOM = "CUSTOM" + + class LogLevel(StrEnum): DEBUG = "DEBUG" INFO = "INFO" @@ -376,6 +395,11 @@ class Permission(StrEnum): IMAGE_GENERATIONS_READ = "image_generations:read" IMAGE_GENERATIONS_DELETE = "image_generations:delete" + # Vision LLM Configs + VISION_CONFIGS_CREATE = "vision_configs:create" + VISION_CONFIGS_READ = "vision_configs:read" + VISION_CONFIGS_DELETE = "vision_configs:delete" + # Connectors CONNECTORS_CREATE = "connectors:create" CONNECTORS_READ = "connectors:read" @@ -444,6 +468,9 @@ DEFAULT_ROLE_PERMISSIONS = { # Image Generations (create and read, no delete) Permission.IMAGE_GENERATIONS_CREATE.value, Permission.IMAGE_GENERATIONS_READ.value, + # Vision Configs (create and read, no delete) + Permission.VISION_CONFIGS_CREATE.value, + Permission.VISION_CONFIGS_READ.value, # Connectors (no delete) Permission.CONNECTORS_CREATE.value, Permission.CONNECTORS_READ.value, @@ -477,6 +504,8 @@ DEFAULT_ROLE_PERMISSIONS = { Permission.VIDEO_PRESENTATIONS_READ.value, # Image Generations (read only) Permission.IMAGE_GENERATIONS_READ.value, + # Vision Configs (read only) + Permission.VISION_CONFIGS_READ.value, # Connectors (read only) Permission.CONNECTORS_READ.value, # Logs (read only) @@ -955,6 +984,7 @@ class Folder(BaseModel, TimestampMixin): onupdate=lambda: datetime.now(UTC), index=True, ) + folder_metadata = Column("metadata", JSONB, nullable=True) parent = relationship("Folder", remote_side="Folder.id", backref="children") search_space = relationship("SearchSpace", back_populates="folders") @@ -1039,6 +1069,26 @@ class Document(BaseModel, TimestampMixin): ) +class DocumentVersion(BaseModel, TimestampMixin): + __tablename__ = "document_versions" + __table_args__ = ( + UniqueConstraint("document_id", "version_number", name="uq_document_version"), + ) + + document_id = Column( + Integer, + ForeignKey("documents.id", ondelete="CASCADE"), + nullable=False, + index=True, + ) + version_number = Column(Integer, nullable=False) + source_markdown = Column(Text, nullable=True) + content_hash = Column(String, nullable=False) + title = Column(String, nullable=True) + + document = relationship("Document", backref="versions") + + class Chunk(BaseModel, TimestampMixin): __tablename__ = "chunks" @@ -1241,6 +1291,35 @@ class ImageGenerationConfig(BaseModel, TimestampMixin): user = relationship("User", back_populates="image_generation_configs") +class VisionLLMConfig(BaseModel, TimestampMixin): + __tablename__ = "vision_llm_configs" + + name = Column(String(100), nullable=False, index=True) + description = Column(String(500), nullable=True) + + provider = Column(SQLAlchemyEnum(VisionProvider), nullable=False) + custom_provider = Column(String(100), nullable=True) + model_name = Column(String(100), nullable=False) + + api_key = Column(String, nullable=False) + api_base = Column(String(500), nullable=True) + api_version = Column(String(50), nullable=True) + + litellm_params = Column(JSON, nullable=True, default={}) + + search_space_id = Column( + Integer, ForeignKey("searchspaces.id", ondelete="CASCADE"), nullable=False + ) + search_space = relationship( + "SearchSpace", back_populates="vision_llm_configs" + ) + + user_id = Column( + UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False + ) + user = relationship("User", back_populates="vision_llm_configs") + + class ImageGeneration(BaseModel, TimestampMixin): """ Stores image generation requests and results using litellm.aimage_generation(). @@ -1329,6 +1408,9 @@ class SearchSpace(BaseModel, TimestampMixin): image_generation_config_id = Column( Integer, nullable=True, default=0 ) # For image generation, defaults to Auto mode + vision_llm_config_id = Column( + Integer, nullable=True, default=0 + ) # For vision/screenshot analysis, defaults to Auto mode user_id = Column( UUID(as_uuid=True), ForeignKey("user.id", ondelete="CASCADE"), nullable=False @@ -1407,6 +1489,12 @@ class SearchSpace(BaseModel, TimestampMixin): order_by="ImageGenerationConfig.id", cascade="all, delete-orphan", ) + vision_llm_configs = relationship( + "VisionLLMConfig", + back_populates="search_space", + order_by="VisionLLMConfig.id", + cascade="all, delete-orphan", + ) # RBAC relationships roles = relationship( @@ -1936,6 +2024,12 @@ if config.AUTH_TYPE == "GOOGLE": passive_deletes=True, ) + vision_llm_configs = relationship( + "VisionLLMConfig", + back_populates="user", + passive_deletes=True, + ) + # User memories for personalized AI responses memories = relationship( "UserMemory", @@ -2050,6 +2144,12 @@ else: passive_deletes=True, ) + vision_llm_configs = relationship( + "VisionLLMConfig", + back_populates="user", + passive_deletes=True, + ) + # User memories for personalized AI responses memories = relationship( "UserMemory", diff --git a/surfsense_backend/app/etl_pipeline/__init__.py b/surfsense_backend/app/etl_pipeline/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/app/etl_pipeline/constants.py b/surfsense_backend/app/etl_pipeline/constants.py new file mode 100644 index 000000000..f65759c13 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/constants.py @@ -0,0 +1,39 @@ +import ssl + +import httpx + +LLAMACLOUD_MAX_RETRIES = 5 +LLAMACLOUD_BASE_DELAY = 10 +LLAMACLOUD_MAX_DELAY = 120 +LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( + ssl.SSLError, + httpx.ConnectError, + httpx.ConnectTimeout, + httpx.ReadError, + httpx.ReadTimeout, + httpx.WriteError, + httpx.WriteTimeout, + httpx.RemoteProtocolError, + httpx.LocalProtocolError, + ConnectionError, + ConnectionResetError, + TimeoutError, + OSError, +) + +UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024 +MIN_UPLOAD_TIMEOUT = 120 +MAX_UPLOAD_TIMEOUT = 1800 +BASE_JOB_TIMEOUT = 600 +PER_PAGE_JOB_TIMEOUT = 60 + + +def calculate_upload_timeout(file_size_bytes: int) -> float: + estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 + return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) + + +def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: + page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) + size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 + return max(page_based_timeout, size_based_timeout) diff --git a/surfsense_backend/app/etl_pipeline/etl_document.py b/surfsense_backend/app/etl_pipeline/etl_document.py new file mode 100644 index 000000000..350c3299f --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/etl_document.py @@ -0,0 +1,21 @@ +from pydantic import BaseModel, field_validator + + +class EtlRequest(BaseModel): + file_path: str + filename: str + estimated_pages: int = 0 + + @field_validator("filename") + @classmethod + def filename_must_not_be_empty(cls, v: str) -> str: + if not v.strip(): + raise ValueError("filename must not be empty") + return v + + +class EtlResult(BaseModel): + markdown_content: str + etl_service: str + actual_pages: int = 0 + content_type: str diff --git a/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py new file mode 100644 index 000000000..6e7ab3c4c --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/etl_pipeline_service.py @@ -0,0 +1,90 @@ +from app.config import config as app_config +from app.etl_pipeline.etl_document import EtlRequest, EtlResult +from app.etl_pipeline.exceptions import ( + EtlServiceUnavailableError, + EtlUnsupportedFileError, +) +from app.etl_pipeline.file_classifier import FileCategory, classify_file +from app.etl_pipeline.parsers.audio import transcribe_audio +from app.etl_pipeline.parsers.direct_convert import convert_file_directly +from app.etl_pipeline.parsers.plaintext import read_plaintext + + +class EtlPipelineService: + """Single pipeline for extracting markdown from files. All callers use this.""" + + async def extract(self, request: EtlRequest) -> EtlResult: + category = classify_file(request.filename) + + if category == FileCategory.UNSUPPORTED: + raise EtlUnsupportedFileError( + f"File type not supported for parsing: {request.filename}" + ) + + if category == FileCategory.PLAINTEXT: + content = read_plaintext(request.file_path) + return EtlResult( + markdown_content=content, + etl_service="PLAINTEXT", + content_type="plaintext", + ) + + if category == FileCategory.DIRECT_CONVERT: + content = convert_file_directly(request.file_path, request.filename) + return EtlResult( + markdown_content=content, + etl_service="DIRECT_CONVERT", + content_type="direct_convert", + ) + + if category == FileCategory.AUDIO: + content = await transcribe_audio(request.file_path, request.filename) + return EtlResult( + markdown_content=content, + etl_service="AUDIO", + content_type="audio", + ) + + return await self._extract_document(request) + + async def _extract_document(self, request: EtlRequest) -> EtlResult: + from pathlib import PurePosixPath + + from app.utils.file_extensions import get_document_extensions_for_service + + etl_service = app_config.ETL_SERVICE + if not etl_service: + raise EtlServiceUnavailableError( + "No ETL_SERVICE configured. " + "Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env" + ) + + ext = PurePosixPath(request.filename).suffix.lower() + supported = get_document_extensions_for_service(etl_service) + if ext not in supported: + raise EtlUnsupportedFileError( + f"File type {ext} is not supported by {etl_service}" + ) + + if etl_service == "DOCLING": + from app.etl_pipeline.parsers.docling import parse_with_docling + + content = await parse_with_docling(request.file_path, request.filename) + elif etl_service == "UNSTRUCTURED": + from app.etl_pipeline.parsers.unstructured import parse_with_unstructured + + content = await parse_with_unstructured(request.file_path) + elif etl_service == "LLAMACLOUD": + from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud + + content = await parse_with_llamacloud( + request.file_path, request.estimated_pages + ) + else: + raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}") + + return EtlResult( + markdown_content=content, + etl_service=etl_service, + content_type="document", + ) diff --git a/surfsense_backend/app/etl_pipeline/exceptions.py b/surfsense_backend/app/etl_pipeline/exceptions.py new file mode 100644 index 000000000..26eecbef4 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/exceptions.py @@ -0,0 +1,10 @@ +class EtlParseError(Exception): + """Raised when an ETL parser fails to produce content.""" + + +class EtlServiceUnavailableError(Exception): + """Raised when the configured ETL_SERVICE is not recognised.""" + + +class EtlUnsupportedFileError(Exception): + """Raised when a file type cannot be parsed by any ETL pipeline.""" diff --git a/surfsense_backend/app/etl_pipeline/file_classifier.py b/surfsense_backend/app/etl_pipeline/file_classifier.py new file mode 100644 index 000000000..4e690bcdc --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/file_classifier.py @@ -0,0 +1,137 @@ +from enum import Enum +from pathlib import PurePosixPath + +from app.utils.file_extensions import ( + DOCUMENT_EXTENSIONS, + get_document_extensions_for_service, +) + +PLAINTEXT_EXTENSIONS = frozenset( + { + ".md", + ".markdown", + ".txt", + ".text", + ".json", + ".jsonl", + ".yaml", + ".yml", + ".toml", + ".ini", + ".cfg", + ".conf", + ".xml", + ".css", + ".scss", + ".less", + ".sass", + ".py", + ".pyw", + ".pyi", + ".pyx", + ".js", + ".jsx", + ".ts", + ".tsx", + ".mjs", + ".cjs", + ".java", + ".kt", + ".kts", + ".scala", + ".groovy", + ".c", + ".h", + ".cpp", + ".cxx", + ".cc", + ".hpp", + ".hxx", + ".cs", + ".fs", + ".fsx", + ".go", + ".rs", + ".rb", + ".php", + ".pl", + ".pm", + ".lua", + ".swift", + ".m", + ".mm", + ".r", + ".jl", + ".sh", + ".bash", + ".zsh", + ".fish", + ".bat", + ".cmd", + ".ps1", + ".sql", + ".graphql", + ".gql", + ".env", + ".gitignore", + ".dockerignore", + ".editorconfig", + ".makefile", + ".cmake", + ".log", + ".rst", + ".tex", + ".bib", + ".org", + ".adoc", + ".asciidoc", + ".vue", + ".svelte", + ".astro", + ".tf", + ".hcl", + ".proto", + } +) + +AUDIO_EXTENSIONS = frozenset( + {".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"} +) + +DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"}) + + +class FileCategory(Enum): + PLAINTEXT = "plaintext" + AUDIO = "audio" + DIRECT_CONVERT = "direct_convert" + UNSUPPORTED = "unsupported" + DOCUMENT = "document" + + +def classify_file(filename: str) -> FileCategory: + suffix = PurePosixPath(filename).suffix.lower() + if suffix in PLAINTEXT_EXTENSIONS: + return FileCategory.PLAINTEXT + if suffix in AUDIO_EXTENSIONS: + return FileCategory.AUDIO + if suffix in DIRECT_CONVERT_EXTENSIONS: + return FileCategory.DIRECT_CONVERT + if suffix in DOCUMENT_EXTENSIONS: + return FileCategory.DOCUMENT + return FileCategory.UNSUPPORTED + + +def should_skip_for_service(filename: str, etl_service: str | None) -> bool: + """Return True if *filename* cannot be processed by *etl_service*. + + Plaintext, audio, and direct-convert files are parser-agnostic and never + skipped. Document files are checked against the per-parser extension set. + """ + category = classify_file(filename) + if category == FileCategory.UNSUPPORTED: + return True + if category == FileCategory.DOCUMENT: + suffix = PurePosixPath(filename).suffix.lower() + return suffix not in get_document_extensions_for_service(etl_service) + return False diff --git a/surfsense_backend/app/etl_pipeline/parsers/__init__.py b/surfsense_backend/app/etl_pipeline/parsers/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/app/etl_pipeline/parsers/audio.py b/surfsense_backend/app/etl_pipeline/parsers/audio.py new file mode 100644 index 000000000..cd49bafde --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/audio.py @@ -0,0 +1,34 @@ +from litellm import atranscription + +from app.config import config as app_config + + +async def transcribe_audio(file_path: str, filename: str) -> str: + stt_service_type = ( + "local" + if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/") + else "external" + ) + + if stt_service_type == "local": + from app.services.stt_service import stt_service + + result = stt_service.transcribe_file(file_path) + text = result.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + else: + with open(file_path, "rb") as audio_file: + kwargs: dict = { + "model": app_config.STT_SERVICE, + "file": audio_file, + "api_key": app_config.STT_SERVICE_API_KEY, + } + if app_config.STT_SERVICE_API_BASE: + kwargs["api_base"] = app_config.STT_SERVICE_API_BASE + response = await atranscription(**kwargs) + text = response.get("text", "") + if not text: + raise ValueError("Transcription returned empty text") + + return f"# Transcription of {filename}\n\n{text}" diff --git a/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py b/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py new file mode 100644 index 000000000..c9e6e8647 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/direct_convert.py @@ -0,0 +1,3 @@ +from app.tasks.document_processors._direct_converters import convert_file_directly + +__all__ = ["convert_file_directly"] diff --git a/surfsense_backend/app/etl_pipeline/parsers/docling.py b/surfsense_backend/app/etl_pipeline/parsers/docling.py new file mode 100644 index 000000000..df0498148 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/docling.py @@ -0,0 +1,26 @@ +import warnings +from logging import ERROR, getLogger + + +async def parse_with_docling(file_path: str, filename: str) -> str: + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + + pdfminer_logger = getLogger("pdfminer") + original_level = pdfminer_logger.level + + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer") + warnings.filterwarnings( + "ignore", message=".*Cannot set gray non-stroke color.*" + ) + warnings.filterwarnings("ignore", message=".*invalid float value.*") + pdfminer_logger.setLevel(ERROR) + + try: + result = await docling_service.process_document(file_path, filename) + finally: + pdfminer_logger.setLevel(original_level) + + return result["content"] diff --git a/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py new file mode 100644 index 000000000..ae2a34234 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/llamacloud.py @@ -0,0 +1,123 @@ +import asyncio +import logging +import os +import random + +import httpx + +from app.config import config as app_config +from app.etl_pipeline.constants import ( + LLAMACLOUD_BASE_DELAY, + LLAMACLOUD_MAX_DELAY, + LLAMACLOUD_MAX_RETRIES, + LLAMACLOUD_RETRYABLE_EXCEPTIONS, + PER_PAGE_JOB_TIMEOUT, + calculate_job_timeout, + calculate_upload_timeout, +) + + +async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str: + from llama_cloud_services import LlamaParse + from llama_cloud_services.parse.utils import ResultType + + file_size_bytes = os.path.getsize(file_path) + file_size_mb = file_size_bytes / (1024 * 1024) + + upload_timeout = calculate_upload_timeout(file_size_bytes) + job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) + + custom_timeout = httpx.Timeout( + connect=120.0, + read=upload_timeout, + write=upload_timeout, + pool=120.0, + ) + + logging.info( + f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " + f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " + f"job_timeout={job_timeout:.0f}s" + ) + + last_exception = None + attempt_errors: list[str] = [] + + for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): + try: + async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: + parser = LlamaParse( + api_key=app_config.LLAMA_CLOUD_API_KEY, + num_workers=1, + verbose=True, + language="en", + result_type=ResultType.MD, + max_timeout=int(max(2000, job_timeout + upload_timeout)), + job_timeout_in_seconds=job_timeout, + job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, + custom_client=custom_client, + ) + result = await parser.aparse(file_path) + + if attempt > 1: + logging.info( + f"LlamaCloud upload succeeded on attempt {attempt} after " + f"{len(attempt_errors)} failures" + ) + + if hasattr(result, "get_markdown_documents"): + markdown_docs = result.get_markdown_documents(split_by_page=False) + if markdown_docs and hasattr(markdown_docs[0], "text"): + return markdown_docs[0].text + if hasattr(result, "pages") and result.pages: + return "\n\n".join( + p.md for p in result.pages if hasattr(p, "md") and p.md + ) + return str(result) + + if isinstance(result, list): + if result and hasattr(result[0], "text"): + return result[0].text + return "\n\n".join( + doc.page_content if hasattr(doc, "page_content") else str(doc) + for doc in result + ) + + return str(result) + + except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: + last_exception = e + error_type = type(e).__name__ + error_msg = str(e)[:200] + attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") + + if attempt < LLAMACLOUD_MAX_RETRIES: + base_delay = min( + LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), + LLAMACLOUD_MAX_DELAY, + ) + jitter = base_delay * 0.25 * (2 * random.random() - 1) + delay = base_delay + jitter + + logging.warning( + f"LlamaCloud upload failed " + f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " + f"{error_type}. File: {file_size_mb:.1f}MB. " + f"Retrying in {delay:.0f}s..." + ) + await asyncio.sleep(delay) + else: + logging.error( + f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} " + f"attempts. File size: {file_size_mb:.1f}MB, " + f"Pages: {estimated_pages}. " + f"Errors: {'; '.join(attempt_errors)}" + ) + + except Exception: + raise + + raise last_exception or RuntimeError( + f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " + f"File size: {file_size_mb:.1f}MB" + ) diff --git a/surfsense_backend/app/etl_pipeline/parsers/plaintext.py b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py new file mode 100644 index 000000000..24bfb71e5 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/plaintext.py @@ -0,0 +1,8 @@ +def read_plaintext(file_path: str) -> str: + with open(file_path, encoding="utf-8", errors="replace") as f: + content = f.read() + if "\x00" in content: + raise ValueError( + f"File contains null bytes — likely a binary file opened as text: {file_path}" + ) + return content diff --git a/surfsense_backend/app/etl_pipeline/parsers/unstructured.py b/surfsense_backend/app/etl_pipeline/parsers/unstructured.py new file mode 100644 index 000000000..af8fb99b6 --- /dev/null +++ b/surfsense_backend/app/etl_pipeline/parsers/unstructured.py @@ -0,0 +1,14 @@ +async def parse_with_unstructured(file_path: str) -> str: + from langchain_unstructured import UnstructuredLoader + + loader = UnstructuredLoader( + file_path, + mode="elements", + post_processors=[], + languages=["eng"], + include_orig_elements=False, + include_metadata=False, + strategy="auto", + ) + docs = await loader.aload() + return "\n\n".join(doc.page_content for doc in docs if doc.page_content) diff --git a/surfsense_backend/app/indexing_pipeline/exceptions.py b/surfsense_backend/app/indexing_pipeline/exceptions.py index 9155e9baa..666fa4b9f 100644 --- a/surfsense_backend/app/indexing_pipeline/exceptions.py +++ b/surfsense_backend/app/indexing_pipeline/exceptions.py @@ -59,7 +59,7 @@ class PipelineMessages: LLM_AUTH = "LLM authentication failed. Check your API key." LLM_PERMISSION = "LLM request denied. Check your account permissions." - LLM_NOT_FOUND = "LLM model not found. Check your model configuration." + LLM_NOT_FOUND = "Model not found. Check your model configuration." LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid." LLM_UNPROCESSABLE = ( "Document exceeds the LLM context window even after optimization." @@ -67,7 +67,7 @@ class PipelineMessages: LLM_RESPONSE = "LLM returned an invalid response." LLM_AUTH = "LLM authentication failed. Check your API key." LLM_PERMISSION = "LLM request denied. Check your account permissions." - LLM_NOT_FOUND = "LLM model not found. Check your model configuration." + LLM_NOT_FOUND = "Model not found. Check your model configuration." LLM_BAD_REQUEST = "LLM rejected the request. Document content may be invalid." LLM_UNPROCESSABLE = ( "Document exceeds the LLM context window even after optimization." diff --git a/surfsense_backend/app/routes/__init__.py b/surfsense_backend/app/routes/__init__.py index 1937f11cb..02367606b 100644 --- a/surfsense_backend/app/routes/__init__.py +++ b/surfsense_backend/app/routes/__init__.py @@ -3,6 +3,7 @@ from fastapi import APIRouter from .airtable_add_connector_route import ( router as airtable_add_connector_router, ) +from .autocomplete_routes import router as autocomplete_router from .chat_comments_routes import router as chat_comments_router from .circleback_webhook_route import router as circleback_webhook_router from .clickup_add_connector_route import router as clickup_add_connector_router @@ -48,6 +49,7 @@ from .stripe_routes import router as stripe_router from .surfsense_docs_routes import router as surfsense_docs_router from .teams_add_connector_route import router as teams_add_connector_router from .video_presentations_routes import router as video_presentations_router +from .vision_llm_routes import router as vision_llm_router from .youtube_routes import router as youtube_router router = APIRouter() @@ -67,6 +69,7 @@ router.include_router( ) # Video presentation status and streaming router.include_router(reports_router) # Report CRUD and multi-format export router.include_router(image_generation_router) # Image generation via litellm +router.include_router(vision_llm_router) # Vision LLM configs for screenshot analysis router.include_router(search_source_connectors_router) router.include_router(google_calendar_add_connector_router) router.include_router(google_gmail_add_connector_router) @@ -84,7 +87,7 @@ router.include_router(confluence_add_connector_router) router.include_router(clickup_add_connector_router) router.include_router(dropbox_add_connector_router) router.include_router(new_llm_config_router) # LLM configs with prompt configuration -router.include_router(model_list_router) # Dynamic LLM model catalogue from OpenRouter +router.include_router(model_list_router) # Dynamic model catalogue from OpenRouter router.include_router(logs_router) router.include_router(circleback_webhook_router) # Circleback meeting webhooks router.include_router(surfsense_docs_router) # Surfsense documentation for citations @@ -95,3 +98,4 @@ router.include_router(incentive_tasks_router) # Incentive tasks for earning fre router.include_router(stripe_router) # Stripe checkout for additional page packs router.include_router(youtube_router) # YouTube playlist resolution router.include_router(prompts_router) +router.include_router(autocomplete_router) # Lightweight autocomplete with KB context diff --git a/surfsense_backend/app/routes/airtable_add_connector_route.py b/surfsense_backend/app/routes/airtable_add_connector_route.py index fe359d2f3..1e0b1eb5d 100644 --- a/surfsense_backend/app/routes/airtable_add_connector_route.py +++ b/surfsense_backend/app/routes/airtable_add_connector_route.py @@ -1,7 +1,5 @@ import base64 -import hashlib import logging -import secrets from datetime import UTC, datetime, timedelta from uuid import UUID @@ -26,7 +24,11 @@ from app.utils.connector_naming import ( check_duplicate_connector, generate_unique_connector_name, ) -from app.utils.oauth_security import OAuthStateManager, TokenEncryption +from app.utils.oauth_security import ( + OAuthStateManager, + TokenEncryption, + generate_pkce_pair, +) logger = logging.getLogger(__name__) @@ -75,28 +77,6 @@ def make_basic_auth_header(client_id: str, client_secret: str) -> str: return f"Basic {b64}" -def generate_pkce_pair() -> tuple[str, str]: - """ - Generate PKCE code verifier and code challenge. - - Returns: - Tuple of (code_verifier, code_challenge) - """ - # Generate code verifier (43-128 characters) - code_verifier = ( - base64.urlsafe_b64encode(secrets.token_bytes(32)).decode("utf-8").rstrip("=") - ) - - # Generate code challenge (SHA256 hash of verifier, base64url encoded) - code_challenge = ( - base64.urlsafe_b64encode(hashlib.sha256(code_verifier.encode("utf-8")).digest()) - .decode("utf-8") - .rstrip("=") - ) - - return code_verifier, code_challenge - - @router.get("/auth/airtable/connector/add") async def connect_airtable(space_id: int, user: User = Depends(current_active_user)): """ diff --git a/surfsense_backend/app/routes/autocomplete_routes.py b/surfsense_backend/app/routes/autocomplete_routes.py new file mode 100644 index 000000000..a11b7dbc1 --- /dev/null +++ b/surfsense_backend/app/routes/autocomplete_routes.py @@ -0,0 +1,45 @@ +from fastapi import APIRouter, Depends +from fastapi.responses import StreamingResponse +from pydantic import BaseModel, Field +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import User, get_async_session +from app.services.new_streaming_service import VercelStreamingService +from app.services.vision_autocomplete_service import stream_vision_autocomplete +from app.users import current_active_user +from app.utils.rbac import check_search_space_access + +router = APIRouter(prefix="/autocomplete", tags=["autocomplete"]) + +MAX_SCREENSHOT_SIZE = 20 * 1024 * 1024 # 20 MB base64 ceiling + + +class VisionAutocompleteRequest(BaseModel): + screenshot: str = Field(..., max_length=MAX_SCREENSHOT_SIZE) + search_space_id: int + app_name: str = "" + window_title: str = "" + + +@router.post("/vision/stream") +async def vision_autocomplete_stream( + body: VisionAutocompleteRequest, + user: User = Depends(current_active_user), + session: AsyncSession = Depends(get_async_session), +): + await check_search_space_access(session, user, body.search_space_id) + + return StreamingResponse( + stream_vision_autocomplete( + body.screenshot, + body.search_space_id, + session, + app_name=body.app_name, + window_title=body.window_title, + ), + media_type="text/event-stream", + headers={ + **VercelStreamingService.get_response_headers(), + "X-Accel-Buffering": "no", + }, + ) diff --git a/surfsense_backend/app/routes/documents_routes.py b/surfsense_backend/app/routes/documents_routes.py index 6e69218f1..5008b1a10 100644 --- a/surfsense_backend/app/routes/documents_routes.py +++ b/surfsense_backend/app/routes/documents_routes.py @@ -1,7 +1,8 @@ # Force asyncio to use standard event loop before unstructured imports import asyncio -from fastapi import APIRouter, Depends, Form, HTTPException, UploadFile +from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile +from pydantic import BaseModel as PydanticBaseModel from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.future import select from sqlalchemy.orm import selectinload @@ -10,6 +11,8 @@ from app.db import ( Chunk, Document, DocumentType, + DocumentVersion, + Folder, Permission, SearchSpace, SearchSpaceMembership, @@ -17,6 +20,7 @@ from app.db import ( get_async_session, ) from app.schemas import ( + ChunkRead, DocumentRead, DocumentsCreate, DocumentStatusBatchResponse, @@ -26,6 +30,7 @@ from app.schemas import ( DocumentTitleSearchResponse, DocumentUpdate, DocumentWithChunksRead, + FolderRead, PaginatedResponse, ) from app.services.task_dispatcher import TaskDispatcher, get_task_dispatcher @@ -45,9 +50,7 @@ os.environ["UNSTRUCTURED_HAS_PATCHED_LOOP"] = "1" router = APIRouter() -MAX_FILES_PER_UPLOAD = 10 -MAX_FILE_SIZE_BYTES = 50 * 1024 * 1024 # 50 MB per file -MAX_TOTAL_SIZE_BYTES = 200 * 1024 * 1024 # 200 MB total +MAX_FILE_SIZE_BYTES = 500 * 1024 * 1024 # 500 MB per file @router.post("/documents") @@ -156,13 +159,6 @@ async def create_documents_file_upload( if not files: raise HTTPException(status_code=400, detail="No files provided") - if len(files) > MAX_FILES_PER_UPLOAD: - raise HTTPException( - status_code=413, - detail=f"Too many files. Maximum {MAX_FILES_PER_UPLOAD} files per upload.", - ) - - total_size = 0 for file in files: file_size = file.size or 0 if file_size > MAX_FILE_SIZE_BYTES: @@ -171,14 +167,6 @@ async def create_documents_file_upload( detail=f"File '{file.filename}' ({file_size / (1024 * 1024):.1f} MB) " f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.", ) - total_size += file_size - - if total_size > MAX_TOTAL_SIZE_BYTES: - raise HTTPException( - status_code=413, - detail=f"Total upload size ({total_size / (1024 * 1024):.1f} MB) " - f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.", - ) # ===== Read all files concurrently to avoid blocking the event loop ===== async def _read_and_save(file: UploadFile) -> tuple[str, str, int]: @@ -206,16 +194,6 @@ async def create_documents_file_upload( saved_files = await asyncio.gather(*(_read_and_save(f) for f in files)) - actual_total_size = sum(size for _, _, size in saved_files) - if actual_total_size > MAX_TOTAL_SIZE_BYTES: - for temp_path, _, _ in saved_files: - os.unlink(temp_path) - raise HTTPException( - status_code=413, - detail=f"Total upload size ({actual_total_size / (1024 * 1024):.1f} MB) " - f"exceeds the {MAX_TOTAL_SIZE_BYTES // (1024 * 1024)} MB limit.", - ) - # ===== PHASE 1: Create pending documents for all files ===== created_documents: list[Document] = [] files_to_process: list[tuple[Document, str, str]] = [] @@ -451,13 +429,15 @@ async def read_documents( reason=doc.status.get("reason"), ) + raw_content = doc.content or "" api_documents.append( DocumentRead( id=doc.id, title=doc.title, document_type=doc.document_type, document_metadata=doc.document_metadata, - content=doc.content, + content="", + content_preview=raw_content[:300], content_hash=doc.content_hash, unique_identifier_hash=doc.unique_identifier_hash, created_at=doc.created_at, @@ -609,13 +589,15 @@ async def search_documents( reason=doc.status.get("reason"), ) + raw_content = doc.content or "" api_documents.append( DocumentRead( id=doc.id, title=doc.title, document_type=doc.document_type, document_metadata=doc.document_metadata, - content=doc.content, + content="", + content_preview=raw_content[:300], content_hash=doc.content_hash, unique_identifier_hash=doc.unique_identifier_hash, created_at=doc.created_at, @@ -884,16 +866,19 @@ async def get_document_type_counts( @router.get("/documents/by-chunk/{chunk_id}", response_model=DocumentWithChunksRead) async def get_document_by_chunk_id( chunk_id: int, + chunk_window: int = Query( + 5, ge=0, description="Number of chunks before/after the cited chunk to include" + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): """ - Retrieves a document based on a chunk ID, including all its chunks ordered by creation time. - Requires DOCUMENTS_READ permission for the search space. - The document's embedding and chunk embeddings are excluded from the response. + Retrieves a document based on a chunk ID, including a window of chunks around the cited one. + Uses SQL-level pagination to avoid loading all chunks into memory. """ try: - # First, get the chunk and verify it exists + from sqlalchemy import and_, func, or_ + chunk_result = await session.execute(select(Chunk).filter(Chunk.id == chunk_id)) chunk = chunk_result.scalars().first() @@ -902,11 +887,8 @@ async def get_document_by_chunk_id( status_code=404, detail=f"Chunk with id {chunk_id} not found" ) - # Get the associated document document_result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter(Document.id == chunk.document_id) + select(Document).filter(Document.id == chunk.document_id) ) document = document_result.scalars().first() @@ -916,7 +898,6 @@ async def get_document_by_chunk_id( detail="Document not found", ) - # Check permission for the search space await check_permission( session, user, @@ -925,10 +906,38 @@ async def get_document_by_chunk_id( "You don't have permission to read documents in this search space", ) - # Sort chunks by creation time - sorted_chunks = sorted(document.chunks, key=lambda x: x.created_at) + total_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter(Chunk.document_id == document.id) + ) + total_chunks = total_result.scalar() or 0 + + cited_idx_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter( + Chunk.document_id == document.id, + or_( + Chunk.created_at < chunk.created_at, + and_(Chunk.created_at == chunk.created_at, Chunk.id < chunk.id), + ), + ) + ) + cited_idx = cited_idx_result.scalar() or 0 + + start = max(0, cited_idx - chunk_window) + end = min(total_chunks, cited_idx + chunk_window + 1) + + windowed_result = await session.execute( + select(Chunk) + .filter(Chunk.document_id == document.id) + .order_by(Chunk.created_at, Chunk.id) + .offset(start) + .limit(end - start) + ) + windowed_chunks = windowed_result.scalars().all() - # Return the document with its chunks return DocumentWithChunksRead( id=document.id, title=document.title, @@ -940,7 +949,9 @@ async def get_document_by_chunk_id( created_at=document.created_at, updated_at=document.updated_at, search_space_id=document.search_space_id, - chunks=sorted_chunks, + chunks=windowed_chunks, + total_chunks=total_chunks, + chunk_start_index=start, ) except HTTPException: raise @@ -950,6 +961,108 @@ async def get_document_by_chunk_id( ) from e +@router.get("/documents/watched-folders", response_model=list[FolderRead]) +async def get_watched_folders( + search_space_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Return root folders that are marked as watched (metadata->>'watched' = 'true').""" + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + folders = ( + ( + await session.execute( + select(Folder).where( + Folder.search_space_id == search_space_id, + Folder.parent_id.is_(None), + Folder.folder_metadata.isnot(None), + Folder.folder_metadata["watched"].astext == "true", + ) + ) + ) + .scalars() + .all() + ) + + return folders + + +@router.get( + "/documents/{document_id}/chunks", + response_model=PaginatedResponse[ChunkRead], +) +async def get_document_chunks_paginated( + document_id: int, + page: int = Query(0, ge=0), + page_size: int = Query(20, ge=1, le=100), + start_offset: int | None = Query( + None, ge=0, description="Direct offset; overrides page * page_size" + ), + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Paginated chunk loading for a document. + Supports both page-based and offset-based access. + """ + try: + from sqlalchemy import func + + doc_result = await session.execute( + select(Document).filter(Document.id == document_id) + ) + document = doc_result.scalars().first() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + await check_permission( + session, + user, + document.search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + total_result = await session.execute( + select(func.count()) + .select_from(Chunk) + .filter(Chunk.document_id == document_id) + ) + total = total_result.scalar() or 0 + + offset = start_offset if start_offset is not None else page * page_size + chunks_result = await session.execute( + select(Chunk) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.created_at, Chunk.id) + .offset(offset) + .limit(page_size) + ) + chunks = chunks_result.scalars().all() + + return PaginatedResponse( + items=chunks, + total=total, + page=offset // page_size if page_size else page, + page_size=page_size, + has_more=(offset + len(chunks)) < total, + ) + except HTTPException: + raise + except Exception as e: + raise HTTPException( + status_code=500, detail=f"Failed to fetch chunks: {e!s}" + ) from e + + @router.get("/documents/{document_id}", response_model=DocumentRead) async def read_document( document_id: int, @@ -980,13 +1093,14 @@ async def read_document( "You don't have permission to read documents in this search space", ) - # Convert database object to API-friendly format + raw_content = document.content or "" return DocumentRead( id=document.id, title=document.title, document_type=document.document_type, document_metadata=document.document_metadata, - content=document.content, + content=raw_content, + content_preview=raw_content[:300], content_hash=document.content_hash, unique_identifier_hash=document.unique_identifier_hash, created_at=document.created_at, @@ -1135,3 +1249,297 @@ async def delete_document( raise HTTPException( status_code=500, detail=f"Failed to delete document: {e!s}" ) from e + + +# ==================================================================== +# Version History Endpoints +# ==================================================================== + + +@router.get("/documents/{document_id}/versions") +async def list_document_versions( + document_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """List all versions for a document, ordered by version_number descending.""" + document = ( + await session.execute(select(Document).where(Document.id == document_id)) + ).scalar_one_or_none() + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + await check_permission( + session, user, document.search_space_id, Permission.DOCUMENTS_READ.value + ) + + versions = ( + ( + await session.execute( + select(DocumentVersion) + .where(DocumentVersion.document_id == document_id) + .order_by(DocumentVersion.version_number.desc()) + ) + ) + .scalars() + .all() + ) + + return [ + { + "version_number": v.version_number, + "title": v.title, + "content_hash": v.content_hash, + "created_at": v.created_at.isoformat() if v.created_at else None, + } + for v in versions + ] + + +@router.get("/documents/{document_id}/versions/{version_number}") +async def get_document_version( + document_id: int, + version_number: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Get full version content including source_markdown.""" + document = ( + await session.execute(select(Document).where(Document.id == document_id)) + ).scalar_one_or_none() + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + await check_permission( + session, user, document.search_space_id, Permission.DOCUMENTS_READ.value + ) + + version = ( + await session.execute( + select(DocumentVersion).where( + DocumentVersion.document_id == document_id, + DocumentVersion.version_number == version_number, + ) + ) + ).scalar_one_or_none() + if not version: + raise HTTPException(status_code=404, detail="Version not found") + + return { + "version_number": version.version_number, + "title": version.title, + "content_hash": version.content_hash, + "source_markdown": version.source_markdown, + "created_at": version.created_at.isoformat() if version.created_at else None, + } + + +@router.post("/documents/{document_id}/versions/{version_number}/restore") +async def restore_document_version( + document_id: int, + version_number: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Restore a previous version: snapshot current state, then overwrite document content.""" + document = ( + await session.execute(select(Document).where(Document.id == document_id)) + ).scalar_one_or_none() + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + await check_permission( + session, user, document.search_space_id, Permission.DOCUMENTS_UPDATE.value + ) + + version = ( + await session.execute( + select(DocumentVersion).where( + DocumentVersion.document_id == document_id, + DocumentVersion.version_number == version_number, + ) + ) + ).scalar_one_or_none() + if not version: + raise HTTPException(status_code=404, detail="Version not found") + + # Snapshot current state before restoring + from app.utils.document_versioning import create_version_snapshot + + await create_version_snapshot(session, document) + + # Restore the version's content onto the document + document.source_markdown = version.source_markdown + document.title = version.title or document.title + document.content_needs_reindexing = True + await session.commit() + + from app.tasks.celery_tasks.document_reindex_tasks import reindex_document_task + + reindex_document_task.delay(document_id, str(user.id)) + + return { + "message": f"Restored version {version_number}", + "document_id": document_id, + "restored_version": version_number, + } + + +# ===== Local folder indexing endpoints ===== + + +class FolderIndexRequest(PydanticBaseModel): + folder_path: str + folder_name: str + search_space_id: int + exclude_patterns: list[str] | None = None + file_extensions: list[str] | None = None + root_folder_id: int | None = None + enable_summary: bool = False + + +class FolderIndexFilesRequest(PydanticBaseModel): + folder_path: str + folder_name: str + search_space_id: int + target_file_paths: list[str] + root_folder_id: int | None = None + enable_summary: bool = False + + +@router.post("/documents/folder-index") +async def folder_index( + request: FolderIndexRequest, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Full-scan index of a local folder. Creates the root Folder row synchronously + and dispatches the heavy indexing work to a Celery task. + Returns the root_folder_id so the desktop can persist it. + """ + from app.config import config as app_config + + if not app_config.is_self_hosted(): + raise HTTPException( + status_code=400, + detail="Local folder indexing is only available in self-hosted mode", + ) + + await check_permission( + session, + user, + request.search_space_id, + Permission.DOCUMENTS_CREATE.value, + "You don't have permission to create documents in this search space", + ) + + watched_metadata = { + "watched": True, + "folder_path": request.folder_path, + "exclude_patterns": request.exclude_patterns, + "file_extensions": request.file_extensions, + } + + root_folder_id = request.root_folder_id + if root_folder_id: + existing = ( + await session.execute(select(Folder).where(Folder.id == root_folder_id)) + ).scalar_one_or_none() + if not existing: + root_folder_id = None + else: + existing.folder_metadata = watched_metadata + await session.commit() + + if not root_folder_id: + root_folder = Folder( + name=request.folder_name, + search_space_id=request.search_space_id, + created_by_id=str(user.id), + position="a0", + folder_metadata=watched_metadata, + ) + session.add(root_folder) + await session.flush() + root_folder_id = root_folder.id + await session.commit() + + from app.tasks.celery_tasks.document_tasks import index_local_folder_task + + index_local_folder_task.delay( + search_space_id=request.search_space_id, + user_id=str(user.id), + folder_path=request.folder_path, + folder_name=request.folder_name, + exclude_patterns=request.exclude_patterns, + file_extensions=request.file_extensions, + root_folder_id=root_folder_id, + enable_summary=request.enable_summary, + ) + + return { + "message": "Folder indexing started", + "status": "processing", + "root_folder_id": root_folder_id, + } + + +@router.post("/documents/folder-index-files") +async def folder_index_files( + request: FolderIndexFilesRequest, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Index multiple files within a watched folder (batched chokidar trigger). + Validates that all target_file_paths are under folder_path. + Dispatches a single Celery task that processes them in parallel. + """ + from app.config import config as app_config + + if not app_config.is_self_hosted(): + raise HTTPException( + status_code=400, + detail="Local folder indexing is only available in self-hosted mode", + ) + + if not request.target_file_paths: + raise HTTPException( + status_code=400, detail="target_file_paths must not be empty" + ) + + await check_permission( + session, + user, + request.search_space_id, + Permission.DOCUMENTS_CREATE.value, + "You don't have permission to create documents in this search space", + ) + + from pathlib import Path + + for fp in request.target_file_paths: + try: + Path(fp).relative_to(request.folder_path) + except ValueError as err: + raise HTTPException( + status_code=400, + detail=f"target_file_path {fp} must be inside folder_path", + ) from err + + from app.tasks.celery_tasks.document_tasks import index_local_folder_task + + index_local_folder_task.delay( + search_space_id=request.search_space_id, + user_id=str(user.id), + folder_path=request.folder_path, + folder_name=request.folder_name, + target_file_paths=request.target_file_paths, + root_folder_id=request.root_folder_id, + enable_summary=request.enable_summary, + ) + + return { + "message": f"Batch indexing started for {len(request.target_file_paths)} file(s)", + "status": "processing", + "file_count": len(request.target_file_paths), + } diff --git a/surfsense_backend/app/routes/dropbox_add_connector_route.py b/surfsense_backend/app/routes/dropbox_add_connector_route.py index 941e5c00f..1dba64467 100644 --- a/surfsense_backend/app/routes/dropbox_add_connector_route.py +++ b/surfsense_backend/app/routes/dropbox_add_connector_route.py @@ -311,9 +311,11 @@ async def dropbox_callback( ) existing_cursor = db_connector.config.get("cursor") + existing_folder_cursors = db_connector.config.get("folder_cursors") db_connector.config = { **connector_config, "cursor": existing_cursor, + "folder_cursors": existing_folder_cursors, "auth_expired": False, } flag_modified(db_connector, "config") diff --git a/surfsense_backend/app/routes/editor_routes.py b/surfsense_backend/app/routes/editor_routes.py index f54f18def..829b2cf69 100644 --- a/surfsense_backend/app/routes/editor_routes.py +++ b/surfsense_backend/app/routes/editor_routes.py @@ -15,11 +15,10 @@ import pypandoc import typst from fastapi import APIRouter, Depends, HTTPException, Query from fastapi.responses import StreamingResponse -from sqlalchemy import select +from sqlalchemy import func, select from sqlalchemy.ext.asyncio import AsyncSession -from sqlalchemy.orm import selectinload -from app.db import Document, DocumentType, Permission, User, get_async_session +from app.db import Chunk, Document, DocumentType, Permission, User, get_async_session from app.routes.reports_routes import ( _FILE_EXTENSIONS, _MEDIA_TYPES, @@ -44,6 +43,9 @@ router = APIRouter() async def get_editor_content( search_space_id: int, document_id: int, + max_length: int | None = Query( + None, description="Truncate source_markdown to this many characters" + ), session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): @@ -65,9 +67,7 @@ async def get_editor_content( ) result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( + select(Document).filter( Document.id == document_id, Document.search_space_id == search_space_id, ) @@ -77,80 +77,152 @@ async def get_editor_content( if not document: raise HTTPException(status_code=404, detail="Document not found") - # Priority 1: Return source_markdown if it exists (check `is not None` to allow empty strings) - if document.source_markdown is not None: + count_result = await session.execute( + select(func.count()).select_from(Chunk).filter(Chunk.document_id == document_id) + ) + chunk_count = count_result.scalar() or 0 + + def _build_response(md: str) -> dict: + size_bytes = len(md.encode("utf-8")) + truncated = False + output_md = md + if max_length is not None and size_bytes > max_length: + output_md = md[:max_length] + truncated = True return { "document_id": document.id, "title": document.title, "document_type": document.document_type.value, - "source_markdown": document.source_markdown, + "source_markdown": output_md, + "content_size_bytes": size_bytes, + "chunk_count": chunk_count, + "truncated": truncated, "updated_at": document.updated_at.isoformat() if document.updated_at else None, } - # Priority 2: Lazy-migrate from blocknote_document (pure Python, no external deps) + if document.source_markdown is not None: + return _build_response(document.source_markdown) + if document.blocknote_document: from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown = blocknote_to_markdown(document.blocknote_document) if markdown: - # Persist the migration so we don't repeat it document.source_markdown = markdown await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": markdown, - "updated_at": document.updated_at.isoformat() - if document.updated_at - else None, - } + return _build_response(markdown) - # Priority 3: For NOTE type with no content, return empty markdown if document.document_type == DocumentType.NOTE: empty_markdown = "" document.source_markdown = empty_markdown await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": empty_markdown, - "updated_at": document.updated_at.isoformat() - if document.updated_at - else None, - } + return _build_response(empty_markdown) - # Priority 4: Reconstruct from chunks - chunks = sorted(document.chunks, key=lambda c: c.id) + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() - if not chunks: + if not chunk_contents: + doc_status = document.status or {} + state = ( + doc_status.get("state", "ready") + if isinstance(doc_status, dict) + else "ready" + ) + if state in ("pending", "processing"): + raise HTTPException( + status_code=409, + detail="This document is still being processed. Please wait a moment and try again.", + ) raise HTTPException( status_code=400, - detail="This document has no content and cannot be edited. Please re-upload to enable editing.", + detail="This document has no viewable content yet. It may still be syncing. Try again in a few seconds, or re-upload if the issue persists.", ) - markdown_content = "\n\n".join(chunk.content for chunk in chunks) + markdown_content = "\n\n".join(chunk_contents) if not markdown_content.strip(): raise HTTPException( status_code=400, - detail="This document has empty content and cannot be edited.", + detail="This document appears to be empty. Try re-uploading or editing it to add content.", ) - # Persist the lazy migration document.source_markdown = markdown_content await session.commit() - return { - "document_id": document.id, - "title": document.title, - "document_type": document.document_type.value, - "source_markdown": markdown_content, - "updated_at": document.updated_at.isoformat() if document.updated_at else None, - } + return _build_response(markdown_content) + + +@router.get( + "/search-spaces/{search_space_id}/documents/{document_id}/download-markdown" +) +async def download_document_markdown( + search_space_id: int, + document_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """ + Download the full document content as a .md file. + Reconstructs markdown from source_markdown or chunks. + """ + await check_permission( + session, + user, + search_space_id, + Permission.DOCUMENTS_READ.value, + "You don't have permission to read documents in this search space", + ) + + result = await session.execute( + select(Document).filter( + Document.id == document_id, + Document.search_space_id == search_space_id, + ) + ) + document = result.scalars().first() + + if not document: + raise HTTPException(status_code=404, detail="Document not found") + + markdown: str | None = document.source_markdown + if markdown is None and document.blocknote_document: + from app.utils.blocknote_to_markdown import blocknote_to_markdown + + markdown = blocknote_to_markdown(document.blocknote_document) + if markdown is None: + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown = "\n\n".join(chunk_contents) + + if not markdown or not markdown.strip(): + raise HTTPException( + status_code=400, detail="Document has no content to download" + ) + + safe_title = ( + "".join( + c if c.isalnum() or c in " -_" else "_" + for c in (document.title or "document") + ).strip()[:80] + or "document" + ) + + return StreamingResponse( + io.BytesIO(markdown.encode("utf-8")), + media_type="text/markdown; charset=utf-8", + headers={"Content-Disposition": f'attachment; filename="{safe_title}.md"'}, + ) @router.post("/search-spaces/{search_space_id}/documents/{document_id}/save") @@ -258,9 +330,7 @@ async def export_document( ) result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( + select(Document).filter( Document.id == document_id, Document.search_space_id == search_space_id, ) @@ -269,16 +339,20 @@ async def export_document( if not document: raise HTTPException(status_code=404, detail="Document not found") - # Resolve markdown content (same priority as editor-content endpoint) markdown_content: str | None = document.source_markdown if markdown_content is None and document.blocknote_document: from app.utils.blocknote_to_markdown import blocknote_to_markdown markdown_content = blocknote_to_markdown(document.blocknote_document) if markdown_content is None: - chunks = sorted(document.chunks, key=lambda c: c.id) - if chunks: - markdown_content = "\n\n".join(chunk.content for chunk in chunks) + chunk_contents_result = await session.execute( + select(Chunk.content) + .filter(Chunk.document_id == document_id) + .order_by(Chunk.id) + ) + chunk_contents = chunk_contents_result.scalars().all() + if chunk_contents: + markdown_content = "\n\n".join(chunk_contents) if not markdown_content or not markdown_content.strip(): raise HTTPException(status_code=400, detail="Document has no content to export") diff --git a/surfsense_backend/app/routes/folders_routes.py b/surfsense_backend/app/routes/folders_routes.py index d688e692a..2dc9bceac 100644 --- a/surfsense_backend/app/routes/folders_routes.py +++ b/surfsense_backend/app/routes/folders_routes.py @@ -192,6 +192,33 @@ async def get_folder_breadcrumb( ) from e +@router.patch("/folders/{folder_id}/watched") +async def stop_watching_folder( + folder_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + """Clear the watched flag from a folder's metadata.""" + folder = await session.get(Folder, folder_id) + if not folder: + raise HTTPException(status_code=404, detail="Folder not found") + + await check_permission( + session, + user, + folder.search_space_id, + Permission.DOCUMENTS_UPDATE.value, + "You don't have permission to update folders in this search space", + ) + + if folder.folder_metadata and isinstance(folder.folder_metadata, dict): + updated = {**folder.folder_metadata, "watched": False} + folder.folder_metadata = updated + await session.commit() + + return {"message": "Folder watch status updated"} + + @router.put("/folders/{folder_id}", response_model=FolderRead) async def update_folder( folder_id: int, @@ -340,7 +367,7 @@ async def delete_folder( session: AsyncSession = Depends(get_async_session), user: User = Depends(current_active_user), ): - """Delete a folder and cascade-delete subfolders. Documents are async-deleted via Celery.""" + """Mark documents for deletion and dispatch Celery to delete docs first, then folders.""" try: folder = await session.get(Folder, folder_id) if not folder: @@ -372,30 +399,29 @@ async def delete_folder( ) await session.commit() - await session.execute(Folder.__table__.delete().where(Folder.id == folder_id)) - await session.commit() + try: + from app.tasks.celery_tasks.document_tasks import ( + delete_folder_documents_task, + ) - if document_ids: - try: - from app.tasks.celery_tasks.document_tasks import ( - delete_folder_documents_task, - ) - - delete_folder_documents_task.delay(document_ids) - except Exception as err: + delete_folder_documents_task.delay( + document_ids, folder_subtree_ids=list(subtree_ids) + ) + except Exception as err: + if document_ids: await session.execute( Document.__table__.update() .where(Document.id.in_(document_ids)) .values(status={"state": "ready"}) ) await session.commit() - raise HTTPException( - status_code=503, - detail="Folder deleted but document cleanup could not be queued. Documents have been restored.", - ) from err + raise HTTPException( + status_code=503, + detail="Could not queue folder deletion. Documents have been restored.", + ) from err return { - "message": "Folder deleted successfully", + "message": "Folder deletion started", "documents_queued_for_deletion": len(document_ids), } diff --git a/surfsense_backend/app/routes/google_calendar_add_connector_route.py b/surfsense_backend/app/routes/google_calendar_add_connector_route.py index 9a2308bec..d7ccf62ca 100644 --- a/surfsense_backend/app/routes/google_calendar_add_connector_route.py +++ b/surfsense_backend/app/routes/google_calendar_add_connector_route.py @@ -28,7 +28,11 @@ from app.utils.connector_naming import ( check_duplicate_connector, generate_unique_connector_name, ) -from app.utils.oauth_security import OAuthStateManager, TokenEncryption +from app.utils.oauth_security import ( + OAuthStateManager, + TokenEncryption, + generate_code_verifier, +) logger = logging.getLogger(__name__) @@ -96,9 +100,14 @@ async def connect_calendar(space_id: int, user: User = Depends(current_active_us flow = get_google_flow() - # Generate secure state parameter with HMAC signature + code_verifier = generate_code_verifier() + flow.code_verifier = code_verifier + + # Generate secure state parameter with HMAC signature (includes PKCE code_verifier) state_manager = get_state_manager() - state_encoded = state_manager.generate_secure_state(space_id, user.id) + state_encoded = state_manager.generate_secure_state( + space_id, user.id, code_verifier=code_verifier + ) auth_url, _ = flow.authorization_url( access_type="offline", @@ -146,8 +155,11 @@ async def reauth_calendar( flow = get_google_flow() + code_verifier = generate_code_verifier() + flow.code_verifier = code_verifier + state_manager = get_state_manager() - extra: dict = {"connector_id": connector_id} + extra: dict = {"connector_id": connector_id, "code_verifier": code_verifier} if return_url and return_url.startswith("/"): extra["return_url"] = return_url state_encoded = state_manager.generate_secure_state(space_id, user.id, **extra) @@ -225,6 +237,7 @@ async def calendar_callback( user_id = UUID(data["user_id"]) space_id = data["space_id"] + code_verifier = data.get("code_verifier") # Validate redirect URI (security: ensure it matches configured value) if not config.GOOGLE_CALENDAR_REDIRECT_URI: @@ -233,6 +246,7 @@ async def calendar_callback( ) flow = get_google_flow() + flow.code_verifier = code_verifier flow.fetch_token(code=code) creds = flow.credentials diff --git a/surfsense_backend/app/routes/google_drive_add_connector_route.py b/surfsense_backend/app/routes/google_drive_add_connector_route.py index 1c9391610..8706326b7 100644 --- a/surfsense_backend/app/routes/google_drive_add_connector_route.py +++ b/surfsense_backend/app/routes/google_drive_add_connector_route.py @@ -41,7 +41,11 @@ from app.utils.connector_naming import ( check_duplicate_connector, generate_unique_connector_name, ) -from app.utils.oauth_security import OAuthStateManager, TokenEncryption +from app.utils.oauth_security import ( + OAuthStateManager, + TokenEncryption, + generate_code_verifier, +) # Relax token scope validation for Google OAuth os.environ["OAUTHLIB_RELAX_TOKEN_SCOPE"] = "1" @@ -127,14 +131,19 @@ async def connect_drive(space_id: int, user: User = Depends(current_active_user) flow = get_google_flow() - # Generate secure state parameter with HMAC signature + code_verifier = generate_code_verifier() + flow.code_verifier = code_verifier + + # Generate secure state parameter with HMAC signature (includes PKCE code_verifier) state_manager = get_state_manager() - state_encoded = state_manager.generate_secure_state(space_id, user.id) + state_encoded = state_manager.generate_secure_state( + space_id, user.id, code_verifier=code_verifier + ) # Generate authorization URL auth_url, _ = flow.authorization_url( - access_type="offline", # Get refresh token - prompt="consent", # Force consent screen to get refresh token + access_type="offline", + prompt="consent", include_granted_scopes="true", state=state_encoded, ) @@ -193,8 +202,11 @@ async def reauth_drive( flow = get_google_flow() + code_verifier = generate_code_verifier() + flow.code_verifier = code_verifier + state_manager = get_state_manager() - extra: dict = {"connector_id": connector_id} + extra: dict = {"connector_id": connector_id, "code_verifier": code_verifier} if return_url and return_url.startswith("/"): extra["return_url"] = return_url state_encoded = state_manager.generate_secure_state(space_id, user.id, **extra) @@ -285,6 +297,7 @@ async def drive_callback( space_id = data["space_id"] reauth_connector_id = data.get("connector_id") reauth_return_url = data.get("return_url") + code_verifier = data.get("code_verifier") logger.info( f"Processing Google Drive callback for user {user_id}, space {space_id}" @@ -296,8 +309,9 @@ async def drive_callback( status_code=500, detail="GOOGLE_DRIVE_REDIRECT_URI not configured" ) - # Exchange authorization code for tokens + # Exchange authorization code for tokens (restore PKCE code_verifier from state) flow = get_google_flow() + flow.code_verifier = code_verifier flow.fetch_token(code=code) creds = flow.credentials diff --git a/surfsense_backend/app/routes/google_gmail_add_connector_route.py b/surfsense_backend/app/routes/google_gmail_add_connector_route.py index 750a64819..dd8feb1c7 100644 --- a/surfsense_backend/app/routes/google_gmail_add_connector_route.py +++ b/surfsense_backend/app/routes/google_gmail_add_connector_route.py @@ -28,7 +28,11 @@ from app.utils.connector_naming import ( check_duplicate_connector, generate_unique_connector_name, ) -from app.utils.oauth_security import OAuthStateManager, TokenEncryption +from app.utils.oauth_security import ( + OAuthStateManager, + TokenEncryption, + generate_code_verifier, +) logger = logging.getLogger(__name__) @@ -109,9 +113,14 @@ async def connect_gmail(space_id: int, user: User = Depends(current_active_user) flow = get_google_flow() - # Generate secure state parameter with HMAC signature + code_verifier = generate_code_verifier() + flow.code_verifier = code_verifier + + # Generate secure state parameter with HMAC signature (includes PKCE code_verifier) state_manager = get_state_manager() - state_encoded = state_manager.generate_secure_state(space_id, user.id) + state_encoded = state_manager.generate_secure_state( + space_id, user.id, code_verifier=code_verifier + ) auth_url, _ = flow.authorization_url( access_type="offline", @@ -164,8 +173,11 @@ async def reauth_gmail( flow = get_google_flow() + code_verifier = generate_code_verifier() + flow.code_verifier = code_verifier + state_manager = get_state_manager() - extra: dict = {"connector_id": connector_id} + extra: dict = {"connector_id": connector_id, "code_verifier": code_verifier} if return_url and return_url.startswith("/"): extra["return_url"] = return_url state_encoded = state_manager.generate_secure_state(space_id, user.id, **extra) @@ -256,6 +268,7 @@ async def gmail_callback( user_id = UUID(data["user_id"]) space_id = data["space_id"] + code_verifier = data.get("code_verifier") # Validate redirect URI (security: ensure it matches configured value) if not config.GOOGLE_GMAIL_REDIRECT_URI: @@ -264,6 +277,7 @@ async def gmail_callback( ) flow = get_google_flow() + flow.code_verifier = code_verifier flow.fetch_token(code=code) creds = flow.credentials diff --git a/surfsense_backend/app/routes/model_list_routes.py b/surfsense_backend/app/routes/model_list_routes.py index ef6e30514..79ae7221f 100644 --- a/surfsense_backend/app/routes/model_list_routes.py +++ b/surfsense_backend/app/routes/model_list_routes.py @@ -1,5 +1,5 @@ """ -API route for fetching the available LLM models catalogue. +API route for fetching the available models catalogue. Serves a dynamically-updated list sourced from the OpenRouter public API, with a local JSON fallback when the API is unreachable. @@ -30,7 +30,7 @@ async def list_available_models( user: User = Depends(current_active_user), ): """ - Return all available LLM models grouped by provider. + Return all available models grouped by provider. The list is sourced from the OpenRouter public API and cached for 1 hour. If the API is unreachable, a local fallback file is used instead. diff --git a/surfsense_backend/app/routes/new_llm_config_routes.py b/surfsense_backend/app/routes/new_llm_config_routes.py index f784bd273..78907c719 100644 --- a/surfsense_backend/app/routes/new_llm_config_routes.py +++ b/surfsense_backend/app/routes/new_llm_config_routes.py @@ -1,7 +1,7 @@ """ API routes for NewLLMConfig CRUD operations. -NewLLMConfig combines LLM model settings with prompt configuration: +NewLLMConfig combines model settings with prompt configuration: - LLM provider, model, API key, etc. - Configurable system instructions - Citation toggle diff --git a/surfsense_backend/app/routes/search_source_connectors_routes.py b/surfsense_backend/app/routes/search_source_connectors_routes.py index b73b8c789..bb20da65d 100644 --- a/surfsense_backend/app/routes/search_source_connectors_routes.py +++ b/surfsense_backend/app/routes/search_source_connectors_routes.py @@ -55,23 +55,12 @@ from app.schemas import ( ) from app.services.composio_service import ComposioService, get_composio_service from app.services.notification_service import NotificationService -from app.tasks.connector_indexers import ( - index_airtable_records, - index_clickup_tasks, - index_confluence_pages, - index_crawled_urls, - index_discord_messages, - index_elasticsearch_documents, - index_github_repos, - index_google_calendar_events, - index_google_gmail_messages, - index_jira_issues, - index_linear_issues, - index_luma_events, - index_notion_pages, - index_slack_messages, -) from app.users import current_active_user + +# NOTE: connector indexer functions are imported lazily inside each +# ``run_*_indexing`` helper to break a circular import cycle: +# connector_indexers.__init__ → airtable_indexer → airtable_history +# → app.routes.__init__ → this file → connector_indexers (not ready yet) from app.utils.connector_naming import ensure_unique_connector_name from app.utils.indexing_locks import ( acquire_connector_indexing_lock, @@ -1378,6 +1367,8 @@ async def run_slack_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_slack_messages + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -1824,6 +1815,8 @@ async def run_notion_indexing_with_new_session( Create a new session and run the Notion indexing task. This prevents session leaks by creating a dedicated session for the background task. """ + from app.tasks.connector_indexers import index_notion_pages + async with async_session_maker() as session: await _run_indexing_with_notifications( session=session, @@ -1858,6 +1851,8 @@ async def run_notion_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_notion_pages + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -1910,6 +1905,8 @@ async def run_github_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_github_repos + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -1961,6 +1958,8 @@ async def run_linear_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_linear_issues + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2011,6 +2010,8 @@ async def run_discord_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_discord_messages + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2113,6 +2114,8 @@ async def run_jira_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_jira_issues + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2166,6 +2169,8 @@ async def run_confluence_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_confluence_pages + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2217,6 +2222,8 @@ async def run_clickup_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_clickup_tasks + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2268,6 +2275,8 @@ async def run_airtable_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_airtable_records + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2321,6 +2330,8 @@ async def run_google_calendar_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_google_calendar_events + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2370,6 +2381,7 @@ async def run_google_gmail_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_google_gmail_messages # Create a wrapper function that calls index_google_gmail_messages with max_messages async def gmail_indexing_wrapper( @@ -2465,6 +2477,8 @@ async def run_google_drive_indexing( stage="fetching", ) + total_unsupported = 0 + # Index each folder with indexing options for folder in items.folders: try: @@ -2472,6 +2486,7 @@ async def run_google_drive_indexing( indexed_count, skipped_count, error_message, + unsupported_count, ) = await index_google_drive_files( session, connector_id, @@ -2485,6 +2500,7 @@ async def run_google_drive_indexing( include_subfolders=indexing_options.include_subfolders, ) total_skipped += skipped_count + total_unsupported += unsupported_count if error_message: errors.append(f"Folder '{folder.name}': {error_message}") else: @@ -2560,6 +2576,7 @@ async def run_google_drive_indexing( indexed_count=total_indexed, error_message=error_message, skipped_count=total_skipped, + unsupported_count=total_unsupported, ) except Exception as e: @@ -2630,7 +2647,12 @@ async def run_onedrive_indexing( stage="fetching", ) - total_indexed, total_skipped, error_message = await index_onedrive_files( + ( + total_indexed, + total_skipped, + error_message, + total_unsupported, + ) = await index_onedrive_files( session, connector_id, search_space_id, @@ -2671,6 +2693,7 @@ async def run_onedrive_indexing( indexed_count=total_indexed, error_message=error_message, skipped_count=total_skipped, + unsupported_count=total_unsupported, ) except Exception as e: @@ -2738,7 +2761,12 @@ async def run_dropbox_indexing( stage="fetching", ) - total_indexed, total_skipped, error_message = await index_dropbox_files( + ( + total_indexed, + total_skipped, + error_message, + total_unsupported, + ) = await index_dropbox_files( session, connector_id, search_space_id, @@ -2779,6 +2807,7 @@ async def run_dropbox_indexing( indexed_count=total_indexed, error_message=error_message, skipped_count=total_skipped, + unsupported_count=total_unsupported, ) except Exception as e: @@ -2836,6 +2865,8 @@ async def run_luma_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_luma_events + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2888,6 +2919,8 @@ async def run_elasticsearch_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_elasticsearch_documents + await _run_indexing_with_notifications( session=session, connector_id=connector_id, @@ -2938,6 +2971,8 @@ async def run_web_page_indexing( start_date: Start date for indexing end_date: End date for indexing """ + from app.tasks.connector_indexers import index_crawled_urls + await _run_indexing_with_notifications( session=session, connector_id=connector_id, diff --git a/surfsense_backend/app/routes/search_spaces_routes.py b/surfsense_backend/app/routes/search_spaces_routes.py index 7f6638e2c..78be97aa1 100644 --- a/surfsense_backend/app/routes/search_spaces_routes.py +++ b/surfsense_backend/app/routes/search_spaces_routes.py @@ -14,6 +14,7 @@ from app.db import ( SearchSpaceMembership, SearchSpaceRole, User, + VisionLLMConfig, get_async_session, get_default_roles_config, ) @@ -483,6 +484,63 @@ async def _get_image_gen_config_by_id( return None +async def _get_vision_llm_config_by_id( + session: AsyncSession, config_id: int | None +) -> dict | None: + if config_id is None: + return None + + if config_id == 0: + return { + "id": 0, + "name": "Auto (Fastest)", + "description": "Automatically routes requests across available vision LLM providers", + "provider": "AUTO", + "model_name": "auto", + "is_global": True, + "is_auto_mode": True, + } + + if config_id < 0: + for cfg in config.GLOBAL_VISION_LLM_CONFIGS: + if cfg.get("id") == config_id: + return { + "id": cfg.get("id"), + "name": cfg.get("name"), + "description": cfg.get("description"), + "provider": cfg.get("provider"), + "custom_provider": cfg.get("custom_provider"), + "model_name": cfg.get("model_name"), + "api_base": cfg.get("api_base") or None, + "api_version": cfg.get("api_version") or None, + "litellm_params": cfg.get("litellm_params", {}), + "is_global": True, + } + return None + + result = await session.execute( + select(VisionLLMConfig).filter(VisionLLMConfig.id == config_id) + ) + db_config = result.scalars().first() + if db_config: + return { + "id": db_config.id, + "name": db_config.name, + "description": db_config.description, + "provider": db_config.provider.value if db_config.provider else None, + "custom_provider": db_config.custom_provider, + "model_name": db_config.model_name, + "api_base": db_config.api_base, + "api_version": db_config.api_version, + "litellm_params": db_config.litellm_params or {}, + "created_at": db_config.created_at.isoformat() + if db_config.created_at + else None, + "search_space_id": db_config.search_space_id, + } + return None + + @router.get( "/search-spaces/{search_space_id}/llm-preferences", response_model=LLMPreferencesRead, @@ -522,14 +580,19 @@ async def get_llm_preferences( image_generation_config = await _get_image_gen_config_by_id( session, search_space.image_generation_config_id ) + vision_llm_config = await _get_vision_llm_config_by_id( + session, search_space.vision_llm_config_id + ) return LLMPreferencesRead( agent_llm_id=search_space.agent_llm_id, document_summary_llm_id=search_space.document_summary_llm_id, image_generation_config_id=search_space.image_generation_config_id, + vision_llm_config_id=search_space.vision_llm_config_id, agent_llm=agent_llm, document_summary_llm=document_summary_llm, image_generation_config=image_generation_config, + vision_llm_config=vision_llm_config, ) except HTTPException: @@ -589,14 +652,19 @@ async def update_llm_preferences( image_generation_config = await _get_image_gen_config_by_id( session, search_space.image_generation_config_id ) + vision_llm_config = await _get_vision_llm_config_by_id( + session, search_space.vision_llm_config_id + ) return LLMPreferencesRead( agent_llm_id=search_space.agent_llm_id, document_summary_llm_id=search_space.document_summary_llm_id, image_generation_config_id=search_space.image_generation_config_id, + vision_llm_config_id=search_space.vision_llm_config_id, agent_llm=agent_llm, document_summary_llm=document_summary_llm, image_generation_config=image_generation_config, + vision_llm_config=vision_llm_config, ) except HTTPException: diff --git a/surfsense_backend/app/routes/vision_llm_routes.py b/surfsense_backend/app/routes/vision_llm_routes.py new file mode 100644 index 000000000..eddd5e367 --- /dev/null +++ b/surfsense_backend/app/routes/vision_llm_routes.py @@ -0,0 +1,295 @@ +import logging + +from fastapi import APIRouter, Depends, HTTPException +from pydantic import BaseModel +from sqlalchemy import select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.config import config +from app.db import ( + Permission, + User, + VisionLLMConfig, + get_async_session, +) +from app.schemas import ( + GlobalVisionLLMConfigRead, + VisionLLMConfigCreate, + VisionLLMConfigRead, + VisionLLMConfigUpdate, +) +from app.services.vision_model_list_service import get_vision_model_list +from app.users import current_active_user +from app.utils.rbac import check_permission + +router = APIRouter() +logger = logging.getLogger(__name__) + + +# ============================================================================= +# Vision Model Catalogue (from OpenRouter, filtered for image-input models) +# ============================================================================= + + +class VisionModelListItem(BaseModel): + value: str + label: str + provider: str + context_window: str | None = None + + +@router.get("/vision-models", response_model=list[VisionModelListItem]) +async def list_vision_models( + user: User = Depends(current_active_user), +): + """Return vision-capable models sourced from OpenRouter (filtered by image input).""" + try: + return await get_vision_model_list() + except Exception as e: + logger.exception("Failed to fetch vision model list") + raise HTTPException( + status_code=500, detail=f"Failed to fetch vision model list: {e!s}" + ) from e + + +# ============================================================================= +# Global Vision LLM Configs (from YAML) +# ============================================================================= + + +@router.get( + "/global-vision-llm-configs", + response_model=list[GlobalVisionLLMConfigRead], +) +async def get_global_vision_llm_configs( + user: User = Depends(current_active_user), +): + try: + global_configs = config.GLOBAL_VISION_LLM_CONFIGS + safe_configs = [] + + if global_configs and len(global_configs) > 0: + safe_configs.append( + { + "id": 0, + "name": "Auto (Fastest)", + "description": "Automatically routes across available vision LLM providers.", + "provider": "AUTO", + "custom_provider": None, + "model_name": "auto", + "api_base": None, + "api_version": None, + "litellm_params": {}, + "is_global": True, + "is_auto_mode": True, + } + ) + + for cfg in global_configs: + safe_configs.append( + { + "id": cfg.get("id"), + "name": cfg.get("name"), + "description": cfg.get("description"), + "provider": cfg.get("provider"), + "custom_provider": cfg.get("custom_provider"), + "model_name": cfg.get("model_name"), + "api_base": cfg.get("api_base") or None, + "api_version": cfg.get("api_version") or None, + "litellm_params": cfg.get("litellm_params", {}), + "is_global": True, + } + ) + + return safe_configs + except Exception as e: + logger.exception("Failed to fetch global vision LLM configs") + raise HTTPException( + status_code=500, detail=f"Failed to fetch configs: {e!s}" + ) from e + + +# ============================================================================= +# VisionLLMConfig CRUD +# ============================================================================= + + +@router.post("/vision-llm-configs", response_model=VisionLLMConfigRead) +async def create_vision_llm_config( + config_data: VisionLLMConfigCreate, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + await check_permission( + session, + user, + config_data.search_space_id, + Permission.VISION_CONFIGS_CREATE.value, + "You don't have permission to create vision LLM configs in this search space", + ) + + db_config = VisionLLMConfig(**config_data.model_dump(), user_id=user.id) + session.add(db_config) + await session.commit() + await session.refresh(db_config) + return db_config + + except HTTPException: + raise + except Exception as e: + await session.rollback() + logger.exception("Failed to create VisionLLMConfig") + raise HTTPException( + status_code=500, detail=f"Failed to create config: {e!s}" + ) from e + + +@router.get("/vision-llm-configs", response_model=list[VisionLLMConfigRead]) +async def list_vision_llm_configs( + search_space_id: int, + skip: int = 0, + limit: int = 100, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + await check_permission( + session, + user, + search_space_id, + Permission.VISION_CONFIGS_READ.value, + "You don't have permission to view vision LLM configs in this search space", + ) + + result = await session.execute( + select(VisionLLMConfig) + .filter(VisionLLMConfig.search_space_id == search_space_id) + .order_by(VisionLLMConfig.created_at.desc()) + .offset(skip) + .limit(limit) + ) + return result.scalars().all() + + except HTTPException: + raise + except Exception as e: + logger.exception("Failed to list VisionLLMConfigs") + raise HTTPException( + status_code=500, detail=f"Failed to fetch configs: {e!s}" + ) from e + + +@router.get( + "/vision-llm-configs/{config_id}", response_model=VisionLLMConfigRead +) +async def get_vision_llm_config( + config_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + result = await session.execute( + select(VisionLLMConfig).filter(VisionLLMConfig.id == config_id) + ) + db_config = result.scalars().first() + if not db_config: + raise HTTPException(status_code=404, detail="Config not found") + + await check_permission( + session, + user, + db_config.search_space_id, + Permission.VISION_CONFIGS_READ.value, + "You don't have permission to view vision LLM configs in this search space", + ) + return db_config + + except HTTPException: + raise + except Exception as e: + logger.exception("Failed to get VisionLLMConfig") + raise HTTPException( + status_code=500, detail=f"Failed to fetch config: {e!s}" + ) from e + + +@router.put( + "/vision-llm-configs/{config_id}", response_model=VisionLLMConfigRead +) +async def update_vision_llm_config( + config_id: int, + update_data: VisionLLMConfigUpdate, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + result = await session.execute( + select(VisionLLMConfig).filter(VisionLLMConfig.id == config_id) + ) + db_config = result.scalars().first() + if not db_config: + raise HTTPException(status_code=404, detail="Config not found") + + await check_permission( + session, + user, + db_config.search_space_id, + Permission.VISION_CONFIGS_CREATE.value, + "You don't have permission to update vision LLM configs in this search space", + ) + + for key, value in update_data.model_dump(exclude_unset=True).items(): + setattr(db_config, key, value) + + await session.commit() + await session.refresh(db_config) + return db_config + + except HTTPException: + raise + except Exception as e: + await session.rollback() + logger.exception("Failed to update VisionLLMConfig") + raise HTTPException( + status_code=500, detail=f"Failed to update config: {e!s}" + ) from e + + +@router.delete("/vision-llm-configs/{config_id}", response_model=dict) +async def delete_vision_llm_config( + config_id: int, + session: AsyncSession = Depends(get_async_session), + user: User = Depends(current_active_user), +): + try: + result = await session.execute( + select(VisionLLMConfig).filter(VisionLLMConfig.id == config_id) + ) + db_config = result.scalars().first() + if not db_config: + raise HTTPException(status_code=404, detail="Config not found") + + await check_permission( + session, + user, + db_config.search_space_id, + Permission.VISION_CONFIGS_DELETE.value, + "You don't have permission to delete vision LLM configs in this search space", + ) + + await session.delete(db_config) + await session.commit() + return { + "message": "Vision LLM config deleted successfully", + "id": config_id, + } + + except HTTPException: + raise + except Exception as e: + await session.rollback() + logger.exception("Failed to delete VisionLLMConfig") + raise HTTPException( + status_code=500, detail=f"Failed to delete config: {e!s}" + ) from e diff --git a/surfsense_backend/app/schemas/__init__.py b/surfsense_backend/app/schemas/__init__.py index b94a30c19..fdf34672b 100644 --- a/surfsense_backend/app/schemas/__init__.py +++ b/surfsense_backend/app/schemas/__init__.py @@ -125,6 +125,13 @@ from .video_presentations import ( VideoPresentationRead, VideoPresentationUpdate, ) +from .vision_llm import ( + GlobalVisionLLMConfigRead, + VisionLLMConfigCreate, + VisionLLMConfigPublic, + VisionLLMConfigRead, + VisionLLMConfigUpdate, +) __all__ = [ # Folder schemas @@ -163,6 +170,8 @@ __all__ = [ "FolderUpdate", "GlobalImageGenConfigRead", "GlobalNewLLMConfigRead", + # Vision LLM Config schemas + "GlobalVisionLLMConfigRead", "GoogleDriveIndexRequest", "GoogleDriveIndexingOptions", # Base schemas @@ -264,4 +273,8 @@ __all__ = [ "VideoPresentationCreate", "VideoPresentationRead", "VideoPresentationUpdate", + "VisionLLMConfigCreate", + "VisionLLMConfigPublic", + "VisionLLMConfigRead", + "VisionLLMConfigUpdate", ] diff --git a/surfsense_backend/app/schemas/documents.py b/surfsense_backend/app/schemas/documents.py index c022a09d2..49d2836b2 100644 --- a/surfsense_backend/app/schemas/documents.py +++ b/surfsense_backend/app/schemas/documents.py @@ -53,25 +53,26 @@ class DocumentRead(BaseModel): title: str document_type: DocumentType document_metadata: dict - content: str # Changed to string to match frontend + content: str = "" + content_preview: str = "" content_hash: str unique_identifier_hash: str | None created_at: datetime updated_at: datetime | None search_space_id: int folder_id: int | None = None - created_by_id: UUID | None = None # User who created/uploaded this document + created_by_id: UUID | None = None created_by_name: str | None = None created_by_email: str | None = None - status: DocumentStatusSchema | None = ( - None # Processing status (ready, processing, failed) - ) + status: DocumentStatusSchema | None = None model_config = ConfigDict(from_attributes=True) class DocumentWithChunksRead(DocumentRead): chunks: list[ChunkRead] = [] + total_chunks: int = 0 + chunk_start_index: int = 0 model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/schemas/folders.py b/surfsense_backend/app/schemas/folders.py index 263817182..a7e065144 100644 --- a/surfsense_backend/app/schemas/folders.py +++ b/surfsense_backend/app/schemas/folders.py @@ -1,6 +1,7 @@ """Pydantic schemas for folder CRUD, move, and reorder operations.""" from datetime import datetime +from typing import Any from uuid import UUID from pydantic import BaseModel, ConfigDict, Field @@ -34,6 +35,9 @@ class FolderRead(BaseModel): created_by_id: UUID | None created_at: datetime updated_at: datetime + metadata: dict[str, Any] | None = Field( + default=None, validation_alias="folder_metadata" + ) model_config = ConfigDict(from_attributes=True) diff --git a/surfsense_backend/app/schemas/new_llm_config.py b/surfsense_backend/app/schemas/new_llm_config.py index 9863665b6..a466f2c99 100644 --- a/surfsense_backend/app/schemas/new_llm_config.py +++ b/surfsense_backend/app/schemas/new_llm_config.py @@ -1,7 +1,7 @@ """ Pydantic schemas for the NewLLMConfig API. -NewLLMConfig combines LLM model settings with prompt configuration: +NewLLMConfig combines model settings with prompt configuration: - LLM provider, model, API key, etc. - Configurable system instructions - Citation toggle @@ -26,7 +26,7 @@ class NewLLMConfigBase(BaseModel): None, max_length=500, description="Optional description" ) - # LLM Model Configuration + # Model Configuration provider: LiteLLMProvider = Field(..., description="LiteLLM provider type") custom_provider: str | None = Field( None, max_length=100, description="Custom provider name when provider is CUSTOM" @@ -71,7 +71,7 @@ class NewLLMConfigUpdate(BaseModel): name: str | None = Field(None, max_length=100) description: str | None = Field(None, max_length=500) - # LLM Model Configuration + # Model Configuration provider: LiteLLMProvider | None = None custom_provider: str | None = Field(None, max_length=100) model_name: str | None = Field(None, max_length=100) @@ -106,7 +106,7 @@ class NewLLMConfigPublic(BaseModel): name: str description: str | None = None - # LLM Model Configuration (no api_key) + # Model Configuration (no api_key) provider: LiteLLMProvider custom_provider: str | None = None model_name: str @@ -149,7 +149,7 @@ class GlobalNewLLMConfigRead(BaseModel): name: str description: str | None = None - # LLM Model Configuration (no api_key) + # Model Configuration (no api_key) provider: str # String because YAML doesn't enforce enum, "AUTO" for Auto mode custom_provider: str | None = None model_name: str @@ -182,6 +182,9 @@ class LLMPreferencesRead(BaseModel): image_generation_config_id: int | None = Field( None, description="ID of the image generation config to use" ) + vision_llm_config_id: int | None = Field( + None, description="ID of the vision LLM config to use for vision/screenshot analysis" + ) agent_llm: dict[str, Any] | None = Field( None, description="Full config for agent LLM" ) @@ -191,6 +194,9 @@ class LLMPreferencesRead(BaseModel): image_generation_config: dict[str, Any] | None = Field( None, description="Full config for image generation" ) + vision_llm_config: dict[str, Any] | None = Field( + None, description="Full config for vision LLM" + ) model_config = ConfigDict(from_attributes=True) @@ -207,3 +213,6 @@ class LLMPreferencesUpdate(BaseModel): image_generation_config_id: int | None = Field( None, description="ID of the image generation config to use" ) + vision_llm_config_id: int | None = Field( + None, description="ID of the vision LLM config to use for vision/screenshot analysis" + ) diff --git a/surfsense_backend/app/schemas/vision_llm.py b/surfsense_backend/app/schemas/vision_llm.py new file mode 100644 index 000000000..ab2e609dc --- /dev/null +++ b/surfsense_backend/app/schemas/vision_llm.py @@ -0,0 +1,75 @@ +import uuid +from datetime import datetime +from typing import Any + +from pydantic import BaseModel, ConfigDict, Field + +from app.db import VisionProvider + + +class VisionLLMConfigBase(BaseModel): + name: str = Field(..., max_length=100) + description: str | None = Field(None, max_length=500) + provider: VisionProvider = Field(...) + custom_provider: str | None = Field(None, max_length=100) + model_name: str = Field(..., max_length=100) + api_key: str = Field(...) + api_base: str | None = Field(None, max_length=500) + api_version: str | None = Field(None, max_length=50) + litellm_params: dict[str, Any] | None = Field(default=None) + + +class VisionLLMConfigCreate(VisionLLMConfigBase): + search_space_id: int = Field(...) + + +class VisionLLMConfigUpdate(BaseModel): + name: str | None = Field(None, max_length=100) + description: str | None = Field(None, max_length=500) + provider: VisionProvider | None = None + custom_provider: str | None = Field(None, max_length=100) + model_name: str | None = Field(None, max_length=100) + api_key: str | None = None + api_base: str | None = Field(None, max_length=500) + api_version: str | None = Field(None, max_length=50) + litellm_params: dict[str, Any] | None = None + + +class VisionLLMConfigRead(VisionLLMConfigBase): + id: int + created_at: datetime + search_space_id: int + user_id: uuid.UUID + + model_config = ConfigDict(from_attributes=True) + + +class VisionLLMConfigPublic(BaseModel): + id: int + name: str + description: str | None = None + provider: VisionProvider + custom_provider: str | None = None + model_name: str + api_base: str | None = None + api_version: str | None = None + litellm_params: dict[str, Any] | None = None + created_at: datetime + search_space_id: int + user_id: uuid.UUID + + model_config = ConfigDict(from_attributes=True) + + +class GlobalVisionLLMConfigRead(BaseModel): + id: int = Field(...) + name: str + description: str | None = None + provider: str + custom_provider: str | None = None + model_name: str + api_base: str | None = None + api_version: str | None = None + litellm_params: dict[str, Any] | None = None + is_global: bool = True + is_auto_mode: bool = False diff --git a/surfsense_backend/app/services/docling_service.py b/surfsense_backend/app/services/docling_service.py index 82eaf7f74..af9a7d2d5 100644 --- a/surfsense_backend/app/services/docling_service.py +++ b/surfsense_backend/app/services/docling_service.py @@ -111,9 +111,8 @@ class DoclingService: pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend ) - # Initialize DocumentConverter self.converter = DocumentConverter( - format_options={InputFormat.PDF: pdf_format_option} + format_options={InputFormat.PDF: pdf_format_option}, ) acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU" diff --git a/surfsense_backend/app/services/llm_service.py b/surfsense_backend/app/services/llm_service.py index 59f52a4eb..e531aeabb 100644 --- a/surfsense_backend/app/services/llm_service.py +++ b/surfsense_backend/app/services/llm_service.py @@ -405,6 +405,123 @@ async def get_document_summary_llm( ) +async def get_vision_llm( + session: AsyncSession, search_space_id: int +) -> ChatLiteLLM | ChatLiteLLMRouter | None: + """Get the search space's vision LLM instance for screenshot analysis. + + Resolves from the dedicated VisionLLMConfig system: + - Auto mode (ID 0): VisionLLMRouterService + - Global (negative ID): YAML configs + - DB (positive ID): VisionLLMConfig table + """ + from app.db import VisionLLMConfig + from app.services.vision_llm_router_service import ( + VISION_PROVIDER_MAP, + VisionLLMRouterService, + get_global_vision_llm_config, + is_vision_auto_mode, + ) + + try: + result = await session.execute( + select(SearchSpace).where(SearchSpace.id == search_space_id) + ) + search_space = result.scalars().first() + if not search_space: + logger.error(f"Search space {search_space_id} not found") + return None + + config_id = search_space.vision_llm_config_id + if config_id is None: + logger.error( + f"No vision LLM configured for search space {search_space_id}" + ) + return None + + if is_vision_auto_mode(config_id): + if not VisionLLMRouterService.is_initialized(): + logger.error( + "Vision Auto mode requested but Vision LLM Router not initialized" + ) + return None + try: + return ChatLiteLLMRouter( + router=VisionLLMRouterService.get_router(), + streaming=True, + ) + except Exception as e: + logger.error(f"Failed to create vision ChatLiteLLMRouter: {e}") + return None + + if config_id < 0: + global_cfg = get_global_vision_llm_config(config_id) + if not global_cfg: + logger.error(f"Global vision LLM config {config_id} not found") + return None + + if global_cfg.get("custom_provider"): + model_string = ( + f"{global_cfg['custom_provider']}/{global_cfg['model_name']}" + ) + else: + prefix = VISION_PROVIDER_MAP.get( + global_cfg["provider"].upper(), + global_cfg["provider"].lower(), + ) + model_string = f"{prefix}/{global_cfg['model_name']}" + + litellm_kwargs = { + "model": model_string, + "api_key": global_cfg["api_key"], + } + if global_cfg.get("api_base"): + litellm_kwargs["api_base"] = global_cfg["api_base"] + if global_cfg.get("litellm_params"): + litellm_kwargs.update(global_cfg["litellm_params"]) + + return ChatLiteLLM(**litellm_kwargs) + + result = await session.execute( + select(VisionLLMConfig).where( + VisionLLMConfig.id == config_id, + VisionLLMConfig.search_space_id == search_space_id, + ) + ) + vision_cfg = result.scalars().first() + if not vision_cfg: + logger.error( + f"Vision LLM config {config_id} not found in search space {search_space_id}" + ) + return None + + if vision_cfg.custom_provider: + model_string = f"{vision_cfg.custom_provider}/{vision_cfg.model_name}" + else: + prefix = VISION_PROVIDER_MAP.get( + vision_cfg.provider.value.upper(), + vision_cfg.provider.value.lower(), + ) + model_string = f"{prefix}/{vision_cfg.model_name}" + + litellm_kwargs = { + "model": model_string, + "api_key": vision_cfg.api_key, + } + if vision_cfg.api_base: + litellm_kwargs["api_base"] = vision_cfg.api_base + if vision_cfg.litellm_params: + litellm_kwargs.update(vision_cfg.litellm_params) + + return ChatLiteLLM(**litellm_kwargs) + + except Exception as e: + logger.error( + f"Error getting vision LLM for search space {search_space_id}: {e!s}" + ) + return None + + # Backward-compatible alias (LLM preferences are now per-search-space, not per-user) async def get_user_long_context_llm( session: AsyncSession, diff --git a/surfsense_backend/app/services/model_list_service.py b/surfsense_backend/app/services/model_list_service.py index ebc0e0d7c..2a81c2d52 100644 --- a/surfsense_backend/app/services/model_list_service.py +++ b/surfsense_backend/app/services/model_list_service.py @@ -1,5 +1,5 @@ """ -Service for fetching and caching the available LLM model list. +Service for fetching and caching the available model list. Uses the OpenRouter public API as the primary source, with a local fallback JSON file when the API is unreachable. diff --git a/surfsense_backend/app/services/notification_service.py b/surfsense_backend/app/services/notification_service.py index 5e40a3b42..5ffee12d7 100644 --- a/surfsense_backend/app/services/notification_service.py +++ b/surfsense_backend/app/services/notification_service.py @@ -421,6 +421,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): error_message: str | None = None, is_warning: bool = False, skipped_count: int | None = None, + unsupported_count: int | None = None, ) -> Notification: """ Update notification when connector indexing completes. @@ -428,10 +429,11 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): Args: session: Database session notification: Notification to update - indexed_count: Total number of items indexed + indexed_count: Total number of files indexed error_message: Error message if indexing failed, or warning message (optional) is_warning: If True, treat error_message as a warning (success case) rather than an error - skipped_count: Number of items skipped (e.g., duplicates) - optional + skipped_count: Number of files skipped (e.g., unchanged) - optional + unsupported_count: Number of files skipped because the ETL parser doesn't support them Returns: Updated notification @@ -440,52 +442,45 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler): "connector_name", "Connector" ) - # Build the skipped text if there are skipped items - skipped_text = "" - if skipped_count and skipped_count > 0: - skipped_item_text = "item" if skipped_count == 1 else "items" - skipped_text = ( - f" ({skipped_count} {skipped_item_text} skipped - already indexed)" - ) + unsupported_text = "" + if unsupported_count and unsupported_count > 0: + file_word = "file was" if unsupported_count == 1 else "files were" + unsupported_text = f" {unsupported_count} {file_word} not supported." - # If there's an error message but items were indexed, treat it as a warning (partial success) - # If is_warning is True, treat it as success even with 0 items (e.g., duplicates found) - # Otherwise, treat it as a failure if error_message: if indexed_count > 0: - # Partial success with warnings (e.g., duplicate content from other connectors) title = f"Ready: {connector_name}" - item_text = "item" if indexed_count == 1 else "items" - message = f"Now searchable! {indexed_count} {item_text} synced{skipped_text}. Note: {error_message}" + file_text = "file" if indexed_count == 1 else "files" + message = f"Now searchable! {indexed_count} {file_text} synced.{unsupported_text} Note: {error_message}" status = "completed" elif is_warning: - # Warning case (e.g., duplicates found) - treat as success title = f"Ready: {connector_name}" - message = f"Sync completed{skipped_text}. {error_message}" + message = f"Sync complete.{unsupported_text} {error_message}" status = "completed" else: - # Complete failure title = f"Failed: {connector_name}" message = f"Sync failed: {error_message}" + if unsupported_text: + message += unsupported_text status = "failed" else: title = f"Ready: {connector_name}" if indexed_count == 0: - if skipped_count and skipped_count > 0: - skipped_item_text = "item" if skipped_count == 1 else "items" - message = f"Already up to date! {skipped_count} {skipped_item_text} skipped (already indexed)." + if unsupported_count and unsupported_count > 0: + message = f"Sync complete.{unsupported_text}" else: - message = "Already up to date! No new items to sync." + message = "Already up to date!" else: - item_text = "item" if indexed_count == 1 else "items" - message = ( - f"Now searchable! {indexed_count} {item_text} synced{skipped_text}." - ) + file_text = "file" if indexed_count == 1 else "files" + message = f"Now searchable! {indexed_count} {file_text} synced." + if unsupported_text: + message += unsupported_text status = "completed" metadata_updates = { "indexed_count": indexed_count, "skipped_count": skipped_count or 0, + "unsupported_count": unsupported_count or 0, "sync_stage": "completed" if (not error_message or is_warning or indexed_count > 0) else "failed", diff --git a/surfsense_backend/app/services/page_limit_service.py b/surfsense_backend/app/services/page_limit_service.py index 080d05b5d..47fe07fc6 100644 --- a/surfsense_backend/app/services/page_limit_service.py +++ b/surfsense_backend/app/services/page_limit_service.py @@ -3,7 +3,7 @@ Service for managing user page limits for ETL services. """ import os -from pathlib import Path +from pathlib import Path, PurePosixPath from sqlalchemy import select from sqlalchemy.ext.asyncio import AsyncSession @@ -223,10 +223,155 @@ class PageLimitService: # Estimate ~2000 characters per page return max(1, content_length // 2000) + @staticmethod + def estimate_pages_from_metadata( + file_name_or_ext: str, file_size: int | str | None = None + ) -> int: + """Size-based page estimation from file name/extension and byte size. + + Pure function — no file I/O, no database access. Used by cloud + connectors (which only have API metadata) and as the internal + fallback for :meth:`estimate_pages_before_processing`. + + ``file_name_or_ext`` can be a full filename (``"report.pdf"``) or + a bare extension (``".pdf"``). ``file_size`` may be an int, a + stringified int from a cloud API, or *None*. + """ + if file_size is not None: + try: + file_size = int(file_size) + except (ValueError, TypeError): + file_size = 0 + else: + file_size = 0 + + if file_size <= 0: + return 1 + + ext = PurePosixPath(file_name_or_ext).suffix.lower() if file_name_or_ext else "" + if not ext and file_name_or_ext.startswith("."): + ext = file_name_or_ext.lower() + file_ext = ext + + if file_ext == ".pdf": + return max(1, file_size // (100 * 1024)) + + if file_ext in { + ".doc", + ".docx", + ".docm", + ".dot", + ".dotm", + ".odt", + ".ott", + ".sxw", + ".stw", + ".uot", + ".rtf", + ".pages", + ".wpd", + ".wps", + ".abw", + ".zabw", + ".cwk", + ".hwp", + ".lwp", + ".mcw", + ".mw", + ".sdw", + ".vor", + }: + return max(1, file_size // (50 * 1024)) + + if file_ext in { + ".ppt", + ".pptx", + ".pptm", + ".pot", + ".potx", + ".odp", + ".otp", + ".sxi", + ".sti", + ".uop", + ".key", + ".sda", + ".sdd", + ".sdp", + }: + return max(1, file_size // (200 * 1024)) + + if file_ext in { + ".xls", + ".xlsx", + ".xlsm", + ".xlsb", + ".xlw", + ".xlr", + ".ods", + ".ots", + ".fods", + ".numbers", + ".123", + ".wk1", + ".wk2", + ".wk3", + ".wk4", + ".wks", + ".wb1", + ".wb2", + ".wb3", + ".wq1", + ".wq2", + ".csv", + ".tsv", + ".slk", + ".sylk", + ".dif", + ".dbf", + ".prn", + ".qpw", + ".602", + ".et", + ".eth", + }: + return max(1, file_size // (100 * 1024)) + + if file_ext in {".epub"}: + return max(1, file_size // (50 * 1024)) + + if file_ext in {".txt", ".log", ".md", ".markdown", ".htm", ".html", ".xml"}: + return max(1, file_size // 3000) + + if file_ext in { + ".jpg", + ".jpeg", + ".png", + ".gif", + ".bmp", + ".tiff", + ".webp", + ".svg", + ".cgm", + ".odg", + ".pbd", + }: + return 1 + + if file_ext in {".mp3", ".m4a", ".wav", ".mpga"}: + return max(1, file_size // (1024 * 1024)) + + if file_ext in {".mp4", ".mpeg", ".webm"}: + return max(1, file_size // (5 * 1024 * 1024)) + + return max(1, file_size // (80 * 1024)) + def estimate_pages_before_processing(self, file_path: str) -> int: """ - Estimate page count from file before processing (to avoid unnecessary API calls). - This is called BEFORE sending to ETL services to prevent cost on rejected files. + Estimate page count from a local file before processing. + + For PDFs, attempts to read the actual page count via pypdf. + For everything else, delegates to :meth:`estimate_pages_from_metadata`. Args: file_path: Path to the file @@ -240,7 +385,6 @@ class PageLimitService: file_ext = Path(file_path).suffix.lower() file_size = os.path.getsize(file_path) - # PDF files - try to get actual page count if file_ext == ".pdf": try: import pypdf @@ -249,153 +393,6 @@ class PageLimitService: pdf_reader = pypdf.PdfReader(f) return len(pdf_reader.pages) except Exception: - # If PDF reading fails, fall back to size estimation - # Typical PDF: ~100KB per page (conservative estimate) - return max(1, file_size // (100 * 1024)) + pass # fall through to size-based estimation - # Word Processing Documents - # Microsoft Word, LibreOffice Writer, WordPerfect, Pages, etc. - elif file_ext in [ - ".doc", - ".docx", - ".docm", - ".dot", - ".dotm", # Microsoft Word - ".odt", - ".ott", - ".sxw", - ".stw", - ".uot", # OpenDocument/StarOffice Writer - ".rtf", # Rich Text Format - ".pages", # Apple Pages - ".wpd", - ".wps", # WordPerfect, Microsoft Works - ".abw", - ".zabw", # AbiWord - ".cwk", - ".hwp", - ".lwp", - ".mcw", - ".mw", - ".sdw", - ".vor", # Other word processors - ]: - # Typical word document: ~50KB per page (conservative) - return max(1, file_size // (50 * 1024)) - - # Presentation Documents - # PowerPoint, Impress, Keynote, etc. - elif file_ext in [ - ".ppt", - ".pptx", - ".pptm", - ".pot", - ".potx", # Microsoft PowerPoint - ".odp", - ".otp", - ".sxi", - ".sti", - ".uop", # OpenDocument/StarOffice Impress - ".key", # Apple Keynote - ".sda", - ".sdd", - ".sdp", # StarOffice Draw/Impress - ]: - # Typical presentation: ~200KB per slide (conservative) - return max(1, file_size // (200 * 1024)) - - # Spreadsheet Documents - # Excel, Calc, Numbers, Lotus, etc. - elif file_ext in [ - ".xls", - ".xlsx", - ".xlsm", - ".xlsb", - ".xlw", - ".xlr", # Microsoft Excel - ".ods", - ".ots", - ".fods", # OpenDocument Spreadsheet - ".numbers", # Apple Numbers - ".123", - ".wk1", - ".wk2", - ".wk3", - ".wk4", - ".wks", # Lotus 1-2-3 - ".wb1", - ".wb2", - ".wb3", - ".wq1", - ".wq2", # Quattro Pro - ".csv", - ".tsv", - ".slk", - ".sylk", - ".dif", - ".dbf", - ".prn", - ".qpw", # Data formats - ".602", - ".et", - ".eth", # Other spreadsheets - ]: - # Spreadsheets typically have 1 sheet = 1 page for ETL - # Conservative: ~100KB per sheet - return max(1, file_size // (100 * 1024)) - - # E-books - elif file_ext in [".epub"]: - # E-books vary widely, estimate by size - # Typical e-book: ~50KB per page - return max(1, file_size // (50 * 1024)) - - # Plain Text and Markup Files - elif file_ext in [ - ".txt", - ".log", # Plain text - ".md", - ".markdown", # Markdown - ".htm", - ".html", - ".xml", # Markup - ]: - # Plain text: ~3000 bytes per page - return max(1, file_size // 3000) - - # Image Files - # Each image is typically processed as 1 page - elif file_ext in [ - ".jpg", - ".jpeg", # JPEG - ".png", # PNG - ".gif", # GIF - ".bmp", # Bitmap - ".tiff", # TIFF - ".webp", # WebP - ".svg", # SVG - ".cgm", # Computer Graphics Metafile - ".odg", - ".pbd", # OpenDocument Graphics - ]: - # Each image = 1 page - return 1 - - # Audio Files (transcription = typically 1 page per minute) - # Note: These should be handled by audio transcription flow, not ETL - elif file_ext in [".mp3", ".m4a", ".wav", ".mpga"]: - # Audio files: estimate based on duration - # Fallback: ~1MB per minute of audio, 1 page per minute transcript - return max(1, file_size // (1024 * 1024)) - - # Video Files (typically not processed for pages, but just in case) - elif file_ext in [".mp4", ".mpeg", ".webm"]: - # Video files: very rough estimate - # Typically wouldn't be page-based, but use conservative estimate - return max(1, file_size // (5 * 1024 * 1024)) - - # Other/Unknown Document Types - else: - # Conservative estimate: ~80KB per page - # This catches: .sgl, .sxg, .uof, .uos1, .uos2, .web, and any future formats - return max(1, file_size // (80 * 1024)) + return self.estimate_pages_from_metadata(file_ext, file_size) diff --git a/surfsense_backend/app/services/vision_autocomplete_service.py b/surfsense_backend/app/services/vision_autocomplete_service.py new file mode 100644 index 000000000..c28962b31 --- /dev/null +++ b/surfsense_backend/app/services/vision_autocomplete_service.py @@ -0,0 +1,158 @@ +"""Vision autocomplete service — agent-based with scoped filesystem. + +Optimized pipeline: +1. Start the SSE stream immediately so the UI shows progress. +2. Derive a KB search query from window_title (no separate LLM call). +3. Run KB filesystem pre-computation and agent graph compilation in PARALLEL. +4. Inject pre-computed KB files as initial state and stream the agent. +""" + +import logging +from collections.abc import AsyncGenerator + +from langchain_core.messages import HumanMessage +from sqlalchemy.ext.asyncio import AsyncSession + +from app.agents.autocomplete import create_autocomplete_agent, stream_autocomplete_agent +from app.services.llm_service import get_vision_llm +from app.services.new_streaming_service import VercelStreamingService + +logger = logging.getLogger(__name__) + +PREP_STEP_ID = "autocomplete-prep" + + +def _derive_kb_query(app_name: str, window_title: str) -> str: + parts = [p for p in (window_title, app_name) if p] + return " ".join(parts) + + +def _is_vision_unsupported_error(e: Exception) -> bool: + msg = str(e).lower() + return "content must be a string" in msg or "does not support image" in msg + + +# --------------------------------------------------------------------------- +# Main entry point +# --------------------------------------------------------------------------- + + +async def stream_vision_autocomplete( + screenshot_data_url: str, + search_space_id: int, + session: AsyncSession, + *, + app_name: str = "", + window_title: str = "", +) -> AsyncGenerator[str, None]: + """Analyze a screenshot with a vision-LLM agent and stream a text completion.""" + streaming = VercelStreamingService() + vision_error_msg = ( + "The selected model does not support vision. " + "Please set a vision-capable model (e.g. GPT-4o, Gemini) in your search space settings." + ) + + llm = await get_vision_llm(session, search_space_id) + if not llm: + yield streaming.format_message_start() + yield streaming.format_error("No Vision LLM configured for this search space") + yield streaming.format_done() + return + + # Start SSE stream immediately so the UI has something to show + yield streaming.format_message_start() + + kb_query = _derive_kb_query(app_name, window_title) + + # Show a preparation step while KB search + agent compile run + yield streaming.format_thinking_step( + step_id=PREP_STEP_ID, + title="Searching knowledge base", + status="in_progress", + items=[kb_query] if kb_query else [], + ) + + try: + agent, kb = await create_autocomplete_agent( + llm, + search_space_id=search_space_id, + kb_query=kb_query, + app_name=app_name, + window_title=window_title, + ) + except Exception as e: + if _is_vision_unsupported_error(e): + logger.warning("Vision autocomplete: model does not support vision: %s", e) + yield streaming.format_error(vision_error_msg) + yield streaming.format_done() + return + logger.error("Failed to create autocomplete agent: %s", e, exc_info=True) + yield streaming.format_error("Autocomplete failed. Please try again.") + yield streaming.format_done() + return + + has_kb = kb.has_documents + doc_count = len(kb.files) if has_kb else 0 # type: ignore[arg-type] + + yield streaming.format_thinking_step( + step_id=PREP_STEP_ID, + title="Searching knowledge base", + status="complete", + items=[f"Found {doc_count} document{'s' if doc_count != 1 else ''}"] + if kb_query + else ["Skipped"], + ) + + # Build agent input with pre-computed KB as initial state + if has_kb: + instruction = ( + "Analyze this screenshot, then explore the knowledge base documents " + "listed above — read the chunk index of any document whose title " + "looks relevant and check matched chunks for useful facts. " + "Finally, generate a concise autocomplete for the active text area, " + "enhanced with any relevant KB information you found." + ) + else: + instruction = ( + "Analyze this screenshot and generate a concise autocomplete " + "for the active text area based on what you see." + ) + + user_message = HumanMessage( + content=[ + {"type": "text", "text": instruction}, + {"type": "image_url", "image_url": {"url": screenshot_data_url}}, + ] + ) + + input_data: dict = {"messages": [user_message]} + + if has_kb: + input_data["files"] = kb.files + input_data["messages"] = [kb.ls_ai_msg, kb.ls_tool_msg, user_message] + logger.info( + "Autocomplete: injected %d KB files into agent initial state", doc_count + ) + else: + logger.info( + "Autocomplete: no KB documents found, proceeding with screenshot only" + ) + + # Stream the agent (message_start already sent above) + try: + async for sse in stream_autocomplete_agent( + agent, + input_data, + streaming, + emit_message_start=False, + ): + yield sse + except Exception as e: + if _is_vision_unsupported_error(e): + logger.warning("Vision autocomplete: model does not support vision: %s", e) + yield streaming.format_error(vision_error_msg) + yield streaming.format_done() + else: + logger.error("Vision autocomplete streaming error: %s", e, exc_info=True) + yield streaming.format_error("Autocomplete failed. Please try again.") + yield streaming.format_done() diff --git a/surfsense_backend/app/services/vision_llm_router_service.py b/surfsense_backend/app/services/vision_llm_router_service.py new file mode 100644 index 000000000..0d782ab2b --- /dev/null +++ b/surfsense_backend/app/services/vision_llm_router_service.py @@ -0,0 +1,193 @@ +import logging +from typing import Any + +from litellm import Router + +logger = logging.getLogger(__name__) + +VISION_AUTO_MODE_ID = 0 + +VISION_PROVIDER_MAP = { + "OPENAI": "openai", + "ANTHROPIC": "anthropic", + "GOOGLE": "gemini", + "AZURE_OPENAI": "azure", + "VERTEX_AI": "vertex_ai", + "BEDROCK": "bedrock", + "XAI": "xai", + "OPENROUTER": "openrouter", + "OLLAMA": "ollama_chat", + "GROQ": "groq", + "TOGETHER_AI": "together_ai", + "FIREWORKS_AI": "fireworks_ai", + "DEEPSEEK": "openai", + "MISTRAL": "mistral", + "CUSTOM": "custom", +} + + +class VisionLLMRouterService: + _instance = None + _router: Router | None = None + _model_list: list[dict] = [] + _router_settings: dict = {} + _initialized: bool = False + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + @classmethod + def get_instance(cls) -> "VisionLLMRouterService": + if cls._instance is None: + cls._instance = cls() + return cls._instance + + @classmethod + def initialize( + cls, + global_configs: list[dict], + router_settings: dict | None = None, + ) -> None: + instance = cls.get_instance() + + if instance._initialized: + logger.debug("Vision LLM Router already initialized, skipping") + return + + model_list = [] + for config in global_configs: + deployment = cls._config_to_deployment(config) + if deployment: + model_list.append(deployment) + + if not model_list: + logger.warning( + "No valid vision LLM configs found for router initialization" + ) + return + + instance._model_list = model_list + instance._router_settings = router_settings or {} + + default_settings = { + "routing_strategy": "usage-based-routing", + "num_retries": 3, + "allowed_fails": 3, + "cooldown_time": 60, + "retry_after": 5, + } + + final_settings = {**default_settings, **instance._router_settings} + + try: + instance._router = Router( + model_list=model_list, + routing_strategy=final_settings.get( + "routing_strategy", "usage-based-routing" + ), + num_retries=final_settings.get("num_retries", 3), + allowed_fails=final_settings.get("allowed_fails", 3), + cooldown_time=final_settings.get("cooldown_time", 60), + set_verbose=False, + ) + instance._initialized = True + logger.info( + "Vision LLM Router initialized with %d deployments, strategy: %s", + len(model_list), + final_settings.get("routing_strategy"), + ) + except Exception as e: + logger.error(f"Failed to initialize Vision LLM Router: {e}") + instance._router = None + + @classmethod + def _config_to_deployment(cls, config: dict) -> dict | None: + try: + if not config.get("model_name") or not config.get("api_key"): + return None + + if config.get("custom_provider"): + model_string = f"{config['custom_provider']}/{config['model_name']}" + else: + provider = config.get("provider", "").upper() + provider_prefix = VISION_PROVIDER_MAP.get(provider, provider.lower()) + model_string = f"{provider_prefix}/{config['model_name']}" + + litellm_params: dict[str, Any] = { + "model": model_string, + "api_key": config.get("api_key"), + } + + if config.get("api_base"): + litellm_params["api_base"] = config["api_base"] + + if config.get("api_version"): + litellm_params["api_version"] = config["api_version"] + + if config.get("litellm_params"): + litellm_params.update(config["litellm_params"]) + + deployment: dict[str, Any] = { + "model_name": "auto", + "litellm_params": litellm_params, + } + + if config.get("rpm"): + deployment["rpm"] = config["rpm"] + if config.get("tpm"): + deployment["tpm"] = config["tpm"] + + return deployment + + except Exception as e: + logger.warning(f"Failed to convert vision config to deployment: {e}") + return None + + @classmethod + def get_router(cls) -> Router | None: + instance = cls.get_instance() + return instance._router + + @classmethod + def is_initialized(cls) -> bool: + instance = cls.get_instance() + return instance._initialized and instance._router is not None + + @classmethod + def get_model_count(cls) -> int: + instance = cls.get_instance() + return len(instance._model_list) + + +def is_vision_auto_mode(config_id: int | None) -> bool: + return config_id == VISION_AUTO_MODE_ID + + +def build_vision_model_string( + provider: str, model_name: str, custom_provider: str | None +) -> str: + if custom_provider: + return f"{custom_provider}/{model_name}" + prefix = VISION_PROVIDER_MAP.get(provider.upper(), provider.lower()) + return f"{prefix}/{model_name}" + + +def get_global_vision_llm_config(config_id: int) -> dict | None: + from app.config import config + + if config_id == VISION_AUTO_MODE_ID: + return { + "id": VISION_AUTO_MODE_ID, + "name": "Auto (Fastest)", + "provider": "AUTO", + "model_name": "auto", + "is_auto_mode": True, + } + if config_id > 0: + return None + for cfg in config.GLOBAL_VISION_LLM_CONFIGS: + if cfg.get("id") == config_id: + return cfg + return None diff --git a/surfsense_backend/app/services/vision_model_list_service.py b/surfsense_backend/app/services/vision_model_list_service.py new file mode 100644 index 000000000..09893dd06 --- /dev/null +++ b/surfsense_backend/app/services/vision_model_list_service.py @@ -0,0 +1,132 @@ +""" +Service for fetching and caching the vision-capable model list. + +Reuses the same OpenRouter public API and local fallback as the LLM model +list service, but filters for models that accept image input. +""" + +import json +import logging +import time +from pathlib import Path + +import httpx + +logger = logging.getLogger(__name__) + +OPENROUTER_API_URL = "https://openrouter.ai/api/v1/models" +FALLBACK_FILE = Path(__file__).parent.parent / "config" / "vision_model_list_fallback.json" +CACHE_TTL_SECONDS = 86400 # 24 hours + +_cache: list[dict] | None = None +_cache_timestamp: float = 0 + +OPENROUTER_SLUG_TO_VISION_PROVIDER: dict[str, str] = { + "openai": "OPENAI", + "anthropic": "ANTHROPIC", + "google": "GOOGLE", + "mistralai": "MISTRAL", + "x-ai": "XAI", +} + + +def _format_context_length(length: int | None) -> str | None: + if not length: + return None + if length >= 1_000_000: + return f"{length / 1_000_000:g}M" + if length >= 1_000: + return f"{length / 1_000:g}K" + return str(length) + + +async def _fetch_from_openrouter() -> list[dict] | None: + try: + async with httpx.AsyncClient(timeout=15) as client: + response = await client.get(OPENROUTER_API_URL) + response.raise_for_status() + data = response.json() + return data.get("data", []) + except Exception as e: + logger.warning("Failed to fetch from OpenRouter API for vision models: %s", e) + return None + + +def _load_fallback() -> list[dict]: + try: + with open(FALLBACK_FILE, encoding="utf-8") as f: + return json.load(f) + except Exception as e: + logger.error("Failed to load vision model fallback list: %s", e) + return [] + + +def _is_vision_model(model: dict) -> bool: + """Return True if the model accepts image input and outputs text.""" + arch = model.get("architecture", {}) + input_mods = arch.get("input_modalities", []) + output_mods = arch.get("output_modalities", []) + return "image" in input_mods and "text" in output_mods + + +def _process_vision_models(raw_models: list[dict]) -> list[dict]: + processed: list[dict] = [] + + for model in raw_models: + model_id: str = model.get("id", "") + name: str = model.get("name", "") + context_length = model.get("context_length") + + if "/" not in model_id: + continue + + if not _is_vision_model(model): + continue + + provider_slug, model_name = model_id.split("/", 1) + context_window = _format_context_length(context_length) + + processed.append( + { + "value": model_id, + "label": name, + "provider": "OPENROUTER", + "context_window": context_window, + } + ) + + native_provider = OPENROUTER_SLUG_TO_VISION_PROVIDER.get(provider_slug) + if native_provider: + if native_provider == "GOOGLE" and not model_name.startswith("gemini-"): + continue + + processed.append( + { + "value": model_name, + "label": name, + "provider": native_provider, + "context_window": context_window, + } + ) + + return processed + + +async def get_vision_model_list() -> list[dict]: + global _cache, _cache_timestamp + + if _cache is not None and (time.time() - _cache_timestamp) < CACHE_TTL_SECONDS: + return _cache + + raw_models = await _fetch_from_openrouter() + + if raw_models is None: + logger.info("Using fallback vision model list") + return _load_fallback() + + processed = _process_vision_models(raw_models) + + _cache = processed + _cache_timestamp = time.time() + + return processed diff --git a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py index 662b41f2a..4e9249d34 100644 --- a/surfsense_backend/app/tasks/celery_tasks/document_tasks.py +++ b/surfsense_backend/app/tasks/celery_tasks/document_tasks.py @@ -1,6 +1,7 @@ """Celery tasks for document processing.""" import asyncio +import contextlib import logging import os from uuid import UUID @@ -10,6 +11,7 @@ from app.config import config from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService from app.tasks.celery_tasks import get_celery_session_maker +from app.tasks.connector_indexers.local_folder_indexer import index_local_folder from app.tasks.document_processors import ( add_extension_received_document, add_youtube_video_document, @@ -141,21 +143,30 @@ async def _delete_document_background(document_id: int) -> None: retry_backoff_max=300, max_retries=5, ) -def delete_folder_documents_task(self, document_ids: list[int]): - """Celery task to batch-delete documents orphaned by folder deletion.""" +def delete_folder_documents_task( + self, + document_ids: list[int], + folder_subtree_ids: list[int] | None = None, +): + """Celery task to delete documents first, then the folder rows.""" loop = asyncio.new_event_loop() asyncio.set_event_loop(loop) try: - loop.run_until_complete(_delete_folder_documents(document_ids)) + loop.run_until_complete( + _delete_folder_documents(document_ids, folder_subtree_ids) + ) finally: loop.close() -async def _delete_folder_documents(document_ids: list[int]) -> None: - """Delete chunks in batches, then document rows for each orphaned document.""" +async def _delete_folder_documents( + document_ids: list[int], + folder_subtree_ids: list[int] | None = None, +) -> None: + """Delete chunks in batches, then document rows, then folder rows.""" from sqlalchemy import delete as sa_delete, select - from app.db import Chunk, Document + from app.db import Chunk, Document, Folder async with get_celery_session_maker()() as session: batch_size = 500 @@ -177,6 +188,12 @@ async def _delete_folder_documents(document_ids: list[int]) -> None: await session.delete(doc) await session.commit() + if folder_subtree_ids: + await session.execute( + sa_delete(Folder).where(Folder.id.in_(folder_subtree_ids)) + ) + await session.commit() + @celery_app.task( name="delete_search_space_background", @@ -1243,3 +1260,154 @@ async def _process_circleback_meeting( heartbeat_task.cancel() if notification: _stop_heartbeat(notification.id) + + +# ===== Local folder indexing task ===== + + +@celery_app.task(name="index_local_folder", bind=True) +def index_local_folder_task( + self, + search_space_id: int, + user_id: str, + folder_path: str, + folder_name: str, + exclude_patterns: list[str] | None = None, + file_extensions: list[str] | None = None, + root_folder_id: int | None = None, + enable_summary: bool = False, + target_file_paths: list[str] | None = None, +): + """Celery task to index a local folder. Config is passed directly — no connector row.""" + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + + try: + loop.run_until_complete( + _index_local_folder_async( + search_space_id=search_space_id, + user_id=user_id, + folder_path=folder_path, + folder_name=folder_name, + exclude_patterns=exclude_patterns, + file_extensions=file_extensions, + root_folder_id=root_folder_id, + enable_summary=enable_summary, + target_file_paths=target_file_paths, + ) + ) + finally: + loop.close() + + +async def _index_local_folder_async( + search_space_id: int, + user_id: str, + folder_path: str, + folder_name: str, + exclude_patterns: list[str] | None = None, + file_extensions: list[str] | None = None, + root_folder_id: int | None = None, + enable_summary: bool = False, + target_file_paths: list[str] | None = None, +): + """Run local folder indexing with notification + heartbeat.""" + is_batch = bool(target_file_paths) + is_full_scan = not target_file_paths + file_count = len(target_file_paths) if target_file_paths else None + + if is_batch: + doc_name = f"{folder_name} ({file_count} file{'s' if file_count != 1 else ''})" + else: + doc_name = folder_name + + notification = None + notification_id: int | None = None + heartbeat_task = None + + async with get_celery_session_maker()() as session: + try: + notification = ( + await NotificationService.document_processing.notify_processing_started( + session=session, + user_id=UUID(user_id), + document_type="LOCAL_FOLDER_FILE", + document_name=doc_name, + search_space_id=search_space_id, + ) + ) + notification_id = notification.id + _start_heartbeat(notification_id) + heartbeat_task = asyncio.create_task(_run_heartbeat_loop(notification_id)) + except Exception: + logger.warning( + "Failed to create notification for local folder indexing", + exc_info=True, + ) + + async def _heartbeat_progress(completed_count: int) -> None: + """Refresh heartbeat and optionally update notification progress.""" + if notification: + with contextlib.suppress(Exception): + await NotificationService.document_processing.notify_processing_progress( + session=session, + notification=notification, + stage="indexing", + stage_message=f"Syncing files ({completed_count}/{file_count or '?'})", + ) + + try: + _indexed, _skipped_or_failed, _rfid, err = await index_local_folder( + session=session, + search_space_id=search_space_id, + user_id=user_id, + folder_path=folder_path, + folder_name=folder_name, + exclude_patterns=exclude_patterns, + file_extensions=file_extensions, + root_folder_id=root_folder_id, + enable_summary=enable_summary, + target_file_paths=target_file_paths, + on_heartbeat_callback=_heartbeat_progress + if (is_batch or is_full_scan) + else None, + ) + + if notification: + try: + await session.refresh(notification) + if err: + await NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message=err, + ) + else: + await NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + ) + except Exception: + logger.warning( + "Failed to update notification after local folder indexing", + exc_info=True, + ) + + except Exception as e: + logger.exception(f"Local folder indexing failed: {e}") + if notification: + try: + await session.refresh(notification) + await NotificationService.document_processing.notify_processing_completed( + session=session, + notification=notification, + error_message=str(e)[:200], + ) + except Exception: + pass + raise + finally: + if heartbeat_task: + heartbeat_task.cancel() + if notification_id is not None: + _stop_heartbeat(notification_id) diff --git a/surfsense_backend/app/tasks/chat/stream_new_chat.py b/surfsense_backend/app/tasks/chat/stream_new_chat.py index 7c1e3b7ea..5ff907459 100644 --- a/surfsense_backend/app/tasks/chat/stream_new_chat.py +++ b/surfsense_backend/app/tasks/chat/stream_new_chat.py @@ -39,7 +39,6 @@ from app.agents.new_chat.llm_config import ( ) from app.db import ( ChatVisibility, - Document, NewChatMessage, NewChatThread, Report, @@ -63,74 +62,6 @@ _perf_log = get_perf_logger() _background_tasks: set[asyncio.Task] = set() -def format_mentioned_documents_as_context(documents: list[Document]) -> str: - """ - Format mentioned documents as context for the agent. - - Uses the same XML structure as knowledge_base.format_documents_for_context - to ensure citations work properly with chunk IDs. - """ - if not documents: - return "" - - context_parts = [""] - context_parts.append( - "The user has explicitly mentioned the following documents from their knowledge base. " - "These documents are directly relevant to the query and should be prioritized as primary sources. " - "Use [citation:CHUNK_ID] format for citations (e.g., [citation:123])." - ) - context_parts.append("") - - for doc in documents: - # Build metadata JSON - metadata = doc.document_metadata or {} - metadata_json = json.dumps(metadata, ensure_ascii=False) - - # Get URL from metadata - url = ( - metadata.get("url") - or metadata.get("source") - or metadata.get("page_url") - or "" - ) - - context_parts.append("") - context_parts.append("") - context_parts.append(f" {doc.id}") - context_parts.append( - f" {doc.document_type.value}" - ) - context_parts.append(f" <![CDATA[{doc.title}]]>") - context_parts.append(f" ") - context_parts.append( - f" " - ) - context_parts.append("") - context_parts.append("") - context_parts.append("") - - # Use chunks if available (preferred for proper citations) - if hasattr(doc, "chunks") and doc.chunks: - for chunk in doc.chunks: - context_parts.append( - f" " - ) - else: - # Fallback to document content if chunks not loaded - # Use document ID as chunk ID prefix for consistency - context_parts.append( - f" " - ) - - context_parts.append("") - context_parts.append("") - context_parts.append("") - - context_parts.append("") - - return "\n".join(context_parts) - - def format_mentioned_surfsense_docs_as_context( documents: list[SurfsenseDocsDocument], ) -> str: @@ -1317,6 +1248,7 @@ async def stream_new_chat( firecrawl_api_key=firecrawl_api_key, thread_visibility=visibility, disabled_tools=disabled_tools, + mentioned_document_ids=mentioned_document_ids, ) _perf_log.info( "[stream_new_chat] Agent created in %.3fs", time.perf_counter() - _t0 @@ -1340,18 +1272,9 @@ async def stream_new_chat( thread.needs_history_bootstrap = False await session.commit() - # Fetch mentioned documents if any (with chunks for proper citations) - mentioned_documents: list[Document] = [] - if mentioned_document_ids: - result = await session.execute( - select(Document) - .options(selectinload(Document.chunks)) - .filter( - Document.id.in_(mentioned_document_ids), - Document.search_space_id == search_space_id, - ) - ) - mentioned_documents = list(result.scalars().all()) + # Mentioned KB documents are now handled by KnowledgeBaseSearchMiddleware + # which merges them into the scoped filesystem with full document + # structure. Only SurfSense docs and report context are inlined here. # Fetch mentioned SurfSense docs if any mentioned_surfsense_docs: list[SurfsenseDocsDocument] = [] @@ -1379,15 +1302,10 @@ async def stream_new_chat( ) recent_reports = list(recent_reports_result.scalars().all()) - # Format the user query with context (mentioned documents + SurfSense docs) + # Format the user query with context (SurfSense docs + reports only) final_query = user_query context_parts = [] - if mentioned_documents: - context_parts.append( - format_mentioned_documents_as_context(mentioned_documents) - ) - if mentioned_surfsense_docs: context_parts.append( format_mentioned_surfsense_docs_as_context(mentioned_surfsense_docs) @@ -1479,7 +1397,7 @@ async def stream_new_chat( yield streaming_service.format_start_step() # Initial thinking step - analyzing the request - if mentioned_documents or mentioned_surfsense_docs: + if mentioned_surfsense_docs: initial_title = "Analyzing referenced content" action_verb = "Analyzing" else: @@ -1490,18 +1408,6 @@ async def stream_new_chat( query_text = user_query[:80] + ("..." if len(user_query) > 80 else "") processing_parts.append(query_text) - if mentioned_documents: - doc_names = [] - for doc in mentioned_documents: - title = doc.title - if len(title) > 30: - title = title[:27] + "..." - doc_names.append(title) - if len(doc_names) == 1: - processing_parts.append(f"[{doc_names[0]}]") - else: - processing_parts.append(f"[{len(doc_names)} documents]") - if mentioned_surfsense_docs: doc_names = [] for doc in mentioned_surfsense_docs: @@ -1527,7 +1433,7 @@ async def stream_new_chat( # These ORM objects (with eagerly-loaded chunks) can be very large. # They're only needed to build context strings already copied into # final_query / langchain_messages — release them before streaming. - del mentioned_documents, mentioned_surfsense_docs, recent_reports + del mentioned_surfsense_docs, recent_reports del langchain_messages, final_query # Check if this is the first assistant response so we can generate diff --git a/surfsense_backend/app/tasks/connector_indexers/__init__.py b/surfsense_backend/app/tasks/connector_indexers/__init__.py index 9a1d17fd5..1b032d54a 100644 --- a/surfsense_backend/app/tasks/connector_indexers/__init__.py +++ b/surfsense_backend/app/tasks/connector_indexers/__init__.py @@ -42,9 +42,9 @@ from .jira_indexer import index_jira_issues # Issue tracking and project management from .linear_indexer import index_linear_issues -from .luma_indexer import index_luma_events # Documentation and knowledge management +from .luma_indexer import index_luma_events from .notion_indexer import index_notion_pages from .obsidian_indexer import index_obsidian_vault from .slack_indexer import index_slack_messages diff --git a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py index 1b039add7..4a49944c2 100644 --- a/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/dropbox_indexer.py @@ -28,6 +28,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_hashing import compute_identifier_hash from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService from app.services.llm_service import get_user_long_context_llm +from app.services.page_limit_service import PageLimitService from app.services.task_logging_service import TaskLoggingService from app.tasks.connector_indexers.base import ( check_document_by_unique_identifier, @@ -50,7 +51,10 @@ async def _should_skip_file( file_id = file.get("id", "") file_name = file.get("name", "Unknown") - if skip_item(file): + skip, unsup_ext = skip_item(file) + if skip: + if unsup_ext: + return True, f"unsupported:{unsup_ext}" return True, "folder/non-downloadable" if not file_id: return True, "missing file_id" @@ -250,6 +254,121 @@ async def _download_and_index( return batch_indexed, download_failed + batch_failed +async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int): + """Remove a document that was deleted in Dropbox.""" + primary_hash = compute_identifier_hash( + DocumentType.DROPBOX_FILE.value, file_id, search_space_id + ) + existing = await check_document_by_unique_identifier(session, primary_hash) + + if not existing: + result = await session.execute( + select(Document).where( + Document.search_space_id == search_space_id, + Document.document_type == DocumentType.DROPBOX_FILE, + cast(Document.document_metadata["dropbox_file_id"], String) == file_id, + ) + ) + existing = result.scalar_one_or_none() + + if existing: + await session.delete(existing) + + +async def _index_with_delta_sync( + dropbox_client: DropboxClient, + session: AsyncSession, + connector_id: int, + search_space_id: int, + user_id: str, + cursor: str, + task_logger: TaskLoggingService, + log_entry: object, + max_files: int, + on_heartbeat_callback: HeartbeatCallbackType | None = None, + enable_summary: bool = True, +) -> tuple[int, int, int, str]: + """Delta sync using Dropbox cursor-based change tracking. + + Returns (indexed_count, skipped_count, new_cursor). + """ + await task_logger.log_task_progress( + log_entry, + f"Starting delta sync from cursor: {cursor[:20]}...", + {"stage": "delta_sync", "cursor_prefix": cursor[:20]}, + ) + + entries, new_cursor, error = await dropbox_client.get_changes(cursor) + if error: + err_lower = error.lower() + if "401" in error or "authentication expired" in err_lower: + raise Exception( + f"Dropbox authentication failed. Please re-authenticate. (Error: {error})" + ) + raise Exception(f"Failed to fetch Dropbox changes: {error}") + + if not entries: + logger.info("No changes detected since last sync") + return 0, 0, 0, new_cursor or cursor + + logger.info(f"Processing {len(entries)} change entries") + + renamed_count = 0 + skipped = 0 + unsupported_count = 0 + files_to_download: list[dict] = [] + files_processed = 0 + + for entry in entries: + if files_processed >= max_files: + break + files_processed += 1 + + tag = entry.get(".tag") + + if tag == "deleted": + path_lower = entry.get("path_lower", "") + name = entry.get("name", "") + file_id = entry.get("id", "") + if file_id: + await _remove_document(session, file_id, search_space_id) + logger.debug(f"Processed deletion: {name or path_lower}") + continue + + if tag != "file": + continue + + skip, msg = await _should_skip_file(session, entry, search_space_id) + if skip: + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): + renamed_count += 1 + else: + skipped += 1 + continue + + files_to_download.append(entry) + + batch_indexed, failed = await _download_and_index( + dropbox_client, + session, + files_to_download, + connector_id=connector_id, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=enable_summary, + on_heartbeat=on_heartbeat_callback, + ) + + indexed = renamed_count + batch_indexed + logger.info( + f"Delta sync complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" + ) + return indexed, skipped, unsupported_count, new_cursor or cursor + + async def _index_full_scan( dropbox_client: DropboxClient, session: AsyncSession, @@ -265,8 +384,11 @@ async def _index_full_scan( incremental_sync: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Full scan indexing of a folder.""" +) -> tuple[int, int, int]: + """Full scan indexing of a folder. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting full scan of folder: {folder_name}", @@ -278,8 +400,15 @@ async def _index_full_scan( }, ) + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + page_limit_reached = False + renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] all_files, error = await get_files_in_folder( @@ -299,14 +428,36 @@ async def _index_full_scan( if incremental_sync: skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue - elif skip_item(file): + else: + item_skip, item_unsup = skip_item(file) + if item_skip: + if item_unsup: + unsupported_count += 1 + else: + skipped += 1 + continue + + file_pages = PageLimitService.estimate_pages_from_metadata( + file.get("name", ""), file.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + if not page_limit_reached: + logger.warning( + "Page limit reached during Dropbox full scan, " + "skipping remaining files" + ) + page_limit_reached = True skipped += 1 continue + + batch_estimated_pages += file_pages files_to_download.append(file) batch_indexed, failed = await _download_and_index( @@ -320,11 +471,20 @@ async def _index_full_scan( on_heartbeat=on_heartbeat_callback, ) + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + indexed = renamed_count + batch_indexed logger.info( - f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Full scan complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count async def _index_selected_files( @@ -338,12 +498,18 @@ async def _index_selected_files( enable_summary: bool, incremental_sync: bool = True, on_heartbeat: HeartbeatCallbackType | None = None, -) -> tuple[int, int, list[str]]: +) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + files_to_download: list[dict] = [] errors: list[str] = [] renamed_count = 0 skipped = 0 + unsupported_count = 0 for file_path, file_name in file_paths: file, error = await get_file_by_path(dropbox_client, file_path) @@ -355,15 +521,31 @@ async def _index_selected_files( if incremental_sync: skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue - elif skip_item(file): - skipped += 1 + else: + item_skip, item_unsup = skip_item(file) + if item_skip: + if item_unsup: + unsupported_count += 1 + else: + skipped += 1 + continue + + file_pages = PageLimitService.estimate_pages_from_metadata( + file.get("name", ""), file.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + display = file_name or file_path + errors.append(f"File '{display}': page limit would be exceeded") continue + batch_estimated_pages += file_pages files_to_download.append(file) batch_indexed, _failed = await _download_and_index( @@ -377,7 +559,15 @@ async def _index_selected_files( on_heartbeat=on_heartbeat, ) - return renamed_count + batch_indexed, skipped, errors + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + + return renamed_count + batch_indexed, skipped, unsupported_count, errors async def index_dropbox_files( @@ -386,7 +576,7 @@ async def index_dropbox_files( search_space_id: int, user_id: str, items_dict: dict, -) -> tuple[int, int, str | None]: +) -> tuple[int, int, str | None, int]: """Index Dropbox files for a specific connector. items_dict format: @@ -417,7 +607,7 @@ async def index_dropbox_files( await task_logger.log_task_failure( log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 token_encrypted = connector.config.get("_token_encrypted", False) if token_encrypted and not config.SECRET_KEY: @@ -428,7 +618,7 @@ async def index_dropbox_files( "Missing SECRET_KEY", {"error_type": "MissingSecretKey"}, ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) dropbox_client = DropboxClient(session, connector_id) @@ -437,9 +627,13 @@ async def index_dropbox_files( max_files = indexing_options.get("max_files", 500) incremental_sync = indexing_options.get("incremental_sync", True) include_subfolders = indexing_options.get("include_subfolders", True) + use_delta_sync = indexing_options.get("use_delta_sync", True) + + folder_cursors: dict = connector.config.get("folder_cursors", {}) total_indexed = 0 total_skipped = 0 + total_unsupported = 0 selected_files = items_dict.get("files", []) if selected_files: @@ -447,7 +641,7 @@ async def index_dropbox_files( (f.get("path", f.get("path_lower", f.get("id", ""))), f.get("name")) for f in selected_files ] - indexed, skipped, file_errors = await _index_selected_files( + indexed, skipped, unsupported, file_errors = await _index_selected_files( dropbox_client, session, file_tuples, @@ -459,6 +653,7 @@ async def index_dropbox_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsupported if file_errors: logger.warning( f"File indexing errors for connector {connector_id}: {file_errors}" @@ -471,25 +666,66 @@ async def index_dropbox_files( ) folder_name = folder.get("name", "Root") - logger.info(f"Using full scan for folder {folder_name}") - indexed, skipped = await _index_full_scan( - dropbox_client, - session, - connector_id, - search_space_id, - user_id, - folder_path, - folder_name, - task_logger, - log_entry, - max_files, - include_subfolders, - incremental_sync=incremental_sync, - enable_summary=connector_enable_summary, + saved_cursor = folder_cursors.get(folder_path) + can_use_delta = ( + use_delta_sync and saved_cursor and connector.last_indexed_at ) + + if can_use_delta: + logger.info(f"Using delta sync for folder {folder_name}") + indexed, skipped, unsup, new_cursor = await _index_with_delta_sync( + dropbox_client, + session, + connector_id, + search_space_id, + user_id, + saved_cursor, + task_logger, + log_entry, + max_files, + enable_summary=connector_enable_summary, + ) + folder_cursors[folder_path] = new_cursor + total_unsupported += unsup + else: + logger.info(f"Using full scan for folder {folder_name}") + indexed, skipped, unsup = await _index_full_scan( + dropbox_client, + session, + connector_id, + search_space_id, + user_id, + folder_path, + folder_name, + task_logger, + log_entry, + max_files, + include_subfolders, + incremental_sync=incremental_sync, + enable_summary=connector_enable_summary, + ) + total_unsupported += unsup + total_indexed += indexed total_skipped += skipped + # Persist latest cursor for this folder + try: + latest_cursor, cursor_err = await dropbox_client.get_latest_cursor( + folder_path + ) + if latest_cursor and not cursor_err: + folder_cursors[folder_path] = latest_cursor + except Exception as e: + logger.warning(f"Failed to get latest cursor for {folder_path}: {e}") + + # Persist folder cursors to connector config + if folders: + cfg = dict(connector.config) + cfg["folder_cursors"] = folder_cursors + connector.config = cfg + flag_modified(connector, "config") + if total_indexed > 0 or folders: await update_connector_last_indexed(session, connector, True) @@ -498,12 +734,18 @@ async def index_dropbox_files( await task_logger.log_task_success( log_entry, f"Successfully completed Dropbox indexing for connector {connector_id}", - {"files_processed": total_indexed, "files_skipped": total_skipped}, + { + "files_processed": total_indexed, + "files_skipped": total_skipped, + "files_unsupported": total_unsupported, + }, ) logger.info( - f"Dropbox indexing completed: {total_indexed} indexed, {total_skipped} skipped" + f"Dropbox indexing completed: {total_indexed} indexed, " + f"{total_skipped} skipped, {total_unsupported} unsupported" ) - return total_indexed, total_skipped, None + + return total_indexed, total_skipped, None, total_unsupported except SQLAlchemyError as db_error: await session.rollback() @@ -514,7 +756,7 @@ async def index_dropbox_files( {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, 0, f"Database error: {db_error!s}" + return 0, 0, f"Database error: {db_error!s}", 0 except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -524,4 +766,4 @@ async def index_dropbox_files( {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Dropbox files: {e!s}", exc_info=True) - return 0, 0, f"Failed to index Dropbox files: {e!s}" + return 0, 0, f"Failed to index Dropbox files: {e!s}", 0 diff --git a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py index b03d305f7..b11087fe6 100644 --- a/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/google_drive_indexer.py @@ -25,7 +25,11 @@ from app.connectors.google_drive import ( get_files_in_folder, get_start_page_token, ) -from app.connectors.google_drive.file_types import should_skip_file as skip_mime +from app.connectors.google_drive.file_types import ( + is_google_workspace_file, + should_skip_by_extension, + should_skip_file as skip_mime, +) from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_hashing import compute_identifier_hash @@ -34,6 +38,7 @@ from app.indexing_pipeline.indexing_pipeline_service import ( PlaceholderInfo, ) from app.services.llm_service import get_user_long_context_llm +from app.services.page_limit_service import PageLimitService from app.services.task_logging_service import TaskLoggingService from app.tasks.connector_indexers.base import ( check_document_by_unique_identifier, @@ -77,6 +82,10 @@ async def _should_skip_file( if skip_mime(mime_type): return True, "folder/shortcut" + if not is_google_workspace_file(mime_type): + ext_skip, unsup_ext = should_skip_by_extension(file_name) + if ext_skip: + return True, f"unsupported:{unsup_ext}" if not file_id: return True, "missing file_id" @@ -327,6 +336,12 @@ async def _process_single_file( return 1, 0, 0 return 0, 1, 0 + page_limit_service = PageLimitService(session) + estimated_pages = PageLimitService.estimate_pages_from_metadata( + file_name, file.get("size") + ) + await page_limit_service.check_page_limit(user_id, estimated_pages) + markdown, drive_metadata, error = await download_and_extract_content( drive_client, file ) @@ -363,6 +378,9 @@ async def _process_single_file( ) await pipeline.index(document, connector_doc, user_llm) + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) logger.info(f"Successfully indexed Google Drive file: {file_name}") return 1, 0, 0 @@ -458,18 +476,24 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, -) -> tuple[int, int, list[str]]: +) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline. Phase 1 (serial): fetch metadata + skip checks. Phase 2+3 (parallel): download, ETL, index via _download_and_index. - Returns (indexed_count, skipped_count, errors). + Returns (indexed_count, skipped_count, unsupported_count, errors). """ + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + files_to_download: list[dict] = [] errors: list[str] = [] renamed_count = 0 skipped = 0 + unsupported_count = 0 for file_id, file_name in file_ids: file, error = await get_file_by_id(drive_client, file_id) @@ -480,12 +504,23 @@ async def _index_selected_files( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue + file_pages = PageLimitService.estimate_pages_from_metadata( + file.get("name", ""), file.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + display = file_name or file_id + errors.append(f"File '{display}': page limit would be exceeded") + continue + + batch_estimated_pages += file_pages files_to_download.append(file) await _create_drive_placeholders( @@ -507,7 +542,15 @@ async def _index_selected_files( on_heartbeat=on_heartbeat, ) - return renamed_count + batch_indexed, skipped, errors + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + + return renamed_count + batch_indexed, skipped, unsupported_count, errors # --------------------------------------------------------------------------- @@ -530,8 +573,11 @@ async def _index_full_scan( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Full scan indexing of a folder.""" +) -> tuple[int, int, int]: + """Full scan indexing of a folder. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})", @@ -545,8 +591,15 @@ async def _index_full_scan( # ------------------------------------------------------------------ # Phase 1 (serial): collect files, run skip checks, track renames # ------------------------------------------------------------------ + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + page_limit_reached = False + renamed_count = 0 skipped = 0 + unsupported_count = 0 files_processed = 0 files_to_download: list[dict] = [] folders_to_process = [(folder_id, folder_name)] @@ -587,12 +640,28 @@ async def _index_full_scan( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue + file_pages = PageLimitService.estimate_pages_from_metadata( + file.get("name", ""), file.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + if not page_limit_reached: + logger.warning( + "Page limit reached during Google Drive full scan, " + "skipping remaining files" + ) + page_limit_reached = True + skipped += 1 + continue + + batch_estimated_pages += file_pages files_to_download.append(file) page_token = next_token @@ -636,11 +705,20 @@ async def _index_full_scan( on_heartbeat=on_heartbeat_callback, ) + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + indexed = renamed_count + batch_indexed logger.info( - f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Full scan complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count async def _index_with_delta_sync( @@ -658,8 +736,11 @@ async def _index_with_delta_sync( include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Delta sync using change tracking.""" +) -> tuple[int, int, int]: + """Delta sync using change tracking. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting delta sync from token: {start_page_token[:20]}...", @@ -679,15 +760,22 @@ async def _index_with_delta_sync( if not changes: logger.info("No changes detected since last sync") - return 0, 0 + return 0, 0, 0 logger.info(f"Processing {len(changes)} changes") # ------------------------------------------------------------------ # Phase 1 (serial): handle removals, collect files for download # ------------------------------------------------------------------ + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + page_limit_reached = False + renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] files_processed = 0 @@ -709,12 +797,28 @@ async def _index_with_delta_sync( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue + file_pages = PageLimitService.estimate_pages_from_metadata( + file.get("name", ""), file.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + if not page_limit_reached: + logger.warning( + "Page limit reached during Google Drive delta sync, " + "skipping remaining files" + ) + page_limit_reached = True + skipped += 1 + continue + + batch_estimated_pages += file_pages files_to_download.append(file) # ------------------------------------------------------------------ @@ -742,11 +846,20 @@ async def _index_with_delta_sync( on_heartbeat=on_heartbeat_callback, ) + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + indexed = renamed_count + batch_indexed logger.info( - f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Delta sync complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count # --------------------------------------------------------------------------- @@ -766,8 +879,11 @@ async def index_google_drive_files( max_files: int = 500, include_subfolders: bool = False, on_heartbeat_callback: HeartbeatCallbackType | None = None, -) -> tuple[int, int, str | None]: - """Index Google Drive files for a specific connector.""" +) -> tuple[int, int, str | None, int]: + """Index Google Drive files for a specific connector. + + Returns (indexed, skipped, error_or_none, unsupported_count). + """ task_logger = TaskLoggingService(session, search_space_id) log_entry = await task_logger.log_task_start( task_name="google_drive_files_indexing", @@ -793,7 +909,7 @@ async def index_google_drive_files( await task_logger.log_task_failure( log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 await task_logger.log_task_progress( log_entry, @@ -812,7 +928,7 @@ async def index_google_drive_files( "Missing Composio account", {"error_type": "MissingComposioAccount"}, ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 pre_built_credentials = build_composio_credentials(connected_account_id) else: token_encrypted = connector.config.get("_token_encrypted", False) @@ -827,6 +943,7 @@ async def index_google_drive_files( 0, 0, "SECRET_KEY not configured but credentials are marked as encrypted", + 0, ) connector_enable_summary = getattr(connector, "enable_summary", True) @@ -839,7 +956,7 @@ async def index_google_drive_files( await task_logger.log_task_failure( log_entry, error_msg, {"error_type": "MissingParameter"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 target_folder_id = folder_id target_folder_name = folder_name or "Selected Folder" @@ -850,9 +967,11 @@ async def index_google_drive_files( use_delta_sync and start_page_token and connector.last_indexed_at ) + documents_unsupported = 0 + if can_use_delta: logger.info(f"Using delta sync for connector {connector_id}") - documents_indexed, documents_skipped = await _index_with_delta_sync( + documents_indexed, documents_skipped, du = await _index_with_delta_sync( drive_client, session, connector, @@ -868,8 +987,9 @@ async def index_google_drive_files( on_heartbeat_callback, connector_enable_summary, ) + documents_unsupported += du logger.info("Running reconciliation scan after delta sync") - ri, rs = await _index_full_scan( + ri, rs, ru = await _index_full_scan( drive_client, session, connector, @@ -887,9 +1007,14 @@ async def index_google_drive_files( ) documents_indexed += ri documents_skipped += rs + documents_unsupported += ru else: logger.info(f"Using full scan for connector {connector_id}") - documents_indexed, documents_skipped = await _index_full_scan( + ( + documents_indexed, + documents_skipped, + documents_unsupported, + ) = await _index_full_scan( drive_client, session, connector, @@ -924,14 +1049,17 @@ async def index_google_drive_files( { "files_processed": documents_indexed, "files_skipped": documents_skipped, + "files_unsupported": documents_unsupported, "sync_type": "delta" if can_use_delta else "full", "folder": target_folder_name, }, ) logger.info( - f"Google Drive indexing completed: {documents_indexed} indexed, {documents_skipped} skipped" + f"Google Drive indexing completed: {documents_indexed} indexed, " + f"{documents_skipped} skipped, {documents_unsupported} unsupported" ) - return documents_indexed, documents_skipped, None + + return documents_indexed, documents_skipped, None, documents_unsupported except SQLAlchemyError as db_error: await session.rollback() @@ -942,7 +1070,7 @@ async def index_google_drive_files( {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, 0, f"Database error: {db_error!s}" + return 0, 0, f"Database error: {db_error!s}", 0 except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -952,7 +1080,7 @@ async def index_google_drive_files( {"error_type": type(e).__name__}, ) logger.error(f"Failed to index Google Drive files: {e!s}", exc_info=True) - return 0, 0, f"Failed to index Google Drive files: {e!s}" + return 0, 0, f"Failed to index Google Drive files: {e!s}", 0 async def index_google_drive_single_file( @@ -1154,7 +1282,7 @@ async def index_google_drive_selected_files( session, connector_id, credentials=pre_built_credentials ) - indexed, skipped, errors = await _index_selected_files( + indexed, skipped, unsupported, errors = await _index_selected_files( drive_client, session, files, @@ -1165,6 +1293,11 @@ async def index_google_drive_selected_files( on_heartbeat=on_heartbeat_callback, ) + if unsupported > 0: + file_text = "file was" if unsupported == 1 else "files were" + unsup_msg = f"{unsupported} {file_text} not supported" + errors.append(unsup_msg) + await session.commit() if errors: @@ -1172,7 +1305,12 @@ async def index_google_drive_selected_files( log_entry, f"Batch file indexing completed with {len(errors)} error(s)", "; ".join(errors), - {"indexed": indexed, "skipped": skipped, "error_count": len(errors)}, + { + "indexed": indexed, + "skipped": skipped, + "unsupported": unsupported, + "error_count": len(errors), + }, ) else: await task_logger.log_task_success( diff --git a/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py new file mode 100644 index 000000000..7f42f4638 --- /dev/null +++ b/surfsense_backend/app/tasks/connector_indexers/local_folder_indexer.py @@ -0,0 +1,1083 @@ +""" +Local folder indexer. + +Indexes files from a local folder on disk. Supports: +- Full-scan mode (startup reconciliation / manual trigger) +- Batch mode (chokidar real-time trigger, 1..N files) +- Filesystem folder structure mirroring into DB Folder rows +- Document versioning via create_version_snapshot +- ETL-based file parsing for binary formats (PDF, DOCX, images, audio, etc.) + +Desktop-only: all change detection is driven by chokidar in the desktop app. +Config (folder_path, exclude_patterns, etc.) is passed in from the caller — +no connector row is read. +""" + +import asyncio +import os +from collections.abc import Awaitable, Callable +from datetime import UTC, datetime +from pathlib import Path + +from sqlalchemy import select +from sqlalchemy.exc import IntegrityError, SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import ( + Document, + DocumentStatus, + DocumentType, + Folder, +) +from app.indexing_pipeline.connector_document import ConnectorDocument +from app.indexing_pipeline.document_hashing import compute_identifier_hash +from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService +from app.services.llm_service import get_user_long_context_llm +from app.services.page_limit_service import PageLimitExceededError, PageLimitService +from app.services.task_logging_service import TaskLoggingService +from app.tasks.celery_tasks import get_celery_session_maker +from app.utils.document_versioning import create_version_snapshot + +from .base import ( + check_document_by_unique_identifier, + logger, +) + +HeartbeatCallbackType = Callable[[int], Awaitable[None]] + + +def _estimate_pages_safe(page_limit_service: PageLimitService, file_path: str) -> int: + """Estimate page count with a file-size fallback.""" + try: + return page_limit_service.estimate_pages_before_processing(file_path) + except Exception: + file_size = os.path.getsize(file_path) + return max(1, file_size // (80 * 1024)) + + +async def _check_page_limit_or_skip( + page_limit_service: PageLimitService, + user_id: str, + file_path: str, +) -> int: + """Estimate pages and check the limit; raises PageLimitExceededError if over quota. + + Returns the estimated page count on success. + """ + estimated = _estimate_pages_safe(page_limit_service, file_path) + await page_limit_service.check_page_limit(user_id, estimated) + return estimated + + +def _compute_final_pages( + page_limit_service: PageLimitService, + estimated_pages: int, + content_length: int, +) -> int: + """Return the final page count as max(estimated, actual).""" + actual = page_limit_service.estimate_pages_from_content_length(content_length) + return max(estimated_pages, actual) + + +DEFAULT_EXCLUDE_PATTERNS = [ + ".git", + "node_modules", + "__pycache__", + ".DS_Store", + ".obsidian", + ".trash", +] + + +def scan_folder( + folder_path: str, + file_extensions: list[str] | None = None, + exclude_patterns: list[str] | None = None, +) -> list[dict]: + """Walk a directory and return a list of file entries. + + Args: + folder_path: Absolute path to the folder to scan. + file_extensions: If provided, only include files with these extensions + (e.g. [".md", ".txt"]). ``None`` means include all files. + exclude_patterns: Directory/file names to exclude. Any path component + matching one of these strings is skipped. + + Returns: + List of dicts with keys: path, relative_path, name, modified_at, size. + """ + root = Path(folder_path) + if not root.exists(): + raise ValueError(f"Folder path does not exist: {folder_path}") + + if exclude_patterns is None: + exclude_patterns = [] + + files: list[dict] = [] + for dirpath, dirnames, filenames in os.walk(root): + rel_dir = Path(dirpath).relative_to(root) + + dirnames[:] = [d for d in dirnames if d not in exclude_patterns] + + if any(part in exclude_patterns for part in rel_dir.parts): + continue + + for fname in filenames: + if fname in exclude_patterns: + continue + + full = Path(dirpath) / fname + + if ( + file_extensions is not None + and full.suffix.lower() not in file_extensions + ): + continue + + try: + stat = full.stat() + rel_path = full.relative_to(root) + files.append( + { + "path": str(full), + "relative_path": str(rel_path), + "name": full.name, + "modified_at": datetime.fromtimestamp(stat.st_mtime, tz=UTC), + "size": stat.st_size, + } + ) + except OSError as e: + logger.warning(f"Could not stat file {full}: {e}") + + return files + + +async def _read_file_content(file_path: str, filename: str) -> str: + """Read file content via the unified ETL pipeline. + + All file types (plaintext, audio, direct-convert, document) are handled + by ``EtlPipelineService``. + """ + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + + result = await EtlPipelineService().extract( + EtlRequest(file_path=file_path, filename=filename) + ) + return result.markdown_content + + +def _content_hash(content: str, search_space_id: int) -> str: + """SHA-256 hash of content scoped to a search space. + + Matches the format used by ``compute_content_hash`` in the unified + pipeline so that dedup checks are consistent. + """ + import hashlib + + return hashlib.sha256(f"{search_space_id}:{content}".encode()).hexdigest() + + +async def _compute_file_content_hash( + file_path: str, + filename: str, + search_space_id: int, +) -> tuple[str, str]: + """Read a file (via ETL if needed) and compute its content hash. + + Returns (content_text, content_hash). + """ + content = await _read_file_content(file_path, filename) + return content, _content_hash(content, search_space_id) + + +async def _mirror_folder_structure( + session: AsyncSession, + folder_path: str, + folder_name: str, + search_space_id: int, + user_id: str, + root_folder_id: int | None = None, + exclude_patterns: list[str] | None = None, +) -> tuple[dict[str, int], int]: + """Mirror the local filesystem directory structure into DB Folder rows. + + Returns (mapping, root_folder_id) where mapping is + relative_dir_path -> folder_id. The empty string key maps to the root folder. + """ + root = Path(folder_path) + if exclude_patterns is None: + exclude_patterns = [] + + subdirs: list[str] = [] + for dirpath, dirnames, _ in os.walk(root): + dirnames[:] = [d for d in dirnames if d not in exclude_patterns] + rel = Path(dirpath).relative_to(root) + if any(part in exclude_patterns for part in rel.parts): + continue + rel_str = str(rel) if str(rel) != "." else "" + if rel_str: + subdirs.append(rel_str) + + subdirs.sort(key=lambda p: p.count(os.sep)) + + mapping: dict[str, int] = {} + + if root_folder_id: + existing = ( + await session.execute(select(Folder).where(Folder.id == root_folder_id)) + ).scalar_one_or_none() + if existing: + mapping[""] = existing.id + else: + root_folder_id = None + + if not root_folder_id: + root_folder = Folder( + name=folder_name, + search_space_id=search_space_id, + created_by_id=user_id, + position="a0", + ) + session.add(root_folder) + await session.flush() + mapping[""] = root_folder.id + root_folder_id = root_folder.id + + for rel_dir in subdirs: + dir_parts = Path(rel_dir).parts + dir_name = dir_parts[-1] + parent_rel = str(Path(*dir_parts[:-1])) if len(dir_parts) > 1 else "" + + parent_id = mapping.get(parent_rel, mapping[""]) + + existing_folder = ( + await session.execute( + select(Folder).where( + Folder.name == dir_name, + Folder.parent_id == parent_id, + Folder.search_space_id == search_space_id, + ) + ) + ).scalar_one_or_none() + + if existing_folder: + mapping[rel_dir] = existing_folder.id + else: + new_folder = Folder( + name=dir_name, + parent_id=parent_id, + search_space_id=search_space_id, + created_by_id=user_id, + position="a0", + ) + session.add(new_folder) + await session.flush() + mapping[rel_dir] = new_folder.id + + await session.flush() + return mapping, root_folder_id + + +async def _resolve_folder_for_file( + session: AsyncSession, + rel_path: str, + root_folder_id: int, + search_space_id: int, + user_id: str, +) -> int: + """Given a file's relative path, ensure all parent Folder rows exist and + return the folder_id for the file's immediate parent directory. + + For a file at "notes/daily/today.md", this ensures Folder rows exist for + "notes" and "notes/daily", and returns the id of "notes/daily". + For a file at "readme.md" (root level), returns root_folder_id. + """ + parent_dir = str(Path(rel_path).parent) + if parent_dir == ".": + return root_folder_id + + parts = Path(parent_dir).parts + current_parent_id = root_folder_id + + for part in parts: + existing = ( + await session.execute( + select(Folder).where( + Folder.name == part, + Folder.parent_id == current_parent_id, + Folder.search_space_id == search_space_id, + ) + ) + ).scalar_one_or_none() + + if existing: + current_parent_id = existing.id + else: + new_folder = Folder( + name=part, + parent_id=current_parent_id, + search_space_id=search_space_id, + created_by_id=user_id, + position="a0", + ) + session.add(new_folder) + await session.flush() + current_parent_id = new_folder.id + + return current_parent_id + + +async def _cleanup_empty_folder_chain( + session: AsyncSession, + folder_id: int, + root_folder_id: int, +) -> None: + """Walk up from folder_id toward root, deleting empty folders (no docs, no + children). Stops at root_folder_id which is never deleted.""" + current_id = folder_id + while current_id and current_id != root_folder_id: + has_doc = ( + await session.execute( + select(Document.id).where(Document.folder_id == current_id).limit(1) + ) + ).scalar_one_or_none() + if has_doc is not None: + break + + has_child = ( + await session.execute( + select(Folder.id).where(Folder.parent_id == current_id).limit(1) + ) + ).scalar_one_or_none() + if has_child is not None: + break + + folder = ( + await session.execute(select(Folder).where(Folder.id == current_id)) + ).scalar_one_or_none() + if not folder: + break + + parent_id = folder.parent_id + await session.delete(folder) + await session.flush() + current_id = parent_id + + +async def _cleanup_empty_folders( + session: AsyncSession, + root_folder_id: int, + search_space_id: int, + existing_dirs_on_disk: set[str], + folder_mapping: dict[str, int], +) -> None: + """Delete Folder rows that are empty (no docs, no children) and no longer on disk.""" + from sqlalchemy import delete as sa_delete + + id_to_rel: dict[int, str] = {fid: rel for rel, fid in folder_mapping.items() if rel} + + all_folders = ( + ( + await session.execute( + select(Folder).where( + Folder.search_space_id == search_space_id, + Folder.id != root_folder_id, + ) + ) + ) + .scalars() + .all() + ) + + candidates: list[Folder] = [] + for folder in all_folders: + rel = id_to_rel.get(folder.id) + if rel and rel in existing_dirs_on_disk: + continue + candidates.append(folder) + + changed = True + while changed: + changed = False + remaining: list[Folder] = [] + for folder in candidates: + doc_exists = ( + await session.execute( + select(Document.id).where(Document.folder_id == folder.id).limit(1) + ) + ).scalar_one_or_none() + if doc_exists is not None: + remaining.append(folder) + continue + + child_exists = ( + await session.execute( + select(Folder.id).where(Folder.parent_id == folder.id).limit(1) + ) + ).scalar_one_or_none() + if child_exists is not None: + remaining.append(folder) + continue + + await session.execute(sa_delete(Folder).where(Folder.id == folder.id)) + changed = True + candidates = remaining + + +def _build_connector_doc( + title: str, + content: str, + relative_path: str, + folder_name: str, + *, + search_space_id: int, + user_id: str, + enable_summary: bool, +) -> ConnectorDocument: + """Build a ConnectorDocument from a local file's extracted content.""" + unique_id = f"{folder_name}:{relative_path}" + metadata = { + "folder_name": folder_name, + "file_path": relative_path, + "document_type": "Local Folder File", + "connector_type": "Local Folder", + } + fallback_summary = f"File: {title}\n\n{content[:4000]}" + + return ConnectorDocument( + title=title, + source_markdown=content, + unique_id=unique_id, + document_type=DocumentType.LOCAL_FOLDER_FILE, + search_space_id=search_space_id, + connector_id=None, + created_by_id=user_id, + should_summarize=enable_summary, + fallback_summary=fallback_summary, + metadata=metadata, + ) + + +async def index_local_folder( + session: AsyncSession, + search_space_id: int, + user_id: str, + folder_path: str, + folder_name: str, + exclude_patterns: list[str] | None = None, + file_extensions: list[str] | None = None, + root_folder_id: int | None = None, + enable_summary: bool = False, + target_file_paths: list[str] | None = None, + on_heartbeat_callback: HeartbeatCallbackType | None = None, +) -> tuple[int, int, int | None, str | None]: + """Index files from a local folder. + + Supports two modes: + - Batch (target_file_paths set): processes 1..N files. + Single-file uses the caller's session; multi-file fans out with per-file sessions. + - Full scan (no target paths): walks entire folder, handles new/changed/deleted files. + + Returns (indexed_count, skipped_count, root_folder_id, error_or_warning_message). + """ + task_logger = TaskLoggingService(session, search_space_id) + + log_entry = await task_logger.log_task_start( + task_name="local_folder_indexing", + source="local_folder_indexing_task", + message=f"Starting local folder indexing for {folder_name}", + metadata={ + "folder_path": folder_path, + "user_id": str(user_id), + "target_file_paths_count": len(target_file_paths) + if target_file_paths + else None, + }, + ) + + try: + if not folder_path or not os.path.exists(folder_path): + await task_logger.log_task_failure( + log_entry, + f"Folder path missing or does not exist: {folder_path}", + "Folder not found", + {}, + ) + return ( + 0, + 0, + root_folder_id, + f"Folder path missing or does not exist: {folder_path}", + ) + + if exclude_patterns is None: + exclude_patterns = DEFAULT_EXCLUDE_PATTERNS + + # ==================================================================== + # BATCH MODE (1..N files) + # ==================================================================== + if target_file_paths: + if len(target_file_paths) == 1: + indexed, skipped, err = await _index_single_file( + session=session, + search_space_id=search_space_id, + user_id=user_id, + folder_path=folder_path, + folder_name=folder_name, + target_file_path=target_file_paths[0], + enable_summary=enable_summary, + root_folder_id=root_folder_id, + task_logger=task_logger, + log_entry=log_entry, + ) + return indexed, skipped, root_folder_id, err + + indexed, failed, err = await _index_batch_files( + search_space_id=search_space_id, + user_id=user_id, + folder_path=folder_path, + folder_name=folder_name, + target_file_paths=target_file_paths, + enable_summary=enable_summary, + root_folder_id=root_folder_id, + on_progress_callback=on_heartbeat_callback, + ) + if err: + await task_logger.log_task_success( + log_entry, + f"Batch indexing: {indexed} indexed, {failed} failed", + {"indexed": indexed, "failed": failed}, + ) + else: + await task_logger.log_task_success( + log_entry, + f"Batch indexing complete: {indexed} indexed", + {"indexed": indexed, "failed": failed}, + ) + return indexed, failed, root_folder_id, err + + # ==================================================================== + # FULL-SCAN MODE + # ==================================================================== + + await task_logger.log_task_progress( + log_entry, "Mirroring folder structure", {"stage": "folder_mirror"} + ) + + folder_mapping, root_folder_id = await _mirror_folder_structure( + session=session, + folder_path=folder_path, + folder_name=folder_name, + search_space_id=search_space_id, + user_id=user_id, + root_folder_id=root_folder_id, + exclude_patterns=exclude_patterns, + ) + await session.flush() + + try: + files = scan_folder(folder_path, file_extensions, exclude_patterns) + except Exception as e: + await task_logger.log_task_failure( + log_entry, f"Failed to scan folder: {e}", "Scan error", {} + ) + return 0, 0, root_folder_id, f"Failed to scan folder: {e}" + + logger.info(f"Found {len(files)} files in folder") + + indexed_count = 0 + skipped_count = 0 + failed_count = 0 + + page_limit_service = PageLimitService(session) + + # ================================================================ + # PHASE 1: Pre-filter files (mtime / content-hash), version changed + # ================================================================ + connector_docs: list[ConnectorDocument] = [] + file_meta_map: dict[str, dict] = {} + seen_unique_hashes: set[str] = set() + + for file_info in files: + try: + relative_path = file_info["relative_path"] + file_path_abs = file_info["path"] + + unique_identifier = f"{folder_name}:{relative_path}" + unique_identifier_hash = compute_identifier_hash( + DocumentType.LOCAL_FOLDER_FILE.value, + unique_identifier, + search_space_id, + ) + seen_unique_hashes.add(unique_identifier_hash) + + existing_document = await check_document_by_unique_identifier( + session, unique_identifier_hash + ) + + if existing_document: + stored_mtime = (existing_document.document_metadata or {}).get( + "mtime" + ) + current_mtime = file_info["modified_at"].timestamp() + + if stored_mtime and abs(current_mtime - stored_mtime) < 1.0: + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() + skipped_count += 1 + continue + + try: + estimated_pages = await _check_page_limit_or_skip( + page_limit_service, user_id, file_path_abs + ) + except PageLimitExceededError: + logger.warning( + f"Page limit exceeded, skipping: {file_path_abs}" + ) + failed_count += 1 + continue + + try: + content, content_hash = await _compute_file_content_hash( + file_path_abs, file_info["relative_path"], search_space_id + ) + except Exception as read_err: + logger.warning(f"Could not read {file_path_abs}: {read_err}") + skipped_count += 1 + continue + + if existing_document.content_hash == content_hash: + meta = dict(existing_document.document_metadata or {}) + meta["mtime"] = current_mtime + existing_document.document_metadata = meta + if not DocumentStatus.is_state( + existing_document.status, DocumentStatus.READY + ): + existing_document.status = DocumentStatus.ready() + skipped_count += 1 + continue + + await create_version_snapshot(session, existing_document) + else: + try: + estimated_pages = await _check_page_limit_or_skip( + page_limit_service, user_id, file_path_abs + ) + except PageLimitExceededError: + logger.warning( + f"Page limit exceeded, skipping: {file_path_abs}" + ) + failed_count += 1 + continue + + try: + content, content_hash = await _compute_file_content_hash( + file_path_abs, file_info["relative_path"], search_space_id + ) + except Exception as read_err: + logger.warning(f"Could not read {file_path_abs}: {read_err}") + skipped_count += 1 + continue + + if not content.strip(): + skipped_count += 1 + continue + + doc = _build_connector_doc( + title=file_info["name"], + content=content, + relative_path=relative_path, + folder_name=folder_name, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=enable_summary, + ) + connector_docs.append(doc) + file_meta_map[unique_identifier] = { + "relative_path": relative_path, + "mtime": file_info["modified_at"].timestamp(), + "estimated_pages": estimated_pages, + "content_length": len(content), + } + + except Exception as e: + logger.exception(f"Phase 1 error for {file_info.get('path')}: {e}") + failed_count += 1 + + # ================================================================ + # PHASE 1.5: Delete documents no longer on disk + # ================================================================ + all_root_folder_ids = set(folder_mapping.values()) + all_db_folders = ( + ( + await session.execute( + select(Folder.id).where( + Folder.search_space_id == search_space_id, + ) + ) + ) + .scalars() + .all() + ) + all_root_folder_ids.update(all_db_folders) + + all_folder_docs = ( + ( + await session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == search_space_id, + Document.folder_id.in_(list(all_root_folder_ids)), + ) + ) + ) + .scalars() + .all() + ) + + for doc in all_folder_docs: + if doc.unique_identifier_hash not in seen_unique_hashes: + await session.delete(doc) + + await session.flush() + + # ================================================================ + # PHASE 2: Index via unified pipeline + # ================================================================ + if connector_docs: + from app.indexing_pipeline.document_hashing import ( + compute_unique_identifier_hash, + ) + + pipeline = IndexingPipelineService(session) + doc_map = {compute_unique_identifier_hash(cd): cd for cd in connector_docs} + documents = await pipeline.prepare_for_indexing(connector_docs) + + # Assign folder_id immediately so docs appear in the correct + # folder while still pending/processing (visible via Zero sync). + for document in documents: + cd = doc_map.get(document.unique_identifier_hash) + if cd is None: + continue + rel_path = (cd.metadata or {}).get("file_path", "") + parent_dir = str(Path(rel_path).parent) if rel_path else "" + if parent_dir == ".": + parent_dir = "" + document.folder_id = folder_mapping.get( + parent_dir, folder_mapping.get("") + ) + try: + await session.commit() + except IntegrityError: + await session.rollback() + for document in documents: + await session.refresh(document) + + llm = await get_user_long_context_llm(session, user_id, search_space_id) + + for document in documents: + connector_doc = doc_map.get(document.unique_identifier_hash) + if connector_doc is None: + failed_count += 1 + continue + + result = await pipeline.index(document, connector_doc, llm) + + if DocumentStatus.is_state(result.status, DocumentStatus.READY): + indexed_count += 1 + + unique_id = connector_doc.unique_id + mtime_info = file_meta_map.get(unique_id, {}) + + doc_meta = dict(result.document_metadata or {}) + doc_meta["mtime"] = mtime_info.get("mtime") + result.document_metadata = doc_meta + + est = mtime_info.get("estimated_pages", 1) + content_len = mtime_info.get("content_length", 0) + final_pages = _compute_final_pages( + page_limit_service, est, content_len + ) + await page_limit_service.update_page_usage( + user_id, final_pages, allow_exceed=True + ) + else: + failed_count += 1 + + if on_heartbeat_callback and indexed_count % 5 == 0: + await on_heartbeat_callback(indexed_count) + + # Cleanup empty folders + existing_dirs = set() + for dirpath, dirnames, _ in os.walk(folder_path): + dirnames[:] = [d for d in dirnames if d not in exclude_patterns] + rel = str(Path(dirpath).relative_to(folder_path)) + if rel == ".": + rel = "" + if rel and not any(part in exclude_patterns for part in Path(rel).parts): + existing_dirs.add(rel) + + root_fid = folder_mapping.get("") + if root_fid: + await _cleanup_empty_folders( + session, root_fid, search_space_id, existing_dirs, folder_mapping + ) + + try: + await session.commit() + except Exception as e: + if "duplicate key value violates unique constraint" in str(e).lower(): + logger.warning(f"Duplicate key during commit: {e}") + await session.rollback() + else: + raise + + warning_parts = [] + if failed_count > 0: + warning_parts.append(f"{failed_count} failed") + warning_message = ", ".join(warning_parts) if warning_parts else None + + await task_logger.log_task_success( + log_entry, + f"Completed local folder indexing for {folder_name}", + { + "indexed": indexed_count, + "skipped": skipped_count, + "failed": failed_count, + }, + ) + + return indexed_count, skipped_count, root_folder_id, warning_message + + except SQLAlchemyError as e: + logger.exception(f"Database error during local folder indexing: {e}") + await session.rollback() + await task_logger.log_task_failure( + log_entry, f"DB error: {e}", "Database error", {} + ) + return 0, 0, root_folder_id, f"Database error: {e}" + + except Exception as e: + logger.exception(f"Error during local folder indexing: {e}") + await task_logger.log_task_failure( + log_entry, f"Error: {e}", "Unexpected error", {} + ) + return 0, 0, root_folder_id, str(e) + + +BATCH_CONCURRENCY = 5 + + +async def _index_batch_files( + search_space_id: int, + user_id: str, + folder_path: str, + folder_name: str, + target_file_paths: list[str], + enable_summary: bool, + root_folder_id: int | None, + on_progress_callback: HeartbeatCallbackType | None = None, +) -> tuple[int, int, str | None]: + """Process multiple files in parallel with bounded concurrency. + + Each file gets its own DB session so they can run concurrently. + Returns (indexed_count, failed_count, error_summary_or_none). + """ + semaphore = asyncio.Semaphore(BATCH_CONCURRENCY) + indexed = 0 + failed = 0 + errors: list[str] = [] + lock = asyncio.Lock() + completed = 0 + + async def process_one(file_path: str) -> None: + nonlocal indexed, failed, completed + async with semaphore: + try: + async with get_celery_session_maker()() as file_session: + task_logger = TaskLoggingService(file_session, search_space_id) + log_entry = await task_logger.log_task_start( + task_name="local_folder_indexing", + source="local_folder_batch_indexing", + message=f"Batch: indexing {Path(file_path).name}", + metadata={"file_path": file_path}, + ) + ix, _sk, err = await _index_single_file( + session=file_session, + search_space_id=search_space_id, + user_id=user_id, + folder_path=folder_path, + folder_name=folder_name, + target_file_path=file_path, + enable_summary=enable_summary, + root_folder_id=root_folder_id, + task_logger=task_logger, + log_entry=log_entry, + ) + async with lock: + indexed += ix + if err: + failed += 1 + errors.append(f"{Path(file_path).name}: {err}") + completed += 1 + if on_progress_callback and completed % BATCH_CONCURRENCY == 0: + await on_progress_callback(completed) + except Exception as exc: + logger.exception(f"Batch: error processing {file_path}: {exc}") + async with lock: + failed += 1 + completed += 1 + errors.append(f"{Path(file_path).name}: {exc}") + + await asyncio.gather(*[process_one(fp) for fp in target_file_paths]) + + if on_progress_callback: + await on_progress_callback(completed) + + error_summary = None + if errors: + error_summary = f"{failed} file(s) failed: " + "; ".join(errors[:5]) + if len(errors) > 5: + error_summary += f" ... and {len(errors) - 5} more" + + return indexed, failed, error_summary + + +async def _index_single_file( + session: AsyncSession, + search_space_id: int, + user_id: str, + folder_path: str, + folder_name: str, + target_file_path: str, + enable_summary: bool, + root_folder_id: int | None, + task_logger, + log_entry, +) -> tuple[int, int, str | None]: + """Process a single file (chokidar real-time trigger).""" + try: + full_path = Path(target_file_path) + if not full_path.exists(): + rel = str(full_path.relative_to(folder_path)) + unique_id = f"{folder_name}:{rel}" + uid_hash = compute_identifier_hash( + DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id + ) + existing = await check_document_by_unique_identifier(session, uid_hash) + if existing: + deleted_folder_id = existing.folder_id + await session.delete(existing) + await session.flush() + if deleted_folder_id and root_folder_id: + await _cleanup_empty_folder_chain( + session, deleted_folder_id, root_folder_id + ) + await session.commit() + return 0, 0, None + return 0, 0, None + + rel_path = str(full_path.relative_to(folder_path)) + + unique_id = f"{folder_name}:{rel_path}" + uid_hash = compute_identifier_hash( + DocumentType.LOCAL_FOLDER_FILE.value, unique_id, search_space_id + ) + + page_limit_service = PageLimitService(session) + try: + estimated_pages = await _check_page_limit_or_skip( + page_limit_service, user_id, str(full_path) + ) + except PageLimitExceededError as e: + return 0, 1, f"Page limit exceeded: {e}" + + try: + content, content_hash = await _compute_file_content_hash( + str(full_path), full_path.name, search_space_id + ) + except Exception as e: + return 0, 1, f"Could not read file: {e}" + + if not content.strip(): + return 0, 1, None + + existing = await check_document_by_unique_identifier(session, uid_hash) + + if existing: + if existing.content_hash == content_hash: + mtime = full_path.stat().st_mtime + meta = dict(existing.document_metadata or {}) + meta["mtime"] = mtime + existing.document_metadata = meta + await session.commit() + return 0, 1, None + + await create_version_snapshot(session, existing) + + mtime = full_path.stat().st_mtime + + connector_doc = _build_connector_doc( + title=full_path.name, + content=content, + relative_path=rel_path, + folder_name=folder_name, + search_space_id=search_space_id, + user_id=user_id, + enable_summary=enable_summary, + ) + + pipeline = IndexingPipelineService(session) + llm = await get_user_long_context_llm(session, user_id, search_space_id) + documents = await pipeline.prepare_for_indexing([connector_doc]) + + if not documents: + return 0, 1, None + + db_doc = documents[0] + + if root_folder_id: + try: + db_doc.folder_id = await _resolve_folder_for_file( + session, rel_path, root_folder_id, search_space_id, user_id + ) + await session.commit() + except IntegrityError: + await session.rollback() + await session.refresh(db_doc) + + await pipeline.index(db_doc, connector_doc, llm) + + await session.refresh(db_doc) + doc_meta = dict(db_doc.document_metadata or {}) + doc_meta["mtime"] = mtime + db_doc.document_metadata = doc_meta + await session.commit() + + indexed = ( + 1 if DocumentStatus.is_state(db_doc.status, DocumentStatus.READY) else 0 + ) + failed_msg = None if indexed else "Indexing failed" + + if indexed: + final_pages = _compute_final_pages( + page_limit_service, estimated_pages, len(content) + ) + await page_limit_service.update_page_usage( + user_id, final_pages, allow_exceed=True + ) + await task_logger.log_task_success( + log_entry, + f"Single file indexed: {rel_path}", + {"file": rel_path, "pages_processed": final_pages}, + ) + return indexed, 0 if indexed else 1, failed_msg + + except Exception as e: + logger.exception(f"Error indexing single file {target_file_path}: {e}") + await session.rollback() + return 0, 0, str(e) diff --git a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py index 748cb0988..06517f542 100644 --- a/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py +++ b/surfsense_backend/app/tasks/connector_indexers/onedrive_indexer.py @@ -28,6 +28,7 @@ from app.indexing_pipeline.connector_document import ConnectorDocument from app.indexing_pipeline.document_hashing import compute_identifier_hash from app.indexing_pipeline.indexing_pipeline_service import IndexingPipelineService from app.services.llm_service import get_user_long_context_llm +from app.services.page_limit_service import PageLimitService from app.services.task_logging_service import TaskLoggingService from app.tasks.connector_indexers.base import ( check_document_by_unique_identifier, @@ -55,7 +56,10 @@ async def _should_skip_file( file_id = file.get("id") file_name = file.get("name", "Unknown") - if skip_item(file): + skip, unsup_ext = skip_item(file) + if skip: + if unsup_ext: + return True, f"unsupported:{unsup_ext}" return True, "folder/onenote/remote" if not file_id: return True, "missing file_id" @@ -289,12 +293,18 @@ async def _index_selected_files( user_id: str, enable_summary: bool, on_heartbeat: HeartbeatCallbackType | None = None, -) -> tuple[int, int, list[str]]: +) -> tuple[int, int, int, list[str]]: """Index user-selected files using the parallel pipeline.""" + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + files_to_download: list[dict] = [] errors: list[str] = [] renamed_count = 0 skipped = 0 + unsupported_count = 0 for file_id, file_name in file_ids: file, error = await get_file_by_id(onedrive_client, file_id) @@ -305,12 +315,23 @@ async def _index_selected_files( skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue + file_pages = PageLimitService.estimate_pages_from_metadata( + file.get("name", ""), file.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + display = file_name or file_id + errors.append(f"File '{display}': page limit would be exceeded") + continue + + batch_estimated_pages += file_pages files_to_download.append(file) batch_indexed, _failed = await _download_and_index( @@ -324,7 +345,15 @@ async def _index_selected_files( on_heartbeat=on_heartbeat, ) - return renamed_count + batch_indexed, skipped, errors + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + + return renamed_count + batch_indexed, skipped, unsupported_count, errors # --------------------------------------------------------------------------- @@ -346,8 +375,11 @@ async def _index_full_scan( include_subfolders: bool = True, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int]: - """Full scan indexing of a folder.""" +) -> tuple[int, int, int]: + """Full scan indexing of a folder. + + Returns (indexed, skipped, unsupported_count). + """ await task_logger.log_task_progress( log_entry, f"Starting full scan of folder: {folder_name}", @@ -358,8 +390,15 @@ async def _index_full_scan( }, ) + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + page_limit_reached = False + renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] all_files, error = await get_files_in_folder( @@ -378,11 +417,28 @@ async def _index_full_scan( for file in all_files[:max_files]: skip, msg = await _should_skip_file(session, file, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue + + file_pages = PageLimitService.estimate_pages_from_metadata( + file.get("name", ""), file.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + if not page_limit_reached: + logger.warning( + "Page limit reached during OneDrive full scan, " + "skipping remaining files" + ) + page_limit_reached = True + skipped += 1 + continue + + batch_estimated_pages += file_pages files_to_download.append(file) batch_indexed, failed = await _download_and_index( @@ -396,11 +452,20 @@ async def _index_full_scan( on_heartbeat=on_heartbeat_callback, ) + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + indexed = renamed_count + batch_indexed logger.info( - f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Full scan complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped + return indexed, skipped, unsupported_count async def _index_with_delta_sync( @@ -416,8 +481,11 @@ async def _index_with_delta_sync( max_files: int, on_heartbeat_callback: HeartbeatCallbackType | None = None, enable_summary: bool = True, -) -> tuple[int, int, str | None]: - """Delta sync using OneDrive change tracking. Returns (indexed, skipped, new_delta_link).""" +) -> tuple[int, int, int, str | None]: + """Delta sync using OneDrive change tracking. + + Returns (indexed, skipped, unsupported_count, new_delta_link). + """ await task_logger.log_task_progress( log_entry, "Starting delta sync", @@ -437,12 +505,19 @@ async def _index_with_delta_sync( if not changes: logger.info("No changes detected since last sync") - return 0, 0, new_delta_link + return 0, 0, 0, new_delta_link logger.info(f"Processing {len(changes)} delta changes") + page_limit_service = PageLimitService(session) + pages_used, pages_limit = await page_limit_service.get_page_usage(user_id) + remaining_quota = pages_limit - pages_used + batch_estimated_pages = 0 + page_limit_reached = False + renamed_count = 0 skipped = 0 + unsupported_count = 0 files_to_download: list[dict] = [] files_processed = 0 @@ -465,12 +540,28 @@ async def _index_with_delta_sync( skip, msg = await _should_skip_file(session, change, search_space_id) if skip: - if msg and "renamed" in msg.lower(): + if msg and msg.startswith("unsupported:"): + unsupported_count += 1 + elif msg and "renamed" in msg.lower(): renamed_count += 1 else: skipped += 1 continue + file_pages = PageLimitService.estimate_pages_from_metadata( + change.get("name", ""), change.get("size") + ) + if batch_estimated_pages + file_pages > remaining_quota: + if not page_limit_reached: + logger.warning( + "Page limit reached during OneDrive delta sync, " + "skipping remaining files" + ) + page_limit_reached = True + skipped += 1 + continue + + batch_estimated_pages += file_pages files_to_download.append(change) batch_indexed, failed = await _download_and_index( @@ -484,11 +575,20 @@ async def _index_with_delta_sync( on_heartbeat=on_heartbeat_callback, ) + if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0: + pages_to_deduct = max( + 1, batch_estimated_pages * batch_indexed // len(files_to_download) + ) + await page_limit_service.update_page_usage( + user_id, pages_to_deduct, allow_exceed=True + ) + indexed = renamed_count + batch_indexed logger.info( - f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed" + f"Delta sync complete: {indexed} indexed, {skipped} skipped, " + f"{unsupported_count} unsupported, {failed} failed" ) - return indexed, skipped, new_delta_link + return indexed, skipped, unsupported_count, new_delta_link # --------------------------------------------------------------------------- @@ -502,7 +602,7 @@ async def index_onedrive_files( search_space_id: int, user_id: str, items_dict: dict, -) -> tuple[int, int, str | None]: +) -> tuple[int, int, str | None, int]: """Index OneDrive files for a specific connector. items_dict format: @@ -529,7 +629,7 @@ async def index_onedrive_files( await task_logger.log_task_failure( log_entry, error_msg, None, {"error_type": "ConnectorNotFound"} ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 token_encrypted = connector.config.get("_token_encrypted", False) if token_encrypted and not config.SECRET_KEY: @@ -540,7 +640,7 @@ async def index_onedrive_files( "Missing SECRET_KEY", {"error_type": "MissingSecretKey"}, ) - return 0, 0, error_msg + return 0, 0, error_msg, 0 connector_enable_summary = getattr(connector, "enable_summary", True) onedrive_client = OneDriveClient(session, connector_id) @@ -552,12 +652,13 @@ async def index_onedrive_files( total_indexed = 0 total_skipped = 0 + total_unsupported = 0 # Index selected individual files selected_files = items_dict.get("files", []) if selected_files: file_tuples = [(f["id"], f.get("name")) for f in selected_files] - indexed, skipped, _errors = await _index_selected_files( + indexed, skipped, unsupported, _errors = await _index_selected_files( onedrive_client, session, file_tuples, @@ -568,6 +669,7 @@ async def index_onedrive_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsupported # Index selected folders folders = items_dict.get("folders", []) @@ -581,7 +683,7 @@ async def index_onedrive_files( if can_use_delta: logger.info(f"Using delta sync for folder {folder_name}") - indexed, skipped, new_delta_link = await _index_with_delta_sync( + indexed, skipped, unsup, new_delta_link = await _index_with_delta_sync( onedrive_client, session, connector_id, @@ -596,6 +698,7 @@ async def index_onedrive_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsup if new_delta_link: await session.refresh(connector) @@ -605,7 +708,7 @@ async def index_onedrive_files( flag_modified(connector, "config") # Reconciliation full scan - ri, rs = await _index_full_scan( + ri, rs, ru = await _index_full_scan( onedrive_client, session, connector_id, @@ -621,9 +724,10 @@ async def index_onedrive_files( ) total_indexed += ri total_skipped += rs + total_unsupported += ru else: logger.info(f"Using full scan for folder {folder_name}") - indexed, skipped = await _index_full_scan( + indexed, skipped, unsup = await _index_full_scan( onedrive_client, session, connector_id, @@ -639,6 +743,7 @@ async def index_onedrive_files( ) total_indexed += indexed total_skipped += skipped + total_unsupported += unsup # Store new delta link for this folder _, new_delta_link, _ = await onedrive_client.get_delta(folder_id=folder_id) @@ -657,12 +762,18 @@ async def index_onedrive_files( await task_logger.log_task_success( log_entry, f"Successfully completed OneDrive indexing for connector {connector_id}", - {"files_processed": total_indexed, "files_skipped": total_skipped}, + { + "files_processed": total_indexed, + "files_skipped": total_skipped, + "files_unsupported": total_unsupported, + }, ) logger.info( - f"OneDrive indexing completed: {total_indexed} indexed, {total_skipped} skipped" + f"OneDrive indexing completed: {total_indexed} indexed, " + f"{total_skipped} skipped, {total_unsupported} unsupported" ) - return total_indexed, total_skipped, None + + return total_indexed, total_skipped, None, total_unsupported except SQLAlchemyError as db_error: await session.rollback() @@ -673,7 +784,7 @@ async def index_onedrive_files( {"error_type": "SQLAlchemyError"}, ) logger.error(f"Database error: {db_error!s}", exc_info=True) - return 0, 0, f"Database error: {db_error!s}" + return 0, 0, f"Database error: {db_error!s}", 0 except Exception as e: await session.rollback() await task_logger.log_task_failure( @@ -683,4 +794,4 @@ async def index_onedrive_files( {"error_type": type(e).__name__}, ) logger.error(f"Failed to index OneDrive files: {e!s}", exc_info=True) - return 0, 0, f"Failed to index OneDrive files: {e!s}" + return 0, 0, f"Failed to index OneDrive files: {e!s}", 0 diff --git a/surfsense_backend/app/tasks/document_processors/__init__.py b/surfsense_backend/app/tasks/document_processors/__init__.py index e70c41cb4..f82c10883 100644 --- a/surfsense_backend/app/tasks/document_processors/__init__.py +++ b/surfsense_backend/app/tasks/document_processors/__init__.py @@ -1,43 +1,17 @@ """ Document processors module for background tasks. -This module provides a collection of document processors for different content types -and sources. Each processor is responsible for handling a specific type of document -processing task in the background. - -Available processors: -- Extension processor: Handle documents from browser extension -- Markdown processor: Process markdown files -- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling) -- YouTube processor: Process YouTube videos and extract transcripts +Content extraction is handled by ``app.etl_pipeline.EtlPipelineService``. +This package keeps orchestration (save, notify, page-limit) and +non-ETL processors (extension, markdown, youtube). """ -# URL crawler -# Extension processor from .extension_processor import add_extension_received_document - -# File processors -from .file_processors import ( - add_received_file_document_using_docling, - add_received_file_document_using_llamacloud, - add_received_file_document_using_unstructured, -) - -# Markdown processor from .markdown_processor import add_received_markdown_file_document - -# YouTube processor from .youtube_processor import add_youtube_video_document __all__ = [ - # Extension processing "add_extension_received_document", - "add_received_file_document_using_docling", - "add_received_file_document_using_llamacloud", - # File processing with different ETL services - "add_received_file_document_using_unstructured", - # Markdown file processing "add_received_markdown_file_document", - # YouTube video processing "add_youtube_video_document", ] diff --git a/surfsense_backend/app/tasks/document_processors/_direct_converters.py b/surfsense_backend/app/tasks/document_processors/_direct_converters.py new file mode 100644 index 000000000..bbff4838e --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_direct_converters.py @@ -0,0 +1,91 @@ +""" +Lossless file-to-markdown converters for text-based formats. + +These converters handle file types that can be faithfully represented as +markdown without any external ETL/OCR service: + +- CSV / TSV → markdown table (stdlib ``csv``) +- HTML / HTM / XHTML → markdown (``markdownify``) +""" + +from __future__ import annotations + +import csv +from collections.abc import Callable +from pathlib import Path + +from markdownify import markdownify + +# The stdlib csv module defaults to a 128 KB field-size limit which is too +# small for real-world exports (e.g. chat logs, CRM dumps). We raise it once +# at import time so every csv.reader call in this module can handle large fields. +csv.field_size_limit(2**31 - 1) + + +def _escape_pipe(cell: str) -> str: + """Escape literal pipe characters inside a markdown table cell.""" + return cell.replace("|", "\\|") + + +def csv_to_markdown(file_path: str, *, delimiter: str = ",") -> str: + """Convert a CSV (or TSV) file to a markdown table. + + The first row is treated as the header. An empty file returns an + empty string so the caller can decide how to handle it. + """ + with open(file_path, encoding="utf-8", newline="") as fh: + reader = csv.reader(fh, delimiter=delimiter) + rows = list(reader) + + if not rows: + return "" + + header, *body = rows + col_count = len(header) + + lines: list[str] = [] + + header_cells = [_escape_pipe(c.strip()) for c in header] + lines.append("| " + " | ".join(header_cells) + " |") + lines.append("| " + " | ".join(["---"] * col_count) + " |") + + for row in body: + padded = row + [""] * (col_count - len(row)) + cells = [_escape_pipe(c.strip()) for c in padded[:col_count]] + lines.append("| " + " | ".join(cells) + " |") + + return "\n".join(lines) + "\n" + + +def tsv_to_markdown(file_path: str) -> str: + """Convert a TSV file to a markdown table.""" + return csv_to_markdown(file_path, delimiter="\t") + + +def html_to_markdown(file_path: str) -> str: + """Convert an HTML file to markdown via ``markdownify``.""" + html = Path(file_path).read_text(encoding="utf-8") + return markdownify(html).strip() + + +_CONVERTER_MAP: dict[str, Callable[..., str]] = { + ".csv": csv_to_markdown, + ".tsv": tsv_to_markdown, + ".html": html_to_markdown, + ".htm": html_to_markdown, + ".xhtml": html_to_markdown, +} + + +def convert_file_directly(file_path: str, filename: str) -> str: + """Dispatch to the appropriate lossless converter based on file extension. + + Raises ``ValueError`` if the extension is not supported. + """ + suffix = Path(filename).suffix.lower() + converter = _CONVERTER_MAP.get(suffix) + if converter is None: + raise ValueError( + f"No direct converter for extension '{suffix}' (file: {filename})" + ) + return converter(file_path) diff --git a/surfsense_backend/app/tasks/document_processors/_helpers.py b/surfsense_backend/app/tasks/document_processors/_helpers.py new file mode 100644 index 000000000..9cd7b87c9 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_helpers.py @@ -0,0 +1,193 @@ +""" +Document helper functions for deduplication, migration, and connector updates. + +Provides reusable logic shared across file processors and ETL strategies. +""" + +import logging + +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentStatus, DocumentType +from app.utils.document_converters import generate_unique_identifier_hash + +from .base import ( + check_document_by_unique_identifier, + check_duplicate_document, +) + +# --------------------------------------------------------------------------- +# Unique identifier helpers +# --------------------------------------------------------------------------- + + +def get_google_drive_unique_identifier( + connector: dict | None, + filename: str, + search_space_id: int, +) -> tuple[str, str | None]: + """ + Get unique identifier hash, using file_id for Google Drive (stable across renames). + + Returns: + Tuple of (primary_hash, legacy_hash or None). + For Google Drive: (file_id-based hash, filename-based hash for migration). + For other sources: (filename-based hash, None). + """ + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + metadata = connector.get("metadata", {}) + file_id = metadata.get("google_drive_file_id") + + if file_id: + primary_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id + ) + legacy_hash = generate_unique_identifier_hash( + DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id + ) + return primary_hash, legacy_hash + + primary_hash = generate_unique_identifier_hash( + DocumentType.FILE, filename, search_space_id + ) + return primary_hash, None + + +# --------------------------------------------------------------------------- +# Document deduplication and migration +# --------------------------------------------------------------------------- + + +async def handle_existing_document_update( + session: AsyncSession, + existing_document: Document, + content_hash: str, + connector: dict | None, + filename: str, + primary_hash: str, +) -> tuple[bool, Document | None]: + """ + Handle update logic for an existing document. + + Returns: + Tuple of (should_skip_processing, document_to_return): + - (True, document): Content unchanged, return existing document + - (False, None): Content changed, needs re-processing + """ + if existing_document.unique_identifier_hash != primary_hash: + existing_document.unique_identifier_hash = primary_hash + logging.info(f"Migrated document to file_id-based identifier: {filename}") + + if existing_document.content_hash == content_hash: + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + connector_metadata = connector.get("metadata", {}) + new_name = connector_metadata.get("google_drive_file_name") + doc_metadata = existing_document.document_metadata or {} + old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get( + "google_drive_file_name" + ) + + if new_name and old_name and old_name != new_name: + from sqlalchemy.orm.attributes import flag_modified + + existing_document.title = new_name + if not existing_document.document_metadata: + existing_document.document_metadata = {} + existing_document.document_metadata["FILE_NAME"] = new_name + existing_document.document_metadata["google_drive_file_name"] = new_name + flag_modified(existing_document, "document_metadata") + await session.commit() + logging.info( + f"File renamed in Google Drive: '{old_name}' → '{new_name}' " + f"(no re-processing needed)" + ) + + logging.info(f"Document for file {filename} unchanged. Skipping.") + return True, existing_document + + # Content has changed — guard against content_hash collision before + # expensive ETL processing. + collision_doc = await check_duplicate_document(session, content_hash) + if collision_doc and collision_doc.id != existing_document.id: + logging.warning( + "Content-hash collision for %s: identical content exists in " + "document #%s (%s). Skipping re-processing.", + filename, + collision_doc.id, + collision_doc.document_type, + ) + if DocumentStatus.is_state( + existing_document.status, DocumentStatus.PENDING + ) or DocumentStatus.is_state( + existing_document.status, DocumentStatus.PROCESSING + ): + await session.delete(existing_document) + await session.commit() + return True, None + + return True, existing_document + + logging.info(f"Content changed for file {filename}. Updating document.") + return False, None + + +async def find_existing_document_with_migration( + session: AsyncSession, + primary_hash: str, + legacy_hash: str | None, + content_hash: str | None = None, +) -> Document | None: + """ + Find existing document, checking primary hash, legacy hash, and content_hash. + + Supports migration from filename-based to file_id-based hashing for + Google Drive files, with content_hash fallback for cross-source dedup. + """ + existing_document = await check_document_by_unique_identifier(session, primary_hash) + + if not existing_document and legacy_hash: + existing_document = await check_document_by_unique_identifier( + session, legacy_hash + ) + if existing_document: + logging.info( + "Found legacy document (filename-based hash), " + "will migrate to file_id-based hash" + ) + + if not existing_document and content_hash: + existing_document = await check_duplicate_document(session, content_hash) + if existing_document: + logging.info( + f"Found duplicate content from different source (content_hash match). " + f"Original document ID: {existing_document.id}, " + f"type: {existing_document.document_type}" + ) + + return existing_document + + +# --------------------------------------------------------------------------- +# Connector helpers +# --------------------------------------------------------------------------- + + +async def update_document_from_connector( + document: Document | None, + connector: dict | None, + session: AsyncSession, +) -> None: + """Update document type, metadata, and connector_id from connector info.""" + if not document or not connector: + return + if "type" in connector: + document.document_type = connector["type"] + if "metadata" in connector: + if not document.document_metadata: + document.document_metadata = connector["metadata"] + else: + merged = {**document.document_metadata, **connector["metadata"]} + document.document_metadata = merged + if "connector_id" in connector: + document.connector_id = connector["connector_id"] + await session.commit() diff --git a/surfsense_backend/app/tasks/document_processors/_save.py b/surfsense_backend/app/tasks/document_processors/_save.py new file mode 100644 index 000000000..ae45f7a69 --- /dev/null +++ b/surfsense_backend/app/tasks/document_processors/_save.py @@ -0,0 +1,204 @@ +""" +Unified document save/update logic for file processors. +""" + +import logging + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentStatus, DocumentType +from app.services.llm_service import get_user_long_context_llm +from app.utils.document_converters import ( + create_document_chunks, + embed_text, + generate_content_hash, + generate_document_summary, +) + +from ._helpers import ( + find_existing_document_with_migration, + get_google_drive_unique_identifier, + handle_existing_document_update, +) +from .base import get_current_timestamp, safe_set_chunks + +# --------------------------------------------------------------------------- +# Summary generation +# --------------------------------------------------------------------------- + + +async def _generate_summary( + markdown_content: str, + file_name: str, + etl_service: str, + user_llm, + enable_summary: bool, +) -> tuple[str, list[float]]: + """ + Generate a document summary and embedding. + + Docling uses its own large-document summary strategy; other ETL services + use the standard ``generate_document_summary`` helper. + """ + if not enable_summary: + summary = f"File: {file_name}\n\n{markdown_content[:4000]}" + return summary, embed_text(summary) + + if etl_service == "DOCLING": + from app.services.docling_service import create_docling_service + + docling_service = create_docling_service() + summary_text = await docling_service.process_large_document_summary( + content=markdown_content, llm=user_llm, document_title=file_name + ) + + meta = { + "file_name": file_name, + "etl_service": etl_service, + "document_type": "File Document", + } + parts = ["# DOCUMENT METADATA"] + for key, value in meta.items(): + if value: + formatted_key = key.replace("_", " ").title() + parts.append(f"**{formatted_key}:** {value}") + + enhanced = "\n".join(parts) + "\n\n# DOCUMENT SUMMARY\n\n" + summary_text + return enhanced, embed_text(enhanced) + + # Standard summary (Unstructured / LlamaCloud / others) + meta = { + "file_name": file_name, + "etl_service": etl_service, + "document_type": "File Document", + } + return await generate_document_summary(markdown_content, user_llm, meta) + + +# --------------------------------------------------------------------------- +# Unified save function +# --------------------------------------------------------------------------- + + +async def save_file_document( + session: AsyncSession, + file_name: str, + markdown_content: str, + search_space_id: int, + user_id: str, + etl_service: str, + connector: dict | None = None, + enable_summary: bool = True, +) -> Document | None: + """ + Process and store a file document with deduplication and migration support. + + Handles both creating new documents and updating existing ones. This is + the single implementation behind the per-ETL-service wrapper functions. + + Args: + session: Database session + file_name: Name of the processed file + markdown_content: Markdown content to store + search_space_id: ID of the search space + user_id: ID of the user + etl_service: Name of the ETL service (UNSTRUCTURED, LLAMACLOUD, DOCLING) + connector: Optional connector info for Google Drive files + enable_summary: Whether to generate an AI summary + + Returns: + Document object if successful, None if duplicate detected + """ + try: + primary_hash, legacy_hash = get_google_drive_unique_identifier( + connector, file_name, search_space_id + ) + content_hash = generate_content_hash(markdown_content, search_space_id) + + existing_document = await find_existing_document_with_migration( + session, primary_hash, legacy_hash, content_hash + ) + + if existing_document: + should_skip, doc = await handle_existing_document_update( + session, + existing_document, + content_hash, + connector, + file_name, + primary_hash, + ) + if should_skip: + return doc + + user_llm = await get_user_long_context_llm(session, user_id, search_space_id) + if not user_llm: + raise RuntimeError( + f"No long context LLM configured for user {user_id} " + f"in search space {search_space_id}" + ) + + summary_content, summary_embedding = await _generate_summary( + markdown_content, file_name, etl_service, user_llm, enable_summary + ) + chunks = await create_document_chunks(markdown_content) + doc_metadata = {"FILE_NAME": file_name, "ETL_SERVICE": etl_service} + + if existing_document: + existing_document.title = file_name + existing_document.content = summary_content + existing_document.content_hash = content_hash + existing_document.embedding = summary_embedding + existing_document.document_metadata = doc_metadata + await safe_set_chunks(session, existing_document, chunks) + existing_document.source_markdown = markdown_content + existing_document.content_needs_reindexing = False + existing_document.updated_at = get_current_timestamp() + existing_document.status = DocumentStatus.ready() + + await session.commit() + await session.refresh(existing_document) + return existing_document + + doc_type = DocumentType.FILE + if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: + doc_type = DocumentType.GOOGLE_DRIVE_FILE + + document = Document( + search_space_id=search_space_id, + title=file_name, + document_type=doc_type, + document_metadata=doc_metadata, + content=summary_content, + embedding=summary_embedding, + chunks=chunks, + content_hash=content_hash, + unique_identifier_hash=primary_hash, + source_markdown=markdown_content, + content_needs_reindexing=False, + updated_at=get_current_timestamp(), + created_by_id=user_id, + connector_id=connector.get("connector_id") if connector else None, + status=DocumentStatus.ready(), + ) + session.add(document) + await session.commit() + await session.refresh(document) + return document + + except SQLAlchemyError as db_error: + await session.rollback() + if "ix_documents_content_hash" in str(db_error): + logging.warning( + "content_hash collision during commit for %s (%s). Skipping.", + file_name, + etl_service, + ) + return None + raise db_error + except Exception as e: + await session.rollback() + raise RuntimeError( + f"Failed to process file document using {etl_service}: {e!s}" + ) from e diff --git a/surfsense_backend/app/tasks/document_processors/file_processors.py b/surfsense_backend/app/tasks/document_processors/file_processors.py index 6c0ae1870..c765dbd87 100644 --- a/surfsense_backend/app/tasks/document_processors/file_processors.py +++ b/surfsense_backend/app/tasks/document_processors/file_processors.py @@ -1,905 +1,271 @@ """ -File document processors for different ETL services (Unstructured, LlamaCloud, Docling). +File document processors orchestrating content extraction and indexing. + +Delegates content extraction to ``app.etl_pipeline.EtlPipelineService`` and +keeps only orchestration concerns (notifications, logging, page limits, saving). """ -import asyncio +from __future__ import annotations + import contextlib import logging -import ssl -import warnings -from logging import ERROR, getLogger +import os +from dataclasses import dataclass, field -import httpx from fastapi import HTTPException -from langchain_core.documents import Document as LangChainDocument -from litellm import atranscription -from sqlalchemy.exc import SQLAlchemyError from sqlalchemy.ext.asyncio import AsyncSession -from app.config import config as app_config -from app.db import Document, DocumentStatus, DocumentType, Log, Notification -from app.indexing_pipeline.adapters.file_upload_adapter import UploadDocumentAdapter -from app.services.llm_service import get_user_long_context_llm +from app.db import Document, Log, Notification from app.services.notification_service import NotificationService from app.services.task_logging_service import TaskLoggingService -from app.utils.document_converters import ( - convert_document_to_markdown, - create_document_chunks, - embed_text, - generate_content_hash, - generate_document_summary, - generate_unique_identifier_hash, -) -from .base import ( - check_document_by_unique_identifier, - check_duplicate_document, - get_current_timestamp, - safe_set_chunks, -) +from ._helpers import update_document_from_connector +from ._save import save_file_document from .markdown_processor import add_received_markdown_file_document -# Constants for LlamaCloud retry configuration -LLAMACLOUD_MAX_RETRIES = 5 # Increased from 3 for large file resilience -LLAMACLOUD_BASE_DELAY = 10 # Base delay in seconds for exponential backoff -LLAMACLOUD_MAX_DELAY = 120 # Maximum delay between retries (2 minutes) -LLAMACLOUD_RETRYABLE_EXCEPTIONS = ( - ssl.SSLError, - httpx.ConnectError, - httpx.ConnectTimeout, - httpx.ReadTimeout, - httpx.WriteTimeout, - httpx.RemoteProtocolError, - httpx.LocalProtocolError, - ConnectionError, - ConnectionResetError, - TimeoutError, - OSError, # Catches various network-level errors -) - -# Timeout calculation constants -UPLOAD_BYTES_PER_SECOND_SLOW = ( - 100 * 1024 -) # 100 KB/s (conservative for slow connections) -MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file -MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files -BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing -PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing +__all__ = [ + "process_file_in_background", + "process_file_in_background_with_document", + "save_file_document", +] -def get_google_drive_unique_identifier( - connector: dict | None, - filename: str, - search_space_id: int, -) -> tuple[str, str | None]: - """ - Get unique identifier hash for a file, with special handling for Google Drive. - - For Google Drive files, uses file_id as the unique identifier (doesn't change on rename). - For other files, uses filename. - - Args: - connector: Optional connector info dict with type and metadata - filename: The filename (used for non-Google Drive files or as fallback) - search_space_id: The search space ID - - Returns: - Tuple of (primary_hash, legacy_hash or None) - - For Google Drive: (file_id_based_hash, filename_based_hash for migration) - - For other sources: (filename_based_hash, None) - """ - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - metadata = connector.get("metadata", {}) - file_id = metadata.get("google_drive_file_id") - - if file_id: - # New method: use file_id as unique identifier (doesn't change on rename) - primary_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - # Legacy method: for backward compatibility with existing documents - # that were indexed with filename-based hash - legacy_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id - ) - return primary_hash, legacy_hash - - # For non-Google Drive files, use filename as before - primary_hash = generate_unique_identifier_hash( - DocumentType.FILE, filename, search_space_id - ) - return primary_hash, None +# --------------------------------------------------------------------------- +# Processing context (bundles parameters shared across handler functions) +# --------------------------------------------------------------------------- -async def handle_existing_document_update( - session: AsyncSession, - existing_document: Document, - content_hash: str, - connector: dict | None, - filename: str, - primary_hash: str, -) -> tuple[bool, Document | None]: - """ - Handle update logic for an existing document. +@dataclass +class _ProcessingContext: + session: AsyncSession + file_path: str + filename: str + search_space_id: int + user_id: str + task_logger: TaskLoggingService + log_entry: Log + connector: dict | None = None + notification: Notification | None = None + enable_summary: bool = field(init=False) - Args: - session: Database session - existing_document: The existing document found in database - content_hash: Hash of the new content - connector: Optional connector info - filename: Current filename - primary_hash: The primary hash (file_id based for Google Drive) - - Returns: - Tuple of (should_skip_processing, document_to_return) - - (True, document): Content unchanged, just return existing document - - (False, None): Content changed, need to re-process - """ - # Check if this document needs hash migration (found via legacy hash) - if existing_document.unique_identifier_hash != primary_hash: - existing_document.unique_identifier_hash = primary_hash - logging.info(f"Migrated document to file_id-based identifier: {filename}") - - # Check if content has changed - if existing_document.content_hash == content_hash: - # Content unchanged - check if we need to update metadata (e.g., filename changed) - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - connector_metadata = connector.get("metadata", {}) - new_name = connector_metadata.get("google_drive_file_name") - # Check both possible keys for old name (FILE_NAME is used in stored documents) - doc_metadata = existing_document.document_metadata or {} - old_name = doc_metadata.get("FILE_NAME") or doc_metadata.get( - "google_drive_file_name" - ) - - if new_name and old_name and old_name != new_name: - # File was renamed - update title and metadata, skip expensive processing - from sqlalchemy.orm.attributes import flag_modified - - existing_document.title = new_name - if not existing_document.document_metadata: - existing_document.document_metadata = {} - existing_document.document_metadata["FILE_NAME"] = new_name - existing_document.document_metadata["google_drive_file_name"] = new_name - flag_modified(existing_document, "document_metadata") - await session.commit() - logging.info( - f"File renamed in Google Drive: '{old_name}' → '{new_name}' (no re-processing needed)" - ) - - logging.info(f"Document for file {filename} unchanged. Skipping.") - return True, existing_document - else: - # Content has changed — guard against content_hash collision before - # expensive ETL processing. A collision means the exact same content - # already lives in a *different* document (e.g. a manual upload of the - # same file). Proceeding would trigger a unique-constraint violation - # on ix_documents_content_hash. - collision_doc = await check_duplicate_document(session, content_hash) - if collision_doc and collision_doc.id != existing_document.id: - logging.warning( - "Content-hash collision for %s: identical content exists in " - "document #%s (%s). Skipping re-processing.", - filename, - collision_doc.id, - collision_doc.document_type, - ) - if DocumentStatus.is_state( - existing_document.status, DocumentStatus.PENDING - ) or DocumentStatus.is_state( - existing_document.status, DocumentStatus.PROCESSING - ): - # Pending/processing doc has no real content yet — remove it - # so the UI doesn't show a contentless entry. - await session.delete(existing_document) - await session.commit() - return True, None - - # Document already has valid content — keep it as-is. - return True, existing_document - - logging.info(f"Content changed for file {filename}. Updating document.") - return False, None - - -async def find_existing_document_with_migration( - session: AsyncSession, - primary_hash: str, - legacy_hash: str | None, - content_hash: str | None = None, -) -> Document | None: - """ - Find existing document, checking both new hash and legacy hash for migration, - with fallback to content_hash for cross-source deduplication. - - Args: - session: Database session - primary_hash: The primary hash (file_id based for Google Drive) - legacy_hash: The legacy hash (filename based) for migration, or None - content_hash: The content hash for fallback deduplication, or None - - Returns: - Existing document if found, None otherwise - """ - # First check with primary hash (new method) - existing_document = await check_document_by_unique_identifier(session, primary_hash) - - # If not found and we have a legacy hash, check with that (migration path) - if not existing_document and legacy_hash: - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - logging.info( - "Found legacy document (filename-based hash), will migrate to file_id-based hash" - ) - - # Fallback: check by content_hash to catch duplicates from different sources - # This prevents unique constraint violations when the same content exists - # under a different unique_identifier (e.g., manual upload vs Google Drive) - if not existing_document and content_hash: - existing_document = await check_duplicate_document(session, content_hash) - if existing_document: - logging.info( - f"Found duplicate content from different source (content_hash match). " - f"Original document ID: {existing_document.id}, type: {existing_document.document_type}" - ) - - return existing_document - - -def calculate_upload_timeout(file_size_bytes: int) -> float: - """ - Calculate appropriate upload timeout based on file size. - - Assumes a conservative slow connection speed to handle worst-case scenarios. - - Args: - file_size_bytes: Size of the file in bytes - - Returns: - Timeout in seconds - """ - # Calculate time needed at slow connection speed - # Add 50% buffer for network variability and SSL overhead - estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5 - - # Clamp to reasonable bounds - return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT)) - - -def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float: - """ - Calculate job processing timeout based on page count and file size. - - Args: - estimated_pages: Estimated number of pages - file_size_bytes: Size of the file in bytes - - Returns: - Timeout in seconds - """ - # Base timeout + time per page - page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT) - - # Also consider file size (large images take longer to process) - # ~1 minute per 10MB of file size - size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60 - - # Use the larger of the two estimates - return max(page_based_timeout, size_based_timeout) - - -async def parse_with_llamacloud_retry( - file_path: str, - estimated_pages: int, - task_logger: TaskLoggingService | None = None, - log_entry: Log | None = None, -): - """ - Parse a file with LlamaCloud with retry logic for transient SSL/connection errors. - - Uses dynamic timeout calculations based on file size and page count to handle - very large files reliably. - - Args: - file_path: Path to the file to parse - estimated_pages: Estimated number of pages for timeout calculation - task_logger: Optional task logger for progress updates - log_entry: Optional log entry for progress updates - - Returns: - LlamaParse result object - - Raises: - Exception: If all retries fail - """ - import os - import random - - from llama_cloud_services import LlamaParse - from llama_cloud_services.parse.utils import ResultType - - # Get file size for timeout calculations - file_size_bytes = os.path.getsize(file_path) - file_size_mb = file_size_bytes / (1024 * 1024) - - # Calculate dynamic timeouts based on file size and page count - upload_timeout = calculate_upload_timeout(file_size_bytes) - job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes) - - # HTTP client timeouts - scaled based on file size - # Write timeout is critical for large file uploads - custom_timeout = httpx.Timeout( - connect=120.0, # 2 minutes to establish connection (handles slow DNS, etc.) - read=upload_timeout, # Dynamic based on file size - write=upload_timeout, # Dynamic based on file size (upload time) - pool=120.0, # 2 minutes to acquire connection from pool - ) - - logging.info( - f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, " - f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, " - f"job_timeout={job_timeout:.0f}s" - ) - - last_exception = None - attempt_errors = [] - - for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1): - try: - # Create a fresh httpx client for each attempt - async with httpx.AsyncClient(timeout=custom_timeout) as custom_client: - # Create LlamaParse parser instance with optimized settings - parser = LlamaParse( - api_key=app_config.LLAMA_CLOUD_API_KEY, - num_workers=1, # Use single worker for file processing - verbose=True, - language="en", - result_type=ResultType.MD, - # Timeout settings for large files - max_timeout=int(max(2000, job_timeout + upload_timeout)), - job_timeout_in_seconds=job_timeout, - job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT, - # Use our custom client with larger timeouts - custom_client=custom_client, - ) - - # Parse the file asynchronously - result = await parser.aparse(file_path) - - # Success - log if we had previous failures - if attempt > 1: - logging.info( - f"LlamaCloud upload succeeded on attempt {attempt} after " - f"{len(attempt_errors)} failures" - ) - - return result - - except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e: - last_exception = e - error_type = type(e).__name__ - error_msg = str(e)[:200] - attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}") - - if attempt < LLAMACLOUD_MAX_RETRIES: - # Calculate exponential backoff with jitter - # Base delay doubles each attempt, capped at max delay - base_delay = min( - LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)), LLAMACLOUD_MAX_DELAY - ) - # Add random jitter (±25%) to prevent thundering herd - jitter = base_delay * 0.25 * (2 * random.random() - 1) - delay = base_delay + jitter - - if task_logger and log_entry: - await task_logger.log_task_progress( - log_entry, - f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), retrying in {delay:.0f}s", - { - "error_type": error_type, - "error_message": error_msg, - "attempt": attempt, - "retry_delay": delay, - "file_size_mb": round(file_size_mb, 1), - "upload_timeout": upload_timeout, - }, - ) - else: - logging.warning( - f"LlamaCloud upload failed (attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): " - f"{error_type}. File: {file_size_mb:.1f}MB. Retrying in {delay:.0f}s..." - ) - - await asyncio.sleep(delay) - else: - logging.error( - f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} attempts. " - f"File size: {file_size_mb:.1f}MB, Pages: {estimated_pages}. " - f"Errors: {'; '.join(attempt_errors)}" - ) - - except Exception: - # Non-retryable exception, raise immediately - raise - - # All retries exhausted - raise last_exception or RuntimeError( - f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. " - f"File size: {file_size_mb:.1f}MB" - ) - - -async def add_received_file_document_using_unstructured( - session: AsyncSession, - file_name: str, - unstructured_processed_elements: list[LangChainDocument], - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store a file document using Unstructured service. - - Args: - session: Database session - file_name: Name of the processed file - unstructured_processed_elements: Processed elements from Unstructured - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - file_in_markdown = await convert_document_to_markdown( - unstructured_processed_elements + def __post_init__(self) -> None: + self.enable_summary = ( + self.connector.get("enable_summary", True) if self.connector else True ) - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search space {search_space_id}" - ) - - # Generate summary with metadata - document_metadata = { - "file_name": file_name, - "etl_service": "UNSTRUCTURED", - "document_type": "File Document", - } - if enable_summary: - summary_content, summary_embedding = await generate_document_summary( - file_in_markdown, user_llm, document_metadata - ) - else: - summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - summary_embedding = embed_text(summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - # Update existing document - existing_document.title = file_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "UNSTRUCTURED", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "UNSTRUCTURED", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (Unstructured). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError(f"Failed to process file document: {e!s}") from e +# --------------------------------------------------------------------------- +# Notification helper +# --------------------------------------------------------------------------- -async def add_received_file_document_using_llamacloud( - session: AsyncSession, - file_name: str, - llamacloud_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store document content parsed by LlamaCloud. - - Args: - session: Database session - file_name: Name of the processed file - llamacloud_markdown_document: Markdown content from LlamaCloud parsing - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - # Combine all markdown documents into one - file_in_markdown = llamacloud_markdown_document - - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search space {search_space_id}" - ) - - # Generate summary with metadata - document_metadata = { - "file_name": file_name, - "etl_service": "LLAMACLOUD", - "document_type": "File Document", - } - if enable_summary: - summary_content, summary_embedding = await generate_document_summary( - file_in_markdown, user_llm, document_metadata - ) - else: - summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - summary_embedding = embed_text(summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - existing_document.title = file_name - existing_document.content = summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "LLAMACLOUD", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "LLAMACLOUD", - }, - content=summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (LlamaCloud). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using LlamaCloud: {e!s}" - ) from e - - -async def add_received_file_document_using_docling( - session: AsyncSession, - file_name: str, - docling_markdown_document: str, - search_space_id: int, - user_id: str, - connector: dict | None = None, - enable_summary: bool = True, -) -> Document | None: - """ - Process and store document content parsed by Docling. - - Args: - session: Database session - file_name: Name of the processed file - docling_markdown_document: Markdown content from Docling parsing - search_space_id: ID of the search space - user_id: ID of the user - connector: Optional connector info for Google Drive files - - Returns: - Document object if successful, None if failed - """ - try: - file_in_markdown = docling_markdown_document - - # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = get_google_drive_unique_identifier( - connector, file_name, search_space_id - ) - - # Generate content hash - content_hash = generate_content_hash(file_in_markdown, search_space_id) - - # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await find_existing_document_with_migration( - session, primary_hash, legacy_hash, content_hash - ) - - if existing_document: - # Handle existing document (rename detection, content change check) - should_skip, doc = await handle_existing_document_update( - session, - existing_document, - content_hash, - connector, - file_name, - primary_hash, - ) - if should_skip: - return doc - # Content changed - continue to update - - # Get user's long context LLM (needed for both create and update) - user_llm = await get_user_long_context_llm(session, user_id, search_space_id) - if not user_llm: - raise RuntimeError( - f"No long context LLM configured for user {user_id} in search_space {search_space_id}" - ) - - if enable_summary: - from app.services.docling_service import create_docling_service - - docling_service = create_docling_service() - - summary_content = await docling_service.process_large_document_summary( - content=file_in_markdown, llm=user_llm, document_title=file_name - ) - - document_metadata = { - "file_name": file_name, - "etl_service": "DOCLING", - "document_type": "File Document", - } - metadata_parts = ["# DOCUMENT METADATA"] - for key, value in document_metadata.items(): - if value: - formatted_key = key.replace("_", " ").title() - metadata_parts.append(f"**{formatted_key}:** {value}") - - metadata_section = "\n".join(metadata_parts) - enhanced_summary_content = ( - f"{metadata_section}\n\n# DOCUMENT SUMMARY\n\n{summary_content}" - ) - else: - enhanced_summary_content = f"File: {file_name}\n\n{file_in_markdown[:4000]}" - - summary_embedding = embed_text(enhanced_summary_content) - - # Process chunks - chunks = await create_document_chunks(file_in_markdown) - - # Update or create document - if existing_document: - # Update existing document - existing_document.title = file_name - existing_document.content = enhanced_summary_content - existing_document.content_hash = content_hash - existing_document.embedding = summary_embedding - existing_document.document_metadata = { - "FILE_NAME": file_name, - "ETL_SERVICE": "DOCLING", - } - await safe_set_chunks(session, existing_document, chunks) - existing_document.source_markdown = file_in_markdown - existing_document.content_needs_reindexing = False - existing_document.updated_at = get_current_timestamp() - existing_document.status = DocumentStatus.ready() # Mark as ready - - await session.commit() - await session.refresh(existing_document) - document = existing_document - else: - # Create new document - # Determine document type based on connector - doc_type = DocumentType.FILE - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - doc_type = DocumentType.GOOGLE_DRIVE_FILE - - document = Document( - search_space_id=search_space_id, - title=file_name, - document_type=doc_type, - document_metadata={ - "FILE_NAME": file_name, - "ETL_SERVICE": "DOCLING", - }, - content=enhanced_summary_content, - embedding=summary_embedding, - chunks=chunks, - content_hash=content_hash, - unique_identifier_hash=primary_hash, - source_markdown=file_in_markdown, - content_needs_reindexing=False, - updated_at=get_current_timestamp(), - created_by_id=user_id, - connector_id=connector.get("connector_id") if connector else None, - status=DocumentStatus.ready(), # Mark as ready - ) - - session.add(document) - await session.commit() - await session.refresh(document) - - return document - except SQLAlchemyError as db_error: - await session.rollback() - if "ix_documents_content_hash" in str(db_error): - logging.warning( - "content_hash collision during commit for %s (Docling). Skipping.", - file_name, - ) - return None - raise db_error - except Exception as e: - await session.rollback() - raise RuntimeError( - f"Failed to process file document using Docling: {e!s}" - ) from e - - -async def _update_document_from_connector( - document: Document | None, connector: dict | None, session: AsyncSession +async def _notify( + ctx: _ProcessingContext, + stage: str, + stage_message: str | None = None, + **kwargs, ) -> None: - """Helper to update document type, metadata, and connector_id from connector info.""" - if document and connector: - if "type" in connector: - document.document_type = connector["type"] - if "metadata" in connector: - # Merge with existing document_metadata (the actual column name) - if not document.document_metadata: - document.document_metadata = connector["metadata"] - else: - # Expand existing metadata with connector metadata - merged = {**document.document_metadata, **connector["metadata"]} - document.document_metadata = merged - # Set connector_id if provided for de-indexing support - if "connector_id" in connector: - document.connector_id = connector["connector_id"] - await session.commit() + """Send a processing-progress notification if one is attached.""" + if not ctx.notification: + return + await NotificationService.document_processing.notify_processing_progress( + ctx.session, + ctx.notification, + stage=stage, + stage_message=stage_message, + **kwargs, + ) + + +# --------------------------------------------------------------------------- +# Page-limit helpers +# --------------------------------------------------------------------------- + + +def _estimate_pages_safe(page_limit_service, file_path: str) -> int: + """Estimate page count with a file-size fallback.""" + try: + return page_limit_service.estimate_pages_before_processing(file_path) + except Exception: + file_size = os.path.getsize(file_path) + return max(1, file_size // (80 * 1024)) + + +async def _log_page_divergence( + task_logger: TaskLoggingService, + log_entry: Log, + filename: str, + estimated: int, + actual: int, + final: int, +) -> None: + """Log a warning when the actual page count far exceeds the pre-estimate.""" + if actual > estimated * 1.5: + await task_logger.log_task_progress( + log_entry, + f"Actual page count higher than estimate: {filename}", + { + "estimated_before": estimated, + "actual_pages": actual, + "using_count": final, + }, + ) + + +# =================================================================== +# Handlers for process_file_in_background (legacy / connector path) +# =================================================================== + + +async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None: + """Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + + await _notify(ctx, "parsing", "Processing file") + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Processing file: {ctx.filename}", + {"processing_stage": "extracting"}, + ) + + etl_result = await EtlPipelineService().extract( + EtlRequest(file_path=ctx.file_path, filename=ctx.filename) + ) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await _notify(ctx, "chunking") + + result = await add_received_markdown_file_document( + ctx.session, + ctx.filename, + etl_result.markdown_content, + ctx.search_space_id, + ctx.user_id, + ctx.connector, + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + + if result: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": etl_result.content_type, + "etl_service": etl_result.etl_service, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"File already exists (duplicate): {ctx.filename}", + {"duplicate_detected": True, "file_type": etl_result.content_type}, + ) + return result + + +# --------------------------------------------------------------------------- +# Document file processing (ETL service dispatch) +# --------------------------------------------------------------------------- + + +async def _process_document_upload(ctx: _ProcessingContext) -> Document | None: + """Route a document file to the configured ETL service via the unified pipeline.""" + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + from app.services.page_limit_service import PageLimitExceededError, PageLimitService + + page_limit_service = PageLimitService(ctx.session) + estimated_pages = _estimate_pages_safe(page_limit_service, ctx.file_path) + + await ctx.task_logger.log_task_progress( + ctx.log_entry, + f"Estimated {estimated_pages} pages for file: {ctx.filename}", + {"estimated_pages": estimated_pages, "file_type": "document"}, + ) + + try: + await page_limit_service.check_page_limit(ctx.user_id, estimated_pages) + except PageLimitExceededError as e: + await ctx.task_logger.log_task_failure( + ctx.log_entry, + f"Page limit exceeded before processing: {ctx.filename}", + str(e), + { + "error_type": "PageLimitExceeded", + "pages_used": e.pages_used, + "pages_limit": e.pages_limit, + "estimated_pages": estimated_pages, + }, + ) + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + raise HTTPException(status_code=403, detail=str(e)) from e + + await _notify(ctx, "parsing", "Extracting content") + + etl_result = await EtlPipelineService().extract( + EtlRequest( + file_path=ctx.file_path, + filename=ctx.filename, + estimated_pages=estimated_pages, + ) + ) + + with contextlib.suppress(Exception): + os.unlink(ctx.file_path) + + await _notify(ctx, "chunking") + + result = await save_file_document( + ctx.session, + ctx.filename, + etl_result.markdown_content, + ctx.search_space_id, + ctx.user_id, + etl_result.etl_service, + ctx.connector, + enable_summary=ctx.enable_summary, + ) + + if result: + await page_limit_service.update_page_usage( + ctx.user_id, estimated_pages, allow_exceed=True + ) + if ctx.connector: + await update_document_from_connector(result, ctx.connector, ctx.session) + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Successfully processed file: {ctx.filename}", + { + "document_id": result.id, + "content_hash": result.content_hash, + "file_type": "document", + "etl_service": etl_result.etl_service, + "pages_processed": estimated_pages, + }, + ) + else: + await ctx.task_logger.log_task_success( + ctx.log_entry, + f"Document already exists (duplicate): {ctx.filename}", + { + "duplicate_detected": True, + "file_type": "document", + "etl_service": etl_result.etl_service, + }, + ) + return result + + +# =================================================================== +# Public orchestrators +# =================================================================== async def process_file_in_background( @@ -910,726 +276,36 @@ async def process_file_in_background( session: AsyncSession, task_logger: TaskLoggingService, log_entry: Log, - connector: dict - | None = None, # Optional: {"type": "GOOGLE_DRIVE_FILE", "metadata": {...}} - notification: Notification - | None = None, # Optional notification for progress updates + connector: dict | None = None, + notification: Notification | None = None, ) -> Document | None: + ctx = _ProcessingContext( + session=session, + file_path=file_path, + filename=filename, + search_space_id=search_space_id, + user_id=user_id, + task_logger=task_logger, + log_entry=log_entry, + connector=connector, + notification=notification, + ) + try: - # Check if the file is a markdown or text file - if filename.lower().endswith((".md", ".markdown", ".txt")): - # Update notification: parsing stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Reading file", - ) - ) + from app.etl_pipeline.file_classifier import ( + FileCategory as EtlFileCategory, + classify_file as etl_classify, + ) - await task_logger.log_task_progress( - log_entry, - f"Processing markdown/text file: {filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, - ) + category = etl_classify(filename) - # For markdown files, read the content directly - with open(file_path, encoding="utf-8") as f: - markdown_content = f.read() + if category == EtlFileCategory.DOCUMENT: + return await _process_document_upload(ctx) + return await _process_non_document_upload(ctx) - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Update notification: chunking stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Creating document from markdown content: {filename}", - { - "processing_stage": "creating_document", - "content_length": len(markdown_content), - }, - ) - - # Process markdown directly through specialized function - result = await add_received_markdown_file_document( - session, filename, markdown_content, search_space_id, user_id, connector - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully processed markdown file: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "markdown", - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Markdown file already exists (duplicate): {filename}", - {"duplicate_detected": True, "file_type": "markdown"}, - ) - return None - - # Check if the file is an audio file - elif filename.lower().endswith( - (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") - ): - # Update notification: parsing stage (transcription) - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Transcribing audio", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing audio file for transcription: {filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - - # Determine STT service type - stt_service_type = ( - "local" - if app_config.STT_SERVICE - and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - # Check if using local STT service - if stt_service_type == "local": - # Use local Faster-Whisper for transcription - from app.services.stt_service import stt_service - - try: - result = stt_service.transcribe_file(file_path) - transcribed_text = result.get("text", "") - - if not transcribed_text: - raise ValueError("Transcription returned empty text") - - # Add metadata about the transcription - transcribed_text = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - except Exception as e: - raise HTTPException( - status_code=422, - detail=f"Failed to transcribe audio file {filename}: {e!s}", - ) from e - - await task_logger.log_task_progress( - log_entry, - f"Local STT transcription completed: {filename}", - { - "processing_stage": "local_transcription_complete", - "language": result.get("language"), - "confidence": result.get("language_probability"), - "duration": result.get("duration"), - }, - ) - else: - # Use LiteLLM for audio transcription - with open(file_path, "rb") as audio_file: - transcription_kwargs = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = ( - app_config.STT_SERVICE_API_BASE - ) - - transcription_response = await atranscription( - **transcription_kwargs - ) - - # Extract the transcribed text - transcribed_text = transcription_response.get("text", "") - - if not transcribed_text: - raise ValueError("Transcription returned empty text") - - # Add metadata about the transcription - transcribed_text = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - - await task_logger.log_task_progress( - log_entry, - f"Transcription completed, creating document: {filename}", - { - "processing_stage": "transcription_complete", - "transcript_length": len(transcribed_text), - }, - ) - - # Update notification: chunking stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - ) - - # Clean up the temp file - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Process transcription as markdown document - result = await add_received_markdown_file_document( - session, filename, transcribed_text, search_space_id, user_id, connector - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - await task_logger.log_task_success( - log_entry, - f"Successfully transcribed and processed audio file: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "audio", - "transcript_length": len(transcribed_text), - "stt_service": stt_service_type, - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Audio file transcript already exists (duplicate): {filename}", - {"duplicate_detected": True, "file_type": "audio"}, - ) - return None - - else: - # Import page limit service - from app.services.page_limit_service import ( - PageLimitExceededError, - PageLimitService, - ) - - # Initialize page limit service - page_limit_service = PageLimitService(session) - - # CRITICAL: Estimate page count BEFORE making expensive ETL API calls - # This prevents users from incurring costs on files that would exceed their limit - try: - estimated_pages_before = ( - page_limit_service.estimate_pages_before_processing(file_path) - ) - except Exception: - # If estimation fails, use a conservative estimate based on file size - import os - - file_size = os.path.getsize(file_path) - estimated_pages_before = max( - 1, file_size // (80 * 1024) - ) # ~80KB per page - - await task_logger.log_task_progress( - log_entry, - f"Estimated {estimated_pages_before} pages for file: {filename}", - { - "estimated_pages": estimated_pages_before, - "file_type": "document", - }, - ) - - # Check page limit BEFORE calling ETL service to avoid unnecessary costs - try: - await page_limit_service.check_page_limit( - user_id, estimated_pages_before - ) - except PageLimitExceededError as e: - await task_logger.log_task_failure( - log_entry, - f"Page limit exceeded before processing: {filename}", - str(e), - { - "error_type": "PageLimitExceeded", - "pages_used": e.pages_used, - "pages_limit": e.pages_limit, - "estimated_pages": estimated_pages_before, - }, - ) - # Clean up the temp file - import os - - with contextlib.suppress(Exception): - os.unlink(file_path) - - raise HTTPException( - status_code=403, - detail=str(e), - ) from e - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with Unstructured ETL: {filename}", - { - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "processing_stage": "loading", - }, - ) - - from langchain_unstructured import UnstructuredLoader - - # Process the file - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - - docs = await loader.aload() - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking", chunks_count=len(docs) - ) - - await task_logger.log_task_progress( - log_entry, - f"Unstructured ETL completed, creating document: {filename}", - {"processing_stage": "etl_complete", "elements_count": len(docs)}, - ) - - # Verify actual page count from parsed documents - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - result = await add_received_file_document_using_unstructured( - session, - filename, - docs, - search_space_id, - user_id, - connector, - enable_summary=enable_summary, - ) - - if connector: - await _update_document_from_connector(result, connector, session) - - if result: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with Unstructured: {filename}", - { - "document_id": result.id, - "content_hash": result.content_hash, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - "pages_processed": final_page_count, - }, - ) - return result - else: - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "UNSTRUCTURED", - }, - ) - return None - - elif app_config.ETL_SERVICE == "LLAMACLOUD": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with LlamaCloud ETL: {filename}", - { - "file_type": "document", - "etl_service": "LLAMACLOUD", - "processing_stage": "parsing", - "estimated_pages": estimated_pages_before, - }, - ) - - # Parse file with retry logic for SSL/connection errors (common with large files) - result = await parse_with_llamacloud_retry( - file_path=file_path, - estimated_pages=estimated_pages_before, - task_logger=task_logger, - log_entry=log_entry, - ) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - # Get markdown documents from the result - markdown_documents = await result.aget_markdown_documents( - split_by_page=False - ) - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="chunking", - chunks_count=len(markdown_documents), - ) - - await task_logger.log_task_progress( - log_entry, - f"LlamaCloud parsing completed, creating documents: {filename}", - { - "processing_stage": "parsing_complete", - "documents_count": len(markdown_documents), - }, - ) - - # Check if LlamaCloud returned any documents - if not markdown_documents or len(markdown_documents) == 0: - await task_logger.log_task_failure( - log_entry, - f"LlamaCloud parsing returned no documents: {filename}", - "ETL service returned empty document list", - { - "error_type": "EmptyDocumentList", - "etl_service": "LLAMACLOUD", - }, - ) - raise ValueError( - f"LlamaCloud parsing returned no documents for {filename}" - ) - - # Verify actual page count from parsed markdown documents - actual_pages = page_limit_service.estimate_pages_from_markdown( - markdown_documents - ) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Track if any document was successfully created (not a duplicate) - any_doc_created = False - last_created_doc = None - - for doc in markdown_documents: - # Extract text content from the markdown documents - markdown_content = doc.text - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - doc_result = await add_received_file_document_using_llamacloud( - session, - filename, - llamacloud_markdown_document=markdown_content, - search_space_id=search_space_id, - user_id=user_id, - connector=connector, - enable_summary=enable_summary, - ) - - # Track if this document was successfully created - if doc_result: - any_doc_created = True - last_created_doc = doc_result - - # Update page usage once after processing all documents - # Only update if at least one document was created (not all duplicates) - if any_doc_created: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - if connector: - await _update_document_from_connector( - last_created_doc, connector, session - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with LlamaCloud: {filename}", - { - "document_id": last_created_doc.id, - "content_hash": last_created_doc.content_hash, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "pages_processed": final_page_count, - "documents_count": len(markdown_documents), - }, - ) - return last_created_doc - else: - # All documents were duplicates (markdown_documents was not empty, but all returned None) - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "LLAMACLOUD", - "documents_count": len(markdown_documents), - }, - ) - return None - - elif app_config.ETL_SERVICE == "DOCLING": - # Update notification: parsing stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing file with Docling ETL: {filename}", - { - "file_type": "document", - "etl_service": "DOCLING", - "processing_stage": "parsing", - }, - ) - - # Use Docling service for document processing - from app.services.docling_service import create_docling_service - - # Create Docling service - docling_service = create_docling_service() - - # Suppress pdfminer warnings that can cause processing to hang - # These warnings are harmless but can spam logs and potentially halt processing - # Suppress both Python warnings and logging warnings from pdfminer - pdfminer_logger = getLogger("pdfminer") - original_level = pdfminer_logger.level - - with warnings.catch_warnings(): - warnings.filterwarnings( - "ignore", category=UserWarning, module="pdfminer" - ) - warnings.filterwarnings( - "ignore", - message=".*Cannot set gray non-stroke color.*", - ) - warnings.filterwarnings("ignore", message=".*invalid float value.*") - - # Temporarily suppress pdfminer logging warnings - pdfminer_logger.setLevel(ERROR) - - try: - # Process the document - result = await docling_service.process_document( - file_path, filename - ) - finally: - # Restore original logging level - pdfminer_logger.setLevel(original_level) - - # Clean up the temp file - import os - - try: - os.unlink(file_path) - except Exception as e: - print("Error deleting temp file", e) - pass - - await task_logger.log_task_progress( - log_entry, - f"Docling parsing completed, creating document: {filename}", - { - "processing_stage": "parsing_complete", - "content_length": len(result["content"]), - }, - ) - - # Verify actual page count from content length - actual_pages = page_limit_service.estimate_pages_from_content_length( - len(result["content"]) - ) - - # Use the higher of the two estimates for safety (in case pre-estimate was too low) - final_page_count = max(estimated_pages_before, actual_pages) - - # If actual is significantly higher than estimate, log a warning - if actual_pages > estimated_pages_before * 1.5: - await task_logger.log_task_progress( - log_entry, - f"Actual page count higher than estimate: {filename}", - { - "estimated_before": estimated_pages_before, - "actual_pages": actual_pages, - "using_count": final_page_count, - }, - ) - - # Update notification: chunking stage - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" - ) - - enable_summary = ( - connector.get("enable_summary", True) if connector else True - ) - doc_result = await add_received_file_document_using_docling( - session, - filename, - docling_markdown_document=result["content"], - search_space_id=search_space_id, - user_id=user_id, - connector=connector, - enable_summary=enable_summary, - ) - - if doc_result: - # Update page usage after successful processing - # allow_exceed=True because document was already created after passing initial check - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - if connector: - await _update_document_from_connector( - doc_result, connector, session - ) - - await task_logger.log_task_success( - log_entry, - f"Successfully processed file with Docling: {filename}", - { - "document_id": doc_result.id, - "content_hash": doc_result.content_hash, - "file_type": "document", - "etl_service": "DOCLING", - "pages_processed": final_page_count, - }, - ) - return doc_result - else: - await task_logger.log_task_success( - log_entry, - f"Document already exists (duplicate): {filename}", - { - "duplicate_detected": True, - "file_type": "document", - "etl_service": "DOCLING", - }, - ) - return None except Exception as e: await session.rollback() - # For page limit errors, use the detailed message from the exception from app.services.page_limit_service import PageLimitExceededError if isinstance(e, PageLimitExceededError): @@ -1645,10 +321,88 @@ async def process_file_in_background( str(e), {"error_type": type(e).__name__, "filename": filename}, ) - import logging - logging.error(f"Error processing file in background: {error_message}") - raise # Re-raise so the wrapper can also handle it + raise + + +# =================================================================== +# 2-phase handler (process_file_in_background_with_document) +# =================================================================== + + +async def _extract_file_content( + file_path: str, + filename: str, + session: AsyncSession, + user_id: str, + task_logger: TaskLoggingService, + log_entry: Log, + notification: Notification | None, +) -> tuple[str, str]: + """ + Extract markdown content from a file regardless of type. + + Returns: + Tuple of (markdown_content, etl_service_name). + """ + from app.etl_pipeline.etl_document import EtlRequest + from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + from app.etl_pipeline.file_classifier import ( + FileCategory, + classify_file as etl_classify, + ) + + category = etl_classify(filename) + estimated_pages = 0 + + if notification: + stage_messages = { + FileCategory.PLAINTEXT: "Reading file", + FileCategory.DIRECT_CONVERT: "Converting file", + FileCategory.AUDIO: "Transcribing audio", + FileCategory.UNSUPPORTED: "Unsupported file type", + FileCategory.DOCUMENT: "Extracting content", + } + await NotificationService.document_processing.notify_processing_progress( + session, + notification, + stage="parsing", + stage_message=stage_messages.get(category, "Processing"), + ) + + await task_logger.log_task_progress( + log_entry, + f"Processing {category.value} file: {filename}", + {"file_type": category.value, "processing_stage": "extracting"}, + ) + + if category == FileCategory.DOCUMENT: + from app.services.page_limit_service import PageLimitService + + page_limit_service = PageLimitService(session) + estimated_pages = _estimate_pages_safe(page_limit_service, file_path) + await page_limit_service.check_page_limit(user_id, estimated_pages) + + result = await EtlPipelineService().extract( + EtlRequest( + file_path=file_path, + filename=filename, + estimated_pages=estimated_pages, + ) + ) + + if category == FileCategory.DOCUMENT: + await page_limit_service.update_page_usage( + user_id, estimated_pages, allow_exceed=True + ) + + with contextlib.suppress(Exception): + os.unlink(file_path) + + if not result.markdown_content: + raise RuntimeError(f"Failed to extract content from file: {filename}") + + return result.markdown_content, result.etl_service async def process_file_in_background_with_document( @@ -1667,272 +421,50 @@ async def process_file_in_background_with_document( """ Process file and update existing pending document (2-phase pattern). - This function is Phase 2 of the real-time document status updates: - - Phase 1 (API): Created document with pending status - - Phase 2 (this): Process file and update document to ready/failed - - The document already exists with pending status. This function: - 1. Parses the file content (markdown, audio, or ETL services) - 2. Updates the document with content, embeddings, and chunks - 3. Sets status to 'ready' on success - - Args: - document: Existing document with pending status - file_path: Path to the uploaded file - filename: Original filename - search_space_id: ID of the search space - user_id: ID of the user - session: Database session - task_logger: Task logging service - log_entry: Log entry for this task - connector: Optional connector info for Google Drive files - notification: Optional notification for progress updates - - Returns: - Updated Document object if successful, None if duplicate content detected + Phase 1 (API layer): Created document with pending status. + Phase 2 (this function): Process file and update document to ready/failed. """ - import os - - from app.config import config as app_config + from app.indexing_pipeline.adapters.file_upload_adapter import ( + UploadDocumentAdapter, + ) from app.services.llm_service import get_user_long_context_llm + from app.utils.document_converters import generate_content_hash + + from .base import check_duplicate_document doc_id = document.id try: - markdown_content = None - etl_service = None - - # ===== STEP 1: Parse file content based on type ===== - - # Check if the file is a markdown or text file - if filename.lower().endswith((".md", ".markdown", ".txt")): - # Update notification: parsing stage - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Reading file", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing markdown/text file: {filename}", - {"file_type": "markdown", "processing_stage": "reading_file"}, - ) - - # Read markdown content directly - with open(file_path, encoding="utf-8") as f: - markdown_content = f.read() - etl_service = "MARKDOWN" - - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) - - # Check if the file is an audio file - elif filename.lower().endswith( - (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm") - ): - # Update notification: parsing stage (transcription) - if notification: - await ( - NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Transcribing audio", - ) - ) - - await task_logger.log_task_progress( - log_entry, - f"Processing audio file for transcription: {filename}", - {"file_type": "audio", "processing_stage": "starting_transcription"}, - ) - - # Transcribe audio - stt_service_type = ( - "local" - if app_config.STT_SERVICE - and app_config.STT_SERVICE.startswith("local/") - else "external" - ) - - if stt_service_type == "local": - from app.services.stt_service import stt_service - - result = stt_service.transcribe_file(file_path) - transcribed_text = result.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - markdown_content = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - else: - with open(file_path, "rb") as audio_file: - transcription_kwargs = { - "model": app_config.STT_SERVICE, - "file": audio_file, - "api_key": app_config.STT_SERVICE_API_KEY, - } - if app_config.STT_SERVICE_API_BASE: - transcription_kwargs["api_base"] = ( - app_config.STT_SERVICE_API_BASE - ) - transcription_response = await atranscription( - **transcription_kwargs - ) - transcribed_text = transcription_response.get("text", "") - if not transcribed_text: - raise ValueError("Transcription returned empty text") - markdown_content = ( - f"# Transcription of {filename}\n\n{transcribed_text}" - ) - - etl_service = "AUDIO_TRANSCRIPTION" - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) - - else: - # Document files - use ETL service - from app.services.page_limit_service import ( - PageLimitExceededError, - PageLimitService, - ) - - page_limit_service = PageLimitService(session) - - # Estimate page count - try: - estimated_pages = page_limit_service.estimate_pages_before_processing( - file_path - ) - except Exception: - file_size = os.path.getsize(file_path) - estimated_pages = max(1, file_size // (80 * 1024)) - - # Check page limit - await page_limit_service.check_page_limit(user_id, estimated_pages) - - if app_config.ETL_SERVICE == "UNSTRUCTURED": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - from langchain_unstructured import UnstructuredLoader - - loader = UnstructuredLoader( - file_path, - mode="elements", - post_processors=[], - languages=["eng"], - include_orig_elements=False, - include_metadata=False, - strategy="auto", - ) - docs = await loader.aload() - markdown_content = await convert_document_to_markdown(docs) - actual_pages = page_limit_service.estimate_pages_from_elements(docs) - final_page_count = max(estimated_pages, actual_pages) - etl_service = "UNSTRUCTURED" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, final_page_count, allow_exceed=True - ) - - elif app_config.ETL_SERVICE == "LLAMACLOUD": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - result = await parse_with_llamacloud_retry( - file_path=file_path, - estimated_pages=estimated_pages, - task_logger=task_logger, - log_entry=log_entry, - ) - markdown_documents = await result.aget_markdown_documents( - split_by_page=False - ) - if not markdown_documents: - raise RuntimeError( - f"LlamaCloud parsing returned no documents: {filename}" - ) - markdown_content = markdown_documents[0].text - etl_service = "LLAMACLOUD" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, estimated_pages, allow_exceed=True - ) - - elif app_config.ETL_SERVICE == "DOCLING": - if notification: - await NotificationService.document_processing.notify_processing_progress( - session, - notification, - stage="parsing", - stage_message="Extracting content", - ) - - # Suppress logging during Docling import - getLogger("docling.pipeline.base_pipeline").setLevel(ERROR) - getLogger("docling.document_converter").setLevel(ERROR) - getLogger( - "docling_core.transforms.chunker.hierarchical_chunker" - ).setLevel(ERROR) - - from docling.document_converter import DocumentConverter - - converter = DocumentConverter() - result = converter.convert(file_path) - markdown_content = result.document.export_to_markdown() - etl_service = "DOCLING" - - # Update page usage - await page_limit_service.update_page_usage( - user_id, estimated_pages, allow_exceed=True - ) - - else: - raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}") - - # Clean up temp file - with contextlib.suppress(Exception): - os.unlink(file_path) + # Step 1: extract content + markdown_content, etl_service = await _extract_file_content( + file_path, + filename, + session, + user_id, + task_logger, + log_entry, + notification, + ) if not markdown_content: raise RuntimeError(f"Failed to extract content from file: {filename}") - # ===== STEP 2: Check for duplicate content ===== + # Step 2: duplicate check content_hash = generate_content_hash(markdown_content, search_space_id) - existing_by_content = await check_duplicate_document(session, content_hash) if existing_by_content and existing_by_content.id != doc_id: - # Duplicate content found - mark this document as failed logging.info( f"Duplicate content detected for {filename}, " f"matches document {existing_by_content.id}" ) return None - # ===== STEP 3+4: Index via pipeline ===== + # Step 3: index via pipeline if notification: await NotificationService.document_processing.notify_processing_progress( - session, notification, stage="chunking" + session, + notification, + stage="chunking", ) user_llm = await get_user_long_context_llm(session, user_id, search_space_id) @@ -1957,7 +489,6 @@ async def process_file_in_background_with_document( "file_type": etl_service, }, ) - return document except Exception as e: diff --git a/surfsense_backend/app/tasks/document_processors/markdown_processor.py b/surfsense_backend/app/tasks/document_processors/markdown_processor.py index 2fb711bf8..0ff340c0e 100644 --- a/surfsense_backend/app/tasks/document_processors/markdown_processor.py +++ b/surfsense_backend/app/tasks/document_processors/markdown_processor.py @@ -14,88 +14,19 @@ from app.utils.document_converters import ( create_document_chunks, generate_content_hash, generate_document_summary, - generate_unique_identifier_hash, ) +from ._helpers import ( + find_existing_document_with_migration, + get_google_drive_unique_identifier, +) from .base import ( - check_document_by_unique_identifier, check_duplicate_document, get_current_timestamp, safe_set_chunks, ) -def _get_google_drive_unique_identifier( - connector: dict | None, - filename: str, - search_space_id: int, -) -> tuple[str, str | None]: - """ - Get unique identifier hash for a file, with special handling for Google Drive. - - For Google Drive files, uses file_id as the unique identifier (doesn't change on rename). - For other files, uses filename. - - Args: - connector: Optional connector info dict with type and metadata - filename: The filename (used for non-Google Drive files or as fallback) - search_space_id: The search space ID - - Returns: - Tuple of (primary_hash, legacy_hash or None) - """ - if connector and connector.get("type") == DocumentType.GOOGLE_DRIVE_FILE: - metadata = connector.get("metadata", {}) - file_id = metadata.get("google_drive_file_id") - - if file_id: - primary_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, file_id, search_space_id - ) - legacy_hash = generate_unique_identifier_hash( - DocumentType.GOOGLE_DRIVE_FILE, filename, search_space_id - ) - return primary_hash, legacy_hash - - primary_hash = generate_unique_identifier_hash( - DocumentType.FILE, filename, search_space_id - ) - return primary_hash, None - - -async def _find_existing_document_with_migration( - session: AsyncSession, - primary_hash: str, - legacy_hash: str | None, - content_hash: str | None = None, -) -> Document | None: - """ - Find existing document, checking both new hash and legacy hash for migration, - with fallback to content_hash for cross-source deduplication. - """ - existing_document = await check_document_by_unique_identifier(session, primary_hash) - - if not existing_document and legacy_hash: - existing_document = await check_document_by_unique_identifier( - session, legacy_hash - ) - if existing_document: - logging.info( - "Found legacy document (filename-based hash), will migrate to file_id-based hash" - ) - - # Fallback: check by content_hash to catch duplicates from different sources - if not existing_document and content_hash: - existing_document = await check_duplicate_document(session, content_hash) - if existing_document: - logging.info( - f"Found duplicate content from different source (content_hash match). " - f"Original document ID: {existing_document.id}, type: {existing_document.document_type}" - ) - - return existing_document - - async def _handle_existing_document_update( session: AsyncSession, existing_document: Document, @@ -224,7 +155,7 @@ async def add_received_markdown_file_document( try: # Generate unique identifier hash (uses file_id for Google Drive, filename for others) - primary_hash, legacy_hash = _get_google_drive_unique_identifier( + primary_hash, legacy_hash = get_google_drive_unique_identifier( connector, file_name, search_space_id ) @@ -232,7 +163,7 @@ async def add_received_markdown_file_document( content_hash = generate_content_hash(file_in_markdown, search_space_id) # Check if document exists (with migration support for Google Drive and content_hash fallback) - existing_document = await _find_existing_document_with_migration( + existing_document = await find_existing_document_with_migration( session, primary_hash, legacy_hash, content_hash ) diff --git a/surfsense_backend/app/utils/document_versioning.py b/surfsense_backend/app/utils/document_versioning.py new file mode 100644 index 000000000..e6ad1fb06 --- /dev/null +++ b/surfsense_backend/app/utils/document_versioning.py @@ -0,0 +1,107 @@ +"""Document versioning: snapshot creation and cleanup. + +Rules: +- 30-minute debounce window: if the latest version was created < 30 min ago, + overwrite it instead of creating a new row. +- Maximum 20 versions per document. +- Versions older than 90 days are cleaned up. +""" + +from datetime import UTC, datetime, timedelta + +from sqlalchemy import delete, func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentVersion + +MAX_VERSIONS_PER_DOCUMENT = 20 +DEBOUNCE_MINUTES = 30 +RETENTION_DAYS = 90 + + +def _now() -> datetime: + return datetime.now(UTC) + + +async def create_version_snapshot( + session: AsyncSession, + document: Document, +) -> DocumentVersion | None: + """Snapshot the document's current state into a DocumentVersion row. + + Returns the created/updated DocumentVersion, or None if nothing was done. + """ + now = _now() + + latest = ( + await session.execute( + select(DocumentVersion) + .where(DocumentVersion.document_id == document.id) + .order_by(DocumentVersion.version_number.desc()) + .limit(1) + ) + ).scalar_one_or_none() + + if latest is not None: + age = now - latest.created_at.replace(tzinfo=UTC) + if age < timedelta(minutes=DEBOUNCE_MINUTES): + latest.source_markdown = document.source_markdown + latest.content_hash = document.content_hash + latest.title = document.title + latest.created_at = now + await session.flush() + return latest + + max_num = ( + await session.execute( + select(func.coalesce(func.max(DocumentVersion.version_number), 0)).where( + DocumentVersion.document_id == document.id + ) + ) + ).scalar_one() + + version = DocumentVersion( + document_id=document.id, + version_number=max_num + 1, + source_markdown=document.source_markdown, + content_hash=document.content_hash, + title=document.title, + created_at=now, + ) + session.add(version) + await session.flush() + + # Cleanup: remove versions older than 90 days + cutoff = now - timedelta(days=RETENTION_DAYS) + await session.execute( + delete(DocumentVersion).where( + DocumentVersion.document_id == document.id, + DocumentVersion.created_at < cutoff, + ) + ) + + # Cleanup: cap at MAX_VERSIONS_PER_DOCUMENT + count = ( + await session.execute( + select(func.count()) + .select_from(DocumentVersion) + .where(DocumentVersion.document_id == document.id) + ) + ).scalar_one() + + if count > MAX_VERSIONS_PER_DOCUMENT: + excess = count - MAX_VERSIONS_PER_DOCUMENT + oldest_ids_result = await session.execute( + select(DocumentVersion.id) + .where(DocumentVersion.document_id == document.id) + .order_by(DocumentVersion.version_number.asc()) + .limit(excess) + ) + oldest_ids = [row[0] for row in oldest_ids_result.all()] + if oldest_ids: + await session.execute( + delete(DocumentVersion).where(DocumentVersion.id.in_(oldest_ids)) + ) + + await session.flush() + return version diff --git a/surfsense_backend/app/utils/file_extensions.py b/surfsense_backend/app/utils/file_extensions.py new file mode 100644 index 000000000..8d432ce56 --- /dev/null +++ b/surfsense_backend/app/utils/file_extensions.py @@ -0,0 +1,124 @@ +"""Per-parser document extension sets for the ETL pipeline. + +Every consumer (file_classifier, connector-level skip checks, ETL pipeline +validation) imports from here so there is a single source of truth. + +Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or +DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these +sets are exclusively for the "document" ETL path (Docling / LlamaParse / +Unstructured). +""" + +from pathlib import PurePosixPath + +# --------------------------------------------------------------------------- +# Per-parser document extension sets (from official documentation) +# --------------------------------------------------------------------------- + +DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".xlsx", + ".pptx", + ".png", + ".jpg", + ".jpeg", + ".tiff", + ".tif", + ".bmp", + ".webp", + } +) + +LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".doc", + ".xlsx", + ".xls", + ".pptx", + ".ppt", + ".docm", + ".dot", + ".dotm", + ".pptm", + ".pot", + ".potx", + ".xlsm", + ".xlsb", + ".xlw", + ".rtf", + ".epub", + ".png", + ".jpg", + ".jpeg", + ".gif", + ".bmp", + ".tiff", + ".tif", + ".webp", + ".svg", + ".odt", + ".ods", + ".odp", + ".hwp", + ".hwpx", + } +) + +UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset( + { + ".pdf", + ".docx", + ".doc", + ".xlsx", + ".xls", + ".pptx", + ".ppt", + ".png", + ".jpg", + ".jpeg", + ".bmp", + ".tiff", + ".tif", + ".heic", + ".rtf", + ".epub", + ".odt", + ".eml", + ".msg", + ".p7s", + } +) + +# --------------------------------------------------------------------------- +# Union (used by classify_file for routing) + service lookup +# --------------------------------------------------------------------------- + +DOCUMENT_EXTENSIONS: frozenset[str] = ( + DOCLING_DOCUMENT_EXTENSIONS + | LLAMAPARSE_DOCUMENT_EXTENSIONS + | UNSTRUCTURED_DOCUMENT_EXTENSIONS +) + +_SERVICE_MAP: dict[str, frozenset[str]] = { + "DOCLING": DOCLING_DOCUMENT_EXTENSIONS, + "LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS, + "UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS, +} + + +def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]: + """Return the document extensions supported by *etl_service*. + + Falls back to the full union when the service is ``None`` or unknown. + """ + return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS) + + +def is_supported_document_extension(filename: str) -> bool: + """Return True if the file's extension is in the supported document set.""" + suffix = PurePosixPath(filename).suffix.lower() + return suffix in DOCUMENT_EXTENSIONS diff --git a/surfsense_backend/app/utils/oauth_security.py b/surfsense_backend/app/utils/oauth_security.py index 5135cdef4..c39b1e9b1 100644 --- a/surfsense_backend/app/utils/oauth_security.py +++ b/surfsense_backend/app/utils/oauth_security.py @@ -11,6 +11,8 @@ import hmac import json import logging import time +from random import SystemRandom +from string import ascii_letters, digits from uuid import UUID from cryptography.fernet import Fernet @@ -18,6 +20,25 @@ from fastapi import HTTPException logger = logging.getLogger(__name__) +_PKCE_CHARS = ascii_letters + digits + "-._~" +_PKCE_RNG = SystemRandom() + + +def generate_code_verifier(length: int = 128) -> str: + """Generate a PKCE code_verifier (RFC 7636, 43-128 unreserved chars).""" + return "".join(_PKCE_RNG.choice(_PKCE_CHARS) for _ in range(length)) + + +def generate_pkce_pair(length: int = 128) -> tuple[str, str]: + """Generate a PKCE code_verifier and its S256 code_challenge.""" + verifier = generate_code_verifier(length) + challenge = ( + base64.urlsafe_b64encode(hashlib.sha256(verifier.encode()).digest()) + .decode() + .rstrip("=") + ) + return verifier, challenge + class OAuthStateManager: """Manages secure OAuth state parameters with HMAC signatures.""" diff --git a/surfsense_backend/pyproject.toml b/surfsense_backend/pyproject.toml index 8e3f48b11..893aa77f9 100644 --- a/surfsense_backend/pyproject.toml +++ b/surfsense_backend/pyproject.toml @@ -46,8 +46,6 @@ dependencies = [ "redis>=5.2.1", "firecrawl-py>=4.9.0", "boto3>=1.35.0", - "litellm>=1.80.10", - "langchain-litellm>=0.3.5", "fake-useragent>=2.2.0", "trafilatura>=2.0.0", "fastapi-users[oauth,sqlalchemy]>=15.0.3", @@ -75,6 +73,8 @@ dependencies = [ "langchain-community>=0.4.1", "deepagents>=0.4.12", "stripe>=15.0.0", + "litellm>=1.83.0", + "langchain-litellm>=0.6.4", ] [dependency-groups] diff --git a/surfsense_backend/tests/integration/document_upload/conftest.py b/surfsense_backend/tests/integration/document_upload/conftest.py index 1f1c7df59..62f4f6b47 100644 --- a/surfsense_backend/tests/integration/document_upload/conftest.py +++ b/surfsense_backend/tests/integration/document_upload/conftest.py @@ -3,6 +3,7 @@ Prerequisites: PostgreSQL + pgvector only. External system boundaries are mocked: + - ETL parsing — LlamaParse (external API) and Docling (heavy library) - LLM summarization, text embedding, text chunking (external APIs) - Redis heartbeat (external infrastructure) - Task dispatch is swapped via DI (InlineTaskDispatcher) @@ -11,6 +12,7 @@ External system boundaries are mocked: from __future__ import annotations import contextlib +import os from collections.abc import AsyncGenerator from unittest.mock import AsyncMock, MagicMock @@ -298,3 +300,59 @@ def _mock_redis_heartbeat(monkeypatch): "app.tasks.celery_tasks.document_tasks._run_heartbeat_loop", AsyncMock(), ) + + +_MOCK_ETL_MARKDOWN = "# Mocked Document\n\nThis is mocked ETL content." + + +@pytest.fixture(autouse=True) +def _mock_etl_parsing(monkeypatch): + """Mock ETL parsing services — LlamaParse and Docling are external boundaries. + + Preserves the real contract: empty/corrupt files raise an error just like + the actual services would, so tests covering failure paths keep working. + """ + + def _reject_empty(file_path: str) -> None: + if os.path.getsize(file_path) == 0: + raise RuntimeError(f"Cannot parse empty file: {file_path}") + + # -- LlamaParse mock (external API) -------------------------------- + + async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str: + _reject_empty(file_path) + return _MOCK_ETL_MARKDOWN + + monkeypatch.setattr( + "app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud", + _fake_llamacloud_parse, + ) + + # -- Docling mock (heavy library boundary) ------------------------- + + async def _fake_docling_parse(file_path: str, filename: str) -> str: + _reject_empty(file_path) + return _MOCK_ETL_MARKDOWN + + monkeypatch.setattr( + "app.etl_pipeline.parsers.docling.parse_with_docling", + _fake_docling_parse, + ) + + class _FakeDoclingResult: + class Document: + @staticmethod + def export_to_markdown(): + return _MOCK_ETL_MARKDOWN + + document = Document() + + class _FakeDocumentConverter: + def convert(self, file_path): + _reject_empty(file_path) + return _FakeDoclingResult() + + monkeypatch.setattr( + "docling.document_converter.DocumentConverter", + _FakeDocumentConverter, + ) diff --git a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py index a8dab43f0..a56398baa 100644 --- a/surfsense_backend/tests/integration/document_upload/test_upload_limits.py +++ b/surfsense_backend/tests/integration/document_upload/test_upload_limits.py @@ -2,12 +2,11 @@ Integration tests for backend file upload limit enforcement. These tests verify that the API rejects uploads that exceed: - - Max files per upload (10) - - Max per-file size (50 MB) - - Max total upload size (200 MB) + - Max per-file size (500 MB) -The limits mirror the frontend's DocumentUploadTab.tsx constants and are -enforced server-side to protect against direct API calls. +No file count or total size limits are enforced — the frontend batches +uploads in groups of 5 and there is no cap on how many files a user can +upload in a single session. Prerequisites: - PostgreSQL + pgvector @@ -24,60 +23,12 @@ pytestmark = pytest.mark.integration # --------------------------------------------------------------------------- -# Test A: File count limit -# --------------------------------------------------------------------------- - - -class TestFileCountLimit: - """Uploading more than 10 files in a single request should be rejected.""" - - async def test_11_files_returns_413( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - ): - files = [ - ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) - for i in range(11) - ] - resp = await client.post( - "/api/v1/documents/fileupload", - headers=headers, - files=files, - data={"search_space_id": str(search_space_id)}, - ) - assert resp.status_code == 413 - assert "too many files" in resp.json()["detail"].lower() - - async def test_10_files_accepted( - self, - client: httpx.AsyncClient, - headers: dict[str, str], - search_space_id: int, - cleanup_doc_ids: list[int], - ): - files = [ - ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) - for i in range(10) - ] - resp = await client.post( - "/api/v1/documents/fileupload", - headers=headers, - files=files, - data={"search_space_id": str(search_space_id)}, - ) - assert resp.status_code == 200 - cleanup_doc_ids.extend(resp.json().get("document_ids", [])) - - -# --------------------------------------------------------------------------- -# Test B: Per-file size limit +# Test: Per-file size limit (500 MB) # --------------------------------------------------------------------------- class TestPerFileSizeLimit: - """A single file exceeding 50 MB should be rejected.""" + """A single file exceeding 500 MB should be rejected.""" async def test_oversized_file_returns_413( self, @@ -85,7 +36,7 @@ class TestPerFileSizeLimit: headers: dict[str, str], search_space_id: int, ): - oversized = io.BytesIO(b"\x00" * (50 * 1024 * 1024 + 1)) + oversized = io.BytesIO(b"\x00" * (500 * 1024 * 1024 + 1)) resp = await client.post( "/api/v1/documents/fileupload", headers=headers, @@ -102,11 +53,11 @@ class TestPerFileSizeLimit: search_space_id: int, cleanup_doc_ids: list[int], ): - at_limit = io.BytesIO(b"\x00" * (50 * 1024 * 1024)) + at_limit = io.BytesIO(b"\x00" * (500 * 1024 * 1024)) resp = await client.post( "/api/v1/documents/fileupload", headers=headers, - files=[("files", ("exact50mb.txt", at_limit, "text/plain"))], + files=[("files", ("exact500mb.txt", at_limit, "text/plain"))], data={"search_space_id": str(search_space_id)}, ) assert resp.status_code == 200 @@ -114,26 +65,23 @@ class TestPerFileSizeLimit: # --------------------------------------------------------------------------- -# Test C: Total upload size limit +# Test: Multiple files accepted without count limit # --------------------------------------------------------------------------- -class TestTotalSizeLimit: - """Multiple files whose combined size exceeds 200 MB should be rejected.""" +class TestNoFileCountLimit: + """Many files in a single request should be accepted.""" - async def test_total_size_over_200mb_returns_413( + async def test_many_files_accepted( self, client: httpx.AsyncClient, headers: dict[str, str], search_space_id: int, + cleanup_doc_ids: list[int], ): - chunk_size = 45 * 1024 * 1024 # 45 MB each files = [ - ( - "files", - (f"chunk_{i}.txt", io.BytesIO(b"\x00" * chunk_size), "text/plain"), - ) - for i in range(5) # 5 x 45 MB = 225 MB > 200 MB + ("files", (f"file_{i}.txt", io.BytesIO(b"test content"), "text/plain")) + for i in range(20) ] resp = await client.post( "/api/v1/documents/fileupload", @@ -141,5 +89,5 @@ class TestTotalSizeLimit: files=files, data={"search_space_id": str(search_space_id)}, ) - assert resp.status_code == 413 - assert "total upload size" in resp.json()["detail"].lower() + assert resp.status_code == 200 + cleanup_doc_ids.extend(resp.json().get("document_ids", [])) diff --git a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py index 5bb0b6137..e669fa143 100644 --- a/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py +++ b/surfsense_backend/tests/integration/google_unification/test_drive_indexer_credentials.py @@ -124,7 +124,7 @@ async def test_composio_connector_without_account_id_returns_error( maker = make_session_factory(async_engine) async with maker() as session: - count, _skipped, error = await index_google_drive_files( + count, _skipped, error, _unsupported = await index_google_drive_files( session=session, connector_id=data["connector_id"], search_space_id=data["search_space_id"], diff --git a/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py new file mode 100644 index 000000000..000f43aa8 --- /dev/null +++ b/surfsense_backend/tests/integration/indexing_pipeline/test_local_folder_pipeline.py @@ -0,0 +1,1180 @@ +"""Integration tests for local folder indexer — Tier 3 (I1-I5), Tier 4 (F1-F7), Tier 5 (P1), Tier 6 (B1-B2).""" + +import os +from contextlib import asynccontextmanager +from pathlib import Path + +import pytest +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import ( + Document, + DocumentStatus, + DocumentType, + DocumentVersion, + Folder, + SearchSpace, + User, +) + +pytestmark = pytest.mark.integration + +UNIFIED_FIXTURES = ( + "patched_summarize", + "patched_embed_texts", + "patched_chunk_text", +) + + +class _FakeSessionMaker: + """Wraps an existing AsyncSession so ``async with factory()`` yields it + without closing it. Used to route batch-mode DB operations through the + test's savepoint-wrapped session.""" + + def __init__(self, session: AsyncSession): + self._session = session + + def __call__(self): + @asynccontextmanager + async def _ctx(): + yield self._session + + return _ctx() + + +@pytest.fixture +def patched_batch_sessions(monkeypatch, db_session): + """Make ``_index_batch_files`` use the test session and run sequentially.""" + monkeypatch.setattr( + "app.tasks.connector_indexers.local_folder_indexer.get_celery_session_maker", + lambda: _FakeSessionMaker(db_session), + ) + monkeypatch.setattr( + "app.tasks.connector_indexers.local_folder_indexer.BATCH_CONCURRENCY", + 1, + ) + + +# ==================================================================== +# Tier 3: Full Indexer Integration (I1-I5) +# ==================================================================== + + +class TestFullIndexer: + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_i1_new_file_indexed( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """I1: Single new .md file is indexed with status READY.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + docs = ( + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ) + .scalars() + .all() + ) + assert len(docs) == 1 + assert docs[0].document_type == DocumentType.LOCAL_FOLDER_FILE + assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY) + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_i2_unchanged_skipped( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """I2: Second run on unchanged directory creates no new documents.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "note.md").write_text("# Hello\n\nSame content.") + + count1, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + assert count1 == 1 + + count2, _, _, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, + ) + assert count2 == 0 + + total = ( + await db_session.execute( + select(func.count()) + .select_from(Document) + .where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + assert total == 1 + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_i3_changed_reindexed( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """I3: Modified file content triggers re-index and creates a version.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + f = tmp_path / "note.md" + f.write_text("# Version 1\n\nOriginal.") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + f.write_text("# Version 2\n\nUpdated.") + os.utime(f, (f.stat().st_atime + 10, f.stat().st_mtime + 10)) + + count, _, _, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, + ) + assert count == 1 + + versions = ( + ( + await db_session.execute( + select(DocumentVersion) + .join(Document) + .where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ) + .scalars() + .all() + ) + assert len(versions) >= 1 + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_i4_deleted_removed( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """I4: Deleted file is removed from DB on re-sync.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + f = tmp_path / "to_delete.md" + f.write_text("# Delete me") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + docs_before = ( + await db_session.execute( + select(func.count()) + .select_from(Document) + .where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + assert docs_before == 1 + + f.unlink() + + await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, + ) + + docs_after = ( + await db_session.execute( + select(func.count()) + .select_from(Document) + .where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + assert docs_after == 0 + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_i5_single_file_mode( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """I5: Batch mode with a single file only processes that file.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "a.md").write_text("File A") + (tmp_path / "b.md").write_text("File B") + (tmp_path / "c.md").write_text("File C") + + count, _, _, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[str(tmp_path / "b.md")], + ) + assert count == 1 + + docs = ( + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ) + .scalars() + .all() + ) + assert len(docs) == 1 + assert docs[0].title == "b.md" + + +# ==================================================================== +# Tier 4: Folder Mirroring (F1-F7) +# ==================================================================== + + +class TestFolderMirroring: + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_f1_root_folder_created( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F1: First sync creates a root Folder and returns root_folder_id.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "root.md").write_text("Root file") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert root_folder_id is not None + + root_folder = ( + await db_session.execute(select(Folder).where(Folder.id == root_folder_id)) + ).scalar_one() + assert root_folder.name == "test-folder" + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_f2_nested_folder_rows( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F2: Nested dirs create Folder rows with correct parent_id chain.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + daily = tmp_path / "notes" / "daily" + daily.mkdir(parents=True) + weekly = tmp_path / "notes" / "weekly" + weekly.mkdir(parents=True) + (daily / "today.md").write_text("today") + (weekly / "review.md").write_text("review") + + await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + folders = ( + ( + await db_session.execute( + select(Folder).where(Folder.search_space_id == db_search_space.id) + ) + ) + .scalars() + .all() + ) + + folder_names = {f.name for f in folders} + assert "notes" in folder_names + assert "daily" in folder_names + assert "weekly" in folder_names + + notes_folder = next(f for f in folders if f.name == "notes") + daily_folder = next(f for f in folders if f.name == "daily") + weekly_folder = next(f for f in folders if f.name == "weekly") + + assert daily_folder.parent_id == notes_folder.id + assert weekly_folder.parent_id == notes_folder.id + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_f3_resync_reuses_folders( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F3: Re-sync reuses existing Folder rows, no duplicates.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + sub = tmp_path / "docs" + sub.mkdir() + (sub / "file.md").write_text("content") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + folders_before = ( + ( + await db_session.execute( + select(Folder).where(Folder.search_space_id == db_search_space.id) + ) + ) + .scalars() + .all() + ) + ids_before = {f.id for f in folders_before} + + await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, + ) + + folders_after = ( + ( + await db_session.execute( + select(Folder).where(Folder.search_space_id == db_search_space.id) + ) + ) + .scalars() + .all() + ) + ids_after = {f.id for f in folders_after} + + assert ids_before == ids_after + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_f4_folder_id_assigned( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F4: Documents get correct folder_id based on their directory.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + daily = tmp_path / "notes" / "daily" + daily.mkdir(parents=True) + (daily / "today.md").write_text("today note") + (tmp_path / "root.md").write_text("root note") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + docs = ( + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ) + .scalars() + .all() + ) + + today_doc = next(d for d in docs if d.title == "today.md") + root_doc = next(d for d in docs if d.title == "root.md") + + daily_folder = ( + await db_session.execute(select(Folder).where(Folder.name == "daily")) + ).scalar_one() + + assert today_doc.folder_id == daily_folder.id + + assert root_doc.folder_id == root_folder_id + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_f5_empty_folder_cleanup( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F5: Deleted dir's empty Folder row is cleaned up on re-sync.""" + import shutil + + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + daily = tmp_path / "notes" / "daily" + daily.mkdir(parents=True) + weekly = tmp_path / "notes" / "weekly" + weekly.mkdir(parents=True) + (daily / "today.md").write_text("today") + (weekly / "review.md").write_text("review") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + weekly_folder = ( + await db_session.execute(select(Folder).where(Folder.name == "weekly")) + ).scalar_one_or_none() + assert weekly_folder is not None + + shutil.rmtree(weekly) + + await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, + ) + + weekly_after = ( + await db_session.execute(select(Folder).where(Folder.name == "weekly")) + ).scalar_one_or_none() + assert weekly_after is None + + daily_after = ( + await db_session.execute(select(Folder).where(Folder.name == "daily")) + ).scalar_one_or_none() + assert daily_after is not None + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_f6_single_file_creates_subfolder( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F6: Single-file mode creates missing Folder rows and assigns correct folder_id.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "root.md").write_text("root") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + sub = tmp_path / "notes" / "daily" + sub.mkdir(parents=True) + (sub / "new.md").write_text("new note in subfolder") + + count, _, _, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[str(sub / "new.md")], + root_folder_id=root_folder_id, + ) + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.title == "new.md", + ) + ) + ).scalar_one() + + daily_folder = ( + await db_session.execute(select(Folder).where(Folder.name == "daily")) + ).scalar_one() + + assert doc.folder_id == daily_folder.id + assert daily_folder.parent_id is not None + + notes_folder = ( + await db_session.execute(select(Folder).where(Folder.name == "notes")) + ).scalar_one() + assert daily_folder.parent_id == notes_folder.id + assert notes_folder.parent_id == root_folder_id + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_f7_single_file_delete_cleans_empty_folders( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """F7: Deleting the only file in a subfolder via batch mode removes empty Folder rows.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + sub = tmp_path / "notes" / "ephemeral" + sub.mkdir(parents=True) + (sub / "temp.md").write_text("temporary") + (tmp_path / "keep.md").write_text("keep this") + + _, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + eph_folder = ( + await db_session.execute(select(Folder).where(Folder.name == "ephemeral")) + ).scalar_one_or_none() + assert eph_folder is not None + + target = sub / "temp.md" + target.unlink() + + await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[str(target)], + root_folder_id=root_folder_id, + ) + + eph_after = ( + await db_session.execute(select(Folder).where(Folder.name == "ephemeral")) + ).scalar_one_or_none() + assert eph_after is None + + notes_after = ( + await db_session.execute(select(Folder).where(Folder.name == "notes")) + ).scalar_one_or_none() + assert notes_after is None + + +# ==================================================================== +# Tier 6: Batch Mode (B1-B2) +# ==================================================================== + + +class TestBatchMode: + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_b1_batch_indexes_multiple_files( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + patched_batch_sessions, + ): + """B1: Batch with 3 files indexes all of them.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "a.md").write_text("File A content") + (tmp_path / "b.md").write_text("File B content") + (tmp_path / "c.md").write_text("File C content") + + count, failed, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[ + str(tmp_path / "a.md"), + str(tmp_path / "b.md"), + str(tmp_path / "c.md"), + ], + ) + + assert count == 3 + assert failed == 0 + assert err is None + + docs = ( + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ) + .scalars() + .all() + ) + assert len(docs) == 3 + assert {d.title for d in docs} == {"a.md", "b.md", "c.md"} + assert all( + DocumentStatus.is_state(d.status, DocumentStatus.READY) for d in docs + ) + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_b2_partial_failure( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + patched_batch_sessions, + ): + """B2: One unreadable file fails gracefully; the other two still get indexed.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "good1.md").write_text("Good file one") + (tmp_path / "good2.md").write_text("Good file two") + (tmp_path / "bad.md").write_bytes(b"\x00binary garbage") + + count, failed, _, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[ + str(tmp_path / "good1.md"), + str(tmp_path / "bad.md"), + str(tmp_path / "good2.md"), + ], + ) + + assert count == 2 + assert failed == 1 + assert err is not None + + docs = ( + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ) + .scalars() + .all() + ) + assert len(docs) == 2 + assert {d.title for d in docs} == {"good1.md", "good2.md"} + + +# ==================================================================== +# Tier 5: Pipeline Integration (P1) +# ==================================================================== + + +class TestPipelineIntegration: + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_p1_local_folder_file_through_pipeline( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + mocker, + ): + """P1: LOCAL_FOLDER_FILE ConnectorDocument through prepare+index to READY.""" + from app.indexing_pipeline.connector_document import ConnectorDocument + from app.indexing_pipeline.indexing_pipeline_service import ( + IndexingPipelineService, + ) + + doc = ConnectorDocument( + title="Test Local File", + source_markdown="## Local file\n\nContent from disk.", + unique_id="test-folder:test.md", + document_type=DocumentType.LOCAL_FOLDER_FILE, + search_space_id=db_search_space.id, + connector_id=None, + created_by_id=str(db_user.id), + ) + + service = IndexingPipelineService(session=db_session) + prepared = await service.prepare_for_indexing([doc]) + assert len(prepared) == 1 + + db_doc = prepared[0] + result = await service.index(db_doc, doc, llm=mocker.Mock()) + assert result is not None + + docs = ( + ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ) + .scalars() + .all() + ) + assert len(docs) == 1 + assert DocumentStatus.is_state(docs[0].status, DocumentStatus.READY) + + +# ==================================================================== +# Tier 7: Direct Converters (DC1-DC4) +# ==================================================================== + + +class TestDirectConvert: + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_dc1_csv_produces_markdown_table( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """DC1: CSV file is indexed as a markdown table, not raw comma-separated text.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "data.csv").write_text("name,age,city\nAlice,30,NYC\nBob,25,LA\n") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + + assert "| name" in doc.source_markdown + assert "| Alice" in doc.source_markdown + assert "name,age,city" not in doc.source_markdown + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_dc2_tsv_produces_markdown_table( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """DC2: TSV file is indexed as a markdown table.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "data.tsv").write_text( + "name\tage\tcity\nAlice\t30\tNYC\nBob\t25\tLA\n" + ) + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + + assert "| name" in doc.source_markdown + assert "| Alice" in doc.source_markdown + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_dc3_html_produces_clean_markdown( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """DC3: HTML file is indexed as clean markdown, not raw HTML.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "page.html").write_text("

Title

Hello world

") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + + assert "Title" in doc.source_markdown + assert "

" not in doc.source_markdown + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_dc4_csv_single_file_mode( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """DC4: CSV via single-file batch mode also produces a markdown table.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + (tmp_path / "data.csv").write_text("name,age,city\nAlice,30,NYC\nBob,25,LA\n") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[str(tmp_path / "data.csv")], + ) + + assert err is None + assert count == 1 + + doc = ( + await db_session.execute( + select(Document).where( + Document.document_type == DocumentType.LOCAL_FOLDER_FILE, + Document.search_space_id == db_search_space.id, + ) + ) + ).scalar_one() + + assert "| name" in doc.source_markdown + assert "name,age,city" not in doc.source_markdown + + +# ==================================================================== +# Tier 8: Page Limits (PL1-PL6) +# ==================================================================== + + +class TestPageLimits: + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_pl1_full_scan_increments_pages_used( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """PL1: Successful full-scan sync increments user.pages_used.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + db_user.pages_used = 0 + db_user.pages_limit = 500 + await db_session.flush() + + (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert err is None + assert count == 1 + + await db_session.refresh(db_user) + assert db_user.pages_used > 0, "pages_used should increase after indexing" + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_pl2_full_scan_blocked_when_limit_exhausted( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """PL2: Full-scan skips file when page limit is exhausted.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + db_user.pages_used = 100 + db_user.pages_limit = 100 + await db_session.flush() + + (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") + + count, _skipped, _root_folder_id, _err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + + assert count == 0 + + await db_session.refresh(db_user) + assert db_user.pages_used == 100, "pages_used should not change on rejection" + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_pl3_single_file_increments_pages_used( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """PL3: Single-file mode increments user.pages_used on success.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + db_user.pages_used = 0 + db_user.pages_limit = 500 + await db_session.flush() + + (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[str(tmp_path / "note.md")], + ) + + assert err is None + assert count == 1 + + await db_session.refresh(db_user) + assert db_user.pages_used > 0, "pages_used should increase after indexing" + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_pl4_single_file_blocked_when_limit_exhausted( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """PL4: Single-file mode skips file when page limit is exhausted.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + db_user.pages_used = 100 + db_user.pages_limit = 100 + await db_session.flush() + + (tmp_path / "note.md").write_text("# Hello World\n\nContent here.") + + count, _skipped, _root_folder_id, err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[str(tmp_path / "note.md")], + ) + + assert count == 0 + assert err is not None + assert "page limit" in err.lower() + + await db_session.refresh(db_user) + assert db_user.pages_used == 100, "pages_used should not change on rejection" + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_pl5_unchanged_resync_no_extra_pages( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + ): + """PL5: Re-syncing an unchanged file does not consume additional pages.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + db_user.pages_used = 0 + db_user.pages_limit = 500 + await db_session.flush() + + (tmp_path / "note.md").write_text("# Hello\n\nSame content.") + + count1, _, root_folder_id, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + ) + assert count1 == 1 + + await db_session.refresh(db_user) + pages_after_first = db_user.pages_used + assert pages_after_first > 0 + + count2, _, _, _ = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + root_folder_id=root_folder_id, + ) + assert count2 == 0 + + await db_session.refresh(db_user) + assert db_user.pages_used == pages_after_first, ( + "pages_used should not increase for unchanged files" + ) + + @pytest.mark.usefixtures(*UNIFIED_FIXTURES) + async def test_pl6_batch_partial_page_limit_exhaustion( + self, + db_session: AsyncSession, + db_user: User, + db_search_space: SearchSpace, + tmp_path: Path, + patched_batch_sessions, + ): + """PL6: Batch mode with a very low page limit: some files succeed, rest fail.""" + from app.tasks.connector_indexers.local_folder_indexer import index_local_folder + + db_user.pages_used = 0 + db_user.pages_limit = 1 + await db_session.flush() + + (tmp_path / "a.md").write_text("File A content") + (tmp_path / "b.md").write_text("File B content") + (tmp_path / "c.md").write_text("File C content") + + count, failed, _root_folder_id, _err = await index_local_folder( + session=db_session, + search_space_id=db_search_space.id, + user_id=str(db_user.id), + folder_path=str(tmp_path), + folder_name="test-folder", + target_file_paths=[ + str(tmp_path / "a.md"), + str(tmp_path / "b.md"), + str(tmp_path / "c.md"), + ], + ) + + assert count >= 1, "at least one file should succeed" + assert failed >= 1, "at least one file should fail due to page limit" + assert count + failed == 3 + + await db_session.refresh(db_user) + assert db_user.pages_used > 0 + assert db_user.pages_used <= db_user.pages_limit + 1 diff --git a/surfsense_backend/tests/integration/test_document_versioning.py b/surfsense_backend/tests/integration/test_document_versioning.py new file mode 100644 index 000000000..9bd03d219 --- /dev/null +++ b/surfsense_backend/tests/integration/test_document_versioning.py @@ -0,0 +1,167 @@ +"""Integration tests for document versioning snapshot + cleanup.""" + +from datetime import UTC, datetime, timedelta + +import pytest +import pytest_asyncio +from sqlalchemy import func, select +from sqlalchemy.ext.asyncio import AsyncSession + +from app.db import Document, DocumentType, DocumentVersion, SearchSpace, User + +pytestmark = pytest.mark.integration + + +@pytest_asyncio.fixture +async def db_document( + db_session: AsyncSession, db_user: User, db_search_space: SearchSpace +) -> Document: + doc = Document( + title="Test Doc", + document_type=DocumentType.LOCAL_FOLDER_FILE, + document_metadata={}, + content="Summary of test doc.", + content_hash="abc123", + unique_identifier_hash="local_folder:test-folder:test.md", + source_markdown="# Test\n\nOriginal content.", + search_space_id=db_search_space.id, + created_by_id=db_user.id, + ) + db_session.add(doc) + await db_session.flush() + return doc + + +async def _version_count(session: AsyncSession, document_id: int) -> int: + result = await session.execute( + select(func.count()) + .select_from(DocumentVersion) + .where(DocumentVersion.document_id == document_id) + ) + return result.scalar_one() + + +async def _get_versions( + session: AsyncSession, document_id: int +) -> list[DocumentVersion]: + result = await session.execute( + select(DocumentVersion) + .where(DocumentVersion.document_id == document_id) + .order_by(DocumentVersion.version_number) + ) + return list(result.scalars().all()) + + +class TestCreateVersionSnapshot: + """V1-V5: TDD slices for create_version_snapshot.""" + + async def test_v1_creates_first_version(self, db_session, db_document): + """V1: First snapshot creates version 1 with the document's current state.""" + from app.utils.document_versioning import create_version_snapshot + + await create_version_snapshot(db_session, db_document) + + versions = await _get_versions(db_session, db_document.id) + assert len(versions) == 1 + assert versions[0].version_number == 1 + assert versions[0].source_markdown == "# Test\n\nOriginal content." + assert versions[0].content_hash == "abc123" + assert versions[0].title == "Test Doc" + assert versions[0].document_id == db_document.id + + async def test_v2_creates_version_2_after_30_min( + self, db_session, db_document, monkeypatch + ): + """V2: After 30+ minutes, a new version is created (not overwritten).""" + from app.utils.document_versioning import create_version_snapshot + + t0 = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t0) + await create_version_snapshot(db_session, db_document) + + # Simulate content change and time passing + db_document.source_markdown = "# Test\n\nUpdated content." + db_document.content_hash = "def456" + t1 = t0 + timedelta(minutes=31) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t1) + await create_version_snapshot(db_session, db_document) + + versions = await _get_versions(db_session, db_document.id) + assert len(versions) == 2 + assert versions[0].version_number == 1 + assert versions[1].version_number == 2 + assert versions[1].source_markdown == "# Test\n\nUpdated content." + + async def test_v3_overwrites_within_30_min( + self, db_session, db_document, monkeypatch + ): + """V3: Within 30 minutes, the latest version is overwritten.""" + from app.utils.document_versioning import create_version_snapshot + + t0 = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t0) + await create_version_snapshot(db_session, db_document) + count_after_first = await _version_count(db_session, db_document.id) + assert count_after_first == 1 + + # Simulate quick edit within 30 minutes + db_document.source_markdown = "# Test\n\nQuick edit." + db_document.content_hash = "quick123" + t1 = t0 + timedelta(minutes=10) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: t1) + await create_version_snapshot(db_session, db_document) + + count_after_second = await _version_count(db_session, db_document.id) + assert count_after_second == 1 # still 1, not 2 + + versions = await _get_versions(db_session, db_document.id) + assert versions[0].source_markdown == "# Test\n\nQuick edit." + assert versions[0].content_hash == "quick123" + + async def test_v4_cleanup_90_day_old_versions( + self, db_session, db_document, monkeypatch + ): + """V4: Versions older than 90 days are cleaned up.""" + from app.utils.document_versioning import create_version_snapshot + + base = datetime(2025, 1, 1, 12, 0, 0, tzinfo=UTC) + + # Create 5 versions spread across time: 3 older than 90 days, 2 recent + for i in range(5): + db_document.source_markdown = f"Content v{i + 1}" + db_document.content_hash = f"hash_{i + 1}" + t = base + timedelta(days=i) if i < 3 else base + timedelta(days=100 + i) + monkeypatch.setattr("app.utils.document_versioning._now", lambda _t=t: _t) + await create_version_snapshot(db_session, db_document) + + # Now trigger cleanup from a "current" time that makes the first 3 versions > 90 days old + now = base + timedelta(days=200) + monkeypatch.setattr("app.utils.document_versioning._now", lambda: now) + db_document.source_markdown = "Content v6" + db_document.content_hash = "hash_6" + await create_version_snapshot(db_session, db_document) + + versions = await _get_versions(db_session, db_document.id) + # The first 3 (old) should be cleaned up; versions 4, 5, 6 remain + for v in versions: + age = now - v.created_at.replace(tzinfo=UTC) + assert age <= timedelta(days=90), f"Version {v.version_number} is too old" + + async def test_v5_cap_at_20_versions(self, db_session, db_document, monkeypatch): + """V5: More than 20 versions triggers cap — oldest gets deleted.""" + from app.utils.document_versioning import create_version_snapshot + + base = datetime(2025, 6, 1, 12, 0, 0, tzinfo=UTC) + + # Create 21 versions (all within 90 days, each 31 min apart) + for i in range(21): + db_document.source_markdown = f"Content v{i + 1}" + db_document.content_hash = f"hash_{i + 1}" + t = base + timedelta(minutes=31 * i) + monkeypatch.setattr("app.utils.document_versioning._now", lambda _t=t: _t) + await create_version_snapshot(db_session, db_document) + + versions = await _get_versions(db_session, db_document.id) + assert len(versions) == 20 + # The lowest version_number should be 2 (version 1 was the oldest and got capped) + assert versions[0].version_number == 2 diff --git a/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py new file mode 100644 index 000000000..cd112e09f --- /dev/null +++ b/surfsense_backend/tests/unit/connector_indexers/test_content_extraction.py @@ -0,0 +1,244 @@ +"""Tests that each cloud connector's download_and_extract_content correctly +produces markdown from a real file via the unified ETL pipeline. + +Only the cloud client is mocked (system boundary). The ETL pipeline runs for +real so we know the full path from "cloud gives us bytes" to "we get markdown +back" actually works. +""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +pytestmark = pytest.mark.unit + +_TXT_CONTENT = "Hello from the cloud connector test." +_CSV_CONTENT = "name,age\nAlice,30\nBob,25\n" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +async def _write_file(dest_path: str, content: str) -> None: + """Simulate a cloud client writing downloaded bytes to disk.""" + with open(dest_path, "w", encoding="utf-8") as f: + f.write(content) + + +def _make_download_side_effect(content: str): + """Return an async side-effect that writes *content* to the dest path + and returns ``None`` (success).""" + + async def _side_effect(*args): + dest_path = args[-1] + await _write_file(dest_path, content) + return None + + return _side_effect + + +# =================================================================== +# Google Drive +# =================================================================== + + +class TestGoogleDriveContentExtraction: + async def test_txt_file_returns_markdown(self): + from app.connectors.google_drive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_TXT_CONTENT), + ) + + file = {"id": "f1", "name": "notes.txt", "mimeType": "text/plain"} + + markdown, metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert _TXT_CONTENT in markdown + assert metadata["google_drive_file_id"] == "f1" + assert metadata["google_drive_file_name"] == "notes.txt" + + async def test_csv_file_returns_markdown_table(self): + from app.connectors.google_drive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_CSV_CONTENT), + ) + + file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"} + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert "Alice" in markdown + assert "Bob" in markdown + assert "|" in markdown + + async def test_download_error_returns_error_message(self): + from app.connectors.google_drive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock(return_value="Network timeout") + + file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"} + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert markdown is None + assert error == "Network timeout" + + +# =================================================================== +# OneDrive +# =================================================================== + + +class TestOneDriveContentExtraction: + async def test_txt_file_returns_markdown(self): + from app.connectors.onedrive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_TXT_CONTENT), + ) + + file = { + "id": "od-1", + "name": "report.txt", + "file": {"mimeType": "text/plain"}, + } + + markdown, metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert _TXT_CONTENT in markdown + assert metadata["onedrive_file_id"] == "od-1" + assert metadata["onedrive_file_name"] == "report.txt" + + async def test_csv_file_returns_markdown_table(self): + from app.connectors.onedrive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_CSV_CONTENT), + ) + + file = { + "id": "od-2", + "name": "data.csv", + "file": {"mimeType": "text/csv"}, + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert "Alice" in markdown + assert "|" in markdown + + async def test_download_error_returns_error_message(self): + from app.connectors.onedrive.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock(return_value="403 Forbidden") + + file = { + "id": "od-3", + "name": "secret.txt", + "file": {"mimeType": "text/plain"}, + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert markdown is None + assert error == "403 Forbidden" + + +# =================================================================== +# Dropbox +# =================================================================== + + +class TestDropboxContentExtraction: + async def test_txt_file_returns_markdown(self): + from app.connectors.dropbox.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_TXT_CONTENT), + ) + + file = { + "id": "dbx-1", + "name": "memo.txt", + ".tag": "file", + "path_lower": "/memo.txt", + } + + markdown, metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert _TXT_CONTENT in markdown + assert metadata["dropbox_file_id"] == "dbx-1" + assert metadata["dropbox_file_name"] == "memo.txt" + + async def test_csv_file_returns_markdown_table(self): + from app.connectors.dropbox.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock( + side_effect=_make_download_side_effect(_CSV_CONTENT), + ) + + file = { + "id": "dbx-2", + "name": "data.csv", + ".tag": "file", + "path_lower": "/data.csv", + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert error is None + assert "Alice" in markdown + assert "|" in markdown + + async def test_download_error_returns_error_message(self): + from app.connectors.dropbox.content_extractor import ( + download_and_extract_content, + ) + + client = MagicMock() + client.download_file_to_disk = AsyncMock(return_value="Rate limited") + + file = { + "id": "dbx-3", + "name": "big.txt", + ".tag": "file", + "path_lower": "/big.txt", + } + + markdown, _metadata, error = await download_and_extract_content(client, file) + + assert markdown is None + assert error == "Rate limited" diff --git a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py index 76f8806dc..f72135d05 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_dropbox_parallel.py @@ -8,6 +8,10 @@ import pytest from app.db import DocumentType from app.tasks.connector_indexers.dropbox_indexer import ( _download_files_parallel, + _index_full_scan, + _index_selected_files, + _index_with_delta_sync, + index_dropbox_files, ) pytestmark = pytest.mark.unit @@ -234,3 +238,610 @@ async def test_heartbeat_fires_during_parallel_downloads( assert len(docs) == 3 assert failed == 0 assert len(heartbeat_calls) >= 1, "Heartbeat should have fired at least once" + + +# --------------------------------------------------------------------------- +# D1-D2: _index_full_scan tests +# --------------------------------------------------------------------------- + + +def _folder_dict(name: str) -> dict: + return {".tag": "folder", "name": name} + + +@pytest.fixture +def full_scan_mocks(mock_dropbox_client, monkeypatch): + """Wire up mocks for _index_full_scan in isolation.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_session = AsyncMock() + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + mock_log_entry = MagicMock() + + skip_results: dict[str, tuple[bool, str | None]] = {} + + monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD") + + async def _fake_skip(session, file, search_space_id): + from app.connectors.dropbox.file_types import should_skip_file as _skip + + item_skip, unsup_ext = _skip(file) + if item_skip: + if unsup_ext: + return True, f"unsupported:{unsup_ext}" + return True, "folder/non-downloadable" + return skip_results.get(file.get("id", ""), (False, None)) + + monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip) + + download_and_index_mock = AsyncMock(return_value=(0, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + + from app.services.page_limit_service import PageLimitService as _RealPLS + + mock_page_limit_instance = MagicMock() + mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999)) + mock_page_limit_instance.update_page_usage = AsyncMock() + + class _MockPageLimitService: + estimate_pages_from_metadata = staticmethod( + _RealPLS.estimate_pages_from_metadata + ) + + def __init__(self, session): + self.get_page_usage = mock_page_limit_instance.get_page_usage + self.update_page_usage = mock_page_limit_instance.update_page_usage + + monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService) + + return { + "dropbox_client": mock_dropbox_client, + "session": mock_session, + "task_logger": mock_task_logger, + "log_entry": mock_log_entry, + "skip_results": skip_results, + "download_and_index_mock": download_and_index_mock, + } + + +async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500): + import app.tasks.connector_indexers.dropbox_indexer as _mod + + monkeypatch.setattr( + _mod, + "get_files_in_folder", + AsyncMock(return_value=(page_files, None)), + ) + return await _index_full_scan( + mocks["dropbox_client"], + mocks["session"], + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "", + "Root", + mocks["task_logger"], + mocks["log_entry"], + max_files, + enable_summary=True, + ) + + +async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch): + """Skipped files excluded, renames counted as indexed, new files downloaded.""" + page_files = [ + _folder_dict("SubFolder"), + _make_file_dict("skip1", "unchanged.txt"), + _make_file_dict("rename1", "renamed.txt"), + _make_file_dict("new1", "new1.txt"), + _make_file_dict("new2", "new2.txt"), + ] + + full_scan_mocks["skip_results"]["skip1"] = (True, "unchanged") + full_scan_mocks["skip_results"]["rename1"] = ( + True, + "File renamed: 'old' -> 'renamed.txt'", + ) + + full_scan_mocks["download_and_index_mock"].return_value = (2, 0) + + indexed, skipped, _unsupported = await _run_full_scan( + full_scan_mocks, monkeypatch, page_files + ) + + assert indexed == 3 # 1 renamed + 2 from batch + assert skipped == 2 # 1 folder + 1 unchanged + + call_args = full_scan_mocks["download_and_index_mock"].call_args + call_files = call_args[0][2] + assert len(call_files) == 2 + assert {f["id"] for f in call_files} == {"new1", "new2"} + + +async def test_full_scan_respects_max_files(full_scan_mocks, monkeypatch): + """Only max_files non-folder items are considered.""" + page_files = [_make_file_dict(f"f{i}", f"file{i}.txt") for i in range(10)] + + full_scan_mocks["download_and_index_mock"].return_value = (3, 0) + + await _run_full_scan(full_scan_mocks, monkeypatch, page_files, max_files=3) + + call_files = full_scan_mocks["download_and_index_mock"].call_args[0][2] + assert len(call_files) == 3 + + +# --------------------------------------------------------------------------- +# D3-D5: _index_selected_files tests +# --------------------------------------------------------------------------- + + +@pytest.fixture +def selected_files_mocks(mock_dropbox_client, monkeypatch): + """Wire up mocks for _index_selected_files tests.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_session = AsyncMock() + + get_file_results: dict[str, tuple[dict | None, str | None]] = {} + + async def _fake_get_file(client, path): + return get_file_results.get(path, (None, f"Not configured: {path}")) + + monkeypatch.setattr(_mod, "get_file_by_path", _fake_get_file) + + skip_results: dict[str, tuple[bool, str | None]] = {} + + async def _fake_skip(session, file, search_space_id): + return skip_results.get(file["id"], (False, None)) + + monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip) + + download_and_index_mock = AsyncMock(return_value=(0, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + + from app.services.page_limit_service import PageLimitService as _RealPLS + + mock_page_limit_instance = MagicMock() + mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999)) + mock_page_limit_instance.update_page_usage = AsyncMock() + + class _MockPageLimitService: + estimate_pages_from_metadata = staticmethod( + _RealPLS.estimate_pages_from_metadata + ) + + def __init__(self, session): + self.get_page_usage = mock_page_limit_instance.get_page_usage + self.update_page_usage = mock_page_limit_instance.update_page_usage + + monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService) + + return { + "dropbox_client": mock_dropbox_client, + "session": mock_session, + "get_file_results": get_file_results, + "skip_results": skip_results, + "download_and_index_mock": download_and_index_mock, + } + + +async def _run_selected(mocks, file_tuples): + return await _index_selected_files( + mocks["dropbox_client"], + mocks["session"], + file_tuples, + connector_id=_CONNECTOR_ID, + search_space_id=_SEARCH_SPACE_ID, + user_id=_USER_ID, + enable_summary=True, + ) + + +async def test_selected_files_single_file_indexed(selected_files_mocks): + selected_files_mocks["get_file_results"]["/report.pdf"] = ( + _make_file_dict("f1", "report.pdf"), + None, + ) + selected_files_mocks["download_and_index_mock"].return_value = (1, 0) + + indexed, skipped, _unsupported, errors = await _run_selected( + selected_files_mocks, + [("/report.pdf", "report.pdf")], + ) + + assert indexed == 1 + assert skipped == 0 + assert errors == [] + + +async def test_selected_files_fetch_failure_isolation(selected_files_mocks): + selected_files_mocks["get_file_results"]["/first.txt"] = ( + _make_file_dict("f1", "first.txt"), + None, + ) + selected_files_mocks["get_file_results"]["/mid.txt"] = (None, "HTTP 404") + selected_files_mocks["get_file_results"]["/third.txt"] = ( + _make_file_dict("f3", "third.txt"), + None, + ) + selected_files_mocks["download_and_index_mock"].return_value = (2, 0) + + indexed, skipped, _unsupported, errors = await _run_selected( + selected_files_mocks, + [ + ("/first.txt", "first.txt"), + ("/mid.txt", "mid.txt"), + ("/third.txt", "third.txt"), + ], + ) + + assert indexed == 2 + assert skipped == 0 + assert len(errors) == 1 + assert "mid.txt" in errors[0] + + +async def test_selected_files_skip_rename_counting(selected_files_mocks): + for path, fid, fname in [ + ("/unchanged.txt", "s1", "unchanged.txt"), + ("/renamed.txt", "r1", "renamed.txt"), + ("/new1.txt", "n1", "new1.txt"), + ("/new2.txt", "n2", "new2.txt"), + ]: + selected_files_mocks["get_file_results"][path] = ( + _make_file_dict(fid, fname), + None, + ) + + selected_files_mocks["skip_results"]["s1"] = (True, "unchanged") + selected_files_mocks["skip_results"]["r1"] = ( + True, + "File renamed: 'old' -> 'renamed.txt'", + ) + selected_files_mocks["download_and_index_mock"].return_value = (2, 0) + + indexed, skipped, _unsupported, errors = await _run_selected( + selected_files_mocks, + [ + ("/unchanged.txt", "unchanged.txt"), + ("/renamed.txt", "renamed.txt"), + ("/new1.txt", "new1.txt"), + ("/new2.txt", "new2.txt"), + ], + ) + + assert indexed == 3 # 1 renamed + 2 batch + assert skipped == 1 + assert errors == [] + + mock = selected_files_mocks["download_and_index_mock"] + call_files = mock.call_args[0][2] + assert len(call_files) == 2 + assert {f["id"] for f in call_files} == {"n1", "n2"} + + +# --------------------------------------------------------------------------- +# E1-E4: _index_with_delta_sync tests +# --------------------------------------------------------------------------- + + +async def test_delta_sync_deletions_call_remove_document(monkeypatch): + """E1: deleted entries are processed via _remove_document.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + entries = [ + { + ".tag": "deleted", + "name": "gone.txt", + "path_lower": "/gone.txt", + "id": "id:del1", + }, + { + ".tag": "deleted", + "name": "also_gone.pdf", + "path_lower": "/also_gone.pdf", + "id": "id:del2", + }, + ] + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=(entries, "new-cursor", None)) + + remove_calls: list[str] = [] + + async def _fake_remove(session, file_id, search_space_id): + remove_calls.append(file_id) + + monkeypatch.setattr(_mod, "_remove_document", _fake_remove) + monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0))) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + _indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "old-cursor", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert sorted(remove_calls) == ["id:del1", "id:del2"] + assert cursor == "new-cursor" + + +async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch): + """E2: modified/new file entries go through skip filter then download+index.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + entries = [ + _make_file_dict("mod1", "modified1.txt"), + _make_file_dict("mod2", "modified2.txt"), + ] + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None)) + + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_mock = AsyncMock(return_value=(2, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_mock) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "cursor-v1", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert indexed == 2 + assert skipped == 0 + assert cursor == "cursor-v2" + + downloaded_files = download_mock.call_args[0][2] + assert len(downloaded_files) == 2 + assert {f["id"] for f in downloaded_files} == {"mod1", "mod2"} + + +async def test_delta_sync_mix_deletions_and_upserts(monkeypatch): + """E3: deletions processed, then remaining upserts filtered and indexed.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + entries = [ + { + ".tag": "deleted", + "name": "removed.txt", + "path_lower": "/removed.txt", + "id": "id:del1", + }, + { + ".tag": "deleted", + "name": "trashed.pdf", + "path_lower": "/trashed.pdf", + "id": "id:del2", + }, + _make_file_dict("mod1", "updated.txt"), + _make_file_dict("new1", "brandnew.docx"), + ] + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=(entries, "final-cursor", None)) + + remove_calls: list[str] = [] + + async def _fake_remove(session, file_id, search_space_id): + remove_calls.append(file_id) + + monkeypatch.setattr(_mod, "_remove_document", _fake_remove) + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_mock = AsyncMock(return_value=(2, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_mock) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "old-cursor", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert sorted(remove_calls) == ["id:del1", "id:del2"] + assert indexed == 2 + assert skipped == 0 + assert cursor == "final-cursor" + + downloaded_files = download_mock.call_args[0][2] + assert {f["id"] for f in downloaded_files} == {"mod1", "new1"} + + +async def test_delta_sync_returns_new_cursor(monkeypatch): + """E4: the new cursor from the API response is returned.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_client = MagicMock() + mock_client.get_changes = AsyncMock(return_value=([], "brand-new-cursor-xyz", None)) + + monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0))) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + indexed, skipped, _unsupported, cursor = await _index_with_delta_sync( + mock_client, + AsyncMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "old-cursor", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + assert cursor == "brand-new-cursor-xyz" + assert indexed == 0 + assert skipped == 0 + + +# --------------------------------------------------------------------------- +# F1-F3: index_dropbox_files orchestrator tests +# --------------------------------------------------------------------------- + + +@pytest.fixture +def orchestrator_mocks(monkeypatch): + """Wire up mocks for index_dropbox_files orchestrator tests.""" + import app.tasks.connector_indexers.dropbox_indexer as _mod + + mock_connector = MagicMock() + mock_connector.config = {"_token_encrypted": False} + mock_connector.last_indexed_at = None + mock_connector.enable_summary = True + + monkeypatch.setattr( + _mod, + "get_connector_by_id", + AsyncMock(return_value=mock_connector), + ) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_start = AsyncMock(return_value=MagicMock()) + mock_task_logger.log_task_progress = AsyncMock() + mock_task_logger.log_task_success = AsyncMock() + mock_task_logger.log_task_failure = AsyncMock() + monkeypatch.setattr( + _mod, "TaskLoggingService", MagicMock(return_value=mock_task_logger) + ) + + monkeypatch.setattr(_mod, "update_connector_last_indexed", AsyncMock()) + + full_scan_mock = AsyncMock(return_value=(5, 2, 0)) + monkeypatch.setattr(_mod, "_index_full_scan", full_scan_mock) + + delta_sync_mock = AsyncMock(return_value=(3, 1, 0, "delta-cursor-new")) + monkeypatch.setattr(_mod, "_index_with_delta_sync", delta_sync_mock) + + mock_client = MagicMock() + mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None)) + monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client)) + + return { + "connector": mock_connector, + "full_scan_mock": full_scan_mock, + "delta_sync_mock": delta_sync_mock, + "mock_client": mock_client, + } + + +async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed( + orchestrator_mocks, +): + """F1: with cursor + last_indexed_at + use_delta_sync, calls delta sync.""" + from datetime import UTC, datetime + + connector = orchestrator_mocks["connector"] + connector.config = { + "_token_encrypted": False, + "folder_cursors": {"/docs": "saved-cursor-123"}, + } + connector.last_indexed_at = datetime(2026, 1, 1, tzinfo=UTC) + + mock_session = AsyncMock() + mock_session.commit = AsyncMock() + + _indexed, _skipped, error, _unsupported = await index_dropbox_files( + mock_session, + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + { + "folders": [{"path": "/docs", "name": "Docs"}], + "files": [], + "indexing_options": {"use_delta_sync": True}, + }, + ) + + assert error is None + orchestrator_mocks["delta_sync_mock"].assert_called_once() + orchestrator_mocks["full_scan_mock"].assert_not_called() + + +async def test_orchestrator_falls_back_to_full_scan_without_cursor( + orchestrator_mocks, +): + """F2: without cursor, falls back to full scan.""" + connector = orchestrator_mocks["connector"] + connector.config = {"_token_encrypted": False} + connector.last_indexed_at = None + + mock_session = AsyncMock() + mock_session.commit = AsyncMock() + + _indexed, _skipped, error, _unsupported = await index_dropbox_files( + mock_session, + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + { + "folders": [{"path": "/docs", "name": "Docs"}], + "files": [], + "indexing_options": {"use_delta_sync": True}, + }, + ) + + assert error is None + orchestrator_mocks["full_scan_mock"].assert_called_once() + orchestrator_mocks["delta_sync_mock"].assert_not_called() + + +async def test_orchestrator_persists_cursor_after_sync(orchestrator_mocks): + """F3: after sync, persists new cursor to connector config.""" + connector = orchestrator_mocks["connector"] + connector.config = {"_token_encrypted": False} + connector.last_indexed_at = None + + mock_session = AsyncMock() + mock_session.commit = AsyncMock() + + await index_dropbox_files( + mock_session, + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + { + "folders": [{"path": "/docs", "name": "Docs"}], + "files": [], + }, + ) + + assert "folder_cursors" in connector.config + assert connector.config["folder_cursors"]["/docs"] == "latest-cursor-abc" diff --git a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py index 3fe8a183d..0ae096361 100644 --- a/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py +++ b/surfsense_backend/tests/unit/connector_indexers/test_google_drive_parallel.py @@ -248,12 +248,33 @@ def _folder_dict(file_id: str, name: str) -> dict: } +def _make_page_limit_session(pages_used=0, pages_limit=999_999): + """Build a mock DB session that real PageLimitService can operate against.""" + + class _FakeUser: + def __init__(self, pu, pl): + self.pages_used = pu + self.pages_limit = pl + + fake_user = _FakeUser(pages_used, pages_limit) + session = AsyncMock() + + def _make_result(*_a, **_kw): + r = MagicMock() + r.first.return_value = (fake_user.pages_used, fake_user.pages_limit) + r.unique.return_value.scalar_one_or_none.return_value = fake_user + return r + + session.execute = AsyncMock(side_effect=_make_result) + return session, fake_user + + @pytest.fixture def full_scan_mocks(mock_drive_client, monkeypatch): """Wire up all mocks needed to call _index_full_scan in isolation.""" import app.tasks.connector_indexers.google_drive_indexer as _mod - mock_session = AsyncMock() + mock_session, _ = _make_page_limit_session() mock_connector = MagicMock() mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() @@ -345,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch): full_scan_mocks["download_mock"].return_value = (mock_docs, 0) full_scan_mocks["batch_mock"].return_value = ([], 2, 0) - indexed, skipped = await _run_full_scan(full_scan_mocks) + indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks) assert indexed == 3 # 1 renamed + 2 from batch assert skipped == 1 # 1 unchanged @@ -472,11 +493,11 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch): AsyncMock(return_value=MagicMock()), ) - mock_session = AsyncMock() + mock_session, _ = _make_page_limit_session() mock_task_logger = MagicMock() mock_task_logger.log_task_progress = AsyncMock() - indexed, skipped = await _index_with_delta_sync( + indexed, skipped, _unsupported = await _index_with_delta_sync( MagicMock(), mock_session, MagicMock(), @@ -512,7 +533,7 @@ def selected_files_mocks(mock_drive_client, monkeypatch): """Wire up mocks for _index_selected_files tests.""" import app.tasks.connector_indexers.google_drive_indexer as _mod - mock_session = AsyncMock() + mock_session, _ = _make_page_limit_session() get_file_results: dict[str, tuple[dict | None, str | None]] = {} @@ -568,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks): ) selected_files_mocks["download_and_index_mock"].return_value = (1, 0) - indexed, skipped, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [("f1", "report.pdf")], ) @@ -592,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks): ) selected_files_mocks["download_and_index_mock"].return_value = (2, 0) - indexed, skipped, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")], ) @@ -626,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks): selected_files_mocks["download_and_index_mock"].return_value = (2, 0) - indexed, skipped, errors = await _run_selected( + indexed, skipped, _unsup, errors = await _run_selected( selected_files_mocks, [ ("s1", "unchanged.txt"), diff --git a/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py b/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py new file mode 100644 index 000000000..c6e7b160c --- /dev/null +++ b/surfsense_backend/tests/unit/connector_indexers/test_local_folder_scan.py @@ -0,0 +1,78 @@ +"""Unit tests for scan_folder() pure logic — Tier 2 TDD slices (S1-S4).""" + +from pathlib import Path + +import pytest + +pytestmark = pytest.mark.unit + + +class TestScanFolder: + """S1-S4: scan_folder() with real tmp_path filesystem.""" + + def test_s1_single_md_file(self, tmp_path: Path): + """S1: scan_folder on a dir with one .md file returns correct entry.""" + from app.tasks.connector_indexers.local_folder_indexer import scan_folder + + md = tmp_path / "note.md" + md.write_text("# Hello") + + results = scan_folder(str(tmp_path)) + + assert len(results) == 1 + entry = results[0] + assert entry["relative_path"] == "note.md" + assert entry["size"] > 0 + assert "modified_at" in entry + assert entry["path"] == str(md) + + def test_s2_extension_filter(self, tmp_path: Path): + """S2: file_extensions filter returns only matching files.""" + from app.tasks.connector_indexers.local_folder_indexer import scan_folder + + (tmp_path / "a.md").write_text("md") + (tmp_path / "b.txt").write_text("txt") + (tmp_path / "c.pdf").write_bytes(b"%PDF") + + results = scan_folder(str(tmp_path), file_extensions=[".md"]) + names = {r["relative_path"] for r in results} + + assert names == {"a.md"} + + def test_s3_exclude_patterns(self, tmp_path: Path): + """S3: exclude_patterns skips files inside excluded directories.""" + from app.tasks.connector_indexers.local_folder_indexer import scan_folder + + (tmp_path / "good.md").write_text("good") + nm = tmp_path / "node_modules" + nm.mkdir() + (nm / "dep.js").write_text("module") + git = tmp_path / ".git" + git.mkdir() + (git / "config").write_text("gitconfig") + + results = scan_folder(str(tmp_path), exclude_patterns=["node_modules", ".git"]) + names = {r["relative_path"] for r in results} + + assert "good.md" in names + assert not any("node_modules" in n for n in names) + assert not any(".git" in n for n in names) + + def test_s4_nested_dirs(self, tmp_path: Path): + """S4: nested subdirectories produce correct relative paths.""" + from app.tasks.connector_indexers.local_folder_indexer import scan_folder + + daily = tmp_path / "notes" / "daily" + daily.mkdir(parents=True) + weekly = tmp_path / "notes" / "weekly" + weekly.mkdir(parents=True) + (daily / "today.md").write_text("today") + (weekly / "review.md").write_text("review") + (tmp_path / "root.txt").write_text("root") + + results = scan_folder(str(tmp_path)) + paths = {r["relative_path"] for r in results} + + assert "notes/daily/today.md" in paths or "notes\\daily\\today.md" in paths + assert "notes/weekly/review.md" in paths or "notes\\weekly\\review.md" in paths + assert "root.txt" in paths diff --git a/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py new file mode 100644 index 000000000..573ee43d8 --- /dev/null +++ b/surfsense_backend/tests/unit/connector_indexers/test_page_limits.py @@ -0,0 +1,684 @@ +"""Tests for page limit enforcement in connector indexers. + +Covers: + A) PageLimitService.estimate_pages_from_metadata — pure function (no mocks) + B) Page-limit quota gating in _index_selected_files tested through the + real PageLimitService with a mock DB session (system boundary). + Google Drive is the primary, with OneDrive/Dropbox smoke tests. +""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from app.services.page_limit_service import PageLimitService + +pytestmark = pytest.mark.unit + +_USER_ID = "00000000-0000-0000-0000-000000000001" +_CONNECTOR_ID = 42 +_SEARCH_SPACE_ID = 1 + + +# =================================================================== +# A) PageLimitService.estimate_pages_from_metadata — pure function +# No mocks: it's a staticmethod with no I/O. +# =================================================================== + + +class TestEstimatePagesFromMetadata: + """Vertical slices for the page estimation staticmethod.""" + + def test_pdf_100kb_returns_1(self): + assert PageLimitService.estimate_pages_from_metadata(".pdf", 100 * 1024) == 1 + + def test_pdf_500kb_returns_5(self): + assert PageLimitService.estimate_pages_from_metadata(".pdf", 500 * 1024) == 5 + + def test_pdf_1mb(self): + assert PageLimitService.estimate_pages_from_metadata(".pdf", 1024 * 1024) == 10 + + def test_docx_50kb_returns_1(self): + assert PageLimitService.estimate_pages_from_metadata(".docx", 50 * 1024) == 1 + + def test_docx_200kb(self): + assert PageLimitService.estimate_pages_from_metadata(".docx", 200 * 1024) == 4 + + def test_pptx_uses_200kb_per_page(self): + assert PageLimitService.estimate_pages_from_metadata(".pptx", 600 * 1024) == 3 + + def test_xlsx_uses_100kb_per_page(self): + assert PageLimitService.estimate_pages_from_metadata(".xlsx", 300 * 1024) == 3 + + def test_txt_uses_3000_bytes_per_page(self): + assert PageLimitService.estimate_pages_from_metadata(".txt", 9000) == 3 + + def test_image_always_returns_1(self): + for ext in (".jpg", ".png", ".gif", ".webp"): + assert PageLimitService.estimate_pages_from_metadata(ext, 5_000_000) == 1 + + def test_audio_uses_1mb_per_page(self): + assert ( + PageLimitService.estimate_pages_from_metadata(".mp3", 3 * 1024 * 1024) == 3 + ) + + def test_video_uses_5mb_per_page(self): + assert ( + PageLimitService.estimate_pages_from_metadata(".mp4", 15 * 1024 * 1024) == 3 + ) + + def test_unknown_ext_uses_80kb_per_page(self): + assert PageLimitService.estimate_pages_from_metadata(".xyz", 160 * 1024) == 2 + + def test_zero_size_returns_1(self): + assert PageLimitService.estimate_pages_from_metadata(".pdf", 0) == 1 + + def test_negative_size_returns_1(self): + assert PageLimitService.estimate_pages_from_metadata(".pdf", -500) == 1 + + def test_minimum_is_always_1(self): + assert PageLimitService.estimate_pages_from_metadata(".pdf", 50) == 1 + + def test_epub_uses_50kb_per_page(self): + assert PageLimitService.estimate_pages_from_metadata(".epub", 250 * 1024) == 5 + + +# =================================================================== +# B) Page-limit enforcement in connector indexers +# System boundary mocked: DB session (for PageLimitService) +# System boundary mocked: external API clients, download/ETL +# NOT mocked: PageLimitService itself (our own code) +# =================================================================== + + +class _FakeUser: + """Stands in for the User ORM model at the DB boundary.""" + + def __init__(self, pages_used: int = 0, pages_limit: int = 100): + self.pages_used = pages_used + self.pages_limit = pages_limit + + +def _make_page_limit_session(pages_used: int = 0, pages_limit: int = 100): + """Build a mock DB session that real PageLimitService can operate against. + + Every ``session.execute()`` returns a result compatible with both + ``get_page_usage`` (.first() → tuple) and ``update_page_usage`` + (.unique().scalar_one_or_none() → User-like). + """ + fake_user = _FakeUser(pages_used, pages_limit) + session = AsyncMock() + + def _make_result(*_args, **_kwargs): + result = MagicMock() + result.first.return_value = (fake_user.pages_used, fake_user.pages_limit) + result.unique.return_value.scalar_one_or_none.return_value = fake_user + return result + + session.execute = AsyncMock(side_effect=_make_result) + return session, fake_user + + +def _make_gdrive_file(file_id: str, name: str, size: int = 80 * 1024) -> dict: + return { + "id": file_id, + "name": name, + "mimeType": "application/octet-stream", + "size": str(size), + } + + +# --------------------------------------------------------------------------- +# Google Drive: _index_selected_files +# --------------------------------------------------------------------------- + + +@pytest.fixture +def gdrive_selected_mocks(monkeypatch): + """Mocks for Google Drive _index_selected_files — only system boundaries.""" + import app.tasks.connector_indexers.google_drive_indexer as _mod + + session, fake_user = _make_page_limit_session(0, 100) + + get_file_results: dict[str, tuple[dict | None, str | None]] = {} + + async def _fake_get_file(client, file_id): + return get_file_results.get(file_id, (None, f"Not configured: {file_id}")) + + monkeypatch.setattr(_mod, "get_file_by_id", _fake_get_file) + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_and_index_mock = AsyncMock(return_value=(0, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + + pipeline_mock = MagicMock() + pipeline_mock.create_placeholder_documents = AsyncMock(return_value=0) + monkeypatch.setattr( + _mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock) + ) + + return { + "mod": _mod, + "session": session, + "fake_user": fake_user, + "get_file_results": get_file_results, + "download_and_index_mock": download_and_index_mock, + } + + +async def _run_gdrive_selected(mocks, file_ids): + from app.tasks.connector_indexers.google_drive_indexer import ( + _index_selected_files, + ) + + return await _index_selected_files( + MagicMock(), + mocks["session"], + file_ids, + connector_id=_CONNECTOR_ID, + search_space_id=_SEARCH_SPACE_ID, + user_id=_USER_ID, + enable_summary=True, + ) + + +async def test_gdrive_files_within_quota_are_downloaded(gdrive_selected_mocks): + """Files whose cumulative estimated pages fit within remaining quota + are sent to _download_and_index.""" + m = gdrive_selected_mocks + m["fake_user"].pages_used = 0 + m["fake_user"].pages_limit = 100 + + for fid in ("f1", "f2", "f3"): + m["get_file_results"][fid] = ( + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, + ) + m["download_and_index_mock"].return_value = (3, 0) + + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( + m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")] + ) + + assert indexed == 3 + assert errors == [] + call_files = m["download_and_index_mock"].call_args[0][2] + assert len(call_files) == 3 + + +async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks): + """Files whose pages would exceed remaining quota are rejected.""" + m = gdrive_selected_mocks + m["fake_user"].pages_used = 98 + m["fake_user"].pages_limit = 100 + + m["get_file_results"]["big"] = ( + _make_gdrive_file("big", "huge.pdf", size=500 * 1024), + None, + ) + + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( + m, [("big", "huge.pdf")] + ) + + assert indexed == 0 + assert len(errors) == 1 + assert "page limit" in errors[0].lower() + + +async def test_gdrive_quota_mix_partial_indexing(gdrive_selected_mocks): + """3rd file pushes over quota → only first two indexed.""" + m = gdrive_selected_mocks + m["fake_user"].pages_used = 0 + m["fake_user"].pages_limit = 2 + + for fid in ("f1", "f2", "f3"): + m["get_file_results"][fid] = ( + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, + ) + m["download_and_index_mock"].return_value = (2, 0) + + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( + m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")] + ) + + assert indexed == 2 + assert len(errors) == 1 + call_files = m["download_and_index_mock"].call_args[0][2] + assert {f["id"] for f in call_files} == {"f1", "f2"} + + +async def test_gdrive_proportional_page_deduction(gdrive_selected_mocks): + """Pages deducted are proportional to successfully indexed files.""" + m = gdrive_selected_mocks + m["fake_user"].pages_used = 0 + m["fake_user"].pages_limit = 100 + + for fid in ("f1", "f2", "f3", "f4"): + m["get_file_results"][fid] = ( + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, + ) + m["download_and_index_mock"].return_value = (2, 2) + + await _run_gdrive_selected( + m, + [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz"), ("f4", "f4.xyz")], + ) + + assert m["fake_user"].pages_used == 2 + + +async def test_gdrive_no_deduction_when_nothing_indexed(gdrive_selected_mocks): + """If batch_indexed == 0, user's pages_used stays unchanged.""" + m = gdrive_selected_mocks + m["fake_user"].pages_used = 5 + m["fake_user"].pages_limit = 100 + + m["get_file_results"]["f1"] = ( + _make_gdrive_file("f1", "f1.xyz", size=80 * 1024), + None, + ) + m["download_and_index_mock"].return_value = (0, 1) + + await _run_gdrive_selected(m, [("f1", "f1.xyz")]) + + assert m["fake_user"].pages_used == 5 + + +async def test_gdrive_zero_quota_rejects_all(gdrive_selected_mocks): + """When pages_used == pages_limit, every file is rejected.""" + m = gdrive_selected_mocks + m["fake_user"].pages_used = 100 + m["fake_user"].pages_limit = 100 + + for fid in ("f1", "f2"): + m["get_file_results"][fid] = ( + _make_gdrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, + ) + + indexed, _skipped, _unsup, errors = await _run_gdrive_selected( + m, [("f1", "f1.xyz"), ("f2", "f2.xyz")] + ) + + assert indexed == 0 + assert len(errors) == 2 + + +# --------------------------------------------------------------------------- +# Google Drive: _index_full_scan +# --------------------------------------------------------------------------- + + +@pytest.fixture +def gdrive_full_scan_mocks(monkeypatch): + import app.tasks.connector_indexers.google_drive_indexer as _mod + + session, fake_user = _make_page_limit_session(0, 100) + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_mock = AsyncMock(return_value=([], 0)) + monkeypatch.setattr(_mod, "_download_files_parallel", download_mock) + + batch_mock = AsyncMock(return_value=([], 0, 0)) + pipeline_mock = MagicMock() + pipeline_mock.index_batch_parallel = batch_mock + pipeline_mock.create_placeholder_documents = AsyncMock(return_value=0) + monkeypatch.setattr( + _mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock) + ) + monkeypatch.setattr( + _mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock()) + ) + + return { + "mod": _mod, + "session": session, + "fake_user": fake_user, + "task_logger": mock_task_logger, + "download_mock": download_mock, + "batch_mock": batch_mock, + } + + +async def _run_gdrive_full_scan(mocks, max_files=500): + from app.tasks.connector_indexers.google_drive_indexer import _index_full_scan + + return await _index_full_scan( + MagicMock(), + mocks["session"], + MagicMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "folder-root", + "My Folder", + mocks["task_logger"], + MagicMock(), + max_files, + include_subfolders=False, + enable_summary=True, + ) + + +async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeypatch): + m = gdrive_full_scan_mocks + m["fake_user"].pages_used = 0 + m["fake_user"].pages_limit = 2 + + page_files = [ + _make_gdrive_file(f"f{i}", f"file{i}.xyz", size=80 * 1024) for i in range(5) + ] + monkeypatch.setattr( + m["mod"], + "get_files_in_folder", + AsyncMock(return_value=(page_files, None, None)), + ) + m["download_mock"].return_value = ([], 0) + m["batch_mock"].return_value = ([], 2, 0) + + _indexed, skipped, _unsup = await _run_gdrive_full_scan(m) + + call_files = m["download_mock"].call_args[0][1] + assert len(call_files) == 2 + assert skipped == 3 + + +async def test_gdrive_full_scan_deducts_after_indexing( + gdrive_full_scan_mocks, monkeypatch +): + m = gdrive_full_scan_mocks + m["fake_user"].pages_used = 0 + m["fake_user"].pages_limit = 100 + + page_files = [ + _make_gdrive_file(f"f{i}", f"file{i}.xyz", size=80 * 1024) for i in range(3) + ] + monkeypatch.setattr( + m["mod"], + "get_files_in_folder", + AsyncMock(return_value=(page_files, None, None)), + ) + mock_docs = [MagicMock() for _ in range(3)] + m["download_mock"].return_value = (mock_docs, 0) + m["batch_mock"].return_value = ([], 3, 0) + + await _run_gdrive_full_scan(m) + + assert m["fake_user"].pages_used == 3 + + +# --------------------------------------------------------------------------- +# Google Drive: _index_with_delta_sync +# --------------------------------------------------------------------------- + + +async def test_gdrive_delta_sync_skips_over_quota(monkeypatch): + import app.tasks.connector_indexers.google_drive_indexer as _mod + + session, _ = _make_page_limit_session(0, 2) + + changes = [ + { + "fileId": f"mod{i}", + "file": _make_gdrive_file(f"mod{i}", f"mod{i}.xyz", size=80 * 1024), + } + for i in range(5) + ] + monkeypatch.setattr( + _mod, + "fetch_all_changes", + AsyncMock(return_value=(changes, "new-token", None)), + ) + monkeypatch.setattr(_mod, "categorize_change", lambda change: "modified") + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_mock = AsyncMock(return_value=([], 0)) + monkeypatch.setattr(_mod, "_download_files_parallel", download_mock) + + batch_mock = AsyncMock(return_value=([], 2, 0)) + pipeline_mock = MagicMock() + pipeline_mock.index_batch_parallel = batch_mock + pipeline_mock.create_placeholder_documents = AsyncMock(return_value=0) + monkeypatch.setattr( + _mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock) + ) + monkeypatch.setattr( + _mod, "get_user_long_context_llm", AsyncMock(return_value=MagicMock()) + ) + + mock_task_logger = MagicMock() + mock_task_logger.log_task_progress = AsyncMock() + + _indexed, skipped, _unsupported = await _mod._index_with_delta_sync( + MagicMock(), + session, + MagicMock(), + _CONNECTOR_ID, + _SEARCH_SPACE_ID, + _USER_ID, + "folder-root", + "start-token", + mock_task_logger, + MagicMock(), + max_files=500, + enable_summary=True, + ) + + call_files = download_mock.call_args[0][1] + assert len(call_files) == 2 + assert skipped == 3 + + +# =================================================================== +# C) OneDrive smoke tests — verify page limit wiring +# =================================================================== + + +def _make_onedrive_file(file_id: str, name: str, size: int = 80 * 1024) -> dict: + return { + "id": file_id, + "name": name, + "file": {"mimeType": "application/octet-stream"}, + "size": str(size), + "lastModifiedDateTime": "2026-01-01T00:00:00Z", + } + + +@pytest.fixture +def onedrive_selected_mocks(monkeypatch): + import app.tasks.connector_indexers.onedrive_indexer as _mod + + session, fake_user = _make_page_limit_session(0, 100) + + get_file_results: dict[str, tuple[dict | None, str | None]] = {} + + async def _fake_get_file(client, file_id): + return get_file_results.get(file_id, (None, f"Not found: {file_id}")) + + monkeypatch.setattr(_mod, "get_file_by_id", _fake_get_file) + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_and_index_mock = AsyncMock(return_value=(0, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + + pipeline_mock = MagicMock() + pipeline_mock.create_placeholder_documents = AsyncMock(return_value=0) + monkeypatch.setattr( + _mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock) + ) + + return { + "session": session, + "fake_user": fake_user, + "get_file_results": get_file_results, + "download_and_index_mock": download_and_index_mock, + } + + +async def _run_onedrive_selected(mocks, file_ids): + from app.tasks.connector_indexers.onedrive_indexer import _index_selected_files + + return await _index_selected_files( + MagicMock(), + mocks["session"], + file_ids, + connector_id=_CONNECTOR_ID, + search_space_id=_SEARCH_SPACE_ID, + user_id=_USER_ID, + enable_summary=True, + ) + + +async def test_onedrive_over_quota_rejected(onedrive_selected_mocks): + """OneDrive: files exceeding quota produce errors, not downloads.""" + m = onedrive_selected_mocks + m["fake_user"].pages_used = 99 + m["fake_user"].pages_limit = 100 + + m["get_file_results"]["big"] = ( + _make_onedrive_file("big", "huge.pdf", size=500 * 1024), + None, + ) + + indexed, _skipped, _unsup, errors = await _run_onedrive_selected( + m, [("big", "huge.pdf")] + ) + + assert indexed == 0 + assert len(errors) == 1 + assert "page limit" in errors[0].lower() + + +async def test_onedrive_deducts_after_success(onedrive_selected_mocks): + """OneDrive: pages_used increases after successful indexing.""" + m = onedrive_selected_mocks + m["fake_user"].pages_used = 0 + m["fake_user"].pages_limit = 100 + + for fid in ("f1", "f2"): + m["get_file_results"][fid] = ( + _make_onedrive_file(fid, f"{fid}.xyz", size=80 * 1024), + None, + ) + m["download_and_index_mock"].return_value = (2, 0) + + await _run_onedrive_selected(m, [("f1", "f1.xyz"), ("f2", "f2.xyz")]) + + assert m["fake_user"].pages_used == 2 + + +# =================================================================== +# D) Dropbox smoke tests — verify page limit wiring +# =================================================================== + + +def _make_dropbox_file(file_path: str, name: str, size: int = 80 * 1024) -> dict: + return { + "id": f"id:{file_path}", + "name": name, + ".tag": "file", + "path_lower": file_path, + "size": str(size), + "server_modified": "2026-01-01T00:00:00Z", + "content_hash": f"hash_{name}", + } + + +@pytest.fixture +def dropbox_selected_mocks(monkeypatch): + import app.tasks.connector_indexers.dropbox_indexer as _mod + + session, fake_user = _make_page_limit_session(0, 100) + + get_file_results: dict[str, tuple[dict | None, str | None]] = {} + + async def _fake_get_file(client, file_path): + return get_file_results.get(file_path, (None, f"Not found: {file_path}")) + + monkeypatch.setattr(_mod, "get_file_by_path", _fake_get_file) + monkeypatch.setattr( + _mod, "_should_skip_file", AsyncMock(return_value=(False, None)) + ) + + download_and_index_mock = AsyncMock(return_value=(0, 0)) + monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock) + + pipeline_mock = MagicMock() + pipeline_mock.create_placeholder_documents = AsyncMock(return_value=0) + monkeypatch.setattr( + _mod, "IndexingPipelineService", MagicMock(return_value=pipeline_mock) + ) + + return { + "session": session, + "fake_user": fake_user, + "get_file_results": get_file_results, + "download_and_index_mock": download_and_index_mock, + } + + +async def _run_dropbox_selected(mocks, file_paths): + from app.tasks.connector_indexers.dropbox_indexer import _index_selected_files + + return await _index_selected_files( + MagicMock(), + mocks["session"], + file_paths, + connector_id=_CONNECTOR_ID, + search_space_id=_SEARCH_SPACE_ID, + user_id=_USER_ID, + enable_summary=True, + ) + + +async def test_dropbox_over_quota_rejected(dropbox_selected_mocks): + """Dropbox: files exceeding quota produce errors, not downloads.""" + m = dropbox_selected_mocks + m["fake_user"].pages_used = 99 + m["fake_user"].pages_limit = 100 + + m["get_file_results"]["/huge.pdf"] = ( + _make_dropbox_file("/huge.pdf", "huge.pdf", size=500 * 1024), + None, + ) + + indexed, _skipped, _unsup, errors = await _run_dropbox_selected( + m, [("/huge.pdf", "huge.pdf")] + ) + + assert indexed == 0 + assert len(errors) == 1 + assert "page limit" in errors[0].lower() + + +async def test_dropbox_deducts_after_success(dropbox_selected_mocks): + """Dropbox: pages_used increases after successful indexing.""" + m = dropbox_selected_mocks + m["fake_user"].pages_used = 0 + m["fake_user"].pages_limit = 100 + + for name in ("f1.xyz", "f2.xyz"): + path = f"/{name}" + m["get_file_results"][path] = ( + _make_dropbox_file(path, name, size=80 * 1024), + None, + ) + m["download_and_index_mock"].return_value = (2, 0) + + await _run_dropbox_selected(m, [("/f1.xyz", "f1.xyz"), ("/f2.xyz", "f2.xyz")]) + + assert m["fake_user"].pages_used == 2 diff --git a/surfsense_backend/tests/unit/connectors/__init__.py b/surfsense_backend/tests/unit/connectors/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_client.py b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py new file mode 100644 index 000000000..31cafe550 --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_client.py @@ -0,0 +1,123 @@ +"""Tests for DropboxClient delta-sync methods (get_latest_cursor, get_changes).""" + +from unittest.mock import AsyncMock, MagicMock + +import pytest + +from app.connectors.dropbox.client import DropboxClient + +pytestmark = pytest.mark.unit + + +def _make_client() -> DropboxClient: + """Create a DropboxClient with a mocked DB session so no real DB needed.""" + client = DropboxClient.__new__(DropboxClient) + client._session = MagicMock() + client._connector_id = 1 + return client + + +# ---------- C1: get_latest_cursor ---------- + + +async def test_get_latest_cursor_returns_cursor_string(monkeypatch): + client = _make_client() + + fake_resp = MagicMock() + fake_resp.status_code = 200 + fake_resp.json.return_value = {"cursor": "AAHbKxRZ9enq…"} + + monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp)) + + cursor, error = await client.get_latest_cursor("/my-folder") + + assert cursor == "AAHbKxRZ9enq…" + assert error is None + client._request.assert_called_once_with( + "/2/files/list_folder/get_latest_cursor", + { + "path": "/my-folder", + "recursive": False, + "include_non_downloadable_files": True, + }, + ) + + +# ---------- C2: get_changes returns entries and new cursor ---------- + + +async def test_get_changes_returns_entries_and_cursor(monkeypatch): + client = _make_client() + + fake_resp = MagicMock() + fake_resp.status_code = 200 + fake_resp.json.return_value = { + "entries": [ + {".tag": "file", "name": "new.txt", "id": "id:abc"}, + {".tag": "deleted", "name": "old.txt"}, + ], + "cursor": "cursor-v2", + "has_more": False, + } + monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp)) + + entries, new_cursor, error = await client.get_changes("cursor-v1") + + assert error is None + assert new_cursor == "cursor-v2" + assert len(entries) == 2 + assert entries[0]["name"] == "new.txt" + assert entries[1][".tag"] == "deleted" + + +# ---------- C3: get_changes handles pagination ---------- + + +async def test_get_changes_handles_pagination(monkeypatch): + client = _make_client() + + page1 = MagicMock() + page1.status_code = 200 + page1.json.return_value = { + "entries": [{".tag": "file", "name": "a.txt", "id": "id:a"}], + "cursor": "cursor-page2", + "has_more": True, + } + page2 = MagicMock() + page2.status_code = 200 + page2.json.return_value = { + "entries": [{".tag": "file", "name": "b.txt", "id": "id:b"}], + "cursor": "cursor-final", + "has_more": False, + } + + request_mock = AsyncMock(side_effect=[page1, page2]) + monkeypatch.setattr(client, "_request", request_mock) + + entries, new_cursor, error = await client.get_changes("cursor-v1") + + assert error is None + assert new_cursor == "cursor-final" + assert len(entries) == 2 + assert {e["name"] for e in entries} == {"a.txt", "b.txt"} + assert request_mock.call_count == 2 + + +# ---------- C4: get_changes raises on 401 ---------- + + +async def test_get_changes_returns_error_on_401(monkeypatch): + client = _make_client() + + fake_resp = MagicMock() + fake_resp.status_code = 401 + fake_resp.text = "Unauthorized" + + monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp)) + + entries, new_cursor, error = await client.get_changes("old-cursor") + + assert error is not None + assert "401" in error + assert entries == [] + assert new_cursor is None diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py new file mode 100644 index 000000000..b4715e083 --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_file_types.py @@ -0,0 +1,173 @@ +"""Tests for Dropbox file type filtering (should_skip_file).""" + +import pytest + +from app.connectors.dropbox.file_types import should_skip_file + +pytestmark = pytest.mark.unit + + +# --------------------------------------------------------------------------- +# Structural skips (independent of ETL service) +# --------------------------------------------------------------------------- + + +def test_folder_item_is_skipped(): + item = {".tag": "folder", "name": "My Folder"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_paper_file_is_not_skipped(): + item = {".tag": "file", "name": "notes.paper", "is_downloadable": False} + skip, ext = should_skip_file(item) + assert skip is False + assert ext is None + + +def test_non_downloadable_item_is_skipped(): + item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +# --------------------------------------------------------------------------- +# Extension-based skips (require ETL service context) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "filename", + [ + "archive.zip", + "backup.tar", + "data.gz", + "stuff.rar", + "pack.7z", + "program.exe", + "lib.dll", + "module.so", + "image.dmg", + "disk.iso", + "movie.mov", + "clip.avi", + "video.mkv", + "film.wmv", + "stream.flv", + "favicon.ico", + "raw.cr2", + "photo.nef", + "image.arw", + "pic.dng", + "design.psd", + "vector.ai", + "mockup.sketch", + "proto.fig", + "font.ttf", + "font.otf", + "font.woff", + "font.woff2", + "model.stl", + "scene.fbx", + "mesh.blend", + "local.db", + "data.sqlite", + "access.mdb", + ], +) +def test_non_parseable_extensions_are_skipped(filename, mocker): + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is True, f"{filename} should be skipped" + assert ext is not None + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "document.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "page.html", + "notes.md", + "config.json", + "feed.xml", + ], +) +def test_parseable_documents_are_not_skipped(filename, mocker): + """Files in plaintext/direct_convert/universal document sets are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename", + ["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"], +) +def test_universal_images_are_not_skipped(filename, mocker): + """Images supported by all parsers are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("old.doc", "DOCLING", True), + ("old.doc", "LLAMACLOUD", False), + ("old.doc", "UNSTRUCTURED", False), + ("legacy.xls", "DOCLING", True), + ("legacy.xls", "LLAMACLOUD", False), + ("legacy.xls", "UNSTRUCTURED", False), + ("deck.ppt", "DOCLING", True), + ("deck.ppt", "LLAMACLOUD", False), + ("deck.ppt", "UNSTRUCTURED", False), + ("icon.svg", "DOCLING", True), + ("icon.svg", "LLAMACLOUD", False), + ("anim.gif", "DOCLING", True), + ("anim.gif", "LLAMACLOUD", False), + ("photo.webp", "DOCLING", False), + ("photo.webp", "LLAMACLOUD", False), + ("photo.webp", "UNSTRUCTURED", True), + ("live.heic", "DOCLING", True), + ("live.heic", "UNSTRUCTURED", False), + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ], +) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {".tag": "file", "name": filename} + skip, ext = should_skip_file(item) + assert skip is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) + if expected_skip: + assert ext is not None + else: + assert ext is None + + +def test_returns_unsupported_extension(mocker): + """When a file is skipped due to unsupported extension, the ext string is returned.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {".tag": "file", "name": "old.doc"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext == ".doc" diff --git a/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py b/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py new file mode 100644 index 000000000..85281354c --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_dropbox_reauth.py @@ -0,0 +1,43 @@ +"""Test that Dropbox re-auth preserves folder_cursors in connector config.""" + +import pytest + +pytestmark = pytest.mark.unit + + +def test_reauth_preserves_folder_cursors(): + """G1: re-authentication preserves folder_cursors alongside cursor.""" + old_config = { + "access_token": "old-token-enc", + "refresh_token": "old-refresh-enc", + "cursor": "old-cursor-abc", + "folder_cursors": {"/docs": "cursor-docs-123", "/photos": "cursor-photos-456"}, + "_token_encrypted": True, + "auth_expired": True, + } + + new_connector_config = { + "access_token": "new-token-enc", + "refresh_token": "new-refresh-enc", + "token_type": "bearer", + "expires_in": 14400, + "expires_at": "2026-04-06T16:00:00+00:00", + "_token_encrypted": True, + } + + existing_cursor = old_config.get("cursor") + existing_folder_cursors = old_config.get("folder_cursors") + merged_config = { + **new_connector_config, + "cursor": existing_cursor, + "folder_cursors": existing_folder_cursors, + "auth_expired": False, + } + + assert merged_config["access_token"] == "new-token-enc" + assert merged_config["cursor"] == "old-cursor-abc" + assert merged_config["folder_cursors"] == { + "/docs": "cursor-docs-123", + "/photos": "cursor-photos-456", + } + assert merged_config["auth_expired"] is False diff --git a/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py new file mode 100644 index 000000000..ab602468d --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_google_drive_file_types.py @@ -0,0 +1,80 @@ +"""Tests for Google Drive file type filtering.""" + +import pytest + +from app.connectors.google_drive.file_types import should_skip_by_extension + +pytestmark = pytest.mark.unit + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + ], +) +def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker): + """Truly unsupported files are skipped no matter which ETL service is configured.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + skip, _ext = should_skip_by_extension(filename) + assert skip is True + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "photo.png", + "notes.md", + ], +) +def test_universal_extensions_are_not_skipped(filename, mocker): + """Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped.""" + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + skip, ext = should_skip_by_extension(filename) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.gif", "DOCLING", True), + ("photo.gif", "LLAMACLOUD", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), + ], +) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) + skip, ext = should_skip_by_extension(filename) + assert skip is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) + if expected_skip: + assert ext is not None, "unsupported extension should be returned" + else: + assert ext is None + + +def test_returns_unsupported_extension(mocker): + """When a file is skipped, the unsupported extension string is returned.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + skip, ext = should_skip_by_extension("macro.docm") + assert skip is True + assert ext == ".docm" diff --git a/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py new file mode 100644 index 000000000..1d9124c47 --- /dev/null +++ b/surfsense_backend/tests/unit/connectors/test_onedrive_file_types.py @@ -0,0 +1,118 @@ +"""Tests for OneDrive file type filtering.""" + +import pytest + +from app.connectors.onedrive.file_types import should_skip_file + +pytestmark = pytest.mark.unit + + +# --------------------------------------------------------------------------- +# Structural skips (independent of ETL service) +# --------------------------------------------------------------------------- + + +def test_folder_is_skipped(): + item = {"folder": {}, "name": "My Folder"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_remote_item_is_skipped(): + item = {"remoteItem": {}, "name": "shared.docx"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_package_is_skipped(): + item = {"package": {}, "name": "notebook"} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +def test_onenote_is_skipped(): + item = {"name": "notes", "file": {"mimeType": "application/msonenote"}} + skip, ext = should_skip_file(item) + assert skip is True + assert ext is None + + +# --------------------------------------------------------------------------- +# Extension-based skips (require ETL service context) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + ], +) +def test_unsupported_extensions_are_skipped(filename, mocker): + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is True, f"{filename} should be skipped" + assert ext is not None + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "sheet.xlsx", + "slides.pptx", + "readme.txt", + "data.csv", + "photo.png", + "notes.md", + ], +) +def test_universal_files_are_not_skipped(filename, mocker): + for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is False, f"{filename} should NOT be skipped with {service}" + assert ext is None + + +@pytest.mark.parametrize( + "filename,service,expected_skip", + [ + ("macro.docm", "DOCLING", True), + ("macro.docm", "LLAMACLOUD", False), + ("mail.eml", "DOCLING", True), + ("mail.eml", "UNSTRUCTURED", False), + ("photo.heic", "UNSTRUCTURED", False), + ("photo.heic", "DOCLING", True), + ], +) +def test_parser_specific_extensions(filename, service, expected_skip, mocker): + mocker.patch("app.config.config.ETL_SERVICE", service) + item = {"name": filename, "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is expected_skip, ( + f"{filename} with {service}: expected skip={expected_skip}" + ) + if expected_skip: + assert ext is not None + else: + assert ext is None + + +def test_returns_unsupported_extension(mocker): + """When a file is skipped due to unsupported extension, the ext string is returned.""" + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}} + skip, ext = should_skip_file(item) + assert skip is True + assert ext == ".eml" diff --git a/surfsense_backend/tests/unit/etl_pipeline/conftest.py b/surfsense_backend/tests/unit/etl_pipeline/conftest.py new file mode 100644 index 000000000..082ab9771 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/conftest.py @@ -0,0 +1,27 @@ +"""Pre-register the etl_pipeline package to avoid circular imports during unit tests.""" + +import sys +import types +from pathlib import Path + +_BACKEND = Path(__file__).resolve().parents[3] + + +def _stub_package(dotted: str, fs_dir: Path) -> None: + if dotted not in sys.modules: + mod = types.ModuleType(dotted) + mod.__path__ = [str(fs_dir)] + mod.__package__ = dotted + sys.modules[dotted] = mod + + parts = dotted.split(".") + if len(parts) > 1: + parent_dotted = ".".join(parts[:-1]) + parent = sys.modules.get(parent_dotted) + if parent is not None: + setattr(parent, parts[-1], sys.modules[dotted]) + + +_stub_package("app", _BACKEND / "app") +_stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline") +_stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers") diff --git a/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py new file mode 100644 index 000000000..769b1dc53 --- /dev/null +++ b/surfsense_backend/tests/unit/etl_pipeline/test_etl_pipeline_service.py @@ -0,0 +1,461 @@ +"""Tests for EtlPipelineService -- the unified ETL pipeline public interface.""" + +import pytest + +from app.etl_pipeline.etl_document import EtlRequest +from app.etl_pipeline.etl_pipeline_service import EtlPipelineService + +pytestmark = pytest.mark.unit + + +async def test_extract_txt_file_returns_markdown(tmp_path): + """Tracer bullet: a .txt file is read and returned as-is in an EtlResult.""" + txt_file = tmp_path / "hello.txt" + txt_file.write_text("Hello, world!", encoding="utf-8") + + service = EtlPipelineService() + result = await service.extract( + EtlRequest(file_path=str(txt_file), filename="hello.txt") + ) + + assert result.markdown_content == "Hello, world!" + assert result.etl_service == "PLAINTEXT" + assert result.content_type == "plaintext" + + +async def test_extract_md_file(tmp_path): + """A .md file is classified as PLAINTEXT and extracted.""" + md_file = tmp_path / "readme.md" + md_file.write_text("# Title\n\nBody text.", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(md_file), filename="readme.md") + ) + + assert result.markdown_content == "# Title\n\nBody text." + assert result.etl_service == "PLAINTEXT" + assert result.content_type == "plaintext" + + +async def test_extract_markdown_file(tmp_path): + """A .markdown file is classified as PLAINTEXT and extracted.""" + md_file = tmp_path / "notes.markdown" + md_file.write_text("Some notes.", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(md_file), filename="notes.markdown") + ) + + assert result.markdown_content == "Some notes." + assert result.etl_service == "PLAINTEXT" + + +async def test_extract_python_file(tmp_path): + """A .py source code file is classified as PLAINTEXT.""" + py_file = tmp_path / "script.py" + py_file.write_text("print('hello')", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(py_file), filename="script.py") + ) + + assert result.markdown_content == "print('hello')" + assert result.etl_service == "PLAINTEXT" + assert result.content_type == "plaintext" + + +async def test_extract_js_file(tmp_path): + """A .js source code file is classified as PLAINTEXT.""" + js_file = tmp_path / "app.js" + js_file.write_text("console.log('hi');", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(js_file), filename="app.js") + ) + + assert result.markdown_content == "console.log('hi');" + assert result.etl_service == "PLAINTEXT" + + +async def test_extract_csv_returns_markdown_table(tmp_path): + """A .csv file is converted to a markdown table.""" + csv_file = tmp_path / "data.csv" + csv_file.write_text("name,age\nAlice,30\nBob,25\n", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(csv_file), filename="data.csv") + ) + + assert "| name | age |" in result.markdown_content + assert "| Alice | 30 |" in result.markdown_content + assert result.etl_service == "DIRECT_CONVERT" + assert result.content_type == "direct_convert" + + +async def test_extract_tsv_returns_markdown_table(tmp_path): + """A .tsv file is converted to a markdown table.""" + tsv_file = tmp_path / "data.tsv" + tsv_file.write_text("x\ty\n1\t2\n", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(tsv_file), filename="data.tsv") + ) + + assert "| x | y |" in result.markdown_content + assert result.etl_service == "DIRECT_CONVERT" + + +async def test_extract_html_returns_markdown(tmp_path): + """An .html file is converted to markdown.""" + html_file = tmp_path / "page.html" + html_file.write_text("

Title

Body

", encoding="utf-8") + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(html_file), filename="page.html") + ) + + assert "Title" in result.markdown_content + assert "Body" in result.markdown_content + assert result.etl_service == "DIRECT_CONVERT" + + +async def test_extract_mp3_returns_transcription(tmp_path, mocker): + """An .mp3 audio file is transcribed via litellm.atranscription.""" + audio_file = tmp_path / "recording.mp3" + audio_file.write_bytes(b"\x00" * 100) + + mocker.patch("app.config.config.STT_SERVICE", "openai/whisper-1") + mocker.patch("app.config.config.STT_SERVICE_API_KEY", "fake-key") + mocker.patch("app.config.config.STT_SERVICE_API_BASE", None) + + mock_transcription = mocker.patch( + "app.etl_pipeline.parsers.audio.atranscription", + return_value={"text": "Hello from audio"}, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(audio_file), filename="recording.mp3") + ) + + assert "Hello from audio" in result.markdown_content + assert result.etl_service == "AUDIO" + assert result.content_type == "audio" + mock_transcription.assert_called_once() + + +# --------------------------------------------------------------------------- +# Slice 7 - DOCLING document parsing +# --------------------------------------------------------------------------- + + +async def test_extract_pdf_with_docling(tmp_path, mocker): + """A .pdf file with ETL_SERVICE=DOCLING returns parsed markdown.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake") + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + fake_docling = mocker.AsyncMock() + fake_docling.process_document.return_value = {"content": "# Parsed PDF"} + mocker.patch( + "app.services.docling_service.create_docling_service", + return_value=fake_docling, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf") + ) + + assert result.markdown_content == "# Parsed PDF" + assert result.etl_service == "DOCLING" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 8 - UNSTRUCTURED document parsing +# --------------------------------------------------------------------------- + + +async def test_extract_pdf_with_unstructured(tmp_path, mocker): + """A .pdf file with ETL_SERVICE=UNSTRUCTURED returns parsed markdown.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake") + + mocker.patch("app.config.config.ETL_SERVICE", "UNSTRUCTURED") + + class FakeDoc: + def __init__(self, text): + self.page_content = text + + fake_loader_instance = mocker.AsyncMock() + fake_loader_instance.aload.return_value = [ + FakeDoc("Page 1 content"), + FakeDoc("Page 2 content"), + ] + mocker.patch( + "langchain_unstructured.UnstructuredLoader", + return_value=fake_loader_instance, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf") + ) + + assert "Page 1 content" in result.markdown_content + assert "Page 2 content" in result.markdown_content + assert result.etl_service == "UNSTRUCTURED" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 9 - LLAMACLOUD document parsing +# --------------------------------------------------------------------------- + + +async def test_extract_pdf_with_llamacloud(tmp_path, mocker): + """A .pdf file with ETL_SERVICE=LLAMACLOUD returns parsed markdown.""" + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10) + + mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD") + mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True) + + class FakeDoc: + text = "# LlamaCloud parsed" + + class FakeJobResult: + pages = [] + + def get_markdown_documents(self, split_by_page=True): + return [FakeDoc()] + + fake_parser = mocker.AsyncMock() + fake_parser.aparse.return_value = FakeJobResult() + mocker.patch( + "llama_cloud_services.LlamaParse", + return_value=fake_parser, + ) + mocker.patch( + "llama_cloud_services.parse.utils.ResultType", + mocker.MagicMock(MD="md"), + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5) + ) + + assert result.markdown_content == "# LlamaCloud parsed" + assert result.etl_service == "LLAMACLOUD" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 10 - unknown extension falls through to document ETL +# --------------------------------------------------------------------------- + + +async def test_unknown_extension_uses_document_etl(tmp_path, mocker): + """An allowlisted document extension (.docx) routes to the document ETL path.""" + docx_file = tmp_path / "doc.docx" + docx_file.write_bytes(b"PK fake docx") + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + fake_docling = mocker.AsyncMock() + fake_docling.process_document.return_value = {"content": "Docx content"} + mocker.patch( + "app.services.docling_service.create_docling_service", + return_value=fake_docling, + ) + + result = await EtlPipelineService().extract( + EtlRequest(file_path=str(docx_file), filename="doc.docx") + ) + + assert result.markdown_content == "Docx content" + assert result.content_type == "document" + + +# --------------------------------------------------------------------------- +# Slice 11 - EtlRequest validation +# --------------------------------------------------------------------------- + + +def test_etl_request_requires_filename(): + """EtlRequest rejects missing filename.""" + with pytest.raises(ValueError, match="filename must not be empty"): + EtlRequest(file_path="/tmp/some.txt", filename="") + + +# --------------------------------------------------------------------------- +# Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError +# --------------------------------------------------------------------------- + + +async def test_unknown_etl_service_raises(tmp_path, mocker): + """An unknown ETL_SERVICE raises EtlServiceUnavailableError.""" + from app.etl_pipeline.exceptions import EtlServiceUnavailableError + + pdf_file = tmp_path / "report.pdf" + pdf_file.write_bytes(b"%PDF fake") + + mocker.patch("app.config.config.ETL_SERVICE", "NONEXISTENT") + + with pytest.raises(EtlServiceUnavailableError, match="Unknown ETL_SERVICE"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(pdf_file), filename="report.pdf") + ) + + +# --------------------------------------------------------------------------- +# Slice 13 - unsupported file types are rejected before reaching any parser +# --------------------------------------------------------------------------- + + +def test_unknown_extension_classified_as_unsupported(): + """An unknown extension defaults to UNSUPPORTED (allowlist behaviour).""" + from app.etl_pipeline.file_classifier import FileCategory, classify_file + + assert classify_file("random.xyz") == FileCategory.UNSUPPORTED + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + "data.parquet", + "package.deb", + "firmware.bin", + ], +) +def test_unsupported_extensions_classified_correctly(filename): + """Extensions not in any allowlist are classified as UNSUPPORTED.""" + from app.etl_pipeline.file_classifier import FileCategory, classify_file + + assert classify_file(filename) == FileCategory.UNSUPPORTED + + +@pytest.mark.parametrize( + "filename,expected", + [ + ("report.pdf", "document"), + ("doc.docx", "document"), + ("slides.pptx", "document"), + ("sheet.xlsx", "document"), + ("photo.png", "document"), + ("photo.jpg", "document"), + ("book.epub", "document"), + ("letter.odt", "document"), + ("readme.md", "plaintext"), + ("data.csv", "direct_convert"), + ], +) +def test_parseable_extensions_classified_correctly(filename, expected): + """Parseable files are classified into their correct category.""" + from app.etl_pipeline.file_classifier import FileCategory, classify_file + + result = classify_file(filename) + assert result != FileCategory.UNSUPPORTED + assert result.value == expected + + +async def test_extract_unsupported_file_raises_error(tmp_path): + """EtlPipelineService.extract() raises EtlUnsupportedFileError for .exe files.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + exe_file = tmp_path / "program.exe" + exe_file.write_bytes(b"\x00" * 10) + + with pytest.raises(EtlUnsupportedFileError, match="not supported"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(exe_file), filename="program.exe") + ) + + +async def test_extract_zip_raises_unsupported_error(tmp_path): + """EtlPipelineService.extract() raises EtlUnsupportedFileError for .zip archives.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + zip_file = tmp_path / "archive.zip" + zip_file.write_bytes(b"PK\x03\x04") + + with pytest.raises(EtlUnsupportedFileError, match="not supported"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(zip_file), filename="archive.zip") + ) + + +# --------------------------------------------------------------------------- +# Slice 14 - should_skip_for_service (per-parser document filtering) +# --------------------------------------------------------------------------- + + +@pytest.mark.parametrize( + "filename,etl_service,expected_skip", + [ + ("file.eml", "DOCLING", True), + ("file.eml", "UNSTRUCTURED", False), + ("file.docm", "LLAMACLOUD", False), + ("file.docm", "DOCLING", True), + ("file.txt", "DOCLING", False), + ("file.csv", "LLAMACLOUD", False), + ("file.mp3", "UNSTRUCTURED", False), + ("file.exe", "LLAMACLOUD", True), + ("file.pdf", "DOCLING", False), + ("file.webp", "DOCLING", False), + ("file.webp", "UNSTRUCTURED", True), + ("file.gif", "LLAMACLOUD", False), + ("file.gif", "DOCLING", True), + ("file.heic", "UNSTRUCTURED", False), + ("file.heic", "DOCLING", True), + ("file.svg", "LLAMACLOUD", False), + ("file.svg", "DOCLING", True), + ("file.p7s", "UNSTRUCTURED", False), + ("file.p7s", "LLAMACLOUD", True), + ], +) +def test_should_skip_for_service(filename, etl_service, expected_skip): + from app.etl_pipeline.file_classifier import should_skip_for_service + + assert should_skip_for_service(filename, etl_service) is expected_skip, ( + f"{filename} with {etl_service}: expected skip={expected_skip}" + ) + + +# --------------------------------------------------------------------------- +# Slice 14b - ETL pipeline rejects per-parser incompatible documents +# --------------------------------------------------------------------------- + + +async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker): + """Docling cannot parse .docm -- pipeline should reject before dispatching.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + docm_file = tmp_path / "macro.docm" + docm_file.write_bytes(b"\x00" * 10) + + with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(docm_file), filename="macro.docm") + ) + + +async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker): + """Docling cannot parse .eml -- pipeline should reject before dispatching.""" + from app.etl_pipeline.exceptions import EtlUnsupportedFileError + + mocker.patch("app.config.config.ETL_SERVICE", "DOCLING") + + eml_file = tmp_path / "mail.eml" + eml_file.write_bytes(b"From: test@example.com") + + with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"): + await EtlPipelineService().extract( + EtlRequest(file_path=str(eml_file), filename="mail.eml") + ) diff --git a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py index 163dd0d1d..a8cf5c93b 100644 --- a/surfsense_backend/tests/unit/middleware/test_knowledge_search.py +++ b/surfsense_backend/tests/unit/middleware/test_knowledge_search.py @@ -248,7 +248,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", @@ -298,7 +298,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", @@ -334,7 +334,7 @@ class TestKnowledgeBaseSearchMiddlewarePlanner: return [] async def fake_build_scoped_filesystem(**kwargs): - return {} + return {}, {} monkeypatch.setattr( "app.agents.new_chat.middleware.knowledge_search.search_knowledge_base", diff --git a/surfsense_backend/tests/unit/services/__init__.py b/surfsense_backend/tests/unit/services/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/unit/services/test_docling_image_support.py b/surfsense_backend/tests/unit/services/test_docling_image_support.py new file mode 100644 index 000000000..11ffc0ed1 --- /dev/null +++ b/surfsense_backend/tests/unit/services/test_docling_image_support.py @@ -0,0 +1,70 @@ +"""Test that DoclingService does NOT restrict allowed_formats, letting Docling +accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.).""" + +from enum import Enum +from unittest.mock import MagicMock, patch + +import pytest + +pytestmark = pytest.mark.unit + + +class _FakeInputFormat(Enum): + PDF = "pdf" + IMAGE = "image" + DOCX = "docx" + PPTX = "pptx" + XLSX = "xlsx" + + +def test_docling_service_does_not_restrict_allowed_formats(): + """DoclingService should NOT pass allowed_formats to DocumentConverter, + so Docling defaults to accepting every InputFormat it supports.""" + + mock_converter_cls = MagicMock() + mock_backend = MagicMock() + + fake_pipeline_options_cls = MagicMock() + fake_pipeline_options = MagicMock() + fake_pipeline_options_cls.return_value = fake_pipeline_options + + fake_pdf_format_option_cls = MagicMock() + + with patch.dict( + "sys.modules", + { + "docling": MagicMock(), + "docling.backend": MagicMock(), + "docling.backend.pypdfium2_backend": MagicMock( + PyPdfiumDocumentBackend=mock_backend + ), + "docling.datamodel": MagicMock(), + "docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat), + "docling.datamodel.pipeline_options": MagicMock( + PdfPipelineOptions=fake_pipeline_options_cls + ), + "docling.document_converter": MagicMock( + DocumentConverter=mock_converter_cls, + PdfFormatOption=fake_pdf_format_option_cls, + ), + }, + ): + from importlib import reload + + import app.services.docling_service as mod + + reload(mod) + + mod.DoclingService() + + call_kwargs = mock_converter_cls.call_args + assert call_kwargs is not None, "DocumentConverter was never called" + + _, kwargs = call_kwargs + assert "allowed_formats" not in kwargs, ( + f"allowed_formats should not be passed — let Docling accept all formats. " + f"Got: {kwargs.get('allowed_formats')}" + ) + assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), ( + "format_options should still configure PDF pipeline options" + ) diff --git a/surfsense_backend/tests/unit/utils/__init__.py b/surfsense_backend/tests/unit/utils/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/surfsense_backend/tests/unit/utils/test_file_extensions.py b/surfsense_backend/tests/unit/utils/test_file_extensions.py new file mode 100644 index 000000000..c33b39f05 --- /dev/null +++ b/surfsense_backend/tests/unit/utils/test_file_extensions.py @@ -0,0 +1,154 @@ +"""Tests for the DOCUMENT_EXTENSIONS allowlist module.""" + +import pytest + +pytestmark = pytest.mark.unit + + +def test_pdf_is_supported_document(): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension("report.pdf") is True + + +def test_exe_is_not_supported_document(): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension("malware.exe") is False + + +@pytest.mark.parametrize( + "filename", + [ + "report.pdf", + "doc.docx", + "old.doc", + "sheet.xlsx", + "legacy.xls", + "slides.pptx", + "deck.ppt", + "macro.docm", + "macro.xlsm", + "macro.pptm", + "photo.png", + "photo.jpg", + "photo.jpeg", + "scan.bmp", + "scan.tiff", + "scan.tif", + "photo.webp", + "anim.gif", + "iphone.heic", + "manual.rtf", + "book.epub", + "letter.odt", + "data.ods", + "presentation.odp", + "inbox.eml", + "outlook.msg", + "korean.hwpx", + "korean.hwp", + "template.dot", + "template.dotm", + "template.pot", + "template.potx", + "binary.xlsb", + "workspace.xlw", + "vector.svg", + "signature.p7s", + ], +) +def test_document_extensions_are_supported(filename): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension(filename) is True, ( + f"{filename} should be supported" + ) + + +@pytest.mark.parametrize( + "filename", + [ + "malware.exe", + "archive.zip", + "video.mov", + "font.woff2", + "model.blend", + "random.xyz", + "data.parquet", + "package.deb", + ], +) +def test_non_document_extensions_are_not_supported(filename): + from app.utils.file_extensions import is_supported_document_extension + + assert is_supported_document_extension(filename) is False, ( + f"{filename} should NOT be supported" + ) + + +# --------------------------------------------------------------------------- +# Per-parser extension sets +# --------------------------------------------------------------------------- + + +def test_union_equals_all_three_sets(): + from app.utils.file_extensions import ( + DOCLING_DOCUMENT_EXTENSIONS, + DOCUMENT_EXTENSIONS, + LLAMAPARSE_DOCUMENT_EXTENSIONS, + UNSTRUCTURED_DOCUMENT_EXTENSIONS, + ) + + expected = ( + DOCLING_DOCUMENT_EXTENSIONS + | LLAMAPARSE_DOCUMENT_EXTENSIONS + | UNSTRUCTURED_DOCUMENT_EXTENSIONS + ) + assert expected == DOCUMENT_EXTENSIONS + + +def test_get_extensions_for_docling(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("DOCLING") + assert ".pdf" in exts + assert ".webp" in exts + assert ".docx" in exts + assert ".eml" not in exts + assert ".docm" not in exts + assert ".gif" not in exts + assert ".heic" not in exts + + +def test_get_extensions_for_llamacloud(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("LLAMACLOUD") + assert ".docm" in exts + assert ".gif" in exts + assert ".svg" in exts + assert ".hwp" in exts + assert ".eml" not in exts + assert ".heic" not in exts + + +def test_get_extensions_for_unstructured(): + from app.utils.file_extensions import get_document_extensions_for_service + + exts = get_document_extensions_for_service("UNSTRUCTURED") + assert ".eml" in exts + assert ".heic" in exts + assert ".p7s" in exts + assert ".docm" not in exts + assert ".gif" not in exts + assert ".svg" not in exts + + +def test_get_extensions_for_none_returns_union(): + from app.utils.file_extensions import ( + DOCUMENT_EXTENSIONS, + get_document_extensions_for_service, + ) + + assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS diff --git a/surfsense_backend/uv.lock b/surfsense_backend/uv.lock index 8de78705d..c35bbf7d7 100644 --- a/surfsense_backend/uv.lock +++ b/surfsense_backend/uv.lock @@ -62,7 +62,7 @@ wheels = [ [[package]] name = "aiohttp" -version = "3.13.3" +version = "3.13.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohappyeyeballs" }, @@ -73,76 +73,76 @@ dependencies = [ { name = "propcache" }, { name = "yarl" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/50/42/32cf8e7704ceb4481406eb87161349abb46a57fee3f008ba9cb610968646/aiohttp-3.13.3.tar.gz", hash = "sha256:a949eee43d3782f2daae4f4a2819b2cb9b0c5d3b7f7a927067cc84dafdbb9f88", size = 7844556 } +sdist = { url = "https://files.pythonhosted.org/packages/77/9a/152096d4808df8e4268befa55fba462f440f14beab85e8ad9bf990516918/aiohttp-3.13.5.tar.gz", hash = "sha256:9d98cc980ecc96be6eb4c1994ce35d28d8b1f5e5208a23b421187d1209dbb7d1", size = 7858271 } wheels = [ - { url = "https://files.pythonhosted.org/packages/a0/be/4fc11f202955a69e0db803a12a062b8379c970c7c84f4882b6da17337cc1/aiohttp-3.13.3-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:b903a4dfee7d347e2d87697d0713be59e0b87925be030c9178c5faa58ea58d5c", size = 739732 }, - { url = "https://files.pythonhosted.org/packages/97/2c/621d5b851f94fa0bb7430d6089b3aa970a9d9b75196bc93bb624b0db237a/aiohttp-3.13.3-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a45530014d7a1e09f4a55f4f43097ba0fd155089372e105e4bff4ca76cb1b168", size = 494293 }, - { url = "https://files.pythonhosted.org/packages/5d/43/4be01406b78e1be8320bb8316dc9c42dbab553d281c40364e0f862d5661c/aiohttp-3.13.3-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:27234ef6d85c914f9efeb77ff616dbf4ad2380be0cda40b4db086ffc7ddd1b7d", size = 493533 }, - { url = "https://files.pythonhosted.org/packages/8d/a8/5a35dc56a06a2c90d4742cbf35294396907027f80eea696637945a106f25/aiohttp-3.13.3-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d32764c6c9aafb7fb55366a224756387cd50bfa720f32b88e0e6fa45b27dcf29", size = 1737839 }, - { url = "https://files.pythonhosted.org/packages/bf/62/4b9eeb331da56530bf2e198a297e5303e1c1ebdceeb00fe9b568a65c5a0c/aiohttp-3.13.3-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b1a6102b4d3ebc07dad44fbf07b45bb600300f15b552ddf1851b5390202ea2e3", size = 1703932 }, - { url = "https://files.pythonhosted.org/packages/7c/f6/af16887b5d419e6a367095994c0b1332d154f647e7dc2bd50e61876e8e3d/aiohttp-3.13.3-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c014c7ea7fb775dd015b2d3137378b7be0249a448a1612268b5a90c2d81de04d", size = 1771906 }, - { url = "https://files.pythonhosted.org/packages/ce/83/397c634b1bcc24292fa1e0c7822800f9f6569e32934bdeef09dae7992dfb/aiohttp-3.13.3-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2b8d8ddba8f95ba17582226f80e2de99c7a7948e66490ef8d947e272a93e9463", size = 1871020 }, - { url = "https://files.pythonhosted.org/packages/86/f6/a62cbbf13f0ac80a70f71b1672feba90fdb21fd7abd8dbf25c0105fb6fa3/aiohttp-3.13.3-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9ae8dd55c8e6c4257eae3a20fd2c8f41edaea5992ed67156642493b8daf3cecc", size = 1755181 }, - { url = "https://files.pythonhosted.org/packages/0a/87/20a35ad487efdd3fba93d5843efdfaa62d2f1479eaafa7453398a44faf13/aiohttp-3.13.3-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:01ad2529d4b5035578f5081606a465f3b814c542882804e2e8cda61adf5c71bf", size = 1561794 }, - { url = "https://files.pythonhosted.org/packages/de/95/8fd69a66682012f6716e1bc09ef8a1a2a91922c5725cb904689f112309c4/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:bb4f7475e359992b580559e008c598091c45b5088f28614e855e42d39c2f1033", size = 1697900 }, - { url = "https://files.pythonhosted.org/packages/e5/66/7b94b3b5ba70e955ff597672dad1691333080e37f50280178967aff68657/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:c19b90316ad3b24c69cd78d5c9b4f3aa4497643685901185b65166293d36a00f", size = 1728239 }, - { url = "https://files.pythonhosted.org/packages/47/71/6f72f77f9f7d74719692ab65a2a0252584bf8d5f301e2ecb4c0da734530a/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:96d604498a7c782cb15a51c406acaea70d8c027ee6b90c569baa6e7b93073679", size = 1740527 }, - { url = "https://files.pythonhosted.org/packages/fa/b4/75ec16cbbd5c01bdaf4a05b19e103e78d7ce1ef7c80867eb0ace42ff4488/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:084911a532763e9d3dd95adf78a78f4096cd5f58cdc18e6fdbc1b58417a45423", size = 1554489 }, - { url = "https://files.pythonhosted.org/packages/52/8f/bc518c0eea29f8406dcf7ed1f96c9b48e3bc3995a96159b3fc11f9e08321/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:7a4a94eb787e606d0a09404b9c38c113d3b099d508021faa615d70a0131907ce", size = 1767852 }, - { url = "https://files.pythonhosted.org/packages/9d/f2/a07a75173124f31f11ea6f863dc44e6f09afe2bca45dd4e64979490deab1/aiohttp-3.13.3-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:87797e645d9d8e222e04160ee32aa06bc5c163e8499f24db719e7852ec23093a", size = 1722379 }, - { url = "https://files.pythonhosted.org/packages/3c/4a/1a3fee7c21350cac78e5c5cef711bac1b94feca07399f3d406972e2d8fcd/aiohttp-3.13.3-cp312-cp312-win32.whl", hash = "sha256:b04be762396457bef43f3597c991e192ee7da460a4953d7e647ee4b1c28e7046", size = 428253 }, - { url = "https://files.pythonhosted.org/packages/d9/b7/76175c7cb4eb73d91ad63c34e29fc4f77c9386bba4a65b53ba8e05ee3c39/aiohttp-3.13.3-cp312-cp312-win_amd64.whl", hash = "sha256:e3531d63d3bdfa7e3ac5e9b27b2dd7ec9df3206a98e0b3445fa906f233264c57", size = 455407 }, - { url = "https://files.pythonhosted.org/packages/97/8a/12ca489246ca1faaf5432844adbfce7ff2cc4997733e0af120869345643a/aiohttp-3.13.3-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:5dff64413671b0d3e7d5918ea490bdccb97a4ad29b3f311ed423200b2203e01c", size = 734190 }, - { url = "https://files.pythonhosted.org/packages/32/08/de43984c74ed1fca5c014808963cc83cb00d7bb06af228f132d33862ca76/aiohttp-3.13.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:87b9aab6d6ed88235aa2970294f496ff1a1f9adcd724d800e9b952395a80ffd9", size = 491783 }, - { url = "https://files.pythonhosted.org/packages/17/f8/8dd2cf6112a5a76f81f81a5130c57ca829d101ad583ce57f889179accdda/aiohttp-3.13.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:425c126c0dc43861e22cb1c14ba4c8e45d09516d0a3ae0a3f7494b79f5f233a3", size = 490704 }, - { url = "https://files.pythonhosted.org/packages/6d/40/a46b03ca03936f832bc7eaa47cfbb1ad012ba1be4790122ee4f4f8cba074/aiohttp-3.13.3-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f9120f7093c2a32d9647abcaf21e6ad275b4fbec5b55969f978b1a97c7c86bf", size = 1720652 }, - { url = "https://files.pythonhosted.org/packages/f7/7e/917fe18e3607af92657e4285498f500dca797ff8c918bd7d90b05abf6c2a/aiohttp-3.13.3-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:697753042d57f4bf7122cab985bf15d0cef23c770864580f5af4f52023a56bd6", size = 1692014 }, - { url = "https://files.pythonhosted.org/packages/71/b6/cefa4cbc00d315d68973b671cf105b21a609c12b82d52e5d0c9ae61d2a09/aiohttp-3.13.3-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6de499a1a44e7de70735d0b39f67c8f25eb3d91eb3103be99ca0fa882cdd987d", size = 1759777 }, - { url = "https://files.pythonhosted.org/packages/fb/e3/e06ee07b45e59e6d81498b591fc589629be1553abb2a82ce33efe2a7b068/aiohttp-3.13.3-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:37239e9f9a7ea9ac5bf6b92b0260b01f8a22281996da609206a84df860bc1261", size = 1861276 }, - { url = "https://files.pythonhosted.org/packages/7c/24/75d274228acf35ceeb2850b8ce04de9dd7355ff7a0b49d607ee60c29c518/aiohttp-3.13.3-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f76c1e3fe7d7c8afad7ed193f89a292e1999608170dcc9751a7462a87dfd5bc0", size = 1743131 }, - { url = "https://files.pythonhosted.org/packages/04/98/3d21dde21889b17ca2eea54fdcff21b27b93f45b7bb94ca029c31ab59dc3/aiohttp-3.13.3-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fc290605db2a917f6e81b0e1e0796469871f5af381ce15c604a3c5c7e51cb730", size = 1556863 }, - { url = "https://files.pythonhosted.org/packages/9e/84/da0c3ab1192eaf64782b03971ab4055b475d0db07b17eff925e8c93b3aa5/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:4021b51936308aeea0367b8f006dc999ca02bc118a0cc78c303f50a2ff6afb91", size = 1682793 }, - { url = "https://files.pythonhosted.org/packages/ff/0f/5802ada182f575afa02cbd0ec5180d7e13a402afb7c2c03a9aa5e5d49060/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:49a03727c1bba9a97d3e93c9f93ca03a57300f484b6e935463099841261195d3", size = 1716676 }, - { url = "https://files.pythonhosted.org/packages/3f/8c/714d53bd8b5a4560667f7bbbb06b20c2382f9c7847d198370ec6526af39c/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:3d9908a48eb7416dc1f4524e69f1d32e5d90e3981e4e37eb0aa1cd18f9cfa2a4", size = 1733217 }, - { url = "https://files.pythonhosted.org/packages/7d/79/e2176f46d2e963facea939f5be2d26368ce543622be6f00a12844d3c991f/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:2712039939ec963c237286113c68dbad80a82a4281543f3abf766d9d73228998", size = 1552303 }, - { url = "https://files.pythonhosted.org/packages/ab/6a/28ed4dea1759916090587d1fe57087b03e6c784a642b85ef48217b0277ae/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:7bfdc049127717581866fa4708791220970ce291c23e28ccf3922c700740fdc0", size = 1763673 }, - { url = "https://files.pythonhosted.org/packages/e8/35/4a3daeb8b9fab49240d21c04d50732313295e4bd813a465d840236dd0ce1/aiohttp-3.13.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:8057c98e0c8472d8846b9c79f56766bcc57e3e8ac7bfd510482332366c56c591", size = 1721120 }, - { url = "https://files.pythonhosted.org/packages/bc/9f/d643bb3c5fb99547323e635e251c609fbbc660d983144cfebec529e09264/aiohttp-3.13.3-cp313-cp313-win32.whl", hash = "sha256:1449ceddcdbcf2e0446957863af03ebaaa03f94c090f945411b61269e2cb5daf", size = 427383 }, - { url = "https://files.pythonhosted.org/packages/4e/f1/ab0395f8a79933577cdd996dd2f9aa6014af9535f65dddcf88204682fe62/aiohttp-3.13.3-cp313-cp313-win_amd64.whl", hash = "sha256:693781c45a4033d31d4187d2436f5ac701e7bbfe5df40d917736108c1cc7436e", size = 453899 }, - { url = "https://files.pythonhosted.org/packages/99/36/5b6514a9f5d66f4e2597e40dea2e3db271e023eb7a5d22defe96ba560996/aiohttp-3.13.3-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:ea37047c6b367fd4bd632bff8077449b8fa034b69e812a18e0132a00fae6e808", size = 737238 }, - { url = "https://files.pythonhosted.org/packages/f7/49/459327f0d5bcd8c6c9ca69e60fdeebc3622861e696490d8674a6d0cb90a6/aiohttp-3.13.3-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:6fc0e2337d1a4c3e6acafda6a78a39d4c14caea625124817420abceed36e2415", size = 492292 }, - { url = "https://files.pythonhosted.org/packages/e8/0b/b97660c5fd05d3495b4eb27f2d0ef18dc1dc4eff7511a9bf371397ff0264/aiohttp-3.13.3-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c685f2d80bb67ca8c3837823ad76196b3694b0159d232206d1e461d3d434666f", size = 493021 }, - { url = "https://files.pythonhosted.org/packages/54/d4/438efabdf74e30aeceb890c3290bbaa449780583b1270b00661126b8aae4/aiohttp-3.13.3-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:48e377758516d262bde50c2584fc6c578af272559c409eecbdd2bae1601184d6", size = 1717263 }, - { url = "https://files.pythonhosted.org/packages/71/f2/7bddc7fd612367d1459c5bcf598a9e8f7092d6580d98de0e057eb42697ad/aiohttp-3.13.3-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:34749271508078b261c4abb1767d42b8d0c0cc9449c73a4df494777dc55f0687", size = 1669107 }, - { url = "https://files.pythonhosted.org/packages/00/5a/1aeaecca40e22560f97610a329e0e5efef5e0b5afdf9f857f0d93839ab2e/aiohttp-3.13.3-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:82611aeec80eb144416956ec85b6ca45a64d76429c1ed46ae1b5f86c6e0c9a26", size = 1760196 }, - { url = "https://files.pythonhosted.org/packages/f8/f8/0ff6992bea7bd560fc510ea1c815f87eedd745fe035589c71ce05612a19a/aiohttp-3.13.3-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:2fff83cfc93f18f215896e3a190e8e5cb413ce01553901aca925176e7568963a", size = 1843591 }, - { url = "https://files.pythonhosted.org/packages/e3/d1/e30e537a15f53485b61f5be525f2157da719819e8377298502aebac45536/aiohttp-3.13.3-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:bbe7d4cecacb439e2e2a8a1a7b935c25b812af7a5fd26503a66dadf428e79ec1", size = 1720277 }, - { url = "https://files.pythonhosted.org/packages/84/45/23f4c451d8192f553d38d838831ebbc156907ea6e05557f39563101b7717/aiohttp-3.13.3-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:b928f30fe49574253644b1ca44b1b8adbd903aa0da4b9054a6c20fc7f4092a25", size = 1548575 }, - { url = "https://files.pythonhosted.org/packages/6a/ed/0a42b127a43712eda7807e7892c083eadfaf8429ca8fb619662a530a3aab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:7b5e8fe4de30df199155baaf64f2fcd604f4c678ed20910db8e2c66dc4b11603", size = 1679455 }, - { url = "https://files.pythonhosted.org/packages/2e/b5/c05f0c2b4b4fe2c9d55e73b6d3ed4fd6c9dc2684b1d81cbdf77e7fad9adb/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:8542f41a62bcc58fc7f11cf7c90e0ec324ce44950003feb70640fc2a9092c32a", size = 1687417 }, - { url = "https://files.pythonhosted.org/packages/c9/6b/915bc5dad66aef602b9e459b5a973529304d4e89ca86999d9d75d80cbd0b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:5e1d8c8b8f1d91cd08d8f4a3c2b067bfca6ec043d3ff36de0f3a715feeedf926", size = 1729968 }, - { url = "https://files.pythonhosted.org/packages/11/3b/e84581290a9520024a08640b63d07673057aec5ca548177a82026187ba73/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:90455115e5da1c3c51ab619ac57f877da8fd6d73c05aacd125c5ae9819582aba", size = 1545690 }, - { url = "https://files.pythonhosted.org/packages/f5/04/0c3655a566c43fd647c81b895dfe361b9f9ad6d58c19309d45cff52d6c3b/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:042e9e0bcb5fba81886c8b4fbb9a09d6b8a00245fd8d88e4d989c1f96c74164c", size = 1746390 }, - { url = "https://files.pythonhosted.org/packages/1f/53/71165b26978f719c3419381514c9690bd5980e764a09440a10bb816ea4ab/aiohttp-3.13.3-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:2eb752b102b12a76ca02dff751a801f028b4ffbbc478840b473597fc91a9ed43", size = 1702188 }, - { url = "https://files.pythonhosted.org/packages/29/a7/cbe6c9e8e136314fa1980da388a59d2f35f35395948a08b6747baebb6aa6/aiohttp-3.13.3-cp314-cp314-win32.whl", hash = "sha256:b556c85915d8efaed322bf1bdae9486aa0f3f764195a0fb6ee962e5c71ef5ce1", size = 433126 }, - { url = "https://files.pythonhosted.org/packages/de/56/982704adea7d3b16614fc5936014e9af85c0e34b58f9046655817f04306e/aiohttp-3.13.3-cp314-cp314-win_amd64.whl", hash = "sha256:9bf9f7a65e7aa20dd764151fb3d616c81088f91f8df39c3893a536e279b4b984", size = 459128 }, - { url = "https://files.pythonhosted.org/packages/6c/2a/3c79b638a9c3d4658d345339d22070241ea341ed4e07b5ac60fb0f418003/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:05861afbbec40650d8a07ea324367cb93e9e8cc7762e04dd4405df99fa65159c", size = 769512 }, - { url = "https://files.pythonhosted.org/packages/29/b9/3e5014d46c0ab0db8707e0ac2711ed28c4da0218c358a4e7c17bae0d8722/aiohttp-3.13.3-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:2fc82186fadc4a8316768d61f3722c230e2c1dcab4200d52d2ebdf2482e47592", size = 506444 }, - { url = "https://files.pythonhosted.org/packages/90/03/c1d4ef9a054e151cd7839cdc497f2638f00b93cbe8043983986630d7a80c/aiohttp-3.13.3-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:0add0900ff220d1d5c5ebbf99ed88b0c1bbf87aa7e4262300ed1376a6b13414f", size = 510798 }, - { url = "https://files.pythonhosted.org/packages/ea/76/8c1e5abbfe8e127c893fe7ead569148a4d5a799f7cf958d8c09f3eedf097/aiohttp-3.13.3-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:568f416a4072fbfae453dcf9a99194bbb8bdeab718e08ee13dfa2ba0e4bebf29", size = 1868835 }, - { url = "https://files.pythonhosted.org/packages/8e/ac/984c5a6f74c363b01ff97adc96a3976d9c98940b8969a1881575b279ac5d/aiohttp-3.13.3-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:add1da70de90a2569c5e15249ff76a631ccacfe198375eead4aadf3b8dc849dc", size = 1720486 }, - { url = "https://files.pythonhosted.org/packages/b2/9a/b7039c5f099c4eb632138728828b33428585031a1e658d693d41d07d89d1/aiohttp-3.13.3-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:10b47b7ba335d2e9b1239fa571131a87e2d8ec96b333e68b2a305e7a98b0bae2", size = 1847951 }, - { url = "https://files.pythonhosted.org/packages/3c/02/3bec2b9a1ba3c19ff89a43a19324202b8eb187ca1e928d8bdac9bbdddebd/aiohttp-3.13.3-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3dd4dce1c718e38081c8f35f323209d4c1df7d4db4bab1b5c88a6b4d12b74587", size = 1941001 }, - { url = "https://files.pythonhosted.org/packages/37/df/d879401cedeef27ac4717f6426c8c36c3091c6e9f08a9178cc87549c537f/aiohttp-3.13.3-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:34bac00a67a812570d4a460447e1e9e06fae622946955f939051e7cc895cfab8", size = 1797246 }, - { url = "https://files.pythonhosted.org/packages/8d/15/be122de1f67e6953add23335c8ece6d314ab67c8bebb3f181063010795a7/aiohttp-3.13.3-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a19884d2ee70b06d9204b2727a7b9f983d0c684c650254679e716b0b77920632", size = 1627131 }, - { url = "https://files.pythonhosted.org/packages/12/12/70eedcac9134cfa3219ab7af31ea56bc877395b1ac30d65b1bc4b27d0438/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:5f8ca7f2bb6ba8348a3614c7918cc4bb73268c5ac2a207576b7afea19d3d9f64", size = 1795196 }, - { url = "https://files.pythonhosted.org/packages/32/11/b30e1b1cd1f3054af86ebe60df96989c6a414dd87e27ad16950eee420bea/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:b0d95340658b9d2f11d9697f59b3814a9d3bb4b7a7c20b131df4bcef464037c0", size = 1782841 }, - { url = "https://files.pythonhosted.org/packages/88/0d/d98a9367b38912384a17e287850f5695c528cff0f14f791ce8ee2e4f7796/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:a1e53262fd202e4b40b70c3aff944a8155059beedc8a89bba9dc1f9ef06a1b56", size = 1795193 }, - { url = "https://files.pythonhosted.org/packages/43/a5/a2dfd1f5ff5581632c7f6a30e1744deda03808974f94f6534241ef60c751/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:d60ac9663f44168038586cab2157e122e46bdef09e9368b37f2d82d354c23f72", size = 1621979 }, - { url = "https://files.pythonhosted.org/packages/fa/f0/12973c382ae7c1cccbc4417e129c5bf54c374dfb85af70893646e1f0e749/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:90751b8eed69435bac9ff4e3d2f6b3af1f57e37ecb0fbeee59c0174c9e2d41df", size = 1822193 }, - { url = "https://files.pythonhosted.org/packages/3c/5f/24155e30ba7f8c96918af1350eb0663e2430aad9e001c0489d89cd708ab1/aiohttp-3.13.3-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:fc353029f176fd2b3ec6cfc71be166aba1936fe5d73dd1992ce289ca6647a9aa", size = 1769801 }, - { url = "https://files.pythonhosted.org/packages/eb/f8/7314031ff5c10e6ece114da79b338ec17eeff3a079e53151f7e9f43c4723/aiohttp-3.13.3-cp314-cp314t-win32.whl", hash = "sha256:2e41b18a58da1e474a057b3d35248d8320029f61d70a37629535b16a0c8f3767", size = 466523 }, - { url = "https://files.pythonhosted.org/packages/b4/63/278a98c715ae467624eafe375542d8ba9b4383a016df8fdefe0ae28382a7/aiohttp-3.13.3-cp314-cp314t-win_amd64.whl", hash = "sha256:44531a36aa2264a1860089ffd4dce7baf875ee5a6079d5fb42e261c704ef7344", size = 499694 }, + { url = "https://files.pythonhosted.org/packages/be/6f/353954c29e7dcce7cf00280a02c75f30e133c00793c7a2ed3776d7b2f426/aiohttp-3.13.5-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:023ecba036ddd840b0b19bf195bfae970083fd7024ce1ac22e9bba90464620e9", size = 748876 }, + { url = "https://files.pythonhosted.org/packages/f5/1b/428a7c64687b3b2e9cd293186695affc0e1e54a445d0361743b231f11066/aiohttp-3.13.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:15c933ad7920b7d9a20de151efcd05a6e38302cbf0e10c9b2acb9a42210a2416", size = 499557 }, + { url = "https://files.pythonhosted.org/packages/29/47/7be41556bfbb6917069d6a6634bb7dd5e163ba445b783a90d40f5ac7e3a7/aiohttp-3.13.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ab2899f9fa2f9f741896ebb6fa07c4c883bfa5c7f2ddd8cf2aafa86fa981b2d2", size = 500258 }, + { url = "https://files.pythonhosted.org/packages/67/84/c9ecc5828cb0b3695856c07c0a6817a99d51e2473400f705275a2b3d9239/aiohttp-3.13.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a60eaa2d440cd4707696b52e40ed3e2b0f73f65be07fd0ef23b6b539c9c0b0b4", size = 1749199 }, + { url = "https://files.pythonhosted.org/packages/f0/d3/3c6d610e66b495657622edb6ae7c7fd31b2e9086b4ec50b47897ad6042a9/aiohttp-3.13.5-cp312-cp312-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:55b3bdd3292283295774ab585160c4004f4f2f203946997f49aac032c84649e9", size = 1721013 }, + { url = "https://files.pythonhosted.org/packages/49/a0/24409c12217456df0bae7babe3b014e460b0b38a8e60753d6cb339f6556d/aiohttp-3.13.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c2b2355dc094e5f7d45a7bb262fe7207aa0460b37a0d87027dcf21b5d890e7d5", size = 1781501 }, + { url = "https://files.pythonhosted.org/packages/98/9d/b65ec649adc5bccc008b0957a9a9c691070aeac4e41cea18559fef49958b/aiohttp-3.13.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:b38765950832f7d728297689ad78f5f2cf79ff82487131c4d26fe6ceecdc5f8e", size = 1878981 }, + { url = "https://files.pythonhosted.org/packages/57/d8/8d44036d7eb7b6a8ec4c5494ea0c8c8b94fbc0ed3991c1a7adf230df03bf/aiohttp-3.13.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b18f31b80d5a33661e08c89e202edabf1986e9b49c42b4504371daeaa11b47c1", size = 1767934 }, + { url = "https://files.pythonhosted.org/packages/31/04/d3f8211f273356f158e3464e9e45484d3fb8c4ce5eb2f6fe9405c3273983/aiohttp-3.13.5-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:33add2463dde55c4f2d9635c6ab33ce154e5ecf322bd26d09af95c5f81cfa286", size = 1566671 }, + { url = "https://files.pythonhosted.org/packages/41/db/073e4ebe00b78e2dfcacff734291651729a62953b48933d765dc513bf798/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:327cc432fdf1356fb4fbc6fe833ad4e9f6aacb71a8acaa5f1855e4b25910e4a9", size = 1705219 }, + { url = "https://files.pythonhosted.org/packages/48/45/7dfba71a2f9fd97b15c95c06819de7eb38113d2cdb6319669195a7d64270/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:7c35b0bf0b48a70b4cb4fc5d7bed9b932532728e124874355de1a0af8ec4bc88", size = 1743049 }, + { url = "https://files.pythonhosted.org/packages/18/71/901db0061e0f717d226386a7f471bb59b19566f2cae5f0d93874b017271f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:df23d57718f24badef8656c49743e11a89fd6f5358fa8a7b96e728fda2abf7d3", size = 1749557 }, + { url = "https://files.pythonhosted.org/packages/08/d5/41eebd16066e59cd43728fe74bce953d7402f2b4ddfdfef2c0e9f17ca274/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:02e048037a6501a5ec1f6fc9736135aec6eb8a004ce48838cb951c515f32c80b", size = 1558931 }, + { url = "https://files.pythonhosted.org/packages/30/e6/4a799798bf05740e66c3a1161079bda7a3dd8e22ca392481d7a7f9af82a6/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:31cebae8b26f8a615d2b546fee45d5ffb76852ae6450e2a03f42c9102260d6fe", size = 1774125 }, + { url = "https://files.pythonhosted.org/packages/84/63/7749337c90f92bc2cb18f9560d67aa6258c7060d1397d21529b8004fcf6f/aiohttp-3.13.5-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:888e78eb5ca55a615d285c3c09a7a91b42e9dd6fc699b166ebd5dee87c9ccf14", size = 1732427 }, + { url = "https://files.pythonhosted.org/packages/98/de/cf2f44ff98d307e72fb97d5f5bbae3bfcb442f0ea9790c0bf5c5c2331404/aiohttp-3.13.5-cp312-cp312-win32.whl", hash = "sha256:8bd3ec6376e68a41f9f95f5ed170e2fcf22d4eb27a1f8cb361d0508f6e0557f3", size = 433534 }, + { url = "https://files.pythonhosted.org/packages/aa/ca/eadf6f9c8fa5e31d40993e3db153fb5ed0b11008ad5d9de98a95045bed84/aiohttp-3.13.5-cp312-cp312-win_amd64.whl", hash = "sha256:110e448e02c729bcebb18c60b9214a87ba33bac4a9fa5e9a5f139938b56c6cb1", size = 460446 }, + { url = "https://files.pythonhosted.org/packages/78/e9/d76bf503005709e390122d34e15256b88f7008e246c4bdbe915cd4f1adce/aiohttp-3.13.5-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:a5029cc80718bbd545123cd8fe5d15025eccaaaace5d0eeec6bd556ad6163d61", size = 742930 }, + { url = "https://files.pythonhosted.org/packages/57/00/4b7b70223deaebd9bb85984d01a764b0d7bd6526fcdc73cca83bcbe7243e/aiohttp-3.13.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:4bb6bf5811620003614076bdc807ef3b5e38244f9d25ca5fe888eaccea2a9832", size = 496927 }, + { url = "https://files.pythonhosted.org/packages/9c/f5/0fb20fb49f8efdcdce6cd8127604ad2c503e754a8f139f5e02b01626523f/aiohttp-3.13.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:a84792f8631bf5a94e52d9cc881c0b824ab42717165a5579c760b830d9392ac9", size = 497141 }, + { url = "https://files.pythonhosted.org/packages/3b/86/b7c870053e36a94e8951b803cb5b909bfbc9b90ca941527f5fcafbf6b0fa/aiohttp-3.13.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:57653eac22c6a4c13eb22ecf4d673d64a12f266e72785ab1c8b8e5940d0e8090", size = 1732476 }, + { url = "https://files.pythonhosted.org/packages/b5/e5/4e161f84f98d80c03a238671b4136e6530453d65262867d989bbe78244d0/aiohttp-3.13.5-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:e5e5f7debc7a57af53fdf5c5009f9391d9f4c12867049d509bf7bb164a6e295b", size = 1706507 }, + { url = "https://files.pythonhosted.org/packages/d4/56/ea11a9f01518bd5a2a2fcee869d248c4b8a0cfa0bb13401574fa31adf4d4/aiohttp-3.13.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:c719f65bebcdf6716f10e9eff80d27567f7892d8988c06de12bbbd39307c6e3a", size = 1773465 }, + { url = "https://files.pythonhosted.org/packages/eb/40/333ca27fb74b0383f17c90570c748f7582501507307350a79d9f9f3c6eb1/aiohttp-3.13.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d97f93fdae594d886c5a866636397e2bcab146fd7a132fd6bb9ce182224452f8", size = 1873523 }, + { url = "https://files.pythonhosted.org/packages/f0/d2/e2f77eef1acb7111405433c707dc735e63f67a56e176e72e9e7a2cd3f493/aiohttp-3.13.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:3df334e39d4c2f899a914f1dba283c1aadc311790733f705182998c6f7cae665", size = 1754113 }, + { url = "https://files.pythonhosted.org/packages/fb/56/3f653d7f53c89669301ec9e42c95233e2a0c0a6dd051269e6e678db4fdb0/aiohttp-3.13.5-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:fe6970addfea9e5e081401bcbadf865d2b6da045472f58af08427e108d618540", size = 1562351 }, + { url = "https://files.pythonhosted.org/packages/ec/a6/9b3e91eb8ae791cce4ee736da02211c85c6f835f1bdfac0594a8a3b7018c/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:7becdf835feff2f4f335d7477f121af787e3504b48b449ff737afb35869ba7bb", size = 1693205 }, + { url = "https://files.pythonhosted.org/packages/98/fc/bfb437a99a2fcebd6b6eaec609571954de2ed424f01c352f4b5504371dd3/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:676e5651705ad5d8a70aeb8eb6936c436d8ebbd56e63436cb7dd9bb36d2a9a46", size = 1730618 }, + { url = "https://files.pythonhosted.org/packages/e4/b6/c8534862126191a034f68153194c389addc285a0f1347d85096d349bbc15/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:9b16c653d38eb1a611cc898c41e76859ca27f119d25b53c12875fd0474ae31a8", size = 1745185 }, + { url = "https://files.pythonhosted.org/packages/0b/93/4ca8ee2ef5236e2707e0fd5fecb10ce214aee1ff4ab307af9c558bda3b37/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:999802d5fa0389f58decd24b537c54aa63c01c3219ce17d1214cbda3c2b22d2d", size = 1557311 }, + { url = "https://files.pythonhosted.org/packages/57/ae/76177b15f18c5f5d094f19901d284025db28eccc5ae374d1d254181d33f4/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:ec707059ee75732b1ba130ed5f9580fe10ff75180c812bc267ded039db5128c6", size = 1773147 }, + { url = "https://files.pythonhosted.org/packages/01/a4/62f05a0a98d88af59d93b7fcac564e5f18f513cb7471696ac286db970d6a/aiohttp-3.13.5-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d6d44a5b48132053c2f6cd5c8cb14bc67e99a63594e336b0f2af81e94d5530c", size = 1730356 }, + { url = "https://files.pythonhosted.org/packages/e4/85/fc8601f59dfa8c9523808281f2da571f8b4699685f9809a228adcc90838d/aiohttp-3.13.5-cp313-cp313-win32.whl", hash = "sha256:329f292ed14d38a6c4c435e465f48bebb47479fd676a0411936cc371643225cc", size = 432637 }, + { url = "https://files.pythonhosted.org/packages/c0/1b/ac685a8882896acf0f6b31d689e3792199cfe7aba37969fa91da63a7fa27/aiohttp-3.13.5-cp313-cp313-win_amd64.whl", hash = "sha256:69f571de7500e0557801c0b51f4780482c0ec5fe2ac851af5a92cfce1af1cb83", size = 458896 }, + { url = "https://files.pythonhosted.org/packages/5d/ce/46572759afc859e867a5bc8ec3487315869013f59281ce61764f76d879de/aiohttp-3.13.5-cp314-cp314-macosx_10_13_universal2.whl", hash = "sha256:eb4639f32fd4a9904ab8fb45bf3383ba71137f3d9d4ba25b3b3f3109977c5b8c", size = 745721 }, + { url = "https://files.pythonhosted.org/packages/13/fe/8a2efd7626dbe6049b2ef8ace18ffda8a4dfcbe1bcff3ac30c0c7575c20b/aiohttp-3.13.5-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:7e5dc4311bd5ac493886c63cbf76ab579dbe4641268e7c74e48e774c74b6f2be", size = 497663 }, + { url = "https://files.pythonhosted.org/packages/9b/91/cc8cc78a111826c54743d88651e1687008133c37e5ee615fee9b57990fac/aiohttp-3.13.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:756c3c304d394977519824449600adaf2be0ccee76d206ee339c5e76b70ded25", size = 499094 }, + { url = "https://files.pythonhosted.org/packages/0a/33/a8362cb15cf16a3af7e86ed11962d5cd7d59b449202dc576cdc731310bde/aiohttp-3.13.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ecc26751323224cf8186efcf7fbcbc30f4e1d8c7970659daf25ad995e4032a56", size = 1726701 }, + { url = "https://files.pythonhosted.org/packages/45/0c/c091ac5c3a17114bd76cbf85d674650969ddf93387876cf67f754204bd77/aiohttp-3.13.5-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:10a75acfcf794edf9d8db50e5a7ec5fc818b2a8d3f591ce93bc7b1210df016d2", size = 1683360 }, + { url = "https://files.pythonhosted.org/packages/23/73/bcee1c2b79bc275e964d1446c55c54441a461938e70267c86afaae6fba27/aiohttp-3.13.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:0f7a18f258d124cd678c5fe072fe4432a4d5232b0657fca7c1847f599233c83a", size = 1773023 }, + { url = "https://files.pythonhosted.org/packages/c7/ef/720e639df03004fee2d869f771799d8c23046dec47d5b81e396c7cda583a/aiohttp-3.13.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:df6104c009713d3a89621096f3e3e88cc323fd269dbd7c20afe18535094320be", size = 1853795 }, + { url = "https://files.pythonhosted.org/packages/bd/c9/989f4034fb46841208de7aeeac2c6d8300745ab4f28c42f629ba77c2d916/aiohttp-3.13.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:241a94f7de7c0c3b616627aaad530fe2cb620084a8b144d3be7b6ecfe95bae3b", size = 1730405 }, + { url = "https://files.pythonhosted.org/packages/ce/75/ee1fd286ca7dc599d824b5651dad7b3be7ff8d9a7e7b3fe9820d9180f7db/aiohttp-3.13.5-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:c974fb66180e58709b6fc402846f13791240d180b74de81d23913abe48e96d94", size = 1558082 }, + { url = "https://files.pythonhosted.org/packages/c3/20/1e9e6650dfc436340116b7aa89ff8cb2bbdf0abc11dfaceaad8f74273a10/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:6e27ea05d184afac78aabbac667450c75e54e35f62238d44463131bd3f96753d", size = 1692346 }, + { url = "https://files.pythonhosted.org/packages/d8/40/8ebc6658d48ea630ac7903912fe0dd4e262f0e16825aa4c833c56c9f1f56/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_armv7l.whl", hash = "sha256:a79a6d399cef33a11b6f004c67bb07741d91f2be01b8d712d52c75711b1e07c7", size = 1698891 }, + { url = "https://files.pythonhosted.org/packages/d8/78/ea0ae5ec8ba7a5c10bdd6e318f1ba5e76fcde17db8275188772afc7917a4/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_ppc64le.whl", hash = "sha256:c632ce9c0b534fbe25b52c974515ed674937c5b99f549a92127c85f771a78772", size = 1742113 }, + { url = "https://files.pythonhosted.org/packages/8a/66/9d308ed71e3f2491be1acb8769d96c6f0c47d92099f3bc9119cada27b357/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:fceedde51fbd67ee2bcc8c0b33d0126cc8b51ef3bbde2f86662bd6d5a6f10ec5", size = 1553088 }, + { url = "https://files.pythonhosted.org/packages/da/a6/6cc25ed8dfc6e00c90f5c6d126a98e2cf28957ad06fa1036bd34b6f24a2c/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_s390x.whl", hash = "sha256:f92995dfec9420bb69ae629abf422e516923ba79ba4403bc750d94fb4a6c68c1", size = 1757976 }, + { url = "https://files.pythonhosted.org/packages/c1/2b/cce5b0ffe0de99c83e5e36d8f828e4161e415660a9f3e58339d07cce3006/aiohttp-3.13.5-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:20ae0ff08b1f2c8788d6fb85afcb798654ae6ba0b747575f8562de738078457b", size = 1712444 }, + { url = "https://files.pythonhosted.org/packages/6c/cf/9e1795b4160c58d29421eafd1a69c6ce351e2f7c8d3c6b7e4ca44aea1a5b/aiohttp-3.13.5-cp314-cp314-win32.whl", hash = "sha256:b20df693de16f42b2472a9c485e1c948ee55524786a0a34345511afdd22246f3", size = 438128 }, + { url = "https://files.pythonhosted.org/packages/22/4d/eaedff67fc805aeba4ba746aec891b4b24cebb1a7d078084b6300f79d063/aiohttp-3.13.5-cp314-cp314-win_amd64.whl", hash = "sha256:f85c6f327bf0b8c29da7d93b1cabb6363fb5e4e160a32fa241ed2dce21b73162", size = 464029 }, + { url = "https://files.pythonhosted.org/packages/79/11/c27d9332ee20d68dd164dc12a6ecdef2e2e35ecc97ed6cf0d2442844624b/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_universal2.whl", hash = "sha256:1efb06900858bb618ff5cee184ae2de5828896c448403d51fb633f09e109be0a", size = 778758 }, + { url = "https://files.pythonhosted.org/packages/04/fb/377aead2e0a3ba5f09b7624f702a964bdf4f08b5b6728a9799830c80041e/aiohttp-3.13.5-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:fee86b7c4bd29bdaf0d53d14739b08a106fdda809ca5fe032a15f52fae5fe254", size = 512883 }, + { url = "https://files.pythonhosted.org/packages/bb/a6/aa109a33671f7a5d3bd78b46da9d852797c5e665bfda7d6b373f56bff2ec/aiohttp-3.13.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:20058e23909b9e65f9da62b396b77dfa95965cbe840f8def6e572538b1d32e36", size = 516668 }, + { url = "https://files.pythonhosted.org/packages/79/b3/ca078f9f2fa9563c36fb8ef89053ea2bb146d6f792c5104574d49d8acb63/aiohttp-3.13.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8cf20a8d6868cb15a73cab329ffc07291ba8c22b1b88176026106ae39aa6df0f", size = 1883461 }, + { url = "https://files.pythonhosted.org/packages/b7/e3/a7ad633ca1ca497b852233a3cce6906a56c3225fb6d9217b5e5e60b7419d/aiohttp-3.13.5-cp314-cp314t-manylinux2014_armv7l.manylinux_2_17_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:330f5da04c987f1d5bdb8ae189137c77139f36bd1cb23779ca1a354a4b027800", size = 1747661 }, + { url = "https://files.pythonhosted.org/packages/33/b9/cd6fe579bed34a906d3d783fe60f2fa297ef55b27bb4538438ee49d4dc41/aiohttp-3.13.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:6f1cbf0c7926d315c3c26c2da41fd2b5d2fe01ac0e157b78caefc51a782196cf", size = 1863800 }, + { url = "https://files.pythonhosted.org/packages/c0/3f/2c1e2f5144cefa889c8afd5cf431994c32f3b29da9961698ff4e3811b79a/aiohttp-3.13.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:53fc049ed6390d05423ba33103ded7281fe897cf97878f369a527070bd95795b", size = 1958382 }, + { url = "https://files.pythonhosted.org/packages/66/1d/f31ec3f1013723b3babe3609e7f119c2c2fb6ef33da90061a705ef3e1bc8/aiohttp-3.13.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:898703aa2667e3c5ca4c54ca36cd73f58b7a38ef87a5606414799ebce4d3fd3a", size = 1803724 }, + { url = "https://files.pythonhosted.org/packages/0e/b4/57712dfc6f1542f067daa81eb61da282fab3e6f1966fca25db06c4fc62d5/aiohttp-3.13.5-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:0494a01ca9584eea1e5fbd6d748e61ecff218c51b576ee1999c23db7066417d8", size = 1640027 }, + { url = "https://files.pythonhosted.org/packages/25/3c/734c878fb43ec083d8e31bf029daae1beafeae582d1b35da234739e82ee7/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:6cf81fe010b8c17b09495cbd15c1d35afbc8fb405c0c9cf4738e5ae3af1d65be", size = 1806644 }, + { url = "https://files.pythonhosted.org/packages/20/a5/f671e5cbec1c21d044ff3078223f949748f3a7f86b14e34a365d74a5d21f/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_armv7l.whl", hash = "sha256:c564dd5f09ddc9d8f2c2d0a301cd30a79a2cc1b46dd1a73bef8f0038863d016b", size = 1791630 }, + { url = "https://files.pythonhosted.org/packages/0b/63/fb8d0ad63a0b8a99be97deac8c04dacf0785721c158bdf23d679a87aa99e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_ppc64le.whl", hash = "sha256:2994be9f6e51046c4f864598fd9abeb4fba6e88f0b2152422c9666dcd4aea9c6", size = 1809403 }, + { url = "https://files.pythonhosted.org/packages/59/0c/bfed7f30662fcf12206481c2aac57dedee43fe1c49275e85b3a1e1742294/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:157826e2fa245d2ef46c83ea8a5faf77ca19355d278d425c29fda0beb3318037", size = 1634924 }, + { url = "https://files.pythonhosted.org/packages/17/d6/fd518d668a09fd5a3319ae5e984d4d80b9a4b3df4e21c52f02251ef5a32e/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_s390x.whl", hash = "sha256:a8aca50daa9493e9e13c0f566201a9006f080e7c50e5e90d0b06f53146a54500", size = 1836119 }, + { url = "https://files.pythonhosted.org/packages/78/b7/15fb7a9d52e112a25b621c67b69c167805cb1f2ab8f1708a5c490d1b52fe/aiohttp-3.13.5-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3b13560160d07e047a93f23aaa30718606493036253d5430887514715b67c9d9", size = 1772072 }, + { url = "https://files.pythonhosted.org/packages/7e/df/57ba7f0c4a553fc2bd8b6321df236870ec6fd64a2a473a8a13d4f733214e/aiohttp-3.13.5-cp314-cp314t-win32.whl", hash = "sha256:9a0f4474b6ea6818b41f82172d799e4b3d29e22c2c520ce4357856fced9af2f8", size = 471819 }, + { url = "https://files.pythonhosted.org/packages/62/29/2f8418269e46454a26171bfdd6a055d74febf32234e474930f2f60a17145/aiohttp-3.13.5-cp314-cp314t-win_amd64.whl", hash = "sha256:18a2f6c1182c51baa1d28d68fea51513cb2a76612f038853c0ad3c145423d3d9", size = 505441 }, ] [[package]] @@ -1023,14 +1023,14 @@ wheels = [ [[package]] name = "click" -version = "8.3.1" +version = "8.1.8" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "colorama", marker = "sys_platform == 'win32'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/3d/fa/656b739db8587d7b5dfa22e22ed02566950fbfbcdc20311993483657a5c0/click-8.3.1.tar.gz", hash = "sha256:12ff4785d337a1bb490bb7e9c2b1ee5da3112e94a8622f26a6c77f5d2fc6842a", size = 295065 } +sdist = { url = "https://files.pythonhosted.org/packages/b9/2e/0090cbf739cee7d23781ad4b89a9894a41538e4fcf4c31dcdd705b78eb8b/click-8.1.8.tar.gz", hash = "sha256:ed53c9d8990d83c2a27deae68e4ee337473f6330c040a31d4225c9574d16096a", size = 226593 } wheels = [ - { url = "https://files.pythonhosted.org/packages/98/78/01c019cdb5d6498122777c1a43056ebb3ebfeef2076d9d026bfe15583b2b/click-8.3.1-py3-none-any.whl", hash = "sha256:981153a64e25f12d547d3426c367a4857371575ee7ad18df2a6183ab0545b2a6", size = 108274 }, + { url = "https://files.pythonhosted.org/packages/7e/d4/7ebdbd03970677812aac39c869717059dbb71a4cfc033ca6e5221787892c/click-8.1.8-py3-none-any.whl", hash = "sha256:63c132bbbed01578a06712a2d1f497bb62d9c1c0d329b7903a866228027263b2", size = 98188 }, ] [[package]] @@ -2984,14 +2984,14 @@ wheels = [ [[package]] name = "importlib-metadata" -version = "8.7.1" +version = "8.5.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "zipp" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/f3/49/3b30cad09e7771a4982d9975a8cbf64f00d4a1ececb53297f1d9a7be1b10/importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb", size = 57107 } +sdist = { url = "https://files.pythonhosted.org/packages/cd/12/33e59336dca5be0c398a7482335911a33aa0e20776128f038019f1a95f1b/importlib_metadata-8.5.0.tar.gz", hash = "sha256:71522656f0abace1d072b9e5481a48f07c138e00f079c38c8f883823f9c26bd7", size = 55304 } wheels = [ - { url = "https://files.pythonhosted.org/packages/fa/5e/f8e9a1d23b9c20a551a8a02ea3637b4642e22c2626e3a13a9a29cdea99eb/importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151", size = 27865 }, + { url = "https://files.pythonhosted.org/packages/a0/d9/a1e041c5e7caa9a05c925f4bdbdfb7f006d1f74996af53467bc394c97be7/importlib_metadata-8.5.0-py3-none-any.whl", hash = "sha256:45e54197d28b7a7f1559e60b95e7c567032b602131fbd588f1497f47880aa68b", size = 26514 }, ] [[package]] @@ -3222,7 +3222,7 @@ wheels = [ [[package]] name = "jsonschema" -version = "4.26.0" +version = "4.23.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "attrs" }, @@ -3230,9 +3230,9 @@ dependencies = [ { name = "referencing" }, { name = "rpds-py" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/b3/fc/e067678238fa451312d4c62bf6e6cf5ec56375422aee02f9cb5f909b3047/jsonschema-4.26.0.tar.gz", hash = "sha256:0c26707e2efad8aa1bfc5b7ce170f3fccc2e4918ff85989ba9ffa9facb2be326", size = 366583 } +sdist = { url = "https://files.pythonhosted.org/packages/38/2e/03362ee4034a4c917f697890ccd4aec0800ccf9ded7f511971c75451deec/jsonschema-4.23.0.tar.gz", hash = "sha256:d71497fef26351a33265337fa77ffeb82423f3ea21283cd9467bb03999266bc4", size = 325778 } wheels = [ - { url = "https://files.pythonhosted.org/packages/69/90/f63fb5873511e014207a475e2bb4e8b2e570d655b00ac19a9a0ca0a385ee/jsonschema-4.26.0-py3-none-any.whl", hash = "sha256:d489f15263b8d200f8387e64b4c3a75f06629559fb73deb8fdfb525f2dab50ce", size = 90630 }, + { url = "https://files.pythonhosted.org/packages/69/4a/4f9dbeb84e8850557c02365a0eee0649abe5eb1d84af92a25731c6c0f922/jsonschema-4.23.0-py3-none-any.whl", hash = "sha256:fbadb6f8b144a8f8cf9f0b89ba94501d143e50411a1278633f56a7acf7fd5566", size = 88462 }, ] [[package]] @@ -3533,7 +3533,7 @@ wheels = [ [[package]] name = "langchain-litellm" -version = "0.6.2" +version = "0.6.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "cryptography" }, @@ -3541,9 +3541,9 @@ dependencies = [ { name = "langchain-core" }, { name = "litellm" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/ee/6f/ba0490ec0fbc9d97cd9433749455fb4b5fbec3852bcbe113a0278ec1d32d/langchain_litellm-0.6.2.tar.gz", hash = "sha256:93372df7c3f1802358746e2c0a94012d8c27d9f9b57b769b23f6af2264bbaabb", size = 332878 } +sdist = { url = "https://files.pythonhosted.org/packages/68/37/ccc1f284a42900ca5b267a50da8e50145e9f264b32ee955ce91aa360d188/langchain_litellm-0.6.4.tar.gz", hash = "sha256:663281db392b3de1f07f891d0f80f9d4b26c0f0d2abbf854ef9b186d99c309ee", size = 339457 } wheels = [ - { url = "https://files.pythonhosted.org/packages/da/14/ad857a3f56fa4ea0879ac9d6ee5248c883663d0bad94bf8741e1ab6ab200/langchain_litellm-0.6.2-py3-none-any.whl", hash = "sha256:98af79dbcdea4b492e9601351bc5fd15fdd368e021183b8540f0d0b6b6b1589c", size = 24865 }, + { url = "https://files.pythonhosted.org/packages/43/e8/25c50bbad7a05106c7af65557e165d6cb6159c90854dae61de59debe735d/langchain_litellm-0.6.4-py3-none-any.whl", hash = "sha256:60f4e37be1a47dc88f94fac7085675ef8fa04bba92f48735792d82f492120744", size = 26360 }, ] [[package]] @@ -3709,7 +3709,7 @@ wheels = [ [[package]] name = "litellm" -version = "1.82.6" +version = "1.83.4" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp" }, @@ -3725,9 +3725,9 @@ dependencies = [ { name = "tiktoken" }, { name = "tokenizers" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/29/75/1c537aa458426a9127a92bc2273787b2f987f4e5044e21f01f2eed5244fd/litellm-1.82.6.tar.gz", hash = "sha256:2aa1c2da21fe940c33613aa447119674a3ad4d2ad5eb064e4d5ce5ee42420136", size = 17414147 } +sdist = { url = "https://files.pythonhosted.org/packages/03/c4/30469c06ae7437a4406bc11e3c433cfd380a6771068cca15ea918dcd158f/litellm-1.83.4.tar.gz", hash = "sha256:6458d2030a41229460b321adee00517a91dbd8e63213cc953d355cb41d16f2d4", size = 17733899 } wheels = [ - { url = "https://files.pythonhosted.org/packages/02/6c/5327667e6dbe9e98cbfbd4261c8e91386a52e38f41419575854248bbab6a/litellm-1.82.6-py3-none-any.whl", hash = "sha256:164a3ef3e19f309e3cabc199bef3d2045212712fefdfa25fc7f75884a5b5b205", size = 15591595 }, + { url = "https://files.pythonhosted.org/packages/b8/bd/df19d3f8f6654535ee343a341fd921f81c411abf601a53e3eaef58129b02/litellm-1.83.4-py3-none-any.whl", hash = "sha256:17d7b4d48d47aca988ea4f762ddda5e7bd72cda3270192b22813d0330869d7b4", size = 16015555 }, ] [[package]] @@ -6766,11 +6766,11 @@ wheels = [ [[package]] name = "python-dotenv" -version = "1.2.2" +version = "1.0.1" source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/82/ed/0301aeeac3e5353ef3d94b6ec08bbcabd04a72018415dcb29e588514bba8/python_dotenv-1.2.2.tar.gz", hash = "sha256:2c371a91fbd7ba082c2c1dc1f8bf89ca22564a087c2c287cd9b662adde799cf3", size = 50135 } +sdist = { url = "https://files.pythonhosted.org/packages/bc/57/e84d88dfe0aec03b7a2d4327012c1627ab5f03652216c63d49846d7a6c58/python-dotenv-1.0.1.tar.gz", hash = "sha256:e324ee90a023d808f1959c46bcbc04446a10ced277783dc6ee09987c37ec10ca", size = 39115 } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101 }, + { url = "https://files.pythonhosted.org/packages/6a/3e/b68c118422ec867fa7ab88444e1274aa40681c606d59ac27de5a5588f082/python_dotenv-1.0.1-py3-none-any.whl", hash = "sha256:f7b63ef50f1b690dddf550d03497b66d609393b40b564ed0d674909a68ebf16a", size = 19863 }, ] [[package]] @@ -8049,12 +8049,12 @@ requires-dist = [ { name = "langchain", specifier = ">=1.2.13" }, { name = "langchain-community", specifier = ">=0.4.1" }, { name = "langchain-daytona", specifier = ">=0.0.2" }, - { name = "langchain-litellm", specifier = ">=0.3.5" }, + { name = "langchain-litellm", specifier = ">=0.6.4" }, { name = "langchain-unstructured", specifier = ">=1.0.1" }, { name = "langgraph", specifier = ">=1.1.3" }, { name = "langgraph-checkpoint-postgres", specifier = ">=3.0.2" }, { name = "linkup-sdk", specifier = ">=0.2.4" }, - { name = "litellm", specifier = ">=1.80.10" }, + { name = "litellm", specifier = ">=1.83.0" }, { name = "llama-cloud-services", specifier = ">=0.6.25" }, { name = "markdown", specifier = ">=3.7" }, { name = "markdownify", specifier = ">=0.14.1" }, diff --git a/surfsense_desktop/.env b/surfsense_desktop/.env index d053aac97..40e151c10 100644 --- a/surfsense_desktop/.env +++ b/surfsense_desktop/.env @@ -3,4 +3,8 @@ # The hosted web frontend URL. Used to intercept OAuth redirects and keep them # inside the desktop app. Set to your production frontend domain. -HOSTED_FRONTEND_URL=https://surfsense.net +HOSTED_FRONTEND_URL=https://surfsense.com + +# PostHog analytics (leave empty to disable) +POSTHOG_KEY= +POSTHOG_HOST=https://assets.surfsense.com diff --git a/surfsense_desktop/.npmrc b/surfsense_desktop/.npmrc new file mode 100644 index 000000000..d67f37488 --- /dev/null +++ b/surfsense_desktop/.npmrc @@ -0,0 +1 @@ +node-linker=hoisted diff --git a/surfsense_desktop/electron-builder.yml b/surfsense_desktop/electron-builder.yml index eaca0f19b..2c46c827a 100644 --- a/surfsense_desktop/electron-builder.yml +++ b/surfsense_desktop/electron-builder.yml @@ -9,10 +9,19 @@ directories: files: - dist/**/* - "!node_modules" + - node_modules/node-gyp-build/**/* + - node_modules/bindings/**/* + - node_modules/file-uri-to-path/**/* + - node_modules/node-mac-permissions/**/* + - "!node_modules/node-mac-permissions/src" + - "!node_modules/node-mac-permissions/binding.gyp" - "!src" - "!scripts" - "!release" extraResources: + - from: assets/ + to: assets/ + filter: ["*.ico", "*.png", "*.icns"] - from: ../surfsense_web/.next/standalone/surfsense_web/ to: standalone/ filter: @@ -29,12 +38,20 @@ extraResources: filter: ["**/*"] asarUnpack: - "**/*.node" + - "node_modules/node-gyp-build/**/*" + - "node_modules/bindings/**/*" + - "node_modules/file-uri-to-path/**/*" + - "node_modules/node-mac-permissions/**/*" mac: icon: assets/icon.icns category: public.app-category.productivity artifactName: "${productName}-${version}-${arch}.${ext}" - hardenedRuntime: true + hardenedRuntime: false gatekeeperAssess: false + extendInfo: + NSAccessibilityUsageDescription: "SurfSense uses accessibility features to insert suggestions into the active application." + NSScreenCaptureUsageDescription: "SurfSense uses screen capture to analyze your screen and provide context-aware writing suggestions." + NSAppleEventsUsageDescription: "SurfSense uses Apple Events to interact with the active application." target: - target: dmg arch: [x64, arm64] @@ -44,7 +61,7 @@ win: icon: assets/icon.ico target: - target: nsis - arch: [x64, arm64] + arch: [x64] nsis: oneClick: false perMachine: false diff --git a/surfsense_desktop/package.json b/surfsense_desktop/package.json index bd0cc67ab..634783e47 100644 --- a/surfsense_desktop/package.json +++ b/surfsense_desktop/package.json @@ -4,19 +4,25 @@ "description": "SurfSense Desktop App", "main": "dist/main.js", "scripts": { - "dev": "concurrently -k \"pnpm --dir ../surfsense_web dev\" \"wait-on http://localhost:3000 && electron .\"", + "dev": "pnpm build && concurrently -k \"pnpm --dir ../surfsense_web dev\" \"wait-on http://localhost:3000 && electron .\"", "build": "node scripts/build-electron.mjs", "pack:dir": "pnpm build && electron-builder --dir --config electron-builder.yml", "dist": "pnpm build && electron-builder --config electron-builder.yml", "dist:mac": "pnpm build && electron-builder --mac --config electron-builder.yml", "dist:win": "pnpm build && electron-builder --win --config electron-builder.yml", "dist:linux": "pnpm build && electron-builder --linux --config electron-builder.yml", - "typecheck": "tsc --noEmit" + "typecheck": "tsc --noEmit", + "postinstall": "electron-rebuild" + }, + "homepage": "https://github.com/MODSetter/SurfSense", + "author": { + "name": "MODSetter", + "email": "rohan@surfsense.com" }, - "author": "MODSetter", "license": "MIT", "packageManager": "pnpm@10.24.0", "devDependencies": { + "@electron/rebuild": "^4.0.3", "@types/node": "^25.5.0", "concurrently": "^9.2.1", "dotenv": "^17.3.1", @@ -27,7 +33,13 @@ "wait-on": "^9.0.4" }, "dependencies": { + "bindings": "^1.5.0", + "chokidar": "^5.0.0", + "electron-store": "^11.0.2", "electron-updater": "^6.8.3", - "get-port-please": "^3.2.0" + "get-port-please": "^3.2.0", + "node-mac-permissions": "^2.5.0", + "node-machine-id": "^1.1.12", + "posthog-node": "^5.29.0" } } diff --git a/surfsense_desktop/pnpm-lock.yaml b/surfsense_desktop/pnpm-lock.yaml index ea65be0bb..e7b84cc01 100644 --- a/surfsense_desktop/pnpm-lock.yaml +++ b/surfsense_desktop/pnpm-lock.yaml @@ -8,13 +8,34 @@ importers: .: dependencies: + bindings: + specifier: ^1.5.0 + version: 1.5.0 + chokidar: + specifier: ^5.0.0 + version: 5.0.0 + electron-store: + specifier: ^11.0.2 + version: 11.0.2 electron-updater: specifier: ^6.8.3 version: 6.8.3 get-port-please: specifier: ^3.2.0 version: 3.2.0 + node-mac-permissions: + specifier: ^2.5.0 + version: 2.5.0 + node-machine-id: + specifier: ^1.1.12 + version: 1.1.12 + posthog-node: + specifier: ^5.29.0 + version: 5.29.0(rxjs@7.8.2) devDependencies: + '@electron/rebuild': + specifier: ^4.0.3 + version: 4.0.3 '@types/node': specifier: ^25.5.0 version: 25.5.0 @@ -293,6 +314,9 @@ packages: resolution: {integrity: sha512-+1VkjdD0QBLPodGrJUeqarH8VAIvQODIbwh9XpP5Syisf7YoQgsJKPNFoqqLQlu+VQ/tVSshMR6loPMn8U+dPg==} engines: {node: '>=14'} + '@posthog/core@1.25.0': + resolution: {integrity: sha512-XKaHvRFIIN7Dw84r1eKimV1rl9DS+9XMCPPZ7P3+l8fE+rDsmumebiTFsY+q40bVXflcGW9wB+57LH0lvcGmhw==} + '@sindresorhus/is@4.6.0': resolution: {integrity: sha512-t09vSN3MdfsyCHoFcTRCH/iUtG7OJ0CsjzB8cjAmKc/va/kIgeDI/TxsigdncE/4be734m0cvIYwNaV4i2XqAw==} engines: {node: '>=10'} @@ -343,6 +367,7 @@ packages: '@xmldom/xmldom@0.8.11': resolution: {integrity: sha512-cQzWCtO6C8TQiYl1ruKNn2U6Ao4o4WBBcbL61yJl84x+j5sOWWFU9X7DpND8XZG3daDppSsigMdfAIl2upQBRw==} engines: {node: '>=10.0.0'} + deprecated: this version has critical issues, please update to the latest version abbrev@3.0.1: resolution: {integrity: sha512-AO2ac6pjRB3SJmGJo+v5/aK6Omggp6fsLrs6wN9bd35ulu4cCwaAU9+7ZhXjeqHVkaHThLuzH0nZr0YpCDhygg==} @@ -352,6 +377,14 @@ packages: resolution: {integrity: sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==} engines: {node: '>= 14'} + ajv-formats@3.0.1: + resolution: {integrity: sha512-8iUql50EUR+uUcdRQ3HDqa6EVyo3docL8g5WJ3FNcWmu62IbkGUue/pEyLBW8VGKKucTPgqeks4fIU1DA4yowQ==} + peerDependencies: + ajv: ^8.0.0 + peerDependenciesMeta: + ajv: + optional: true + ajv-keywords@3.5.2: resolution: {integrity: sha512-5p6WTN0DdTGVQk6VjcEju19IgaHudalcfabD7yhDGeA6bcQnmL+CpveLJq/3hvfwd1aof6L386Ougkx6RfyMIQ==} peerDependencies: @@ -360,6 +393,9 @@ packages: ajv@6.14.0: resolution: {integrity: sha512-IWrosm/yrn43eiKqkfkHis7QioDleaXQHdDVPKg0FSwwd/DuvyX79TZnFOnYpB7dcsFAMmtFztZuXPDvSePkFw==} + ajv@8.18.0: + resolution: {integrity: sha512-PlXPeEWMXMZ7sPYOHqmDyCJzcfNrUr3fGNKtezX14ykXOEIvyK81d+qydx89KY5O71FKMPaQ2vBfBFI5NHR63A==} + ansi-regex@5.0.1: resolution: {integrity: sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==} engines: {node: '>=8'} @@ -411,6 +447,9 @@ packages: resolution: {integrity: sha512-+q/t7Ekv1EDY2l6Gda6LLiX14rU9TV20Wa3ofeQmwPFZbOMo9DXrLbOjFaaclkXKWidIaopwAObQDqwWtGUjqg==} engines: {node: '>= 4.0.0'} + atomically@2.1.1: + resolution: {integrity: sha512-P4w9o2dqARji6P7MHprklbfiArZAWvo07yW7qs3pdljb3BWr12FIB7W+p0zJiuiVsUpRO0iZn1kFFcpPegg0tQ==} + axios@1.13.6: resolution: {integrity: sha512-ChTCHMouEe2kn713WHbQGcuYrr6fXTBiu460OTwWrWob16g1bXn4vtz07Ope7ewMozJAnEquLk5lWQWtBig9DQ==} @@ -424,6 +463,9 @@ packages: base64-js@1.5.1: resolution: {integrity: sha512-AKpaYlHn8t4SVbOHCy+b5+KKgvR4vrsD8vbvrbiQJps7fKDTkjkDry6ji0rUJjC0kzbNePLwzxq8iypo41qeWA==} + bindings@1.5.0: + resolution: {integrity: sha512-p2q/t/mhvuOj/UeLlV6566GD/guowlr0hHxClI0W9m7MWYkL1F0hLo+0Aexs9HSPCtR1SXQ0TD3MMKrXZajbiQ==} + bl@4.1.0: resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} @@ -477,6 +519,10 @@ packages: resolution: {integrity: sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==} engines: {node: '>=10'} + chokidar@5.0.0: + resolution: {integrity: sha512-TQMmc3w+5AxjpL8iIiwebF73dRDF4fBIieAqGn9RGCWaEVwQ6Fb2cGe31Yns0RRIzii5goJ1Y7xbMwo1TxMplw==} + engines: {node: '>= 20.19.0'} + chownr@3.0.0: resolution: {integrity: sha512-+IxzY9BZOQd/XuYPRmrvEVjF/nqj5kgT4kEq7VofrDoM1MxoRjEWkrCC3EtLi59TVawxTAn+orJwFQcrqEN1+g==} engines: {node: '>=18'} @@ -546,6 +592,10 @@ packages: engines: {node: '>=18'} hasBin: true + conf@15.1.0: + resolution: {integrity: sha512-Uy5YN9KEu0WWDaZAVJ5FAmZoaJt9rdK6kH+utItPyGsCqCgaTKkrmZx3zoE0/3q6S3bcp3Ihkk+ZqPxWxFK5og==} + engines: {node: '>=20'} + core-util-is@1.0.2: resolution: {integrity: sha512-3lqz5YjWTYnW6dlDa5TLaTCcShfar1e40rmcJVwCBJC6mWlFuj0eCHIElmG1g5kyuJ/GD+8Wn4FFCcz4gJPfaQ==} @@ -559,6 +609,10 @@ packages: resolution: {integrity: sha512-uV2QOWP2nWzsy2aMp8aRibhi9dlzF5Hgh5SHaB9OiTGEyDTiJJyx0uy51QXdyWbtAHNua4XJzUKca3OzKUd3vA==} engines: {node: '>= 8'} + debounce-fn@6.0.0: + resolution: {integrity: sha512-rBMW+F2TXryBwB54Q0d8drNEI+TfoS9JpNTAoVpukbWEhjXQq4rySFYLaqXMFXwdv61Zb2OHtj5bviSoimqxRQ==} + engines: {node: '>=18'} + debug@4.4.3: resolution: {integrity: sha512-RGwwWnwQvkVfavKVt22FGLw+xYSdzARwm0ru6DhTVA3umU5hZc28V3kO4stgYryrTlLpuvgI9GiijltAjNbcqA==} engines: {node: '>=6.0'} @@ -610,6 +664,10 @@ packages: os: [darwin] hasBin: true + dot-prop@10.1.0: + resolution: {integrity: sha512-MVUtAugQMOff5RnBy2d9N31iG0lNwg1qAoAOn7pOK5wf94WIaE3My2p3uwTQuvS2AcqchkcR3bHByjaM0mmi7Q==} + engines: {node: '>=20'} + dotenv-expand@11.0.7: resolution: {integrity: sha512-zIHwmZPRshsCdpMDyVsqGmgyP0yT8GAgXUnkdAoJisxvf33k7yO6OuoKmcTGuXPWSsm8Oh88nZicRLA9Y0rUeA==} engines: {node: '>=12'} @@ -645,6 +703,10 @@ packages: electron-publish@26.8.1: resolution: {integrity: sha512-q+jrSTIh/Cv4eGZa7oVR+grEJo/FoLMYBAnSL5GCtqwUpr1T+VgKB/dn1pnzxIxqD8S/jP1yilT9VrwCqINR4w==} + electron-store@11.0.2: + resolution: {integrity: sha512-4VkNRdN+BImL2KcCi41WvAYbh6zLX5AUTi4so68yPqiItjbgTjqpEnGAqasgnG+lB6GuAyUltKwVopp6Uv+gwQ==} + engines: {node: '>=20'} + electron-updater@6.8.3: resolution: {integrity: sha512-Z6sgw3jgbikWKXei1ENdqFOxBP0WlXg3TtKfz0rgw2vIZFJUyI4pD7ZN7jrkm7EoMK+tcm/qTnPUdqfZukBlBQ==} @@ -673,6 +735,10 @@ packages: resolution: {integrity: sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==} engines: {node: '>=6'} + env-paths@3.0.0: + resolution: {integrity: sha512-dtJUTepzMW3Lm/NPxRf3wP4642UWhjL2sQxc+ym2YMj1m/H2zDNQOlezafzkHwn6sMstjHTwG6iQQsctDW/b1A==} + engines: {node: ^12.20.0 || ^14.13.1 || >=16.0.0} + err-code@2.0.3: resolution: {integrity: sha512-2bmlRpNKBxT/CRmPOlyISQpNj+qSeYvcym/uT0Jx2bMOlKLtSy1ZmLuVxSEKKyor/N5yhvp/ZiG1oE3DEYMSFA==} @@ -726,6 +792,9 @@ packages: fast-json-stable-stringify@2.1.0: resolution: {integrity: sha512-lhd/wF+Lk98HZoTCtlVraHtfh5XYijIjalXck7saUtuanSDyLMxnHhSXEDJqHxD7msR8D0uCmqlkwjCV8xvwHw==} + fast-uri@3.1.0: + resolution: {integrity: sha512-iPeeDKJSWf4IEOasVVrknXpaBV0IApz/gp7S2bb7Z4Lljbl2MGJRqInZiUrQwV16cpzw/D3S5j5Julj/gT52AA==} + fd-slicer@1.1.0: resolution: {integrity: sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==} @@ -738,6 +807,9 @@ packages: picomatch: optional: true + file-uri-to-path@1.0.0: + resolution: {integrity: sha512-0Zt+s3L7Vf1biwWZ29aARiVYLx7iMGnEUl9x33fbB/j3jR81u/O2LbqK+Bm1CDSNDKVtJ/YjwY7TUd5SkeLQLw==} + filelist@1.0.6: resolution: {integrity: sha512-5giy2PkLYY1cP39p17Ech+2xlpTRL9HLspOfEgm0L6CwBXBTgsK5ou0JtzYuepxkaQ/tvhCFIJ5uXo0OrM2DxA==} @@ -953,6 +1025,12 @@ packages: json-schema-traverse@0.4.1: resolution: {integrity: sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg==} + json-schema-traverse@1.0.0: + resolution: {integrity: sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==} + + json-schema-typed@8.0.2: + resolution: {integrity: sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA==} + json-stringify-safe@5.0.1: resolution: {integrity: sha512-ZClg6AaYvamvYEE82d3Iyd3vSSIjQ+odgjaTzRuO3s7toCdFKczob2i0zCh7JE8kWn17yvAWhUVxvqGwUalsRA==} @@ -983,6 +1061,9 @@ packages: lodash@4.17.23: resolution: {integrity: sha512-LgVTMpQtIopCi79SJeDiP0TfWi5CNEc/L/aRdTh3yIvmZXTnheWpKjSZhnvMl8iXbC1tFg9gdHHDMLoV7CnG+w==} + lodash@4.18.1: + resolution: {integrity: sha512-dMInicTPVE8d1e5otfwmmjlxkZoUpiVLwyeTdUsi/Caj/gfzzblBcCE5sRHV/AsjuCmxWrte2TNGSYuCeCq+0Q==} + log-symbols@4.1.0: resolution: {integrity: sha512-8XPvpAA8uyhfteu8pIvQxpJZ7SYYdpUivZpGy6sFsBuKRY/7rQGavedeB8aK+Zkyq6upMFVL/9AW6vOYzfRyLg==} engines: {node: '>=10'} @@ -1027,6 +1108,10 @@ packages: resolution: {integrity: sha512-OqbOk5oEQeAZ8WXWydlu9HJjz9WVdEIvamMCcXmuqUYjTknH/sqsWvhQ3vgwKFRR1HpjvNBKQ37nbJgYzGqGcg==} engines: {node: '>=6'} + mimic-function@5.0.1: + resolution: {integrity: sha512-VP79XUPxV2CigYP3jWwAUFSku2aKqBH7uTAapFWCBqutsbmDo96KY5o8uh6U+/YSIn5OxJnXp73beVkpqMIGhA==} + engines: {node: '>=18'} + mimic-response@1.0.1: resolution: {integrity: sha512-j5EctnkH7amfV/q5Hgmoal1g2QHFJRraOtmx0JpIqkxhBhI/lJSl1nMpQ45hVarwNETOoWEimndZ4QK0RHxuxQ==} engines: {node: '>=4'} @@ -1103,6 +1188,9 @@ packages: node-addon-api@1.7.2: resolution: {integrity: sha512-ibPK3iA+vaY1eEjESkQkM0BbCqFOaZMiXRTtdB0u7b4djtY6JnsjvPdUHVMg6xQt3B8fpTTWHI9A+ADjM9frzg==} + node-addon-api@7.1.1: + resolution: {integrity: sha512-5m3bsyrjFWE1xf7nz7YXdN4udnVtXK6/Yfgn5qnahL6bCkf2yKt4k3nuTKAtT4r3IG8JNR2ncsIMdZuAzJjHQQ==} + node-api-version@0.2.1: resolution: {integrity: sha512-2xP/IGGMmmSQpI1+O/k72jF/ykvZ89JeuKX3TLJAYPDVLUalrshrLHkeVcCCZqG/eEa635cr8IBYzgnDvM2O8Q==} @@ -1111,6 +1199,13 @@ packages: engines: {node: ^18.17.0 || >=20.5.0} hasBin: true + node-mac-permissions@2.5.0: + resolution: {integrity: sha512-zR8SVCaN3WqV1xwWd04XVAdzm3UTdjbxciLrZtB0Cc7F2Kd34AJfhPD4hm1HU0YH3oGUZO4X9OBLY5ijSTHsGw==} + os: [darwin] + + node-machine-id@1.1.12: + resolution: {integrity: sha512-QNABxbrPa3qEIfrE6GOJ7BYIuignnJw7iQ2YPbc3Nla1HzRJjXzZOiikfF8m7eAMfichLt3M4VgLOetqgDmgGQ==} + nopt@8.1.0: resolution: {integrity: sha512-ieGu42u/Qsa4TFktmaKEwM6MQH0pOWnaB3htzh0JRtx84+Mebc0cbZYN5bC+6WTZ4+77xrL9Pn5m7CV6VIkV7A==} engines: {node: ^18.17.0 || >=20.5.0} @@ -1180,6 +1275,15 @@ packages: resolution: {integrity: sha512-uysumyrvkUX0rX/dEVqt8gC3sTBzd4zoWfLeS29nb53imdaXVvLINYXTI2GNqzaMuvacNx4uJQ8+b3zXR0pkgQ==} engines: {node: '>=10.4.0'} + posthog-node@5.29.0: + resolution: {integrity: sha512-po7N55haSKxV8VOulkBZJja938yILShl6+fFjoUV3iQgOBCg4Muu615/xRg8mpNiz+UASvL0EEiGvIxdhXfj6Q==} + engines: {node: ^20.20.0 || >=22.22.0} + peerDependencies: + rxjs: ^7.0.0 + peerDependenciesMeta: + rxjs: + optional: true + postject@1.0.0-alpha.6: resolution: {integrity: sha512-b9Eb8h2eVqNE8edvKdwqkrY6O7kAwmI8kcnBv1NScolYJbo59XUF0noFq+lxbC1yN20bmC0WBEbDC5H/7ASb0A==} engines: {node: '>=14.0.0'} @@ -1222,10 +1326,18 @@ packages: resolution: {integrity: sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==} engines: {node: '>= 6'} + readdirp@5.0.0: + resolution: {integrity: sha512-9u/XQ1pvrQtYyMpZe7DXKv2p5CNvyVwzUB6uhLAnQwHMSgKMBR62lc7AHljaeteeHXn11XTAaLLUVZYVZyuRBQ==} + engines: {node: '>= 20.19.0'} + require-directory@2.1.1: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} + require-from-string@2.0.2: + resolution: {integrity: sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==} + engines: {node: '>=0.10.0'} + resedit@1.7.2: resolution: {integrity: sha512-vHjcY2MlAITJhC0eRD/Vv8Vlgmu9Sd3LX9zZvtGzU5ZImdTN3+d6e/4mnTyV8vEbyf1sgNIrWxhWlrys52OkEA==} engines: {node: '>=12', npm: '>=6'} @@ -1365,6 +1477,12 @@ packages: resolution: {integrity: sha512-yDPMNjp4WyfYBkHnjIRLfca1i6KMyGCtsVgoKe/z1+6vukgaENdgGBZt+ZmKPc4gavvEZ5OgHfHdrazhgNyG7w==} engines: {node: '>=12'} + stubborn-fs@2.0.0: + resolution: {integrity: sha512-Y0AvSwDw8y+nlSNFXMm2g6L51rBGdAQT20J3YSOqxC53Lo3bjWRtr2BKcfYoAf352WYpsZSTURrA0tqhfgudPA==} + + stubborn-utils@1.0.2: + resolution: {integrity: sha512-zOh9jPYI+xrNOyisSelgym4tolKTJCQd5GBhK0+0xJvcYDcwlOoxF/rnFKQ2KRZknXSG9jWAp66fwP6AxN9STg==} + sumchecker@3.0.1: resolution: {integrity: sha512-MvjXzkz/BOfyVDkG0oFOtBxHX2u3gKbMHIF/dXblZsgD3BWOFLmHovIpZY7BykJdAjcqRCBi1WYBNdEC9yI7vg==} engines: {node: '>= 8.0'} @@ -1377,6 +1495,10 @@ packages: resolution: {integrity: sha512-MpUEN2OodtUzxvKQl72cUF7RQ5EiHsGvSsVG0ia9c5RbWGL2CI4C7EpPS8UTBIplnlzZiNuV56w+FuNxy3ty2Q==} engines: {node: '>=10'} + tagged-tag@1.0.0: + resolution: {integrity: sha512-yEFYrVhod+hdNyx7g5Bnkkb0G6si8HJurOoOEgC8B/O0uXLHlaey/65KRv6cuWBNhBgHKAROVpc7QyYqE5gFng==} + engines: {node: '>=20'} + tar@7.5.11: resolution: {integrity: sha512-ChjMH33/KetonMTAtpYdgUFr0tbz69Fp2v7zWxQfYZX4g5ZN2nOBXm1R2xyA+lMIKrLKIoKAwFj93jE/avX9cQ==} engines: {node: '>=18'} @@ -1419,11 +1541,19 @@ packages: resolution: {integrity: sha512-34R7HTnG0XIJcBSn5XhDd7nNFPRcXYRZrBB2O2jdKqYODldSzBAqzsWoZYYvduky73toYS/ESqxPvkDf/F0XMg==} engines: {node: '>=10'} + type-fest@5.5.0: + resolution: {integrity: sha512-PlBfpQwiUvGViBNX84Yxwjsdhd1TUlXr6zjX7eoirtCPIr08NAmxwa+fcYBTeRQxHo9YC9wwF3m9i700sHma8g==} + engines: {node: '>=20'} + typescript@5.9.3: resolution: {integrity: sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==} engines: {node: '>=14.17'} hasBin: true + uint8array-extras@1.5.0: + resolution: {integrity: sha512-rvKSBiC5zqCCiDZ9kAOszZcDvdAHwwIKJG33Ykj43OKcWsnmcBRL09YTU4nOeHZ8Y2a7l1MgTd08SBe9A8Qj6A==} + engines: {node: '>=18'} + undici-types@7.16.0: resolution: {integrity: sha512-Zz+aZWSj8LE6zoxD+xrjh4VfkIG8Ya6LvYkZqtUQGJPZjYl53ypCaUwWqo7eI0x66KBGeRo+mlBEkMSeSZ38Nw==} @@ -1467,6 +1597,9 @@ packages: wcwidth@1.0.1: resolution: {integrity: sha512-XHPEwS0q6TaxcvG85+8EYkbiCux2XtWG2mkc47Ng2A77BQu9+DqIOJldST4HgPkuea7dvKSj5VgX3P1d4rW8Tg==} + when-exit@2.1.5: + resolution: {integrity: sha512-VGkKJ564kzt6Ms1dbgPP/yuIoQCrsFAnRbptpC5wOEsDaNsbCB2bnfnaA8i/vRs5tjUSEOtIuvl9/MyVsvQZCg==} + which@2.0.2: resolution: {integrity: sha512-BLI3Tl1TW3Pvl70l3yq3Y64i+awpwXqsGBYWkkqMtnbXgrMD+yj7rhW0kuEDxzJaYXGjEW5ogapKNMEKNMjibA==} engines: {node: '>= 8'} @@ -1764,6 +1897,8 @@ snapshots: '@pkgjs/parseargs@0.11.0': optional: true + '@posthog/core@1.25.0': {} + '@sindresorhus/is@4.6.0': {} '@standard-schema/spec@1.1.0': {} @@ -1827,6 +1962,10 @@ snapshots: agent-base@7.1.4: {} + ajv-formats@3.0.1(ajv@8.18.0): + optionalDependencies: + ajv: 8.18.0 + ajv-keywords@3.5.2(ajv@6.14.0): dependencies: ajv: 6.14.0 @@ -1838,6 +1977,13 @@ snapshots: json-schema-traverse: 0.4.1 uri-js: 4.4.1 + ajv@8.18.0: + dependencies: + fast-deep-equal: 3.1.3 + fast-uri: 3.1.0 + json-schema-traverse: 1.0.0 + require-from-string: 2.0.2 + ansi-regex@5.0.1: {} ansi-regex@6.2.2: {} @@ -1909,6 +2055,11 @@ snapshots: at-least-node@1.0.0: {} + atomically@2.1.1: + dependencies: + stubborn-fs: 2.0.0 + when-exit: 2.1.5 + axios@1.13.6: dependencies: follow-redirects: 1.15.11 @@ -1923,6 +2074,10 @@ snapshots: base64-js@1.5.1: {} + bindings@1.5.0: + dependencies: + file-uri-to-path: 1.0.0 + bl@4.1.0: dependencies: buffer: 5.7.1 @@ -2019,6 +2174,10 @@ snapshots: ansi-styles: 4.3.0 supports-color: 7.2.0 + chokidar@5.0.0: + dependencies: + readdirp: 5.0.0 + chownr@3.0.0: {} chromium-pickle-js@0.2.0: {} @@ -2079,6 +2238,18 @@ snapshots: tree-kill: 1.2.2 yargs: 17.7.2 + conf@15.1.0: + dependencies: + ajv: 8.18.0 + ajv-formats: 3.0.1(ajv@8.18.0) + atomically: 2.1.1 + debounce-fn: 6.0.0 + dot-prop: 10.1.0 + env-paths: 3.0.0 + json-schema-typed: 8.0.2 + semver: 7.7.4 + uint8array-extras: 1.5.0 + core-util-is@1.0.2: optional: true @@ -2096,6 +2267,10 @@ snapshots: shebang-command: 2.0.0 which: 2.0.2 + debounce-fn@6.0.0: + dependencies: + mimic-function: 5.0.1 + debug@4.4.3: dependencies: ms: 2.1.3 @@ -2161,6 +2336,10 @@ snapshots: verror: 1.10.1 optional: true + dot-prop@10.1.0: + dependencies: + type-fest: 5.5.0 + dotenv-expand@11.0.7: dependencies: dotenv: 16.6.1 @@ -2219,6 +2398,11 @@ snapshots: transitivePeerDependencies: - supports-color + electron-store@11.0.2: + dependencies: + conf: 15.1.0 + type-fest: 5.5.0 + electron-updater@6.8.3: dependencies: builder-util-runtime: 9.5.1 @@ -2237,7 +2421,7 @@ snapshots: '@electron/asar': 3.4.1 debug: 4.4.3 fs-extra: 7.0.1 - lodash: 4.17.23 + lodash: 4.18.1 temp: 0.9.4 optionalDependencies: '@electron/windows-sign': 1.2.2 @@ -2267,6 +2451,8 @@ snapshots: env-paths@2.2.1: {} + env-paths@3.0.0: {} + err-code@2.0.3: {} es-define-property@1.0.1: {} @@ -2340,6 +2526,8 @@ snapshots: fast-json-stable-stringify@2.1.0: {} + fast-uri@3.1.0: {} + fd-slicer@1.1.0: dependencies: pend: 1.2.0 @@ -2348,6 +2536,8 @@ snapshots: optionalDependencies: picomatch: 4.0.3 + file-uri-to-path@1.0.0: {} + filelist@1.0.6: dependencies: minimatch: 5.1.9 @@ -2595,6 +2785,10 @@ snapshots: json-schema-traverse@0.4.1: {} + json-schema-traverse@1.0.0: {} + + json-schema-typed@8.0.2: {} + json-stringify-safe@5.0.1: optional: true @@ -2622,6 +2816,8 @@ snapshots: lodash@4.17.23: {} + lodash@4.18.1: {} + log-symbols@4.1.0: dependencies: chalk: 4.1.2 @@ -2668,6 +2864,8 @@ snapshots: mimic-fn@2.1.0: {} + mimic-function@5.0.1: {} + mimic-response@1.0.1: {} mimic-response@3.1.0: {} @@ -2739,6 +2937,8 @@ snapshots: node-addon-api@1.7.2: optional: true + node-addon-api@7.1.1: {} + node-api-version@0.2.1: dependencies: semver: 7.7.4 @@ -2758,6 +2958,13 @@ snapshots: transitivePeerDependencies: - supports-color + node-mac-permissions@2.5.0: + dependencies: + bindings: 1.5.0 + node-addon-api: 7.1.1 + + node-machine-id@1.1.12: {} + nopt@8.1.0: dependencies: abbrev: 3.0.1 @@ -2820,6 +3027,12 @@ snapshots: base64-js: 1.5.1 xmlbuilder: 15.1.1 + posthog-node@5.29.0(rxjs@7.8.2): + dependencies: + '@posthog/core': 1.25.0 + optionalDependencies: + rxjs: 7.8.2 + postject@1.0.0-alpha.6: dependencies: commander: 9.5.0 @@ -2863,8 +3076,12 @@ snapshots: string_decoder: 1.3.0 util-deprecate: 1.0.2 + readdirp@5.0.0: {} + require-directory@2.1.1: {} + require-from-string@2.0.2: {} + resedit@1.7.2: dependencies: pe-library: 0.4.1 @@ -3002,6 +3219,12 @@ snapshots: dependencies: ansi-regex: 6.2.2 + stubborn-fs@2.0.0: + dependencies: + stubborn-utils: 1.0.2 + + stubborn-utils@1.0.2: {} + sumchecker@3.0.1: dependencies: debug: 4.4.3 @@ -3016,6 +3239,8 @@ snapshots: dependencies: has-flag: 4.0.0 + tagged-tag@1.0.0: {} + tar@7.5.11: dependencies: '@isaacs/fs-minipass': 4.0.1 @@ -3062,8 +3287,14 @@ snapshots: type-fest@0.13.1: optional: true + type-fest@5.5.0: + dependencies: + tagged-tag: 1.0.0 + typescript@5.9.3: {} + uint8array-extras@1.5.0: {} + undici-types@7.16.0: {} undici-types@7.18.2: {} @@ -3109,6 +3340,8 @@ snapshots: dependencies: defaults: 1.0.4 + when-exit@2.1.5: {} + which@2.0.2: dependencies: isexe: 2.0.0 diff --git a/surfsense_desktop/scripts/build-electron.mjs b/surfsense_desktop/scripts/build-electron.mjs index 923830296..90d76ef7a 100644 --- a/surfsense_desktop/scripts/build-electron.mjs +++ b/surfsense_desktop/scripts/build-electron.mjs @@ -104,13 +104,19 @@ async function buildElectron() { bundle: true, platform: 'node', target: 'node18', - external: ['electron'], + external: ['electron', 'node-mac-permissions', 'bindings', 'file-uri-to-path'], sourcemap: true, minify: false, define: { 'process.env.HOSTED_FRONTEND_URL': JSON.stringify( process.env.HOSTED_FRONTEND_URL || desktopEnv.HOSTED_FRONTEND_URL || 'https://surfsense.net' ), + 'process.env.POSTHOG_KEY': JSON.stringify( + process.env.POSTHOG_KEY || desktopEnv.POSTHOG_KEY || '' + ), + 'process.env.POSTHOG_HOST': JSON.stringify( + process.env.POSTHOG_HOST || desktopEnv.POSTHOG_HOST || 'https://assets.surfsense.com' + ), }, }; diff --git a/surfsense_desktop/src/ipc/channels.ts b/surfsense_desktop/src/ipc/channels.ts index 25ec1bc0e..39e75f046 100644 --- a/surfsense_desktop/src/ipc/channels.ts +++ b/surfsense_desktop/src/ipc/channels.ts @@ -6,4 +6,39 @@ export const IPC_CHANNELS = { SET_QUICK_ASK_MODE: 'set-quick-ask-mode', GET_QUICK_ASK_MODE: 'get-quick-ask-mode', REPLACE_TEXT: 'replace-text', + // Permissions + GET_PERMISSIONS_STATUS: 'get-permissions-status', + REQUEST_ACCESSIBILITY: 'request-accessibility', + REQUEST_SCREEN_RECORDING: 'request-screen-recording', + RESTART_APP: 'restart-app', + // Autocomplete + AUTOCOMPLETE_CONTEXT: 'autocomplete-context', + ACCEPT_SUGGESTION: 'accept-suggestion', + DISMISS_SUGGESTION: 'dismiss-suggestion', + SET_AUTOCOMPLETE_ENABLED: 'set-autocomplete-enabled', + GET_AUTOCOMPLETE_ENABLED: 'get-autocomplete-enabled', + // Folder sync channels + FOLDER_SYNC_SELECT_FOLDER: 'folder-sync:select-folder', + FOLDER_SYNC_ADD_FOLDER: 'folder-sync:add-folder', + FOLDER_SYNC_REMOVE_FOLDER: 'folder-sync:remove-folder', + FOLDER_SYNC_GET_FOLDERS: 'folder-sync:get-folders', + FOLDER_SYNC_GET_STATUS: 'folder-sync:get-status', + FOLDER_SYNC_FILE_CHANGED: 'folder-sync:file-changed', + FOLDER_SYNC_WATCHER_READY: 'folder-sync:watcher-ready', + FOLDER_SYNC_PAUSE: 'folder-sync:pause', + FOLDER_SYNC_RESUME: 'folder-sync:resume', + FOLDER_SYNC_RENDERER_READY: 'folder-sync:renderer-ready', + FOLDER_SYNC_GET_PENDING_EVENTS: 'folder-sync:get-pending-events', + FOLDER_SYNC_ACK_EVENTS: 'folder-sync:ack-events', + BROWSE_FILES: 'browse:files', + READ_LOCAL_FILES: 'browse:read-local-files', + // Auth token sync across windows + GET_AUTH_TOKENS: 'auth:get-tokens', + SET_AUTH_TOKENS: 'auth:set-tokens', + // Keyboard shortcut configuration + GET_SHORTCUTS: 'shortcuts:get', + SET_SHORTCUTS: 'shortcuts:set', + // Active search space + GET_ACTIVE_SEARCH_SPACE: 'search-space:get-active', + SET_ACTIVE_SEARCH_SPACE: 'search-space:set-active', } as const; diff --git a/surfsense_desktop/src/ipc/handlers.ts b/surfsense_desktop/src/ipc/handlers.ts index 18e343719..200fa75bd 100644 --- a/surfsense_desktop/src/ipc/handlers.ts +++ b/surfsense_desktop/src/ipc/handlers.ts @@ -1,5 +1,32 @@ import { app, ipcMain, shell } from 'electron'; import { IPC_CHANNELS } from './channels'; +import { + getPermissionsStatus, + requestAccessibility, + requestScreenRecording, + restartApp, +} from '../modules/permissions'; +import { + selectFolder, + addWatchedFolder, + removeWatchedFolder, + getWatchedFolders, + getWatcherStatus, + getPendingFileEvents, + acknowledgeFileEvents, + pauseWatcher, + resumeWatcher, + markRendererReady, + browseFiles, + readLocalFiles, +} from '../modules/folder-watcher'; +import { getShortcuts, setShortcuts, type ShortcutConfig } from '../modules/shortcuts'; +import { getActiveSearchSpaceId, setActiveSearchSpaceId } from '../modules/active-search-space'; +import { reregisterQuickAsk } from '../modules/quick-ask'; +import { reregisterAutocomplete } from '../modules/autocomplete'; +import { reregisterGeneralAssist } from '../modules/tray'; + +let authTokens: { bearer: string; refresh: string } | null = null; export function registerIpcHandlers(): void { ipcMain.on(IPC_CHANNELS.OPEN_EXTERNAL, (_event, url: string) => { @@ -16,4 +43,81 @@ export function registerIpcHandlers(): void { ipcMain.handle(IPC_CHANNELS.GET_APP_VERSION, () => { return app.getVersion(); }); + + ipcMain.handle(IPC_CHANNELS.GET_PERMISSIONS_STATUS, () => { + return getPermissionsStatus(); + }); + + ipcMain.handle(IPC_CHANNELS.REQUEST_ACCESSIBILITY, () => { + requestAccessibility(); + }); + + ipcMain.handle(IPC_CHANNELS.REQUEST_SCREEN_RECORDING, () => { + requestScreenRecording(); + }); + + ipcMain.handle(IPC_CHANNELS.RESTART_APP, () => { + restartApp(); + }); + + // Folder sync handlers + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_SELECT_FOLDER, () => selectFolder()); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_ADD_FOLDER, (_event, config) => + addWatchedFolder(config) + ); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_REMOVE_FOLDER, (_event, folderPath: string) => + removeWatchedFolder(folderPath) + ); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_GET_FOLDERS, () => getWatchedFolders()); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_GET_STATUS, () => getWatcherStatus()); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_PAUSE, () => pauseWatcher()); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_RESUME, () => resumeWatcher()); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY, () => { + markRendererReady(); + }); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_GET_PENDING_EVENTS, () => + getPendingFileEvents() + ); + + ipcMain.handle(IPC_CHANNELS.FOLDER_SYNC_ACK_EVENTS, (_event, eventIds: string[]) => + acknowledgeFileEvents(eventIds) + ); + + ipcMain.handle(IPC_CHANNELS.BROWSE_FILES, () => browseFiles()); + + ipcMain.handle(IPC_CHANNELS.READ_LOCAL_FILES, (_event, paths: string[]) => + readLocalFiles(paths) + ); + + ipcMain.handle(IPC_CHANNELS.SET_AUTH_TOKENS, (_event, tokens: { bearer: string; refresh: string }) => { + authTokens = tokens; + }); + + ipcMain.handle(IPC_CHANNELS.GET_AUTH_TOKENS, () => { + return authTokens; + }); + + ipcMain.handle(IPC_CHANNELS.GET_SHORTCUTS, () => getShortcuts()); + + ipcMain.handle(IPC_CHANNELS.GET_ACTIVE_SEARCH_SPACE, () => getActiveSearchSpaceId()); + + ipcMain.handle(IPC_CHANNELS.SET_ACTIVE_SEARCH_SPACE, (_event, id: string) => + setActiveSearchSpaceId(id) + ); + + ipcMain.handle(IPC_CHANNELS.SET_SHORTCUTS, async (_event, config: Partial) => { + const updated = await setShortcuts(config); + if (config.generalAssist) await reregisterGeneralAssist(); + if (config.quickAsk) await reregisterQuickAsk(); + if (config.autocomplete) await reregisterAutocomplete(); + return updated; + }); } diff --git a/surfsense_desktop/src/main.ts b/surfsense_desktop/src/main.ts index 3ab41073b..231553f9a 100644 --- a/surfsense_desktop/src/main.ts +++ b/surfsense_desktop/src/main.ts @@ -1,12 +1,18 @@ import { app, BrowserWindow } from 'electron'; + +let isQuitting = false; import { registerGlobalErrorHandlers, showErrorDialog } from './modules/errors'; import { startNextServer } from './modules/server'; -import { createMainWindow } from './modules/window'; +import { createMainWindow, getMainWindow } from './modules/window'; import { setupDeepLinks, handlePendingDeepLink } from './modules/deep-links'; import { setupAutoUpdater } from './modules/auto-updater'; import { setupMenu } from './modules/menu'; import { registerQuickAsk, unregisterQuickAsk } from './modules/quick-ask'; +import { registerAutocomplete, unregisterAutocomplete } from './modules/autocomplete'; +import { registerFolderWatcher, unregisterFolderWatcher } from './modules/folder-watcher'; import { registerIpcHandlers } from './ipc/handlers'; +import { createTray, destroyTray } from './modules/tray'; +import { initAnalytics, shutdownAnalytics, trackEvent } from './modules/analytics'; registerGlobalErrorHandlers(); @@ -16,8 +22,9 @@ if (!setupDeepLinks()) { registerIpcHandlers(); -// App lifecycle app.whenReady().then(async () => { + initAnalytics(); + trackEvent('desktop_app_launched'); setupMenu(); try { await startNextServer(); @@ -26,25 +33,55 @@ app.whenReady().then(async () => { setTimeout(() => app.quit(), 0); return; } - createMainWindow(); - registerQuickAsk(); + + await createTray(); + + const win = createMainWindow('/dashboard'); + + // Minimize to tray instead of closing the app + win.on('close', (e) => { + if (!isQuitting) { + e.preventDefault(); + win.hide(); + } + }); + + await registerQuickAsk(); + await registerAutocomplete(); + registerFolderWatcher(); setupAutoUpdater(); handlePendingDeepLink(); app.on('activate', () => { - if (BrowserWindow.getAllWindows().length === 0) { - createMainWindow(); + const mw = getMainWindow(); + if (!mw || mw.isDestroyed()) { + createMainWindow('/dashboard'); + } else { + mw.show(); + mw.focus(); } }); }); +// Keep running in the background — the tray "Quit" calls app.exit() app.on('window-all-closed', () => { - if (process.platform !== 'darwin') { - app.quit(); - } + // Do nothing: the app stays alive in the tray }); -app.on('will-quit', () => { - unregisterQuickAsk(); +app.on('before-quit', () => { + isQuitting = true; +}); + +let didCleanup = false; +app.on('will-quit', async (e) => { + if (didCleanup) return; + didCleanup = true; + e.preventDefault(); + unregisterQuickAsk(); + unregisterAutocomplete(); + unregisterFolderWatcher(); + destroyTray(); + await shutdownAnalytics(); + app.exit(); }); diff --git a/surfsense_desktop/src/modules/active-search-space.ts b/surfsense_desktop/src/modules/active-search-space.ts new file mode 100644 index 000000000..e5f55c8f4 --- /dev/null +++ b/surfsense_desktop/src/modules/active-search-space.ts @@ -0,0 +1,24 @@ +const STORE_KEY = 'activeSearchSpaceId'; +// eslint-disable-next-line @typescript-eslint/no-explicit-any +let store: any = null; + +async function getStore() { + if (!store) { + const { default: Store } = await import('electron-store'); + store = new Store({ + name: 'active-search-space', + defaults: { [STORE_KEY]: null as string | null }, + }); + } + return store; +} + +export async function getActiveSearchSpaceId(): Promise { + const s = await getStore(); + return (s.get(STORE_KEY) as string | null) ?? null; +} + +export async function setActiveSearchSpaceId(id: string): Promise { + const s = await getStore(); + s.set(STORE_KEY, id); +} diff --git a/surfsense_desktop/src/modules/analytics.ts b/surfsense_desktop/src/modules/analytics.ts new file mode 100644 index 000000000..0bbcb3026 --- /dev/null +++ b/surfsense_desktop/src/modules/analytics.ts @@ -0,0 +1,50 @@ +import { PostHog } from 'posthog-node'; +import { machineIdSync } from 'node-machine-id'; +import { app } from 'electron'; + +let client: PostHog | null = null; +let distinctId = ''; + +export function initAnalytics(): void { + const key = process.env.POSTHOG_KEY; + if (!key) return; + + try { + distinctId = machineIdSync(true); + } catch { + return; + } + + client = new PostHog(key, { + host: process.env.POSTHOG_HOST || 'https://assets.surfsense.com', + flushAt: 20, + flushInterval: 10000, + }); +} + +export function trackEvent(event: string, properties?: Record): void { + if (!client) return; + + try { + client.capture({ + distinctId, + event, + properties: { + platform: 'desktop', + app_version: app.getVersion(), + os: process.platform, + ...properties, + }, + }); + } catch { + // Analytics should never break the app + } +} + +export async function shutdownAnalytics(): Promise { + if (!client) return; + + const timeout = new Promise((resolve) => setTimeout(resolve, 3000)); + await Promise.race([client.shutdown(), timeout]); + client = null; +} diff --git a/surfsense_desktop/src/modules/autocomplete/index.ts b/surfsense_desktop/src/modules/autocomplete/index.ts new file mode 100644 index 000000000..d4eb727fd --- /dev/null +++ b/surfsense_desktop/src/modules/autocomplete/index.ts @@ -0,0 +1,143 @@ +import { clipboard, globalShortcut, ipcMain, screen } from 'electron'; +import { IPC_CHANNELS } from '../../ipc/channels'; +import { getFrontmostApp, getWindowTitle, hasAccessibilityPermission, simulatePaste } from '../platform'; +import { hasScreenRecordingPermission, requestAccessibility, requestScreenRecording } from '../permissions'; +import { captureScreen } from './screenshot'; +import { createSuggestionWindow, destroySuggestion, getSuggestionWindow } from './suggestion-window'; +import { getShortcuts } from '../shortcuts'; +import { getActiveSearchSpaceId } from '../active-search-space'; +import { trackEvent } from '../analytics'; + +let currentShortcut = ''; +let autocompleteEnabled = true; +let savedClipboard = ''; +let sourceApp = ''; + +function isSurfSenseWindow(): boolean { + const app = getFrontmostApp(); + return app === 'Electron' || app === 'SurfSense' || app === 'surfsense-desktop'; +} + +async function triggerAutocomplete(): Promise { + if (!autocompleteEnabled) return; + if (isSurfSenseWindow()) return; + + if (!hasScreenRecordingPermission()) { + requestScreenRecording(); + return; + } + + sourceApp = getFrontmostApp(); + const windowTitle = getWindowTitle(); + savedClipboard = clipboard.readText(); + + const screenshot = await captureScreen(); + if (!screenshot) { + console.error('[autocomplete] Screenshot capture failed'); + return; + } + + const searchSpaceId = await getActiveSearchSpaceId(); + if (!searchSpaceId) { + console.warn('[autocomplete] No active search space. Select a search space first.'); + return; + } + trackEvent('desktop_autocomplete_triggered', { search_space_id: searchSpaceId }); + const cursor = screen.getCursorScreenPoint(); + const win = createSuggestionWindow(cursor.x, cursor.y); + + win.webContents.once('did-finish-load', () => { + const sw = getSuggestionWindow(); + setTimeout(() => { + if (sw && !sw.isDestroyed()) { + sw.webContents.send(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, { + screenshot, + searchSpaceId, + appName: sourceApp, + windowTitle, + }); + } + }, 300); + }); +} + +async function acceptAndInject(text: string): Promise { + if (!sourceApp) return; + + if (!hasAccessibilityPermission()) { + requestAccessibility(); + return; + } + + clipboard.writeText(text); + destroySuggestion(); + + try { + await new Promise((r) => setTimeout(r, 50)); + simulatePaste(); + await new Promise((r) => setTimeout(r, 100)); + clipboard.writeText(savedClipboard); + } catch { + clipboard.writeText(savedClipboard); + } +} + +let ipcRegistered = false; + +function registerIpcHandlers(): void { + if (ipcRegistered) return; + ipcRegistered = true; + + ipcMain.handle(IPC_CHANNELS.ACCEPT_SUGGESTION, async (_event, text: string) => { + trackEvent('desktop_autocomplete_accepted'); + await acceptAndInject(text); + }); + ipcMain.handle(IPC_CHANNELS.DISMISS_SUGGESTION, () => { + trackEvent('desktop_autocomplete_dismissed'); + destroySuggestion(); + }); + ipcMain.handle(IPC_CHANNELS.SET_AUTOCOMPLETE_ENABLED, (_event, enabled: boolean) => { + autocompleteEnabled = enabled; + if (!enabled) { + destroySuggestion(); + } + }); + ipcMain.handle(IPC_CHANNELS.GET_AUTOCOMPLETE_ENABLED, () => autocompleteEnabled); +} + +function autocompleteHandler(): void { + const sw = getSuggestionWindow(); + if (sw && !sw.isDestroyed()) { + destroySuggestion(); + return; + } + triggerAutocomplete(); +} + +async function registerShortcut(): Promise { + const shortcuts = await getShortcuts(); + currentShortcut = shortcuts.autocomplete; + + const ok = globalShortcut.register(currentShortcut, autocompleteHandler); + + if (!ok) { + console.error(`[autocomplete] Failed to register shortcut ${currentShortcut}`); + } else { + console.log(`[autocomplete] Registered shortcut ${currentShortcut}`); + } +} + +export async function registerAutocomplete(): Promise { + registerIpcHandlers(); + await registerShortcut(); +} + +export function unregisterAutocomplete(): void { + if (currentShortcut) globalShortcut.unregister(currentShortcut); + destroySuggestion(); +} + +export async function reregisterAutocomplete(): Promise { + unregisterAutocomplete(); + await registerShortcut(); +} diff --git a/surfsense_desktop/src/modules/autocomplete/screenshot.ts b/surfsense_desktop/src/modules/autocomplete/screenshot.ts new file mode 100644 index 000000000..22b7c1b14 --- /dev/null +++ b/surfsense_desktop/src/modules/autocomplete/screenshot.ts @@ -0,0 +1,27 @@ +import { desktopCapturer, screen } from 'electron'; + +/** + * Captures the primary display as a base64-encoded PNG data URL. + * Uses the display's actual size for full-resolution capture. + */ +export async function captureScreen(): Promise { + try { + const primaryDisplay = screen.getPrimaryDisplay(); + const { width, height } = primaryDisplay.size; + + const sources = await desktopCapturer.getSources({ + types: ['screen'], + thumbnailSize: { width, height }, + }); + + if (!sources.length) { + console.error('[screenshot] No screen sources found'); + return null; + } + + return sources[0].thumbnail.toDataURL(); + } catch (err) { + console.error('[screenshot] Failed to capture screen:', err); + return null; + } +} diff --git a/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts new file mode 100644 index 000000000..8f61b2901 --- /dev/null +++ b/surfsense_desktop/src/modules/autocomplete/suggestion-window.ts @@ -0,0 +1,112 @@ +import { BrowserWindow, screen, shell } from 'electron'; +import path from 'path'; +import { getServerPort } from '../server'; + +const TOOLTIP_WIDTH = 420; +const TOOLTIP_HEIGHT = 38; +const MAX_HEIGHT = 400; + +let suggestionWindow: BrowserWindow | null = null; +let resizeTimer: ReturnType | null = null; +let cursorOrigin = { x: 0, y: 0 }; + +const CURSOR_GAP = 20; + +function positionOnScreen(cursorX: number, cursorY: number, w: number, h: number): { x: number; y: number } { + const display = screen.getDisplayNearestPoint({ x: cursorX, y: cursorY }); + const { x: dx, y: dy, width: dw, height: dh } = display.workArea; + + const x = Math.max(dx, Math.min(cursorX, dx + dw - w)); + + const spaceBelow = (dy + dh) - (cursorY + CURSOR_GAP); + const y = spaceBelow >= h + ? cursorY + CURSOR_GAP + : cursorY - h - CURSOR_GAP; + + return { x, y: Math.max(dy, y) }; +} + +function stopResizePolling(): void { + if (resizeTimer) { clearInterval(resizeTimer); resizeTimer = null; } +} + +function startResizePolling(win: BrowserWindow): void { + stopResizePolling(); + let lastH = 0; + resizeTimer = setInterval(async () => { + if (!win || win.isDestroyed()) { stopResizePolling(); return; } + try { + const h: number = await win.webContents.executeJavaScript( + `document.body.scrollHeight` + ); + if (h > 0 && h !== lastH) { + lastH = h; + const clamped = Math.min(h, MAX_HEIGHT); + const pos = positionOnScreen(cursorOrigin.x, cursorOrigin.y, TOOLTIP_WIDTH, clamped); + win.setBounds({ x: pos.x, y: pos.y, width: TOOLTIP_WIDTH, height: clamped }); + } + } catch {} + }, 150); +} + +export function getSuggestionWindow(): BrowserWindow | null { + return suggestionWindow; +} + +export function destroySuggestion(): void { + stopResizePolling(); + if (suggestionWindow && !suggestionWindow.isDestroyed()) { + suggestionWindow.close(); + } + suggestionWindow = null; +} + +export function createSuggestionWindow(x: number, y: number): BrowserWindow { + destroySuggestion(); + cursorOrigin = { x, y }; + + const pos = positionOnScreen(x, y, TOOLTIP_WIDTH, TOOLTIP_HEIGHT); + + suggestionWindow = new BrowserWindow({ + width: TOOLTIP_WIDTH, + height: TOOLTIP_HEIGHT, + x: pos.x, + y: pos.y, + frame: false, + transparent: true, + focusable: false, + alwaysOnTop: true, + skipTaskbar: true, + hasShadow: true, + type: 'panel', + webPreferences: { + preload: path.join(__dirname, 'preload.js'), + contextIsolation: true, + nodeIntegration: false, + sandbox: true, + }, + show: false, + }); + + suggestionWindow.loadURL(`http://localhost:${getServerPort()}/desktop/suggestion?t=${Date.now()}`); + + suggestionWindow.once('ready-to-show', () => { + suggestionWindow?.showInactive(); + if (suggestionWindow) startResizePolling(suggestionWindow); + }); + + suggestionWindow.webContents.setWindowOpenHandler(({ url }) => { + if (url.startsWith('http://localhost')) { + return { action: 'allow' }; + } + shell.openExternal(url); + return { action: 'deny' }; + }); + + suggestionWindow.on('closed', () => { + stopResizePolling(); + suggestionWindow = null; + }); + + return suggestionWindow; +} diff --git a/surfsense_desktop/src/modules/folder-watcher.ts b/surfsense_desktop/src/modules/folder-watcher.ts new file mode 100644 index 000000000..969dabe97 --- /dev/null +++ b/surfsense_desktop/src/modules/folder-watcher.ts @@ -0,0 +1,534 @@ +import { BrowserWindow, dialog } from 'electron'; +import chokidar, { type FSWatcher } from 'chokidar'; +import { randomUUID } from 'crypto'; +import * as path from 'path'; +import * as fs from 'fs'; +import { IPC_CHANNELS } from '../ipc/channels'; + +export interface WatchedFolderConfig { + path: string; + name: string; + excludePatterns: string[]; + fileExtensions: string[] | null; + rootFolderId: number | null; + searchSpaceId: number; + active: boolean; +} + +interface WatcherEntry { + config: WatchedFolderConfig; + watcher: FSWatcher | null; +} + +type MtimeMap = Record; +type FolderSyncAction = 'add' | 'change' | 'unlink'; + +export interface FolderSyncFileChangedEvent { + id: string; + rootFolderId: number | null; + searchSpaceId: number; + folderPath: string; + folderName: string; + relativePath: string; + fullPath: string; + action: FolderSyncAction; + timestamp: number; +} + +const STORE_KEY = 'watchedFolders'; +const OUTBOX_STORE_KEY = 'events'; +const MTIME_TOLERANCE_S = 1.0; + +let store: any = null; +let mtimeStore: any = null; +let outboxStore: any = null; +let watchers: Map = new Map(); + +/** + * In-memory cache of mtime maps, keyed by folder path. + * Persisted to electron-store on mutation. + */ +const mtimeMaps: Map = new Map(); + +let rendererReady = false; +const outboxEvents: Map = new Map(); +let outboxLoaded = false; + +export function markRendererReady() { + rendererReady = true; +} + +async function getStore() { + if (!store) { + const { default: Store } = await import('electron-store'); + store = new Store({ + name: 'folder-watcher', + defaults: { + [STORE_KEY]: [] as WatchedFolderConfig[], + }, + }); + } + return store; +} + +async function getMtimeStore() { + if (!mtimeStore) { + const { default: Store } = await import('electron-store'); + mtimeStore = new Store({ + name: 'folder-mtime-maps', + defaults: {} as Record, + }); + } + return mtimeStore; +} + +async function getOutboxStore() { + if (!outboxStore) { + const { default: Store } = await import('electron-store'); + outboxStore = new Store({ + name: 'folder-sync-outbox', + defaults: { + [OUTBOX_STORE_KEY]: [] as FolderSyncFileChangedEvent[], + }, + }); + } + return outboxStore; +} + +function makeEventKey(event: Pick): string { + return `${event.folderPath}:${event.relativePath}`; +} + +function persistOutbox() { + getOutboxStore().then((s) => { + s.set(OUTBOX_STORE_KEY, Array.from(outboxEvents.values())); + }); +} + +async function loadOutbox() { + if (outboxLoaded) return; + const s = await getOutboxStore(); + const stored: FolderSyncFileChangedEvent[] = s.get(OUTBOX_STORE_KEY, []); + outboxEvents.clear(); + for (const event of stored) { + if (!event?.id || !event.folderPath || !event.relativePath) continue; + outboxEvents.set(makeEventKey(event), event); + } + outboxLoaded = true; +} + +function sendFileChangedEvent( + data: Omit +) { + const event: FolderSyncFileChangedEvent = { + id: randomUUID(), + ...data, + }; + + outboxEvents.set(makeEventKey(event), event); + persistOutbox(); + + if (rendererReady) { + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, event); + } +} + +function loadMtimeMap(folderPath: string): MtimeMap { + return mtimeMaps.get(folderPath) ?? {}; +} + +function persistMtimeMap(folderPath: string) { + const map = mtimeMaps.get(folderPath) ?? {}; + getMtimeStore().then((s) => s.set(folderPath, map)); +} + +function walkFolderMtimes(config: WatchedFolderConfig): MtimeMap { + const root = config.path; + const result: MtimeMap = {}; + const excludes = new Set(config.excludePatterns); + + function walk(dir: string) { + let entries: fs.Dirent[]; + try { + entries = fs.readdirSync(dir, { withFileTypes: true }); + } catch { + return; + } + + for (const entry of entries) { + const name = entry.name; + + if (name.startsWith('.') || excludes.has(name)) continue; + + const full = path.join(dir, name); + + if (entry.isDirectory()) { + walk(full); + } else if (entry.isFile()) { + if ( + config.fileExtensions && + config.fileExtensions.length > 0 + ) { + const ext = path.extname(name).toLowerCase(); + if (!config.fileExtensions.includes(ext)) continue; + } + + try { + const stat = fs.statSync(full); + const rel = path.relative(root, full); + result[rel] = stat.mtimeMs; + } catch { + // File may have been removed between readdir and stat + } + } + } + } + + walk(root); + return result; +} + +function getMainWindow(): BrowserWindow | null { + const windows = BrowserWindow.getAllWindows(); + return windows.length > 0 ? windows[0] : null; +} + +function sendToRenderer(channel: string, data: any) { + const win = getMainWindow(); + if (win && !win.isDestroyed()) { + win.webContents.send(channel, data); + } +} + +async function startWatcher(config: WatchedFolderConfig) { + if (watchers.has(config.path)) { + return; + } + + const ms = await getMtimeStore(); + const storedMap: MtimeMap = ms.get(config.path) ?? {}; + mtimeMaps.set(config.path, { ...storedMap }); + + const ignored = [ + /(^|[/\\])\../, // dotfiles by default + ...config.excludePatterns.map((p) => `**/${p}/**`), + ]; + + const watcher = chokidar.watch(config.path, { + persistent: true, + ignoreInitial: true, + awaitWriteFinish: { + stabilityThreshold: 500, + pollInterval: 100, + }, + ignored, + }); + + let ready = false; + + watcher.on('ready', () => { + ready = true; + + const currentMap = walkFolderMtimes(config); + const storedSnapshot = loadMtimeMap(config.path); + const now = Date.now(); + + // Track which files are unchanged so we can selectively update the mtime map + const unchangedMap: MtimeMap = {}; + + for (const [rel, currentMtime] of Object.entries(currentMap)) { + const storedMtime = storedSnapshot[rel]; + if (storedMtime === undefined) { + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, + searchSpaceId: config.searchSpaceId, + folderPath: config.path, + folderName: config.name, + relativePath: rel, + fullPath: path.join(config.path, rel), + action: 'add', + timestamp: now, + }); + } else if (Math.abs(currentMtime - storedMtime) >= MTIME_TOLERANCE_S * 1000) { + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, + searchSpaceId: config.searchSpaceId, + folderPath: config.path, + folderName: config.name, + relativePath: rel, + fullPath: path.join(config.path, rel), + action: 'change', + timestamp: now, + }); + } else { + unchangedMap[rel] = currentMtime; + } + } + + for (const rel of Object.keys(storedSnapshot)) { + if (!(rel in currentMap)) { + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, + searchSpaceId: config.searchSpaceId, + folderPath: config.path, + folderName: config.name, + relativePath: rel, + fullPath: path.join(config.path, rel), + action: 'unlink', + timestamp: now, + }); + } + } + + // Only update the mtime map for unchanged files; changed files keep their + // stored mtime so they'll be re-detected if the app crashes before indexing. + mtimeMaps.set(config.path, unchangedMap); + persistMtimeMap(config.path); + + sendToRenderer(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, { + rootFolderId: config.rootFolderId, + folderPath: config.path, + }); + }); + + const handleFileEvent = (filePath: string, action: FolderSyncAction) => { + if (!ready) return; + + const relativePath = path.relative(config.path, filePath); + + if ( + config.fileExtensions && + config.fileExtensions.length > 0 + ) { + const ext = path.extname(filePath).toLowerCase(); + if (!config.fileExtensions.includes(ext)) return; + } + + const map = mtimeMaps.get(config.path); + if (map) { + if (action === 'unlink') { + delete map[relativePath]; + } else { + try { + map[relativePath] = fs.statSync(filePath).mtimeMs; + } catch { + // File may have been removed between event and stat + } + } + persistMtimeMap(config.path); + } + + sendFileChangedEvent({ + rootFolderId: config.rootFolderId, + searchSpaceId: config.searchSpaceId, + folderPath: config.path, + folderName: config.name, + relativePath, + fullPath: filePath, + action, + timestamp: Date.now(), + }); + }; + + watcher.on('add', (fp) => handleFileEvent(fp, 'add')); + watcher.on('change', (fp) => handleFileEvent(fp, 'change')); + watcher.on('unlink', (fp) => handleFileEvent(fp, 'unlink')); + + watchers.set(config.path, { config, watcher }); +} + +function stopWatcher(folderPath: string) { + persistMtimeMap(folderPath); + const entry = watchers.get(folderPath); + if (entry?.watcher) { + entry.watcher.close(); + } + watchers.delete(folderPath); +} + +export async function selectFolder(): Promise { + const result = await dialog.showOpenDialog({ + properties: ['openDirectory'], + title: 'Select a folder to watch', + }); + if (result.canceled || result.filePaths.length === 0) { + return null; + } + return result.filePaths[0]; +} + +export async function addWatchedFolder( + config: WatchedFolderConfig +): Promise { + const s = await getStore(); + const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []); + + const existing = folders.findIndex((f: WatchedFolderConfig) => f.path === config.path); + if (existing >= 0) { + folders[existing] = config; + } else { + folders.push(config); + } + + s.set(STORE_KEY, folders); + + if (config.active) { + await startWatcher(config); + } + + return folders; +} + +export async function removeWatchedFolder( + folderPath: string +): Promise { + const s = await getStore(); + const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []); + const updated = folders.filter((f: WatchedFolderConfig) => f.path !== folderPath); + s.set(STORE_KEY, updated); + + stopWatcher(folderPath); + + mtimeMaps.delete(folderPath); + const ms = await getMtimeStore(); + ms.delete(folderPath); + + return updated; +} + +export async function getWatchedFolders(): Promise { + const s = await getStore(); + return s.get(STORE_KEY, []); +} + +export async function getWatcherStatus(): Promise< + { path: string; active: boolean; watching: boolean }[] +> { + const s = await getStore(); + const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []); + return folders.map((f: WatchedFolderConfig) => ({ + path: f.path, + active: f.active, + watching: watchers.has(f.path), + })); +} + +export async function getPendingFileEvents(): Promise { + await loadOutbox(); + return Array.from(outboxEvents.values()).sort((a, b) => a.timestamp - b.timestamp); +} + +export async function acknowledgeFileEvents(eventIds: string[]): Promise<{ acknowledged: number }> { + if (!eventIds || eventIds.length === 0) return { acknowledged: 0 }; + await loadOutbox(); + + const ackSet = new Set(eventIds); + let acknowledged = 0; + + for (const [key, event] of outboxEvents.entries()) { + if (ackSet.has(event.id)) { + outboxEvents.delete(key); + acknowledged += 1; + } + } + + if (acknowledged > 0) { + persistOutbox(); + } + + return { acknowledged }; +} + +export async function pauseWatcher(): Promise { + for (const [, entry] of watchers) { + if (entry.watcher) { + await entry.watcher.close(); + entry.watcher = null; + } + } +} + +export async function resumeWatcher(): Promise { + for (const [, entry] of watchers) { + if (!entry.watcher && entry.config.active) { + await startWatcher(entry.config); + } + } +} + +export async function registerFolderWatcher(): Promise { + await loadOutbox(); + const s = await getStore(); + const folders: WatchedFolderConfig[] = s.get(STORE_KEY, []); + + for (const config of folders) { + if (config.active && fs.existsSync(config.path)) { + await startWatcher(config); + } + } +} + +export async function unregisterFolderWatcher(): Promise { + for (const [folderPath] of watchers) { + stopWatcher(folderPath); + } + watchers.clear(); +} + +export async function browseFiles(): Promise { + const result = await dialog.showOpenDialog({ + properties: ['openFile', 'multiSelections'], + title: 'Select files', + }); + if (result.canceled || result.filePaths.length === 0) return null; + return result.filePaths; +} + +const MIME_MAP: Record = { + '.pdf': 'application/pdf', + '.docx': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + '.xlsx': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + '.pptx': 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + '.html': 'text/html', '.htm': 'text/html', + '.csv': 'text/csv', + '.txt': 'text/plain', + '.md': 'text/markdown', '.markdown': 'text/markdown', + '.mp3': 'audio/mpeg', '.mpeg': 'audio/mpeg', '.mpga': 'audio/mpeg', + '.mp4': 'audio/mp4', '.m4a': 'audio/mp4', + '.wav': 'audio/wav', + '.webm': 'audio/webm', + '.jpg': 'image/jpeg', '.jpeg': 'image/jpeg', + '.png': 'image/png', + '.bmp': 'image/bmp', + '.webp': 'image/webp', + '.tiff': 'image/tiff', + '.doc': 'application/msword', + '.rtf': 'application/rtf', + '.xml': 'application/xml', + '.epub': 'application/epub+zip', + '.xls': 'application/vnd.ms-excel', + '.ppt': 'application/vnd.ms-powerpoint', + '.eml': 'message/rfc822', + '.odt': 'application/vnd.oasis.opendocument.text', + '.msg': 'application/vnd.ms-outlook', +}; + +export interface LocalFileData { + name: string; + data: ArrayBuffer; + mimeType: string; + size: number; +} + +export function readLocalFiles(filePaths: string[]): LocalFileData[] { + return filePaths.map((p) => { + const buf = fs.readFileSync(p); + const ext = path.extname(p).toLowerCase(); + return { + name: path.basename(p), + data: buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength), + mimeType: MIME_MAP[ext] || 'application/octet-stream', + size: buf.byteLength, + }; + }); +} diff --git a/surfsense_desktop/src/modules/permissions.ts b/surfsense_desktop/src/modules/permissions.ts new file mode 100644 index 000000000..02786113e --- /dev/null +++ b/surfsense_desktop/src/modules/permissions.ts @@ -0,0 +1,51 @@ +import { app } from 'electron'; + +type PermissionStatus = 'authorized' | 'denied' | 'not determined' | 'restricted' | 'limited'; + +export interface PermissionsStatus { + accessibility: PermissionStatus; + screenRecording: PermissionStatus; +} + +function isMac(): boolean { + return process.platform === 'darwin'; +} + +function getNodeMacPermissions() { + return require('node-mac-permissions'); +} + +export function getPermissionsStatus(): PermissionsStatus { + if (!isMac()) { + return { accessibility: 'authorized', screenRecording: 'authorized' }; + } + + const perms = getNodeMacPermissions(); + return { + accessibility: perms.getAuthStatus('accessibility'), + screenRecording: perms.getAuthStatus('screen'), + }; +} + +export function requestAccessibility(): void { + if (!isMac()) return; + const perms = getNodeMacPermissions(); + perms.askForAccessibilityAccess(); +} + +export function hasScreenRecordingPermission(): boolean { + if (!isMac()) return true; + const perms = getNodeMacPermissions(); + return perms.getAuthStatus('screen') === 'authorized'; +} + +export function requestScreenRecording(): void { + if (!isMac()) return; + const perms = getNodeMacPermissions(); + perms.askForScreenCaptureAccess(); +} + +export function restartApp(): void { + app.relaunch(); + app.exit(0); +} diff --git a/surfsense_desktop/src/modules/platform.ts b/surfsense_desktop/src/modules/platform.ts index 37e126799..2b4d1f4a1 100644 --- a/surfsense_desktop/src/modules/platform.ts +++ b/surfsense_desktop/src/modules/platform.ts @@ -1,16 +1,20 @@ import { execSync } from 'child_process'; import { systemPreferences } from 'electron'; +const EXEC_OPTS = { windowsHide: true } as const; + export function getFrontmostApp(): string { try { if (process.platform === 'darwin') { return execSync( - 'osascript -e \'tell application "System Events" to get name of first application process whose frontmost is true\'' + 'osascript -e \'tell application "System Events" to get name of first application process whose frontmost is true\'', + EXEC_OPTS, ).toString().trim(); } if (process.platform === 'win32') { return execSync( - 'powershell -command "Add-Type \'using System; using System.Runtime.InteropServices; public class W { [DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow(); }\'; (Get-Process | Where-Object { $_.MainWindowHandle -eq [W]::GetForegroundWindow() }).ProcessName"' + 'powershell -NoProfile -NonInteractive -command "Add-Type \'using System; using System.Runtime.InteropServices; public class W { [DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow(); }\'; (Get-Process | Where-Object { $_.MainWindowHandle -eq [W]::GetForegroundWindow() }).ProcessName"', + EXEC_OPTS, ).toString().trim(); } } catch { @@ -19,33 +23,25 @@ export function getFrontmostApp(): string { return ''; } -export function getSelectedText(): string { - try { - if (process.platform === 'darwin') { - return execSync( - 'osascript -e \'tell application "System Events" to get value of attribute "AXSelectedText" of focused UI element of first application process whose frontmost is true\'' - ).toString().trim(); - } - // Windows: no reliable accessibility API for selected text across apps - } catch { - return ''; - } - return ''; -} - -export function simulateCopy(): void { - if (process.platform === 'darwin') { - execSync('osascript -e \'tell application "System Events" to keystroke "c" using command down\''); - } else if (process.platform === 'win32') { - execSync('powershell -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait(\'^c\')"'); - } -} - export function simulatePaste(): void { if (process.platform === 'darwin') { - execSync('osascript -e \'tell application "System Events" to keystroke "v" using command down\''); + execSync('osascript -e \'tell application "System Events" to keystroke "v" using command down\'', EXEC_OPTS); } else if (process.platform === 'win32') { - execSync('powershell -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait(\'^v\')"'); + execSync('powershell -NoProfile -NonInteractive -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait(\'^v\')"', EXEC_OPTS); + } +} + +export function simulateCopy(): boolean { + try { + if (process.platform === 'darwin') { + execSync('osascript -e \'tell application "System Events" to keystroke "c" using command down\'', EXEC_OPTS); + } else if (process.platform === 'win32') { + execSync('powershell -NoProfile -NonInteractive -command "Add-Type -AssemblyName System.Windows.Forms; [System.Windows.Forms.SendKeys]::SendWait(\'^c\')"', EXEC_OPTS); + } + return true; + } catch (err) { + console.error('[simulateCopy] Failed:', err); + return false; } } @@ -53,3 +49,28 @@ export function checkAccessibilityPermission(): boolean { if (process.platform !== 'darwin') return true; return systemPreferences.isTrustedAccessibilityClient(true); } + +export function getWindowTitle(): string { + try { + if (process.platform === 'darwin') { + return execSync( + 'osascript -e \'tell application "System Events" to get title of front window of first application process whose frontmost is true\'', + EXEC_OPTS, + ).toString().trim(); + } + if (process.platform === 'win32') { + return execSync( + 'powershell -NoProfile -NonInteractive -command "(Get-Process | Where-Object { $_.MainWindowHandle -eq (Add-Type -MemberDefinition \'[DllImport(\\\"user32.dll\\\")] public static extern IntPtr GetForegroundWindow();\' -Name W -PassThru)::GetForegroundWindow() }).MainWindowTitle"', + EXEC_OPTS, + ).toString().trim(); + } + } catch { + return ''; + } + return ''; +} + +export function hasAccessibilityPermission(): boolean { + if (process.platform !== 'darwin') return true; + return systemPreferences.isTrustedAccessibilityClient(false); +} diff --git a/surfsense_desktop/src/modules/quick-ask.ts b/surfsense_desktop/src/modules/quick-ask.ts index 52bfc6054..b31ae1bcd 100644 --- a/surfsense_desktop/src/modules/quick-ask.ts +++ b/surfsense_desktop/src/modules/quick-ask.ts @@ -1,13 +1,17 @@ import { BrowserWindow, clipboard, globalShortcut, ipcMain, screen, shell } from 'electron'; import path from 'path'; import { IPC_CHANNELS } from '../ipc/channels'; -import { checkAccessibilityPermission, getFrontmostApp, simulatePaste } from './platform'; +import { checkAccessibilityPermission, getFrontmostApp, simulateCopy, simulatePaste } from './platform'; import { getServerPort } from './server'; +import { getShortcuts } from './shortcuts'; +import { getActiveSearchSpaceId } from './active-search-space'; +import { trackEvent } from './analytics'; -const SHORTCUT = 'CommandOrControl+Option+S'; +let currentShortcut = ''; let quickAskWindow: BrowserWindow | null = null; let pendingText = ''; let pendingMode = ''; +let pendingSearchSpaceId: string | null = null; let sourceApp = ''; let savedClipboard = ''; @@ -52,7 +56,9 @@ function createQuickAskWindow(x: number, y: number): BrowserWindow { skipTaskbar: true, }); - quickAskWindow.loadURL(`http://localhost:${getServerPort()}/dashboard`); + const spaceId = pendingSearchSpaceId; + const route = spaceId ? `/dashboard/${spaceId}/new-chat` : '/dashboard'; + quickAskWindow.loadURL(`http://localhost:${getServerPort()}${route}?quickAssist=true`); quickAskWindow.once('ready-to-show', () => { quickAskWindow?.show(); @@ -77,29 +83,55 @@ function createQuickAskWindow(x: number, y: number): BrowserWindow { return quickAskWindow; } -export function registerQuickAsk(): void { - const ok = globalShortcut.register(SHORTCUT, () => { - if (quickAskWindow && !quickAskWindow.isDestroyed()) { - destroyQuickAsk(); - return; - } +async function openQuickAsk(text: string): Promise { + pendingText = text; + pendingMode = 'quick-assist'; + pendingSearchSpaceId = await getActiveSearchSpaceId(); + const cursor = screen.getCursorScreenPoint(); + const pos = clampToScreen(cursor.x, cursor.y, 450, 750); + createQuickAskWindow(pos.x, pos.y); +} - sourceApp = getFrontmostApp(); - savedClipboard = clipboard.readText(); +async function quickAskHandler(): Promise { + console.log('[quick-ask] Handler triggered'); - const text = savedClipboard.trim(); - if (!text) return; - - pendingText = text; - const cursor = screen.getCursorScreenPoint(); - const pos = clampToScreen(cursor.x, cursor.y, 450, 750); - createQuickAskWindow(pos.x, pos.y); - }); - - if (!ok) { - console.log(`Quick-ask: failed to register ${SHORTCUT}`); + if (quickAskWindow && !quickAskWindow.isDestroyed()) { + console.log('[quick-ask] Window already open, closing'); + destroyQuickAsk(); + return; } + if (!checkAccessibilityPermission()) { + console.log('[quick-ask] Accessibility permission denied'); + return; + } + + savedClipboard = clipboard.readText(); + console.log('[quick-ask] Saved clipboard length:', savedClipboard.length); + + const copyOk = simulateCopy(); + console.log('[quick-ask] simulateCopy result:', copyOk); + + await new Promise((r) => setTimeout(r, 300)); + + const afterCopy = clipboard.readText(); + const selected = afterCopy.trim(); + console.log('[quick-ask] Clipboard after copy length:', afterCopy.length, 'changed:', afterCopy !== savedClipboard); + + const text = selected || savedClipboard.trim(); + + sourceApp = getFrontmostApp(); + console.log('[quick-ask] Source app:', sourceApp, '| Opening Quick Assist with', text.length, 'chars', selected ? '(selected)' : text ? '(clipboard fallback)' : '(empty)'); + trackEvent('desktop_quick_ask_opened', { has_selected_text: !!selected }); + openQuickAsk(text); +} + +let ipcRegistered = false; + +function registerIpcHandlers(): void { + if (ipcRegistered) return; + ipcRegistered = true; + ipcMain.handle(IPC_CHANNELS.QUICK_ASK_TEXT, () => { const text = pendingText; pendingText = ''; @@ -122,6 +154,7 @@ export function registerQuickAsk(): void { if (!checkAccessibilityPermission()) return; + trackEvent('desktop_quick_ask_replaced'); clipboard.writeText(text); destroyQuickAsk(); @@ -136,6 +169,24 @@ export function registerQuickAsk(): void { }); } -export function unregisterQuickAsk(): void { - globalShortcut.unregister(SHORTCUT); +async function registerShortcut(): Promise { + const shortcuts = await getShortcuts(); + currentShortcut = shortcuts.quickAsk; + + const ok = globalShortcut.register(currentShortcut, () => { quickAskHandler(); }); + console.log(`[quick-ask] Register ${currentShortcut}: ${ok ? 'OK' : 'FAILED'}`); +} + +export async function registerQuickAsk(): Promise { + registerIpcHandlers(); + await registerShortcut(); +} + +export function unregisterQuickAsk(): void { + if (currentShortcut) globalShortcut.unregister(currentShortcut); +} + +export async function reregisterQuickAsk(): Promise { + unregisterQuickAsk(); + await registerShortcut(); } diff --git a/surfsense_desktop/src/modules/shortcuts.ts b/surfsense_desktop/src/modules/shortcuts.ts new file mode 100644 index 000000000..6948a005e --- /dev/null +++ b/surfsense_desktop/src/modules/shortcuts.ts @@ -0,0 +1,44 @@ +export interface ShortcutConfig { + generalAssist: string; + quickAsk: string; + autocomplete: string; +} + +const DEFAULTS: ShortcutConfig = { + generalAssist: 'CommandOrControl+Shift+S', + quickAsk: 'CommandOrControl+Alt+S', + autocomplete: 'CommandOrControl+Shift+Space', +}; + +const STORE_KEY = 'shortcuts'; +// eslint-disable-next-line @typescript-eslint/no-explicit-any -- lazily imported ESM module; matches folder-watcher.ts pattern +let store: any = null; + +async function getStore() { + if (!store) { + const { default: Store } = await import('electron-store'); + store = new Store({ + name: 'keyboard-shortcuts', + defaults: { [STORE_KEY]: DEFAULTS }, + }); + } + return store; +} + +export async function getShortcuts(): Promise { + const s = await getStore(); + const stored = s.get(STORE_KEY) as Partial | undefined; + return { ...DEFAULTS, ...stored }; +} + +export async function setShortcuts(config: Partial): Promise { + const s = await getStore(); + const current = (s.get(STORE_KEY) as ShortcutConfig) ?? DEFAULTS; + const merged = { ...current, ...config }; + s.set(STORE_KEY, merged); + return merged; +} + +export function getDefaults(): ShortcutConfig { + return { ...DEFAULTS }; +} diff --git a/surfsense_desktop/src/modules/tray.ts b/surfsense_desktop/src/modules/tray.ts new file mode 100644 index 000000000..1749145a1 --- /dev/null +++ b/surfsense_desktop/src/modules/tray.ts @@ -0,0 +1,77 @@ +import { app, globalShortcut, Menu, nativeImage, Tray } from 'electron'; +import path from 'path'; +import { getMainWindow, createMainWindow } from './window'; +import { getShortcuts } from './shortcuts'; + +let tray: Tray | null = null; +let currentShortcut: string | null = null; + +function getTrayIcon(): nativeImage { + const iconName = process.platform === 'win32' ? 'icon.ico' : 'icon.png'; + const iconPath = app.isPackaged + ? path.join(process.resourcesPath, 'assets', iconName) + : path.join(__dirname, '..', 'assets', iconName); + const img = nativeImage.createFromPath(iconPath); + return img.resize({ width: 16, height: 16 }); +} + +function showMainWindow(): void { + let win = getMainWindow(); + if (!win || win.isDestroyed()) { + win = createMainWindow('/dashboard'); + } else { + win.show(); + win.focus(); + } +} + +function registerShortcut(accelerator: string): void { + if (currentShortcut) { + globalShortcut.unregister(currentShortcut); + currentShortcut = null; + } + if (!accelerator) return; + try { + const ok = globalShortcut.register(accelerator, showMainWindow); + if (ok) { + currentShortcut = accelerator; + } else { + console.warn(`[tray] Failed to register General Assist shortcut: ${accelerator}`); + } + } catch (err) { + console.error(`[tray] Error registering General Assist shortcut:`, err); + } +} + +export async function createTray(): Promise { + if (tray) return; + + tray = new Tray(getTrayIcon()); + tray.setToolTip('SurfSense'); + + const contextMenu = Menu.buildFromTemplate([ + { label: 'Open SurfSense', click: showMainWindow }, + { type: 'separator' }, + { label: 'Quit', click: () => { app.exit(0); } }, + ]); + + tray.setContextMenu(contextMenu); + tray.on('double-click', showMainWindow); + + const shortcuts = await getShortcuts(); + registerShortcut(shortcuts.generalAssist); +} + +export async function reregisterGeneralAssist(): Promise { + const shortcuts = await getShortcuts(); + registerShortcut(shortcuts.generalAssist); +} + +export function destroyTray(): void { + if (currentShortcut) { + globalShortcut.unregister(currentShortcut); + currentShortcut = null; + } + tray?.destroy(); + tray = null; +} diff --git a/surfsense_desktop/src/modules/window.ts b/surfsense_desktop/src/modules/window.ts index 245814cad..9cd216501 100644 --- a/surfsense_desktop/src/modules/window.ts +++ b/surfsense_desktop/src/modules/window.ts @@ -2,6 +2,7 @@ import { app, BrowserWindow, shell, session } from 'electron'; import path from 'path'; import { showErrorDialog } from './errors'; import { getServerPort } from './server'; +import { setActiveSearchSpaceId } from './active-search-space'; const isDev = !app.isPackaged; const HOSTED_FRONTEND_URL = process.env.HOSTED_FRONTEND_URL as string; @@ -12,7 +13,7 @@ export function getMainWindow(): BrowserWindow | null { return mainWindow; } -export function createMainWindow(): BrowserWindow { +export function createMainWindow(initialPath = '/dashboard'): BrowserWindow { mainWindow = new BrowserWindow({ width: 1280, height: 800, @@ -33,7 +34,7 @@ export function createMainWindow(): BrowserWindow { mainWindow?.show(); }); - mainWindow.loadURL(`http://localhost:${getServerPort()}/dashboard`); + mainWindow.loadURL(`http://localhost:${getServerPort()}${initialPath}`); mainWindow.webContents.setWindowOpenHandler(({ url }) => { if (url.startsWith('http://localhost')) { @@ -55,6 +56,16 @@ export function createMainWindow(): BrowserWindow { showErrorDialog('Page failed to load', new Error(`${errorDescription} (${errorCode})\n${validatedURL}`)); }); + // Auto-sync active search space from URL navigation + const syncSearchSpace = (url: string) => { + const match = url.match(/\/dashboard\/(\d+)/); + if (match) { + setActiveSearchSpaceId(match[1]); + } + }; + mainWindow.webContents.on('did-navigate', (_event, url) => syncSearchSpace(url)); + mainWindow.webContents.on('did-navigate-in-page', (_event, url) => syncSearchSpace(url)); + if (isDev) { mainWindow.webContents.openDevTools(); } diff --git a/surfsense_desktop/src/preload.ts b/surfsense_desktop/src/preload.ts index 264ec25b3..4d9537c91 100644 --- a/surfsense_desktop/src/preload.ts +++ b/surfsense_desktop/src/preload.ts @@ -21,4 +21,66 @@ contextBridge.exposeInMainWorld('electronAPI', { setQuickAskMode: (mode: string) => ipcRenderer.invoke(IPC_CHANNELS.SET_QUICK_ASK_MODE, mode), getQuickAskMode: () => ipcRenderer.invoke(IPC_CHANNELS.GET_QUICK_ASK_MODE), replaceText: (text: string) => ipcRenderer.invoke(IPC_CHANNELS.REPLACE_TEXT, text), + // Permissions + getPermissionsStatus: () => ipcRenderer.invoke(IPC_CHANNELS.GET_PERMISSIONS_STATUS), + requestAccessibility: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_ACCESSIBILITY), + requestScreenRecording: () => ipcRenderer.invoke(IPC_CHANNELS.REQUEST_SCREEN_RECORDING), + restartApp: () => ipcRenderer.invoke(IPC_CHANNELS.RESTART_APP), + // Autocomplete + onAutocompleteContext: (callback: (data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => void) => { + const listener = (_event: unknown, data: { screenshot: string; searchSpaceId?: string; appName?: string; windowTitle?: string }) => callback(data); + ipcRenderer.on(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); + return () => { + ipcRenderer.removeListener(IPC_CHANNELS.AUTOCOMPLETE_CONTEXT, listener); + }; + }, + acceptSuggestion: (text: string) => ipcRenderer.invoke(IPC_CHANNELS.ACCEPT_SUGGESTION, text), + dismissSuggestion: () => ipcRenderer.invoke(IPC_CHANNELS.DISMISS_SUGGESTION), + setAutocompleteEnabled: (enabled: boolean) => ipcRenderer.invoke(IPC_CHANNELS.SET_AUTOCOMPLETE_ENABLED, enabled), + getAutocompleteEnabled: () => ipcRenderer.invoke(IPC_CHANNELS.GET_AUTOCOMPLETE_ENABLED), + + // Folder sync + selectFolder: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_SELECT_FOLDER), + addWatchedFolder: (config: any) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_ADD_FOLDER, config), + removeWatchedFolder: (folderPath: string) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_REMOVE_FOLDER, folderPath), + getWatchedFolders: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_FOLDERS), + getWatcherStatus: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_STATUS), + onFileChanged: (callback: (data: any) => void) => { + const listener = (_event: unknown, data: any) => callback(data); + ipcRenderer.on(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, listener); + return () => { + ipcRenderer.removeListener(IPC_CHANNELS.FOLDER_SYNC_FILE_CHANGED, listener); + }; + }, + onWatcherReady: (callback: (data: any) => void) => { + const listener = (_event: unknown, data: any) => callback(data); + ipcRenderer.on(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, listener); + return () => { + ipcRenderer.removeListener(IPC_CHANNELS.FOLDER_SYNC_WATCHER_READY, listener); + }; + }, + pauseWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_PAUSE), + resumeWatcher: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RESUME), + signalRendererReady: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_RENDERER_READY), + getPendingFileEvents: () => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_GET_PENDING_EVENTS), + acknowledgeFileEvents: (eventIds: string[]) => ipcRenderer.invoke(IPC_CHANNELS.FOLDER_SYNC_ACK_EVENTS, eventIds), + + // Browse files via native dialog + browseFiles: () => ipcRenderer.invoke(IPC_CHANNELS.BROWSE_FILES), + readLocalFiles: (paths: string[]) => ipcRenderer.invoke(IPC_CHANNELS.READ_LOCAL_FILES, paths), + + // Auth token sync across windows + getAuthTokens: () => ipcRenderer.invoke(IPC_CHANNELS.GET_AUTH_TOKENS), + setAuthTokens: (bearer: string, refresh: string) => + ipcRenderer.invoke(IPC_CHANNELS.SET_AUTH_TOKENS, { bearer, refresh }), + + // Keyboard shortcut configuration + getShortcuts: () => ipcRenderer.invoke(IPC_CHANNELS.GET_SHORTCUTS), + setShortcuts: (config: Record) => + ipcRenderer.invoke(IPC_CHANNELS.SET_SHORTCUTS, config), + + // Active search space + getActiveSearchSpace: () => ipcRenderer.invoke(IPC_CHANNELS.GET_ACTIVE_SEARCH_SPACE), + setActiveSearchSpace: (id: string) => + ipcRenderer.invoke(IPC_CHANNELS.SET_ACTIVE_SEARCH_SPACE, id), }); diff --git a/surfsense_web/.env.example b/surfsense_web/.env.example index b674d8e9b..b448c1f71 100644 --- a/surfsense_web/.env.example +++ b/surfsense_web/.env.example @@ -7,4 +7,7 @@ NEXT_PUBLIC_ZERO_CACHE_URL=http://localhost:4848 DATABASE_URL=postgresql://postgres:[YOUR-PASSWORD]@db.sdsf.supabase.co:5432/postgres # Deployment mode (optional) -NEXT_PUBLIC_DEPLOYMENT_MODE="self-hosted" or "cloud" \ No newline at end of file +NEXT_PUBLIC_DEPLOYMENT_MODE="self-hosted" or "cloud" + +# PostHog analytics (optional, leave empty to disable) +NEXT_PUBLIC_POSTHOG_KEY= \ No newline at end of file diff --git a/surfsense_web/app/(home)/changelog/page.tsx b/surfsense_web/app/(home)/changelog/page.tsx index 8d38cb687..c14218dab 100644 --- a/surfsense_web/app/(home)/changelog/page.tsx +++ b/surfsense_web/app/(home)/changelog/page.tsx @@ -29,7 +29,7 @@ interface ChangelogPageItem { export default async function ChangelogPage() { const allPages = source.getPages() as ChangelogPageItem[]; - const sortedChangelogs = allPages.sort((a, b) => { + const sortedChangelogs = allPages.toSorted((a, b) => { const dateA = new Date(a.data.date).getTime(); const dateB = new Date(b.data.date).getTime(); return dateB - dateA; diff --git a/surfsense_web/app/(home)/login/LocalLoginForm.tsx b/surfsense_web/app/(home)/login/LocalLoginForm.tsx index c40e16915..07a4db4d3 100644 --- a/surfsense_web/app/(home)/login/LocalLoginForm.tsx +++ b/surfsense_web/app/(home)/login/LocalLoginForm.tsx @@ -161,10 +161,10 @@ export function LocalLoginForm() { placeholder="you@example.com" value={username} onChange={(e) => setUsername(e.target.value)} - className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 bg-background text-foreground transition-all ${ + className={`mt-1 block w-full rounded-md border px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ error.title - ? "border-destructive focus:border-destructive focus:ring-destructive" - : "border-border focus:border-primary focus:ring-primary" + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" }`} disabled={isLoggingIn} /> @@ -183,10 +183,10 @@ export function LocalLoginForm() { placeholder="Enter your password" value={password} onChange={(e) => setPassword(e.target.value)} - className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-2 focus:ring-offset-2 bg-background text-foreground transition-all ${ + className={`mt-1 block w-full rounded-md border pr-10 px-3 py-1.5 md:py-2 shadow-sm focus:outline-none focus:ring-1 bg-background text-foreground transition-all ${ error.title - ? "border-destructive focus:border-destructive focus:ring-destructive" - : "border-border focus:border-primary focus:ring-primary" + ? "border-destructive focus:border-destructive focus:ring-destructive/40" + : "border-border focus:border-primary focus:ring-primary/40" }`} disabled={isLoggingIn} /> diff --git a/surfsense_web/app/(home)/register/page.tsx b/surfsense_web/app/(home)/register/page.tsx index a1f4e6f0f..f9e387cf7 100644 --- a/surfsense_web/app/(home)/register/page.tsx +++ b/surfsense_web/app/(home)/register/page.tsx @@ -229,10 +229,7 @@ export default function RegisterPage() {
-
-
diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx index 20d4a8e53..7308e1e26 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/connector-edit-view.tsx @@ -272,7 +272,7 @@ export const ConnectorEditView: FC = ({ {/* AI Summary toggle */} - {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, or GitHub (indexes full repo snapshots) */} + {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */} {connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" && connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && connector.connector_type !== "DROPBOX_CONNECTOR" && @@ -293,9 +293,7 @@ export const ConnectorEditView: FC = ({ /> )} - {/* Periodic sync - shown for all indexable connectors */} {(() => { - // Check if Google Drive (regular or Composio) has folders/files selected const isGoogleDrive = connector.connector_type === "GOOGLE_DRIVE_CONNECTOR"; const isComposioGoogleDrive = connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR"; @@ -371,16 +369,10 @@ export const ConnectorEditView: FC = ({ size="sm" onClick={handleDisconnectConfirm} disabled={isDisconnecting} - className="text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2" + className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2" > - {isDisconnecting ? ( - <> - - Disconnecting - - ) : ( - "Confirm Disconnect" - )} + Confirm Disconnect + {isDisconnecting && } )} diff --git a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx index 564cb87ee..e583cbe17 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/connector-configs/views/indexing-configuration-view.tsx @@ -158,7 +158,7 @@ export const IndexingConfigurationView: FC = ({ {/* AI Summary toggle */} - {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, or GitHub (indexes full repo snapshots) */} + {/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && config.connectorType !== "DROPBOX_CONNECTOR" && @@ -179,9 +179,10 @@ export const IndexingConfigurationView: FC = ({ /> )} - {/* Periodic sync - not shown for Google Drive (regular and Composio) */} {config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" && - config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && ( + config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" && + config.connectorType !== "DROPBOX_CONNECTOR" && + config.connectorType !== "ONEDRIVE_CONNECTOR" && ( = ({ const hasActiveConnectors = filteredOAuthConnectorTypes.length > 0 || filteredNonOAuthConnectors.length > 0; + const hasFilteredResults = hasActiveConnectors || standaloneDocuments.length > 0; + return ( - {hasSources ? ( + {hasSources && !hasFilteredResults && searchQuery ? ( +
+ +

No connectors found

+

Try a different search term

+
+ ) : hasSources ? (
{/* Active Connectors Section */} {hasActiveConnectors && ( @@ -302,7 +310,7 @@ export const ActiveConnectorsTab: FC = ({ ) : (
- +

No active sources

diff --git a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx index ad2418865..bbbf6dd57 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/tabs/all-connectors-tab.tsx @@ -1,8 +1,10 @@ "use client"; +import { Search } from "lucide-react"; import type { FC } from "react"; import { EnumConnectorName } from "@/contracts/enums/connector"; import type { SearchSourceConnector } from "@/contracts/types/connector.types"; +import { usePlatform } from "@/hooks/use-platform"; import { isSelfHosted } from "@/lib/env-config"; import { ConnectorCard } from "../components/connector-card"; import { @@ -74,31 +76,27 @@ export const AllConnectorsTab: FC = ({ onManage, onViewAccountsList, }) => { - // Check if self-hosted mode (for showing self-hosted only connectors) const selfHosted = isSelfHosted(); + const { isDesktop } = usePlatform(); + + const matchesSearch = (title: string, description: string) => + title.toLowerCase().includes(searchQuery.toLowerCase()) || + description.toLowerCase().includes(searchQuery.toLowerCase()); + + const passesDeploymentFilter = (c: { selfHostedOnly?: boolean; desktopOnly?: boolean }) => + (!c.selfHostedOnly || selfHosted) && (!c.desktopOnly || isDesktop); // Filter connectors based on search and deployment mode const filteredOAuth = OAUTH_CONNECTORS.filter( - (c) => - // Filter by search query - (c.title.toLowerCase().includes(searchQuery.toLowerCase()) || - c.description.toLowerCase().includes(searchQuery.toLowerCase())) && - // Filter self-hosted only connectors in cloud mode - (!("selfHostedOnly" in c) || !c.selfHostedOnly || selfHosted) + (c) => matchesSearch(c.title, c.description) && passesDeploymentFilter(c) ); const filteredCrawlers = CRAWLERS.filter( - (c) => - (c.title.toLowerCase().includes(searchQuery.toLowerCase()) || - c.description.toLowerCase().includes(searchQuery.toLowerCase())) && - (!("selfHostedOnly" in c) || !c.selfHostedOnly || selfHosted) + (c) => matchesSearch(c.title, c.description) && passesDeploymentFilter(c) ); const filteredOther = OTHER_CONNECTORS.filter( - (c) => - (c.title.toLowerCase().includes(searchQuery.toLowerCase()) || - c.description.toLowerCase().includes(searchQuery.toLowerCase())) && - (!("selfHostedOnly" in c) || !c.selfHostedOnly || selfHosted) + (c) => matchesSearch(c.title, c.description) && passesDeploymentFilter(c) ); // Filter Composio connectors @@ -290,6 +288,18 @@ export const AllConnectorsTab: FC = ({ moreIntegrationsOther.length > 0 || moreIntegrationsCrawlers.length > 0; + const hasAnyResults = hasDocumentFileConnectors || hasMoreIntegrations; + + if (!hasAnyResults && searchQuery) { + return ( +

+ +

No connectors found

+

Try a different search term

+
+ ); + } + return (
{/* Document/Files Connectors */} diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx index 5dfc252c2..b4c049c5c 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/views/connector-accounts-list-view.tsx @@ -173,9 +173,7 @@ export const ConnectorAccountsListView: FC = ({ )}
- - {isConnecting ? "Connecting" : buttonText} - + {buttonText}
diff --git a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx index 7ec85f4d3..8982b16a8 100644 --- a/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx +++ b/surfsense_web/components/assistant-ui/connector-popup/views/youtube-crawler-view.tsx @@ -335,16 +335,10 @@ export const YouTubeCrawlerView: FC = ({ searchSpaceId, diff --git a/surfsense_web/components/assistant-ui/document-upload-popup.tsx b/surfsense_web/components/assistant-ui/document-upload-popup.tsx index c34890dff..0b38979a5 100644 --- a/surfsense_web/components/assistant-ui/document-upload-popup.tsx +++ b/surfsense_web/components/assistant-ui/document-upload-popup.tsx @@ -125,38 +125,33 @@ const DocumentUploadPopupContent: FC<{ onPointerDownOutside={(e) => e.preventDefault()} onInteractOutside={(e) => e.preventDefault()} onEscapeKeyDown={(e) => e.preventDefault()} - className="select-none max-w-4xl w-[95vw] sm:w-full h-[calc(100dvh-2rem)] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-12 [&>button]:top-3 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button]:z-[100] [&>button_svg]:size-4 sm:[&>button_svg]:size-5" + className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-5 sm:[&>button]:top-8 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5" > Upload Document - {/* Scrollable container for mobile */}
- {/* Header - scrolls with content on mobile */} -
- {/* Upload header */} -
-
-

- Upload Documents -

-

- Upload and sync your documents to your search space -

-
+
+
+

Upload Documents

+

+ Upload and sync your documents to your search space +

- {/* Content */} -
+
{!isLoading && !hasDocumentSummaryLLM ? ( - + LLM Configuration Required

{isAutoMode && !hasGlobalConfigs - ? "Auto mode is selected but no global LLM configurations are available. Please configure a custom LLM in Settings to process and summarize your uploaded documents." - : "You need to configure a Document Summary LLM before uploading files. This LLM is used to process and summarize your uploaded documents."} + ? "Auto mode requires a global LLM configuration. Please add one in Settings" + : "A Document Summary LLM is required to process uploads, configure one in Settings"}

- - {/* Bottom fade shadow - hidden on very small screens */} -
); diff --git a/surfsense_web/components/assistant-ui/image.tsx b/surfsense_web/components/assistant-ui/image.tsx index 65059bcdc..59781abcf 100644 --- a/surfsense_web/components/assistant-ui/image.tsx +++ b/surfsense_web/components/assistant-ui/image.tsx @@ -3,6 +3,7 @@ import type { ImageMessagePartComponent } from "@assistant-ui/react"; import { cva, type VariantProps } from "class-variance-authority"; import { ImageIcon, ImageOffIcon } from "lucide-react"; +import NextImage from "next/image"; import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react"; import { createPortal } from "react-dom"; import { cn } from "@/lib/utils"; @@ -86,8 +87,8 @@ function ImagePreview({ >
- ) : ( - // biome-ignore lint/performance/noImgElement: intentional for dynamic external URLs + ) : isDataOrBlobUrl(src) ? ( + // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img + ) : ( + // biome-ignore lint/performance/noImgElement: intentional for dynamic external URLs + // {alt} { + // if (typeof src === "string") setLoadedSrc(src); + // onLoad?.(e); + // }} + // onError={(e) => { + // if (typeof src === "string") setErrorSrc(src); + // onError?.(e); + // }} + // {...props} + // /> + { + if (typeof src === "string") setLoadedSrc(src); + onLoad?.(); + }} + onError={() => { + if (typeof src === "string") setErrorSrc(src); + onError?.(); + }} + unoptimized={false} + {...props} + /> )}
); @@ -126,7 +161,10 @@ type ImageZoomProps = PropsWithChildren<{ src: string; alt?: string; }>; - +function isDataOrBlobUrl(src: string | undefined): boolean { + if (!src || typeof src !== "string") return false; + return src.startsWith("data:") || src.startsWith("blob:"); +} function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) { const [isMounted, setIsMounted] = useState(false); const [isOpen, setIsOpen] = useState(false); @@ -177,22 +215,39 @@ function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) { aria-label="Close zoomed image" > {/** biome-ignore lint/performance/noImgElement: */} - {alt} { - e.stopPropagation(); - handleClose(); - }} - onKeyDown={(e) => { - if (e.key === "Enter") { + {isDataOrBlobUrl(src) ? ( + // biome-ignore lint/performance/noImgElement: data/blob URLs need plain img + {alt} { e.stopPropagation(); handleClose(); - } - }} - /> + }} + onKeyDown={(e) => { + if (e.key === "Enter") { + e.stopPropagation(); + handleClose(); + } + }} + /> + ) : ( + { + e.stopPropagation(); + handleClose(); + }} + unoptimized={false} + /> + )} , document.body )} diff --git a/surfsense_web/components/assistant-ui/inline-citation.tsx b/surfsense_web/components/assistant-ui/inline-citation.tsx index 15ad11d94..0c99090e6 100644 --- a/surfsense_web/components/assistant-ui/inline-citation.tsx +++ b/surfsense_web/components/assistant-ui/inline-citation.tsx @@ -32,7 +32,7 @@ export const InlineCitation: FC = ({ chunkId, isDocsChunk =
{group.tools.map((tool) => { - const isDisabled = disabledTools.includes(tool.name); + const isDisabled = disabledToolsSet.has(tool.name); const ToolIcon = getToolIcon(tool.name); return (
= ({ isBlockedByOtherUser = false const iconKey = group.connectorIcon ?? ""; const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey]; const toolNames = group.tools.map((t) => t.name); - const allDisabled = toolNames.every((n) => disabledTools.includes(n)); + const allDisabled = toolNames.every((n) => disabledToolsSet.has(n)); return (
= ({ isBlockedByOtherUser = false >
Manage Tools
= ({ isBlockedByOtherUser = false {group.label}
{group.tools.map((tool) => { - const isDisabled = disabledTools.includes(tool.name); + const isDisabled = disabledToolsSet.has(tool.name); const ToolIcon = getToolIcon(tool.name); const row = (
@@ -1115,7 +1093,7 @@ const ComposerAction: FC = ({ isBlockedByOtherUser = false const iconKey = group.connectorIcon ?? ""; const iconInfo = CONNECTOR_TOOL_ICON_PATHS[iconKey]; const toolNames = group.tools.map((t) => t.name); - const allDisabled = toolNames.every((n) => disabledTools.includes(n)); + const allDisabled = toolNames.every((n) => disabledToolsSet.has(n)); const groupDef = TOOL_GROUPS.find((g) => g.label === group.label); const row = (
@@ -1146,7 +1124,11 @@ const ComposerAction: FC = ({ isBlockedByOtherUser = false {row} {groupDef?.tooltip ?? - group.tools.map((t) => t.description).join(" · ")} + group.tools.flatMap((t, i) => + i === 0 + ? [t.description] + : [, t.description] + )} ); diff --git a/surfsense_web/components/assistant-ui/tool-fallback.tsx b/surfsense_web/components/assistant-ui/tool-fallback.tsx index 40118d2e4..b658dba6d 100644 --- a/surfsense_web/components/assistant-ui/tool-fallback.tsx +++ b/surfsense_web/components/assistant-ui/tool-fallback.tsx @@ -26,7 +26,8 @@ export const ToolFallback: ToolCallMessagePartComponent = ({ ); const serializedResult = useMemo( - () => (result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null), + () => + result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null, [result] ); diff --git a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx index 3e9b4504f..1c4383388 100644 --- a/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx +++ b/surfsense_web/components/chat-comments/comment-composer/comment-composer.tsx @@ -1,6 +1,6 @@ "use client"; -import { ArrowUp, Send, X } from "lucide-react"; +import { ArrowUp } from "lucide-react"; import { useCallback, useEffect, useRef, useState } from "react"; import { Button } from "@/components/ui/button"; import { Popover, PopoverAnchor, PopoverContent } from "@/components/ui/popover"; @@ -15,13 +15,14 @@ function convertDisplayToData(displayContent: string, mentions: InsertedMention[ const sortedMentions = [...mentions].sort((a, b) => b.displayName.length - a.displayName.length); - for (const mention of sortedMentions) { - const displayPattern = new RegExp( - `@${escapeRegExp(mention.displayName)}(?=\\s|$|[.,!?;:])`, - "g" - ); - const dataFormat = `@[${mention.id}]`; - result = result.replace(displayPattern, dataFormat); + const mentionPatterns = sortedMentions.map((mention) => ({ + pattern: new RegExp(`@${escapeRegExp(mention.displayName)}(?=\\s|$|[.,!?;:])`, "g"), + dataFormat: `@[${mention.id}]`, + })); + + for (const { pattern, dataFormat } of mentionPatterns) { + pattern.lastIndex = 0; // reset global regex state + result = result.replace(pattern, dataFormat); } return result; @@ -306,7 +307,6 @@ export function CommentComposer({ onClick={onCancel} disabled={isSubmitting} > - Cancel )} @@ -317,14 +317,7 @@ export function CommentComposer({ disabled={!canSubmit} className={cn(!canSubmit && "opacity-50", compact && "size-8 shrink-0 rounded-full")} > - {compact ? ( - - ) : ( - <> - - {submitLabel} - - )} + {compact ? : submitLabel}
diff --git a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx index 5c0e27779..9638ac01c 100644 --- a/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx +++ b/surfsense_web/components/chat-comments/comment-item/comment-actions.tsx @@ -1,6 +1,6 @@ "use client"; -import { MoreHorizontal, Pencil, Trash2 } from "lucide-react"; +import { MoreHorizontal, PenLine, Trash2 } from "lucide-react"; import { Button } from "@/components/ui/button"; import { DropdownMenu, @@ -21,15 +21,15 @@ export function CommentActions({ canEdit, canDelete, onEdit, onDelete }: Comment {canEdit && ( - + Edit )} diff --git a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx index 4996fe01b..eb374ba49 100644 --- a/surfsense_web/components/chat-comments/comment-item/comment-item.tsx +++ b/surfsense_web/components/chat-comments/comment-item/comment-item.tsx @@ -198,7 +198,7 @@ export function CommentItem({ display helpers +// --------------------------------------------------------------------------- + +export function keyEventToAccelerator(e: React.KeyboardEvent): string | null { + const parts: string[] = []; + if (e.ctrlKey || e.metaKey) parts.push("CommandOrControl"); + if (e.altKey) parts.push("Alt"); + if (e.shiftKey) parts.push("Shift"); + + const key = e.key; + if (["Control", "Meta", "Alt", "Shift"].includes(key)) return null; + + if (key === " ") parts.push("Space"); + else if (key.length === 1) parts.push(key.toUpperCase()); + else parts.push(key); + + if (parts.length < 2) return null; + return parts.join("+"); +} + +export function acceleratorToDisplay(accel: string): string[] { + if (!accel) return []; + return accel.split("+").map((part) => { + if (part === "CommandOrControl") return "Ctrl"; + if (part === "Space") return "Space"; + return part; + }); +} + +export const DEFAULT_SHORTCUTS = { + generalAssist: "CommandOrControl+Shift+S", + quickAsk: "CommandOrControl+Alt+S", + autocomplete: "CommandOrControl+Shift+Space", +}; + +// --------------------------------------------------------------------------- +// Kbd pill component +// --------------------------------------------------------------------------- + +export function Kbd({ keys, className }: { keys: string[]; className?: string }) { + return ( + + {keys.map((key, i) => ( + 3 && "px-1.5" + )} + > + {key} + + ))} + + ); +} + +// --------------------------------------------------------------------------- +// Shortcut recorder component +// --------------------------------------------------------------------------- + +export function ShortcutRecorder({ + value, + onChange, + onReset, + defaultValue, + label, + description, + icon: Icon, +}: { + value: string; + onChange: (accelerator: string) => void; + onReset: () => void; + defaultValue: string; + label: string; + description: string; + icon: React.ElementType; +}) { + const [recording, setRecording] = useState(false); + const inputRef = useRef(null); + + const handleKeyDown = useCallback( + (e: React.KeyboardEvent) => { + if (!recording) return; + e.preventDefault(); + e.stopPropagation(); + + if (e.key === "Escape") { + setRecording(false); + return; + } + + const accel = keyEventToAccelerator(e); + if (accel) { + onChange(accel); + setRecording(false); + } + }, + [recording, onChange] + ); + + const displayKeys = acceleratorToDisplay(value); + const isDefault = value === defaultValue; + + return ( +
+ {/* Icon */} +
+ +
+ + {/* Label + description */} +
+

{label}

+

{description}

+
+ + {/* Actions */} +
+ {!isDefault && ( + + )} + +
+
+ ); +} diff --git a/surfsense_web/components/documents/DocumentNode.tsx b/surfsense_web/components/documents/DocumentNode.tsx index dd7731f22..d8e37df1c 100644 --- a/surfsense_web/components/documents/DocumentNode.tsx +++ b/surfsense_web/components/documents/DocumentNode.tsx @@ -5,6 +5,7 @@ import { Clock, Download, Eye, + History, MoreHorizontal, Move, PenLine, @@ -39,6 +40,7 @@ import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip import type { DocumentTypeEnum } from "@/contracts/types/document.types"; import { cn } from "@/lib/utils"; import { DND_TYPES } from "./FolderNode"; +import { isVersionableType } from "./version-history"; const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]); @@ -60,6 +62,7 @@ interface DocumentNodeProps { onDelete: (doc: DocumentNodeDoc) => void; onMove: (doc: DocumentNodeDoc) => void; onExport?: (doc: DocumentNodeDoc, format: string) => void; + onVersionHistory?: (doc: DocumentNodeDoc) => void; contextMenuOpen?: boolean; onContextMenuOpenChange?: (open: boolean) => void; } @@ -74,6 +77,7 @@ export const DocumentNode = React.memo(function DocumentNode({ onDelete, onMove, onExport, + onVersionHistory, contextMenuOpen, onContextMenuOpenChange, }: DocumentNodeProps) { @@ -102,7 +106,9 @@ export const DocumentNode = React.memo(function DocumentNode({ const isProcessing = statusState === "pending" || statusState === "processing"; const [dropdownOpen, setDropdownOpen] = useState(false); const [exporting, setExporting] = useState(null); + const [titleTooltipOpen, setTitleTooltipOpen] = useState(false); const rowRef = useRef(null); + const titleRef = useRef(null); const handleExport = useCallback( (format: string) => { @@ -114,6 +120,14 @@ export const DocumentNode = React.memo(function DocumentNode({ [doc, onExport] ); + const handleTitleTooltipOpenChange = useCallback((open: boolean) => { + if (open && titleRef.current) { + setTitleTooltipOpen(titleRef.current.scrollWidth > titleRef.current.clientWidth); + } else { + setTitleTooltipOpen(false); + } + }, []); + const attachRef = useCallback( (node: HTMLDivElement | null) => { (rowRef as React.MutableRefObject).current = node; @@ -193,14 +207,32 @@ export const DocumentNode = React.memo(function DocumentNode({ ); })()} - {doc.title} + + + + {doc.title} + + + + {doc.title} + + - - {getDocumentTypeIcon( - doc.document_type as DocumentTypeEnum, - "h-3.5 w-3.5 text-muted-foreground" - )} - + {getDocumentTypeIcon( + doc.document_type as DocumentTypeEnum, + "h-3.5 w-3.5 text-muted-foreground" + ) && ( + + {getDocumentTypeIcon( + doc.document_type as DocumentTypeEnum, + "h-3.5 w-3.5 text-muted-foreground" + )} + + )} @@ -219,7 +251,7 @@ export const DocumentNode = React.memo(function DocumentNode({ e.stopPropagation()}> - onPreview(doc)}> + onPreview(doc)} disabled={isProcessing}> Open @@ -235,7 +267,7 @@ export const DocumentNode = React.memo(function DocumentNode({ {onExport && ( - + Export @@ -244,11 +276,13 @@ export const DocumentNode = React.memo(function DocumentNode({ )} - onDelete(doc)} - > + {onVersionHistory && isVersionableType(doc.document_type) && ( + onVersionHistory(doc)}> + + Versions + + )} + onDelete(doc)}> Delete @@ -259,7 +293,7 @@ export const DocumentNode = React.memo(function DocumentNode({ {contextMenuOpen && ( e.stopPropagation()}> - onPreview(doc)}> + onPreview(doc)} disabled={isProcessing}> Open @@ -275,7 +309,7 @@ export const DocumentNode = React.memo(function DocumentNode({ {onExport && ( - + Export @@ -284,11 +318,13 @@ export const DocumentNode = React.memo(function DocumentNode({ )} - onDelete(doc)} - > + {onVersionHistory && isVersionableType(doc.document_type) && ( + onVersionHistory(doc)}> + + Versions + + )} + onDelete(doc)}> Delete diff --git a/surfsense_web/components/documents/FolderNode.tsx b/surfsense_web/components/documents/FolderNode.tsx index 6a36f724f..2ec430871 100644 --- a/surfsense_web/components/documents/FolderNode.tsx +++ b/surfsense_web/components/documents/FolderNode.tsx @@ -1,14 +1,18 @@ "use client"; import { + AlertCircle, ChevronDown, ChevronRight, + Eye, + EyeOff, Folder, FolderOpen, FolderPlus, MoreHorizontal, Move, PenLine, + RefreshCw, Trash2, } from "lucide-react"; import React, { useCallback, useEffect, useRef, useState } from "react"; @@ -27,6 +31,8 @@ import { DropdownMenuItem, DropdownMenuTrigger, } from "@/components/ui/dropdown-menu"; +import { Spinner } from "@/components/ui/spinner"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { cn } from "@/lib/utils"; import type { FolderSelectionState } from "./FolderTreeView"; @@ -50,8 +56,8 @@ interface FolderNodeProps { depth: number; isExpanded: boolean; isRenaming: boolean; - childCount: number; selectionState: FolderSelectionState; + processingState: "idle" | "processing" | "failed"; onToggleSelect: (folderId: number, selectAll: boolean) => void; onToggleExpand: (folderId: number) => void; onRename: (folder: FolderDisplay, newName: string) => void; @@ -70,6 +76,9 @@ interface FolderNodeProps { disabledDropIds?: Set; contextMenuOpen?: boolean; onContextMenuOpenChange?: (open: boolean) => void; + isWatched?: boolean; + onRescan?: (folder: FolderDisplay) => void; + onStopWatching?: (folder: FolderDisplay) => void; } function getDropZone( @@ -91,8 +100,8 @@ export const FolderNode = React.memo(function FolderNode({ depth, isExpanded, isRenaming, - childCount, selectionState, + processingState, onToggleSelect, onToggleExpand, onRename, @@ -107,6 +116,9 @@ export const FolderNode = React.memo(function FolderNode({ disabledDropIds, contextMenuOpen, onContextMenuOpenChange, + isWatched, + onRescan, + onStopWatching, }: FolderNodeProps) { const [renameValue, setRenameValue] = useState(folder.name); const inputRef = useRef(null); @@ -242,7 +254,9 @@ export const FolderNode = React.memo(function FolderNode({ isOver && !canDrop && "cursor-not-allowed" )} style={{ paddingLeft: `${depth * 16 + 4}px` }} - onClick={() => onToggleExpand(folder.id)} + onClick={() => { + onToggleExpand(folder.id); + }} onKeyDown={(e) => { if (e.key === "Enter" || e.key === " ") { e.preventDefault(); @@ -262,14 +276,45 @@ export const FolderNode = React.memo(function FolderNode({ )} - e.stopPropagation()} - className="h-3.5 w-3.5 shrink-0" - /> + {processingState !== "idle" && selectionState === "none" ? ( + <> + + + + {processingState === "processing" ? ( + + ) : ( + + )} + + + + {processingState === "processing" + ? "Syncing folder contents" + : "Some files failed to process"} + + + e.stopPropagation()} + className="h-3.5 w-3.5 shrink-0 hidden group-hover:flex" + /> + + ) : ( + e.stopPropagation()} + className="h-3.5 w-3.5 shrink-0" + /> + )} @@ -289,12 +334,6 @@ export const FolderNode = React.memo(function FolderNode({ {folder.name} )} - {!isRenaming && childCount > 0 && ( - - {childCount} - - )} - {!isRenaming && ( @@ -308,6 +347,28 @@ export const FolderNode = React.memo(function FolderNode({ + {isWatched && onRescan && ( + { + e.stopPropagation(); + onRescan(folder); + }} + > + + Re-scan + + )} + {isWatched && onStopWatching && ( + { + e.stopPropagation(); + onStopWatching(folder); + }} + > + + Stop watching + + )} { e.stopPropagation(); @@ -353,6 +414,18 @@ export const FolderNode = React.memo(function FolderNode({ {!isRenaming && contextMenuOpen && ( + {isWatched && onRescan && ( + onRescan(folder)}> + + Re-scan + + )} + {isWatched && onStopWatching && ( + onStopWatching(folder)}> + + Stop watching + + )} onCreateSubfolder(folder.id)}> New subfolder diff --git a/surfsense_web/components/documents/FolderTreeView.tsx b/surfsense_web/components/documents/FolderTreeView.tsx index f63d5da5c..47cd17596 100644 --- a/surfsense_web/components/documents/FolderTreeView.tsx +++ b/surfsense_web/components/documents/FolderTreeView.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtom } from "jotai"; -import { CirclePlus } from "lucide-react"; +import { Search } from "lucide-react"; import { useCallback, useMemo, useState } from "react"; import { DndProvider } from "react-dnd"; import { HTML5Backend } from "react-dnd-html5-backend"; @@ -32,6 +32,7 @@ interface FolderTreeViewProps { onDeleteDocument: (doc: DocumentNodeDoc) => void; onMoveDocument: (doc: DocumentNodeDoc) => void; onExportDocument?: (doc: DocumentNodeDoc, format: string) => void; + onVersionHistory?: (doc: DocumentNodeDoc) => void; activeTypes: DocumentTypeEnum[]; searchQuery?: string; onDropIntoFolder?: ( @@ -40,6 +41,9 @@ interface FolderTreeViewProps { targetFolderId: number | null ) => void; onReorderFolder?: (folderId: number, beforePos: string | null, afterPos: string | null) => void; + watchedFolderIds?: Set; + onRescanFolder?: (folder: FolderDisplay) => void; + onStopWatchingFolder?: (folder: FolderDisplay) => void; } function groupBy(items: T[], keyFn: (item: T) => string | number): Record { @@ -69,25 +73,19 @@ export function FolderTreeView({ onDeleteDocument, onMoveDocument, onExportDocument, + onVersionHistory, activeTypes, searchQuery, onDropIntoFolder, onReorderFolder, + watchedFolderIds, + onRescanFolder, + onStopWatchingFolder, }: FolderTreeViewProps) { const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]); const docsByFolder = useMemo(() => groupBy(documents, (d) => d.folderId ?? "root"), [documents]); - const folderChildCounts = useMemo(() => { - const counts: Record = {}; - for (const f of folders) { - const children = foldersByParent[f.id] ?? []; - const docs = docsByFolder[f.id] ?? []; - counts[f.id] = children.length + docs.length; - } - return counts; - }, [folders, foldersByParent, docsByFolder]); - const [openContextMenuId, setOpenContextMenuId] = useState(null); // Single subscription for rename state — derived boolean passed to each FolderNode @@ -98,14 +96,26 @@ export function FolderTreeView({ ); const handleCancelRename = useCallback(() => setRenamingFolderId(null), [setRenamingFolderId]); + const effectiveActiveTypes = useMemo(() => { + if ( + activeTypes.includes("FILE" as DocumentTypeEnum) && + !activeTypes.includes("LOCAL_FOLDER_FILE" as DocumentTypeEnum) + ) { + return [...activeTypes, "LOCAL_FOLDER_FILE" as DocumentTypeEnum]; + } + return activeTypes; + }, [activeTypes]); + const hasDescendantMatch = useMemo(() => { - if (activeTypes.length === 0 && !searchQuery) return null; + if (effectiveActiveTypes.length === 0 && !searchQuery) return null; const match: Record = {}; function check(folderId: number): boolean { if (match[folderId] !== undefined) return match[folderId]; const childDocs = (docsByFolder[folderId] ?? []).some( - (d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum) + (d) => + effectiveActiveTypes.length === 0 || + effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum) ); if (childDocs) { match[folderId] = true; @@ -126,7 +136,7 @@ export function FolderTreeView({ check(f.id); } return match; - }, [folders, docsByFolder, foldersByParent, activeTypes, searchQuery]); + }, [folders, docsByFolder, foldersByParent, effectiveActiveTypes, searchQuery]); const folderSelectionStates = useMemo(() => { const states: Record = {}; @@ -158,6 +168,35 @@ export function FolderTreeView({ return states; }, [folders, docsByFolder, foldersByParent, mentionedDocIds]); + const folderProcessingStates = useMemo(() => { + const states: Record = {}; + + function compute(folderId: number): { hasProcessing: boolean; hasFailed: boolean } { + const directDocs = docsByFolder[folderId] ?? []; + let hasProcessing = directDocs.some( + (d) => d.status?.state === "pending" || d.status?.state === "processing" + ); + let hasFailed = directDocs.some((d) => d.status?.state === "failed"); + + for (const child of foldersByParent[folderId] ?? []) { + const sub = compute(child.id); + hasProcessing = hasProcessing || sub.hasProcessing; + hasFailed = hasFailed || sub.hasFailed; + } + + if (hasProcessing) states[folderId] = "processing"; + else if (hasFailed) states[folderId] = "failed"; + else states[folderId] = "idle"; + + return { hasProcessing, hasFailed }; + } + + for (const f of folders) { + if (states[f.id] === undefined) compute(f.id); + } + return states; + }, [folders, docsByFolder, foldersByParent]); + function renderLevel(parentId: number | null, depth: number): React.ReactNode[] { const key = parentId ?? "root"; const childFolders = (foldersByParent[key] ?? []) @@ -167,7 +206,9 @@ export function FolderTreeView({ ? childFolders.filter((f) => hasDescendantMatch[f.id]) : childFolders; const childDocs = (docsByFolder[key] ?? []).filter( - (d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum) + (d) => + effectiveActiveTypes.length === 0 || + effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum) ); const nodes: React.ReactNode[] = []; @@ -189,8 +230,8 @@ export function FolderTreeView({ depth={depth} isExpanded={isExpanded} isRenaming={renamingFolderId === f.id} - childCount={folderChildCounts[f.id] ?? 0} selectionState={folderSelectionStates[f.id] ?? "none"} + processingState={folderProcessingStates[f.id] ?? "idle"} onToggleSelect={onToggleFolderSelect} onToggleExpand={onToggleExpand} onRename={onRenameFolder} @@ -204,6 +245,9 @@ export function FolderTreeView({ siblingPositions={siblingPositions} contextMenuOpen={openContextMenuId === `folder-${f.id}`} onContextMenuOpenChange={(open) => setOpenContextMenuId(open ? `folder-${f.id}` : null)} + isWatched={watchedFolderIds?.has(f.id)} + onRescan={onRescanFolder} + onStopWatching={onStopWatchingFolder} /> ); @@ -225,6 +269,7 @@ export function FolderTreeView({ onDelete={onDeleteDocument} onMove={onMoveDocument} onExport={onExportDocument} + onVersionHistory={onVersionHistory} contextMenuOpen={openContextMenuId === `doc-${d.id}`} onContextMenuOpenChange={(open) => setOpenContextMenuId(open ? `doc-${d.id}` : null)} /> @@ -247,11 +292,12 @@ export function FolderTreeView({ ); } - if (treeNodes.length === 0 && (activeTypes.length > 0 || searchQuery)) { + if (treeNodes.length === 0 && (effectiveActiveTypes.length > 0 || searchQuery)) { return (
- -

No matching documents

+ +

No matching documents

+

Try a different search term

); } diff --git a/surfsense_web/components/documents/version-history.tsx b/surfsense_web/components/documents/version-history.tsx new file mode 100644 index 000000000..27343dc6a --- /dev/null +++ b/surfsense_web/components/documents/version-history.tsx @@ -0,0 +1,258 @@ +"use client"; + +import { Check, ChevronRight, Clock, Copy, RotateCcw } from "lucide-react"; +import { useCallback, useEffect, useState } from "react"; +import { toast } from "sonner"; +import { Button } from "@/components/ui/button"; +import { Dialog, DialogContent, DialogTitle, DialogTrigger } from "@/components/ui/dialog"; +import { Separator } from "@/components/ui/separator"; +import { Spinner } from "@/components/ui/spinner"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; +import { cn } from "@/lib/utils"; + +interface DocumentVersionSummary { + version_number: number; + title: string; + content_hash: string; + created_at: string | null; +} + +interface VersionHistoryProps { + documentId: number; + documentType: string; +} + +const VERSION_DOCUMENT_TYPES = new Set(["LOCAL_FOLDER_FILE", "OBSIDIAN_CONNECTOR"]); + +export function isVersionableType(documentType: string) { + return VERSION_DOCUMENT_TYPES.has(documentType); +} + +const DIALOG_CLASSES = + "select-none max-w-[900px] w-[95vw] md:w-[90vw] h-[90vh] md:h-[80vh] max-h-[640px] flex flex-col md:flex-row p-0 gap-0 overflow-hidden [--card:var(--background)] dark:[--card:oklch(0.205_0_0)] dark:[--background:oklch(0.205_0_0)]"; + +export function VersionHistoryButton({ documentId, documentType }: VersionHistoryProps) { + if (!isVersionableType(documentType)) return null; + + return ( + + + + + + Version History + + + + ); +} + +export function VersionHistoryDialog({ + open, + onOpenChange, + documentId, +}: { + open: boolean; + onOpenChange: (open: boolean) => void; + documentId: number; +}) { + return ( + + + Version History + {open && } + + + ); +} + +function formatRelativeTime(dateStr: string): string { + const now = Date.now(); + const then = new Date(dateStr).getTime(); + const diffMs = now - then; + const diffMin = Math.floor(diffMs / 60_000); + if (diffMin < 1) return "Just now"; + if (diffMin < 60) return `${diffMin} minute${diffMin !== 1 ? "s" : ""} ago`; + const diffHr = Math.floor(diffMin / 60); + if (diffHr < 24) return `${diffHr} hour${diffHr !== 1 ? "s" : ""} ago`; + return new Date(dateStr).toLocaleDateString(undefined, { + weekday: "short", + month: "short", + day: "numeric", + year: "numeric", + hour: "numeric", + minute: "2-digit", + }); +} + +function VersionHistoryPanel({ documentId }: { documentId: number }) { + const [versions, setVersions] = useState([]); + const [loading, setLoading] = useState(true); + const [selectedVersion, setSelectedVersion] = useState(null); + const [versionContent, setVersionContent] = useState(""); + const [contentLoading, setContentLoading] = useState(false); + const [restoring, setRestoring] = useState(false); + const [copied, setCopied] = useState(false); + + const loadVersions = useCallback(async () => { + setLoading(true); + try { + const data = await documentsApiService.listDocumentVersions(documentId); + setVersions(data as DocumentVersionSummary[]); + } catch { + toast.error("Failed to load version history"); + } finally { + setLoading(false); + } + }, [documentId]); + + useEffect(() => { + loadVersions(); + }, [loadVersions]); + + const handleSelectVersion = async (versionNumber: number) => { + if (selectedVersion === versionNumber) return; + setSelectedVersion(versionNumber); + setContentLoading(true); + try { + const data = (await documentsApiService.getDocumentVersion(documentId, versionNumber)) as { + source_markdown: string; + }; + setVersionContent(data.source_markdown || ""); + } catch { + toast.error("Failed to load version content"); + } finally { + setContentLoading(false); + } + }; + + const handleRestore = async (versionNumber: number) => { + setRestoring(true); + try { + await documentsApiService.restoreDocumentVersion(documentId, versionNumber); + toast.success(`Restored version ${versionNumber}`); + await loadVersions(); + } catch { + toast.error("Failed to restore version"); + } finally { + setRestoring(false); + } + }; + + const handleCopy = () => { + navigator.clipboard.writeText(versionContent); + setCopied(true); + setTimeout(() => setCopied(false), 2000); + }; + + if (loading) { + return ( +
+ +
+ ); + } + + if (versions.length === 0) { + return ( +
+

No version history available yet

+

Versions are created when file content changes

+
+ ); + } + + const selectedVersionData = versions.find((v) => v.version_number === selectedVersion); + + return ( + <> + {/* Left panel — version list */} + + + {/* Right panel — content preview */} +
+ {selectedVersion !== null && selectedVersionData ? ( + <> +
+

+ {selectedVersionData.title || `Version ${selectedVersion}`} +

+
+ + +
+
+ +
+ {contentLoading ? ( +
+ +
+ ) : ( +
+									{versionContent || "(empty)"}
+								
+ )} +
+ + ) : ( +
+

Select a version to preview

+
+ )} +
+ + ); +} diff --git a/surfsense_web/components/editor-panel/editor-panel.tsx b/surfsense_web/components/editor-panel/editor-panel.tsx index 3ea36f800..7c94356d8 100644 --- a/surfsense_web/components/editor-panel/editor-panel.tsx +++ b/surfsense_web/components/editor-panel/editor-panel.tsx @@ -1,28 +1,34 @@ "use client"; import { useAtomValue, useSetAtom } from "jotai"; -import { AlertCircle, XIcon } from "lucide-react"; +import { Download, FileQuestionMark, FileText, Loader2, RefreshCw, XIcon } from "lucide-react"; import dynamic from "next/dynamic"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom"; +import { VersionHistoryButton } from "@/components/documents/version-history"; import { MarkdownViewer } from "@/components/markdown-viewer"; +import { Alert, AlertDescription } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer"; -import { Skeleton } from "@/components/ui/skeleton"; import { useMediaQuery } from "@/hooks/use-media-query"; import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils"; const PlateEditor = dynamic( () => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })), - { ssr: false, loading: () => } + { ssr: false, loading: () => } ); +const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB + interface EditorContent { document_id: number; title: string; document_type?: string; source_markdown: string; + content_size_bytes?: number; + chunk_count?: number; + truncated?: boolean; } const EDITABLE_DOCUMENT_TYPES = new Set(["FILE", "NOTE"]); @@ -62,6 +68,7 @@ export function EditorPanelContent({ const [isLoading, setIsLoading] = useState(true); const [error, setError] = useState(null); const [saving, setSaving] = useState(false); + const [downloading, setDownloading] = useState(false); const [editedMarkdown, setEditedMarkdown] = useState(null); const markdownRef = useRef(""); @@ -69,8 +76,10 @@ export function EditorPanelContent({ const changeCountRef = useRef(0); const [displayTitle, setDisplayTitle] = useState(title || "Untitled"); + const isLargeDocument = (editorDoc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD; + useEffect(() => { - let cancelled = false; + const controller = new AbortController(); setIsLoading(true); setError(null); setEditorDoc(null); @@ -78,7 +87,7 @@ export function EditorPanelContent({ initialLoadDone.current = false; changeCountRef.current = 0; - const fetchContent = async () => { + const doFetch = async () => { const token = getBearerToken(); if (!token) { redirectToLogin(); @@ -86,12 +95,14 @@ export function EditorPanelContent({ } try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + const url = new URL( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content` ); + url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD)); - if (cancelled) return; + const response = await authenticatedFetch(url.toString(), { method: "GET" }); + + if (controller.signal.aborted) return; if (!response.ok) { const errorData = await response @@ -115,18 +126,16 @@ export function EditorPanelContent({ setEditorDoc(data); initialLoadDone.current = true; } catch (err) { - if (cancelled) return; + if (controller.signal.aborted) return; console.error("Error fetching document:", err); setError(err instanceof Error ? err.message : "Failed to fetch document"); } finally { - if (!cancelled) setIsLoading(false); + if (!controller.signal.aborted) setIsLoading(false); } }; - fetchContent(); - return () => { - cancelled = true; - }; + doFetch().catch(() => {}); + return () => controller.abort(); }, [documentId, searchSpaceId, title]); const handleMarkdownChange = useCallback((md: string) => { @@ -175,7 +184,7 @@ export function EditorPanelContent({ }, [documentId, searchSpaceId]); const isEditableType = editorDoc - ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") + ? EDITABLE_DOCUMENT_TYPES.has(editorDoc.document_type ?? "") && !isLargeDocument : false; return ( @@ -187,12 +196,17 @@ export function EditorPanelContent({

Unsaved changes

)}
- {onClose && ( - - )} +
+ {editorDoc?.document_type && ( + + )} + {onClose && ( + + )} +
@@ -200,12 +214,79 @@ export function EditorPanelContent({ ) : error || !editorDoc ? (
- -
-

Failed to load document

-

{error || "An unknown error occurred"}

+ {error?.toLowerCase().includes("still being processed") ? ( +
+ +
+ ) : ( +
+ +
+ )} +
+

+ {error?.toLowerCase().includes("still being processed") + ? "Document is processing" + : "Document unavailable"} +

+

+ {error || "An unknown error occurred"} +

+ ) : isLargeDocument ? ( +
+ + + + + This document is too large for the editor ( + {Math.round((editorDoc.content_size_bytes ?? 0) / 1024 / 1024)}MB,{" "} + {editorDoc.chunk_count ?? 0} chunks). Showing a preview below. + + + + + +
) : isEditableType ? ( ({ + onSave, + hasUnsavedChanges, + isSaving, + canToggleMode, + }), + [onSave, hasUnsavedChanges, isSaving, canToggleMode] + ); + return ( - + ( src="/homepage/comments-audio.webp" alt="Audio Comment Illustration" fill + sizes="(max-width: 768px) 100vw, (max-width: 1024px) 50vw, 33vw" className="object-cover" />
diff --git a/surfsense_web/components/homepage/hero-section.tsx b/surfsense_web/components/homepage/hero-section.tsx index 299cf1032..c7744ccac 100644 --- a/surfsense_web/components/homepage/hero-section.tsx +++ b/surfsense_web/components/homepage/hero-section.tsx @@ -1,39 +1,15 @@ "use client"; +import { Download, Monitor } from "lucide-react"; import { AnimatePresence, motion } from "motion/react"; -import dynamic from "next/dynamic"; import Link from "next/link"; -import type React from "react"; -import { useEffect, useRef, useState } from "react"; +import React, { memo, useCallback, useEffect, useRef, useState } from "react"; import Balancer from "react-wrap-balancer"; +import { ExpandedMediaOverlay, useExpandedMedia } from "@/components/ui/expanded-gif-overlay"; +import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { AUTH_TYPE, BACKEND_URL } from "@/lib/env-config"; import { trackLoginAttempt } from "@/lib/posthog/events"; import { cn } from "@/lib/utils"; -const HeroCarousel = dynamic( - () => import("@/components/ui/hero-carousel").then((m) => ({ default: m.HeroCarousel })), - { - ssr: false, - loading: () => ( -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- ), - } -); - -// Official Google "G" logo with brand colors const GoogleLogo = ({ className }: { className?: string }) => ( ( ); -function useIsDesktop(breakpoint = 1024) { - const [isDesktop, setIsDesktop] = useState(false); - useEffect(() => { - const mql = window.matchMedia(`(min-width: ${breakpoint}px)`); - setIsDesktop(mql.matches); - const handler = (e: MediaQueryListEvent) => setIsDesktop(e.matches); - mql.addEventListener("change", handler); - return () => mql.removeEventListener("change", handler); - }, [breakpoint]); - return isDesktop; -} +const TAB_ITEMS = [ + { + title: "General Assist", + description: "Launch SurfSense instantly from any application.", + src: "/homepage/hero_tutorial/general_assist.mp4", + featured: true, + }, + { + title: "Quick Assist", + description: "Select text anywhere, then ask AI to explain, rewrite, or act on it.", + src: "/homepage/hero_tutorial/quick_assist.mp4", + featured: true, + }, + { + title: "Extreme Assist", + description: "Get inline writing suggestions powered by your knowledge base as you type in any app.", + src: "/homepage/hero_tutorial/extreme_assist.mp4", + featured: true, + }, + // { + // title: "Connect & Sync", + // description: + // "Connect data sources like Notion, Drive and Gmail. Automatically sync to keep them updated.", + // src: "/homepage/hero_tutorial/ConnectorFlowGif.mp4", + // featured: true, + // }, + // { + // title: "Upload Documents", + // description: "Upload documents directly, from images to massive PDFs.", + // src: "/homepage/hero_tutorial/DocUploadGif.mp4", + // featured: true, + // }, + { + title: "Video & Presentations", + description: "Create short videos and editable presentations with AI-generated visuals and narration from your sources.", + src: "/homepage/hero_tutorial/video_gen_surf.mp4", + featured: false, + }, + { + title: "Search & Citation", + description: "Ask questions and get cited responses from your knowledge base.", + src: "/homepage/hero_tutorial/BSNCGif.mp4", + featured: false, + }, + { + title: "Document Q&A", + description: "Mention specific documents in chat for targeted answers.", + src: "/homepage/hero_tutorial/BQnaGif_compressed.mp4", + featured: false, + }, + { + title: "Reports", + description: "Generate reports from your sources in many formats.", + src: "/homepage/hero_tutorial/ReportGenGif_compressed.mp4", + featured: false, + }, + { + title: "Podcasts", + description: "Turn anything into a podcast in under 20 seconds.", + src: "/homepage/hero_tutorial/PodcastGenGif.mp4", + featured: false, + }, + { + title: "Image Generation", + description: "Generate high-quality images easily from your conversations.", + src: "/homepage/hero_tutorial/ImageGenGif.mp4", + featured: false, + }, + { + title: "Collaborative Chat", + description: "Collaborate on AI-powered conversations in realtime with your team.", + src: "/homepage/hero_realtime/RealTimeChatGif.mp4", + featured: false, + }, + { + title: "Comments", + description: "Add comments and tag teammates on any message.", + src: "/homepage/hero_realtime/RealTimeCommentsFlow.mp4", + featured: false, + }, +] as const; export function HeroSection() { - const containerRef = useRef(null); - const parentRef = useRef(null); - const isDesktop = useIsDesktop(); - return ( -
- - {isDesktop && ( - <> - - - - - - )} +
+
+

+ NotebookLM for Teams +

+
+
+

+ An open source, privacy focused alternative to NotebookLM for teams with no data + limits. +

-

-
-
- NotebookLM for Teams +
+ + +
-

-

- Connect any LLM to your internal knowledge sources and chat with it in real time alongside - your team. -

-
- - {/* */} -
-
- +
); @@ -158,256 +164,196 @@ function GetStartedButton() { if (isGoogleAuth) { return ( - - {/* Animated gradient background on hover */} - - {/* Google logo with subtle animation */} - - - - Continue with Google - + + Continue with Google + ); } return ( - - - Get Started - - + + Get Started + ); } -const BackgroundGrids = () => { +function useUserOS() { + const [os, setOs] = useState<"macOS" | "Windows" | "Linux">("macOS"); + useEffect(() => { + const ua = navigator.userAgent; + if (/Windows/i.test(ua)) setOs("Windows"); + else if (/Linux/i.test(ua)) setOs("Linux"); + else setOs("macOS"); + }, []); + return os; +} + +function DownloadButton() { + const os = useUserOS(); return ( -
-
- - -
-
- - -
-
- - -
-
- - -
-
+ + + Download for {os} + ); -}; +} -const CollisionMechanism = ({ - parentRef, - beamOptions = {}, -}: { - parentRef: React.RefObject; - beamOptions?: { - initialX?: number; - translateX?: number; - initialY?: number; - translateY?: number; - rotate?: number; - className?: string; - duration?: number; - delay?: number; - repeatDelay?: number; - }; -}) => { - const beamRef = useRef(null); - const [collision, setCollision] = useState<{ - detected: boolean; - coordinates: { x: number; y: number } | null; - }>({ detected: false, coordinates: null }); - const [beamKey, setBeamKey] = useState(0); - const [cycleCollisionDetected, setCycleCollisionDetected] = useState(false); - - useEffect(() => { - const checkCollision = () => { - if (beamRef.current && parentRef.current && !cycleCollisionDetected) { - const beamRect = beamRef.current.getBoundingClientRect(); - const parentRect = parentRef.current.getBoundingClientRect(); - const rightEdge = parentRect.right; - - if (beamRect.right >= rightEdge - 20) { - const relativeX = parentRect.width - 20; - const relativeY = beamRect.top - parentRect.top + beamRect.height / 2; - - setCollision({ - detected: true, - coordinates: { x: relativeX, y: relativeY }, - }); - setCycleCollisionDetected(true); - if (beamRef.current) { - beamRef.current.style.opacity = "0"; - } - } - } - }; - - const animationInterval = setInterval(checkCollision, 100); - - return () => clearInterval(animationInterval); - }, [cycleCollisionDetected, parentRef]); - - useEffect(() => { - if (!collision.detected || !collision.coordinates) return; - - const timer1 = setTimeout(() => { - setCollision({ detected: false, coordinates: null }); - setCycleCollisionDetected(false); - if (beamRef.current) { - beamRef.current.style.opacity = "1"; - } - }, 2000); - - const timer2 = setTimeout(() => { - setBeamKey((prevKey) => prevKey + 1); - }, 2000); - - return () => { - clearTimeout(timer1); - clearTimeout(timer2); - }; - }, [collision]); +const BrowserWindow = () => { + const [selectedIndex, setSelectedIndex] = useState(0); + const selectedItem = TAB_ITEMS[selectedIndex]; + const { expanded, open, close } = useExpandedMedia(); return ( <> - + +
+
+
+
+
+
+
+ {TAB_ITEMS.map((item, index) => ( + + + {index !== TAB_ITEMS.length - 1 && ( +
+ )} + + ))} +
+
+
+ + +
+
+

+ {selectedItem.title} +

+

+ {selectedItem.description} +

+
+
+ +
+
+
+ + - {collision.detected && collision.coordinates && ( - + {expanded && ( + )} ); }; -const Explosion = ({ ...props }: React.HTMLProps) => { - const spans = Array.from({ length: 20 }, (_, index) => ({ - id: index, - initialX: 0, - initialY: 0, - directionX: Math.floor(Math.random() * 80 - 40), - directionY: Math.floor(Math.random() * -50 - 10), - })); +const TabVideo = memo(function TabVideo({ src }: { src: string }) { + const videoRef = useRef(null); + const [hasLoaded, setHasLoaded] = useState(false); + + useEffect(() => { + setHasLoaded(false); + const video = videoRef.current; + if (!video) return; + video.currentTime = 0; + video.play().catch(() => {}); + }, [src]); + + const handleCanPlay = useCallback(() => { + setHasLoaded(true); + }, []); return ( -
- - {spans.map((span) => ( - - ))} +
+

{title}

diff --git a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx index fd6b45c52..7e9c33a1a 100644 --- a/surfsense_web/components/layout/providers/LayoutDataProvider.tsx +++ b/surfsense_web/components/layout/providers/LayoutDataProvider.tsx @@ -55,7 +55,7 @@ import { useInbox } from "@/hooks/use-inbox"; import { useIsMobile } from "@/hooks/use-mobile"; import { notificationsApiService } from "@/lib/apis/notifications-api.service"; import { searchSpacesApiService } from "@/lib/apis/search-spaces-api.service"; -import { logout } from "@/lib/auth-utils"; +import { getLoginPath, logout } from "@/lib/auth-utils"; import { deleteThread, fetchThreads, updateThread } from "@/lib/chat/thread-persistence"; import { resetUser, trackLogout } from "@/lib/posthog/events"; import { cacheKeys } from "@/lib/query-client/cache-keys"; @@ -347,35 +347,38 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid // Navigation items const navItems: NavItem[] = useMemo( - () => [ - { - title: "Inbox", - url: "#inbox", - icon: Inbox, - isActive: isInboxSidebarOpen, - badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined, - }, - { - title: "Documents", - url: "#documents", - icon: SquareLibrary, - isActive: isMobile - ? isDocumentsSidebarOpen - : isDocumentsSidebarOpen && !isRightPanelCollapsed, - }, - { - title: "Announcements", - url: "#announcements", - icon: Megaphone, - isActive: isAnnouncementsSidebarOpen, - badge: announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined, - }, - ], + () => + ( + [ + { + title: "Inbox", + url: "#inbox", + icon: Inbox, + isActive: isInboxSidebarOpen, + badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined, + }, + isMobile + ? { + title: "Documents", + url: "#documents", + icon: SquareLibrary, + isActive: isDocumentsSidebarOpen, + } + : null, + { + title: "Announcements", + url: "#announcements", + icon: Megaphone, + isActive: isAnnouncementsSidebarOpen, + badge: + announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined, + }, + ] as (NavItem | null)[] + ).filter((item): item is NavItem => item !== null), [ isMobile, isInboxSidebarOpen, isDocumentsSidebarOpen, - isRightPanelCollapsed, totalUnreadCount, isAnnouncementsSidebarOpen, announcementUnreadCount, @@ -600,12 +603,12 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid await logout(); if (typeof window !== "undefined") { - router.push("/"); + router.push(getLoginPath()); } } catch (error) { console.error("Error during logout:", error); await logout(); - router.push("/"); + router.push(getLoginPath()); } }, [router]); @@ -775,7 +778,8 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid {t("delete_chat")} - {t("delete_chat_confirm")} {chatToDelete?.name}?{" "} + {t("delete_chat_confirm")}{" "} + {chatToDelete?.name}?{" "} {t("action_cannot_undone")} @@ -835,9 +839,7 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid {tSidebar("rename") || "Rename"} - {isRenamingChat && ( - - )} + {isRenamingChat && } @@ -865,9 +867,7 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid className="relative bg-destructive text-destructive-foreground hover:bg-destructive/90" > {tCommon("delete")} - {isDeletingSearchSpace && ( - - )} + {isDeletingSearchSpace && } @@ -895,9 +895,7 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid className="relative bg-destructive text-destructive-foreground hover:bg-destructive/90" > {t("leave")} - {isLeavingSearchSpace && ( - - )} + {isLeavingSearchSpace && } diff --git a/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx b/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx index 7d0ef73d2..3f01ab24a 100644 --- a/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx +++ b/surfsense_web/components/layout/ui/dialogs/CreateSearchSpaceDialog.tsx @@ -83,7 +83,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac return ( - +
@@ -108,7 +108,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac placeholder={t("name_placeholder")} {...field} autoFocus - className="text-sm h-9 sm:h-10" + className="text-sm h-9 sm:h-10 select-text" /> @@ -131,7 +131,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac diff --git a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx index ac2f65065..febae35d3 100644 --- a/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx +++ b/surfsense_web/components/layout/ui/right-panel/RightPanel.tsx @@ -10,7 +10,6 @@ import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms"; import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom"; import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom"; import { Button } from "@/components/ui/button"; -import { Skeleton } from "@/components/ui/skeleton"; import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip"; import { DocumentsSidebar } from "../sidebar"; @@ -19,7 +18,7 @@ const EditorPanelContent = dynamic( import("@/components/editor-panel/editor-panel").then((m) => ({ default: m.EditorPanelContent, })), - { ssr: false, loading: () => } + { ssr: false, loading: () => null } ); const HitlEditPanelContent = dynamic( @@ -27,7 +26,7 @@ const HitlEditPanelContent = dynamic( import("@/components/hitl-edit-panel/hitl-edit-panel").then((m) => ({ default: m.HitlEditPanelContent, })), - { ssr: false, loading: () => } + { ssr: false, loading: () => null } ); const ReportPanelContent = dynamic( @@ -35,7 +34,7 @@ const ReportPanelContent = dynamic( import("@/components/report-panel/report-panel").then((m) => ({ default: m.ReportPanelContent, })), - { ssr: false, loading: () => } + { ssr: false, loading: () => null } ); interface RightPanelProps { @@ -78,14 +77,14 @@ export function RightPanelExpandButton() { if (!collapsed || !hasContent) return null; return ( -
+
) : ( - + ) : ( - +
-
+
- {deletableSelectedIds.length > 0 && ( -
- -
- )} +
+ {deletableSelectedIds.length > 0 && ( +
+ +
+ )} - { - openEditorPanel({ - documentId: doc.id, - searchSpaceId, - title: doc.title, - }); - }} - onEditDocument={(doc) => { - openEditorPanel({ - documentId: doc.id, - searchSpaceId, - title: doc.title, - }); - }} - onDeleteDocument={(doc) => handleDeleteDocument(doc.id)} - onMoveDocument={handleMoveDocument} - onExportDocument={handleExportDocument} - activeTypes={activeTypes} - onDropIntoFolder={handleDropIntoFolder} - onReorderFolder={handleReorderFolder} - /> + { + openEditorPanel({ + documentId: doc.id, + searchSpaceId, + title: doc.title, + }); + }} + onEditDocument={(doc) => { + openEditorPanel({ + documentId: doc.id, + searchSpaceId, + title: doc.title, + }); + }} + onDeleteDocument={(doc) => handleDeleteDocument(doc.id)} + onMoveDocument={handleMoveDocument} + onExportDocument={handleExportDocument} + onVersionHistory={(doc) => setVersionDocId(doc.id)} + activeTypes={activeTypes} + onDropIntoFolder={handleDropIntoFolder} + onReorderFolder={handleReorderFolder} + watchedFolderIds={watchedFolderIds} + onRescanFolder={handleRescanFolder} + onStopWatchingFolder={handleStopWatching} + /> +
+ {versionDocId !== null && ( + { + if (!open) setVersionDocId(null); + }} + documentId={versionDocId} + /> + )} + (null); const [connectorScrollPos, setConnectorScrollPos] = useState<"top" | "middle" | "bottom">("top"); + const connectorRafRef = useRef(); const handleConnectorScroll = useCallback((e: React.UIEvent) => { const el = e.currentTarget; - const atTop = el.scrollTop <= 2; - const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; - setConnectorScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + if (connectorRafRef.current) return; + connectorRafRef.current = requestAnimationFrame(() => { + const atTop = el.scrollTop <= 2; + const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight <= 2; + setConnectorScrollPos(atTop ? "top" : atBottom ? "bottom" : "middle"); + connectorRafRef.current = undefined; + }); }, []); + useEffect( + () => () => { + if (connectorRafRef.current) cancelAnimationFrame(connectorRafRef.current); + }, + [] + ); const [filterDrawerOpen, setFilterDrawerOpen] = useState(false); const [markingAsReadId, setMarkingAsReadId] = useState(null); @@ -289,15 +300,14 @@ export function InboxSidebarContent({ [activeFilter] ); + // Defer non-urgent list updates so the search input stays responsive. + // The deferred snapshot lags one render behind the live value intentionally. + const deferredTabItems = useDeferredValue(activeSource.items); + const deferredSearchItems = useDeferredValue(searchResponse?.items ?? []); + // Two data paths: search mode (API) or default (per-tab data source) const filteredItems = useMemo(() => { - let tabItems: InboxItem[]; - - if (isSearchMode) { - tabItems = searchResponse?.items ?? []; - } else { - tabItems = activeSource.items; - } + const tabItems: InboxItem[] = isSearchMode ? deferredSearchItems : deferredTabItems; let result = tabItems; if (activeFilter !== "all") { @@ -310,8 +320,8 @@ export function InboxSidebarContent({ return result; }, [ isSearchMode, - searchResponse, - activeSource.items, + deferredSearchItems, + deferredTabItems, activeTab, activeFilter, selectedSource, @@ -780,36 +790,23 @@ export function InboxSidebarContent({ )} - {isMobile ? ( - - ) : ( - - - - - - {t("mark_all_read") || "Mark all as read"} - - - )} + + + + + + {t("mark_all_read") || "Mark all as read"} + +
@@ -920,31 +917,10 @@ export function InboxSidebarContent({ "transition-colors cursor-pointer", isMarkingAsRead && "opacity-50 pointer-events-none" )} + style={{ contentVisibility: "auto", containIntrinsicSize: "0 80px" }} > - {isMobile ? ( - - ) : ( - + {activeTab === "status" ? ( + )}
diff --git a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx index 73347e304..7dd46e484 100644 --- a/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx +++ b/surfsense_web/components/layout/ui/sidebar/PageUsageDisplay.tsx @@ -35,7 +35,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp @@ -48,7 +48,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp {pageBuyingEnabled && ( diff --git a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx index ac279cd4d..97c5b7cd9 100644 --- a/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx +++ b/surfsense_web/components/layout/ui/tabs/DocumentTabContent.tsx @@ -1,18 +1,24 @@ "use client"; -import { AlertCircle, Pencil } from "lucide-react"; +import { Download, FileQuestionMark, FileText, Loader2, PenLine, RefreshCw } from "lucide-react"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { PlateEditor } from "@/components/editor/plate-editor"; import { MarkdownViewer } from "@/components/markdown-viewer"; +import { Alert, AlertDescription } from "@/components/ui/alert"; import { Button } from "@/components/ui/button"; import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils"; +const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB + interface DocumentContent { document_id: number; title: string; document_type?: string; source_markdown: string; + content_size_bytes?: number; + chunk_count?: number; + truncated?: boolean; } function DocumentSkeleton() { @@ -49,13 +55,16 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen const [error, setError] = useState(null); const [isEditing, setIsEditing] = useState(false); const [saving, setSaving] = useState(false); + const [downloading, setDownloading] = useState(false); const [editedMarkdown, setEditedMarkdown] = useState(null); const markdownRef = useRef(""); const initialLoadDone = useRef(false); const changeCountRef = useRef(0); + const isLargeDocument = (doc?.content_size_bytes ?? 0) > LARGE_DOCUMENT_THRESHOLD; + useEffect(() => { - let cancelled = false; + const controller = new AbortController(); setIsLoading(true); setError(null); setDoc(null); @@ -64,7 +73,7 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen initialLoadDone.current = false; changeCountRef.current = 0; - const fetchContent = async () => { + const doFetch = async () => { const token = getBearerToken(); if (!token) { redirectToLogin(); @@ -72,12 +81,14 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen } try { - const response = await authenticatedFetch( - `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content`, - { method: "GET" } + const url = new URL( + `${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/documents/${documentId}/editor-content` ); + url.searchParams.set("max_length", String(LARGE_DOCUMENT_THRESHOLD)); - if (cancelled) return; + const response = await authenticatedFetch(url.toString(), { method: "GET" }); + + if (controller.signal.aborted) return; if (!response.ok) { const errorData = await response @@ -98,18 +109,16 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen setDoc(data); initialLoadDone.current = true; } catch (err) { - if (cancelled) return; + if (controller.signal.aborted) return; console.error("Error fetching document:", err); setError(err instanceof Error ? err.message : "Failed to fetch document"); } finally { - if (!cancelled) setIsLoading(false); + if (!controller.signal.aborted) setIsLoading(false); } }; - fetchContent(); - return () => { - cancelled = true; - }; + doFetch().catch(() => {}); + return () => controller.abort(); }, [documentId, searchSpaceId]); const handleMarkdownChange = useCallback((md: string) => { @@ -160,22 +169,40 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen if (isLoading) return ; if (error || !doc) { + const isProcessing = error?.toLowerCase().includes("still being processed"); return ( -
- -
-

Failed to load document

-

- {error || "An unknown error occurred"} -

+
+
+ {isProcessing ? ( + + ) : ( + + )}
+
+

+ {isProcessing ? "Document is processing" : "Document unavailable"} +

+

{error || "An unknown error occurred"}

+
+ {!isProcessing && ( + + )}
); } - const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? ""); + const isEditable = EDITABLE_DOCUMENT_TYPES.has(doc.document_type ?? "") && !isLargeDocument; - if (isEditing) { + if (isEditing && !isLargeDocument) { return (
@@ -229,14 +256,69 @@ export function DocumentTabContent({ documentId, searchSpaceId, title }: Documen onClick={() => setIsEditing(true)} className="gap-1.5" > - + Edit )}
- + {isLargeDocument ? ( + <> + + + + + This document is too large for the editor ( + {Math.round((doc.content_size_bytes ?? 0) / 1024 / 1024)}MB,{" "} + {doc.chunk_count ?? 0} chunks). Showing a preview below. + + + + + + + ) : ( + + )}
diff --git a/surfsense_web/components/layout/ui/tabs/TabBar.tsx b/surfsense_web/components/layout/ui/tabs/TabBar.tsx index 18e1ba141..8d0d986d3 100644 --- a/surfsense_web/components/layout/ui/tabs/TabBar.tsx +++ b/surfsense_web/components/layout/ui/tabs/TabBar.tsx @@ -72,7 +72,7 @@ export function TabBar({ onTabSwitch, onNewChat, rightActions, className }: TabB if (tabs.length <= 1) return null; return ( -
+
maxLength; + const displayContent = isTruncated ? content.slice(0, maxLength) : content; + const processedContent = convertLatexDelimiters(stripOuterMarkdownFence(displayContent)); const components: StreamdownProps["components"] = { p: ({ children, ...props }) => (

@@ -124,16 +129,32 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) {

), hr: ({ ...props }) =>
, - img: ({ src, alt, width: _w, height: _h, ...props }) => ( - // eslint-disable-next-line @next/next/no-img-element - {alt - ), + img: ({ src, alt, width: _w, height: _h, ...props }) => { + const isDataOrUnknownUrl = + typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http")); + + return isDataOrUnknownUrl ? ( + // eslint-disable-next-line @next/next/no-img-element + {alt + ) : ( + {alt + ); + }, table: ({ ...props }) => (
@@ -171,6 +192,12 @@ export function MarkdownViewer({ content, className }: MarkdownViewerProps) { > {processedContent} + {isTruncated && ( +

+ Content truncated ({Math.round(content.length / 1024)}KB total). Showing first{" "} + {Math.round(maxLength / 1024)}KB. +

+ )} ); } diff --git a/surfsense_web/components/new-chat/chat-header.tsx b/surfsense_web/components/new-chat/chat-header.tsx index 3263a2b07..0c5253c6c 100644 --- a/surfsense_web/components/new-chat/chat-header.tsx +++ b/surfsense_web/components/new-chat/chat-header.tsx @@ -3,11 +3,14 @@ import { useCallback, useState } from "react"; import { ImageConfigDialog } from "@/components/shared/image-config-dialog"; import { ModelConfigDialog } from "@/components/shared/model-config-dialog"; +import { VisionConfigDialog } from "@/components/shared/vision-config-dialog"; import type { GlobalImageGenConfig, GlobalNewLLMConfig, + GlobalVisionLLMConfig, ImageGenerationConfig, NewLLMConfigPublic, + VisionLLMConfig, } from "@/contracts/types/new-llm-config.types"; import { ModelSelector } from "./model-selector"; @@ -33,6 +36,14 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) { const [isImageGlobal, setIsImageGlobal] = useState(false); const [imageDialogMode, setImageDialogMode] = useState<"create" | "edit" | "view">("view"); + // Vision config dialog state + const [visionDialogOpen, setVisionDialogOpen] = useState(false); + const [selectedVisionConfig, setSelectedVisionConfig] = useState< + VisionLLMConfig | GlobalVisionLLMConfig | null + >(null); + const [isVisionGlobal, setIsVisionGlobal] = useState(false); + const [visionDialogMode, setVisionDialogMode] = useState<"create" | "edit" | "view">("view"); + // LLM handlers const handleEditLLMConfig = useCallback( (config: NewLLMConfigPublic | GlobalNewLLMConfig, global: boolean) => { @@ -79,6 +90,29 @@ export function ChatHeader({ searchSpaceId, className }: ChatHeaderProps) { if (!open) setSelectedImageConfig(null); }, []); + // Vision model handlers + const handleAddVisionModel = useCallback(() => { + setSelectedVisionConfig(null); + setIsVisionGlobal(false); + setVisionDialogMode("create"); + setVisionDialogOpen(true); + }, []); + + const handleEditVisionConfig = useCallback( + (config: VisionLLMConfig | GlobalVisionLLMConfig, global: boolean) => { + setSelectedVisionConfig(config); + setIsVisionGlobal(global); + setVisionDialogMode(global ? "view" : "edit"); + setVisionDialogOpen(true); + }, + [] + ); + + const handleVisionDialogClose = useCallback((open: boolean) => { + setVisionDialogOpen(open); + if (!open) setSelectedVisionConfig(null); + }, []); + return (
+
); } diff --git a/surfsense_web/components/new-chat/chat-share-button.tsx b/surfsense_web/components/new-chat/chat-share-button.tsx index 82e8c6a78..4fc35aba1 100644 --- a/surfsense_web/components/new-chat/chat-share-button.tsx +++ b/surfsense_web/components/new-chat/chat-share-button.tsx @@ -163,21 +163,16 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS )} - - - - - - - Share settings - + + + >(new Map()); const scrollContainerRef = useRef(null); @@ -245,12 +249,14 @@ export const DocumentMentionPicker = forwardRef< * Client-side filtering for single character searches. * Filters cached documents locally for instant feedback without additional API calls. * Server-side search is reserved for 2+ character queries to leverage database indexing. + * Uses deferredSearch (a deferred snapshot of debouncedSearch) so this memo is treated + * as non-urgent — React can interrupt it to keep the input responsive. */ const clientFilteredDocs = useMemo(() => { if (!isSingleCharSearch) return null; - const searchLower = debouncedSearch.trim().toLowerCase(); + const searchLower = deferredSearch.trim().toLowerCase(); return accumulatedDocuments.filter((doc) => doc.title.toLowerCase().includes(searchLower)); - }, [isSingleCharSearch, debouncedSearch, accumulatedDocuments]); + }, [isSingleCharSearch, deferredSearch, accumulatedDocuments]); // Select data source based on search length: client-filtered for single char, server results for 2+ const actualDocuments = isSingleCharSearch ? (clientFilteredDocs ?? []) : accumulatedDocuments; diff --git a/surfsense_web/components/new-chat/model-selector.tsx b/surfsense_web/components/new-chat/model-selector.tsx index 7a2a471ba..46b4a2c3a 100644 --- a/surfsense_web/components/new-chat/model-selector.tsx +++ b/surfsense_web/components/new-chat/model-selector.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtomValue } from "jotai"; -import { Bot, Check, ChevronDown, Edit3, ImageIcon, Plus, Zap } from "lucide-react"; +import { Bot, Check, ChevronDown, Edit3, Eye, ImageIcon, Plus, Search, Zap } from "lucide-react"; import { type UIEvent, useCallback, useMemo, useState } from "react"; import { toast } from "sonner"; import { @@ -15,6 +15,10 @@ import { newLLMConfigsAtom, } from "@/atoms/new-llm-config/new-llm-config-query.atoms"; import { activeSearchSpaceIdAtom } from "@/atoms/search-spaces/search-space-query.atoms"; +import { + globalVisionLLMConfigsAtom, + visionLLMConfigsAtom, +} from "@/atoms/vision-llm-config/vision-llm-config-query.atoms"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; import { @@ -32,8 +36,10 @@ import { Tabs, TabsContent, TabsList, TabsTrigger } from "@/components/ui/tabs"; import type { GlobalImageGenConfig, GlobalNewLLMConfig, + GlobalVisionLLMConfig, ImageGenerationConfig, NewLLMConfigPublic, + VisionLLMConfig, } from "@/contracts/types/new-llm-config.types"; import { getProviderIcon } from "@/lib/provider-icons"; import { cn } from "@/lib/utils"; @@ -43,6 +49,8 @@ interface ModelSelectorProps { onAddNewLLM: () => void; onEditImage?: (config: ImageGenerationConfig | GlobalImageGenConfig, isGlobal: boolean) => void; onAddNewImage?: () => void; + onEditVision?: (config: VisionLLMConfig | GlobalVisionLLMConfig, isGlobal: boolean) => void; + onAddNewVision?: () => void; className?: string; } @@ -51,14 +59,18 @@ export function ModelSelector({ onAddNewLLM, onEditImage, onAddNewImage, + onEditVision, + onAddNewVision, className, }: ModelSelectorProps) { const [open, setOpen] = useState(false); - const [activeTab, setActiveTab] = useState<"llm" | "image">("llm"); + const [activeTab, setActiveTab] = useState<"llm" | "image" | "vision">("llm"); const [llmSearchQuery, setLlmSearchQuery] = useState(""); const [imageSearchQuery, setImageSearchQuery] = useState(""); + const [visionSearchQuery, setVisionSearchQuery] = useState(""); const [llmScrollPos, setLlmScrollPos] = useState<"top" | "middle" | "bottom">("top"); const [imageScrollPos, setImageScrollPos] = useState<"top" | "middle" | "bottom">("top"); + const [visionScrollPos, setVisionScrollPos] = useState<"top" | "middle" | "bottom">("top"); const handleListScroll = useCallback( (setter: typeof setLlmScrollPos) => (e: UIEvent) => { const el = e.currentTarget; @@ -82,8 +94,21 @@ export function ModelSelector({ useAtomValue(globalImageGenConfigsAtom); const { data: imageUserConfigs, isLoading: imageUserLoading } = useAtomValue(imageGenConfigsAtom); + // Vision data + const { data: visionGlobalConfigs, isLoading: visionGlobalLoading } = useAtomValue( + globalVisionLLMConfigsAtom + ); + const { data: visionUserConfigs, isLoading: visionUserLoading } = + useAtomValue(visionLLMConfigsAtom); + const isLoading = - llmUserLoading || llmGlobalLoading || prefsLoading || imageGlobalLoading || imageUserLoading; + llmUserLoading || + llmGlobalLoading || + prefsLoading || + imageGlobalLoading || + imageUserLoading || + visionGlobalLoading || + visionUserLoading; // ─── LLM current config ─── const currentLLMConfig = useMemo(() => { @@ -116,6 +141,24 @@ export function ModelSelector({ ); }, [currentImageConfig]); + // ─── Vision current config ─── + const currentVisionConfig = useMemo(() => { + if (!preferences) return null; + const id = preferences.vision_llm_config_id; + if (id === null || id === undefined) return null; + const globalMatch = visionGlobalConfigs?.find((c) => c.id === id); + if (globalMatch) return globalMatch; + return visionUserConfigs?.find((c) => c.id === id) ?? null; + }, [preferences, visionGlobalConfigs, visionUserConfigs]); + + const isVisionAutoMode = useMemo(() => { + return ( + currentVisionConfig && + "is_auto_mode" in currentVisionConfig && + currentVisionConfig.is_auto_mode + ); + }, [currentVisionConfig]); + // ─── LLM filtering ─── const filteredLLMGlobal = useMemo(() => { if (!llmGlobalConfigs) return []; @@ -170,6 +213,33 @@ export function ModelSelector({ const totalImageModels = (imageGlobalConfigs?.length ?? 0) + (imageUserConfigs?.length ?? 0); + // ─── Vision filtering ─── + const filteredVisionGlobal = useMemo(() => { + if (!visionGlobalConfigs) return []; + if (!visionSearchQuery) return visionGlobalConfigs; + const q = visionSearchQuery.toLowerCase(); + return visionGlobalConfigs.filter( + (c) => + c.name.toLowerCase().includes(q) || + c.model_name.toLowerCase().includes(q) || + c.provider.toLowerCase().includes(q) + ); + }, [visionGlobalConfigs, visionSearchQuery]); + + const filteredVisionUser = useMemo(() => { + if (!visionUserConfigs) return []; + if (!visionSearchQuery) return visionUserConfigs; + const q = visionSearchQuery.toLowerCase(); + return visionUserConfigs.filter( + (c) => + c.name.toLowerCase().includes(q) || + c.model_name.toLowerCase().includes(q) || + c.provider.toLowerCase().includes(q) + ); + }, [visionUserConfigs, visionSearchQuery]); + + const totalVisionModels = (visionGlobalConfigs?.length ?? 0) + (visionUserConfigs?.length ?? 0); + // ─── Handlers ─── const handleSelectLLM = useCallback( async (config: NewLLMConfigPublic | GlobalNewLLMConfig) => { @@ -229,6 +299,30 @@ export function ModelSelector({ [currentImageConfig, searchSpaceId, updatePreferences] ); + const handleSelectVision = useCallback( + async (configId: number) => { + if (currentVisionConfig?.id === configId) { + setOpen(false); + return; + } + if (!searchSpaceId) { + toast.error("No search space selected"); + return; + } + try { + await updatePreferences({ + search_space_id: Number(searchSpaceId), + data: { vision_llm_config_id: configId }, + }); + toast.success("Vision model updated"); + setOpen(false); + } catch { + toast.error("Failed to switch vision model"); + } + }, + [currentVisionConfig, searchSpaceId, updatePreferences] + ); + return ( @@ -282,6 +376,23 @@ export function ModelSelector({ ) : ( )} + + {/* Divider */} +
+ + {/* Vision section */} + {currentVisionConfig ? ( + <> + {getProviderIcon(currentVisionConfig.provider, { + isAutoMode: isVisionAutoMode ?? false, + })} + + {currentVisionConfig.name} + + + ) : ( + + )} )} @@ -295,25 +406,32 @@ export function ModelSelector({ > setActiveTab(v as "llm" | "image")} + onValueChange={(v) => setActiveTab(v as "llm" | "image" | "vision")} className="w-full" >
- + - + LLM - + Image + + + Vision +
@@ -344,7 +462,7 @@ export function ModelSelector({ >
- +

No models found

Try a different search term

@@ -498,7 +616,7 @@ export function ModelSelector({ }} > - Add LLM Model + Add Model
@@ -531,8 +649,9 @@ export function ModelSelector({ >
- +

No image models found

+

Try a different search term

@@ -675,6 +794,174 @@ export function ModelSelector({ + + {/* ─── Vision Tab ─── */} + + + {totalVisionModels > 3 && ( +
+ +
+ )} + + +
+ +

No vision models found

+

Try a different search term

+
+
+ + {filteredVisionGlobal.length > 0 && ( + +
+ Global Vision Models +
+ {filteredVisionGlobal.map((config) => { + const isSelected = currentVisionConfig?.id === config.id; + const isAuto = "is_auto_mode" in config && config.is_auto_mode; + return ( + handleSelectVision(config.id)} + className={cn( + "mx-2 rounded-lg mb-1 cursor-pointer group transition-all hover:bg-accent/50 dark:hover:bg-white/[0.06]", + isSelected && "bg-accent/80 dark:bg-white/[0.06]" + )} + > +
+
+ {getProviderIcon(config.provider, { isAutoMode: isAuto })} +
+
+
+ {config.name} + {isAuto && ( + + Recommended + + )} + {isSelected && } +
+ + {isAuto ? "Auto Mode" : config.model_name} + +
+ {onEditVision && !isAuto && ( + + )} +
+
+ ); + })} +
+ )} + + {filteredVisionUser.length > 0 && ( + <> + {filteredVisionGlobal.length > 0 && ( + + )} + +
+ Your Vision Models +
+ {filteredVisionUser.map((config) => { + const isSelected = currentVisionConfig?.id === config.id; + return ( + handleSelectVision(config.id)} + className={cn( + "mx-2 rounded-lg mb-1 cursor-pointer group transition-all hover:bg-accent/50 dark:hover:bg-white/[0.06]", + isSelected && "bg-accent/80 dark:bg-white/[0.06]" + )} + > +
+
{getProviderIcon(config.provider)}
+
+
+ {config.name} + {isSelected && ( + + )} +
+ + {config.model_name} + +
+ {onEditVision && ( + + )} +
+
+ ); + })} +
+ + )} + + {onAddNewVision && ( +
+ +
+ )} +
+
+
diff --git a/surfsense_web/components/new-chat/prompt-picker.tsx b/surfsense_web/components/new-chat/prompt-picker.tsx index 9fc435111..3e6457b8c 100644 --- a/surfsense_web/components/new-chat/prompt-picker.tsx +++ b/surfsense_web/components/new-chat/prompt-picker.tsx @@ -5,6 +5,7 @@ import { Plus, Zap } from "lucide-react"; import { forwardRef, useCallback, + useDeferredValue, useEffect, useImperativeHandle, useMemo, @@ -41,15 +42,19 @@ export const PromptPicker = forwardRef(funct const shouldScrollRef = useRef(false); const itemRefs = useRef>(new Map()); + // Defer the search value so filtering is non-urgent and the input stays responsive + const deferredSearch = useDeferredValue(externalSearch); + const filtered = useMemo(() => { const list = prompts ?? []; - if (!externalSearch) return list; - return list.filter((a) => a.name.toLowerCase().includes(externalSearch.toLowerCase())); - }, [prompts, externalSearch]); + if (!deferredSearch) return list; + return list.filter((a) => a.name.toLowerCase().includes(deferredSearch.toLowerCase())); + }, [prompts, deferredSearch]); - const prevSearchRef = useRef(externalSearch); - if (prevSearchRef.current !== externalSearch) { - prevSearchRef.current = externalSearch; + // Reset highlight when the deferred (filtered) search changes + const prevSearchRef = useRef(deferredSearch); + if (prevSearchRef.current !== deferredSearch) { + prevSearchRef.current = deferredSearch; if (highlightedIndex !== 0) { setHighlightedIndex(0); } diff --git a/surfsense_web/components/new-chat/source-detail-panel.tsx b/surfsense_web/components/new-chat/source-detail-panel.tsx index b02b2e217..ed6c7e4ac 100644 --- a/surfsense_web/components/new-chat/source-detail-panel.tsx +++ b/surfsense_web/components/new-chat/source-detail-panel.tsx @@ -1,7 +1,18 @@ "use client"; import { useQuery } from "@tanstack/react-query"; -import { BookOpen, ChevronDown, ExternalLink, FileText, Hash, Sparkles, X } from "lucide-react"; +import { + BookOpen, + ChevronDown, + ChevronUp, + ExternalLink, + FileQuestionMark, + FileText, + Hash, + Loader2, + Sparkles, + X, +} from "lucide-react"; import { AnimatePresence, motion, useReducedMotion } from "motion/react"; import { useTranslations } from "next-intl"; import type React from "react"; @@ -10,7 +21,6 @@ import { createPortal } from "react-dom"; import { MarkdownViewer } from "@/components/markdown-viewer"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; -import { Collapsible, CollapsibleContent, CollapsibleTrigger } from "@/components/ui/collapsible"; import { ScrollArea } from "@/components/ui/scroll-area"; import { Spinner } from "@/components/ui/spinner"; import type { @@ -48,7 +58,8 @@ const formatDocumentType = (type: string) => { // which break auto-scroll functionality interface ChunkCardProps { chunk: { id: number; content: string }; - index: number; + localIndex: number; + chunkNumber: number; totalChunks: number; isCited: boolean; isActive: boolean; @@ -56,51 +67,52 @@ interface ChunkCardProps { } const ChunkCard = memo( - forwardRef(({ chunk, index, totalChunks, isCited }, ref) => { - return ( -
- {/* Cited indicator glow effect */} - {isCited &&
} - - {/* Header */} -
-
-
- {index + 1} -
- of {totalChunks} chunks -
- {isCited && ( - - - Cited Source - + forwardRef( + ({ chunk, localIndex, chunkNumber, totalChunks, isCited }, ref) => { + return ( +
+ > + {isCited &&
} - {/* Content */} -
- +
+
+
+ {chunkNumber} +
+ + Chunk {chunkNumber} of {totalChunks} + +
+ {isCited && ( + + + Cited Source + + )} +
+ +
+ +
-
- ); - }) + ); + } + ) ); ChunkCard.displayName = "ChunkCard"; @@ -118,7 +130,6 @@ export function SourceDetailPanel({ const t = useTranslations("dashboard"); const scrollAreaRef = useRef(null); const hasScrolledRef = useRef(false); // Use ref to avoid stale closures - const [summaryOpen, setSummaryOpen] = useState(false); const [activeChunkIndex, setActiveChunkIndex] = useState(null); const [mounted, setMounted] = useState(false); const [_hasScrolledToCited, setHasScrolledToCited] = useState(false); @@ -140,20 +151,93 @@ export function SourceDetailPanel({ if (isDocsChunk) { return documentsApiService.getSurfsenseDocByChunk(chunkId); } - return documentsApiService.getDocumentByChunk({ chunk_id: chunkId }); + return documentsApiService.getDocumentByChunk({ chunk_id: chunkId, chunk_window: 5 }); }, enabled: !!chunkId && open, staleTime: 5 * 60 * 1000, }); + const totalChunks = + documentData && "total_chunks" in documentData + ? (documentData.total_chunks ?? documentData.chunks.length) + : (documentData?.chunks?.length ?? 0); + const [beforeChunks, setBeforeChunks] = useState< + Array<{ id: number; content: string; created_at: string }> + >([]); + const [afterChunks, setAfterChunks] = useState< + Array<{ id: number; content: string; created_at: string }> + >([]); + const [loadingBefore, setLoadingBefore] = useState(false); + const [loadingAfter, setLoadingAfter] = useState(false); + + useEffect(() => { + setBeforeChunks([]); + setAfterChunks([]); + }, [chunkId, open]); + + const chunkStartIndex = + documentData && "chunk_start_index" in documentData ? (documentData.chunk_start_index ?? 0) : 0; + const initialChunks = documentData?.chunks ?? []; + const allChunks = [...beforeChunks, ...initialChunks, ...afterChunks]; + const absoluteStart = chunkStartIndex - beforeChunks.length; + const absoluteEnd = chunkStartIndex + initialChunks.length + afterChunks.length; + const canLoadBefore = absoluteStart > 0; + const canLoadAfter = absoluteEnd < totalChunks; + + const EXPAND_SIZE = 10; + + const loadBefore = useCallback(async () => { + if (!documentData || !("search_space_id" in documentData) || !canLoadBefore) return; + setLoadingBefore(true); + try { + const count = Math.min(EXPAND_SIZE, absoluteStart); + const result = await documentsApiService.getDocumentChunks({ + document_id: documentData.id, + page: 0, + page_size: count, + start_offset: absoluteStart - count, + }); + const existingIds = new Set(allChunks.map((c) => c.id)); + const newChunks = result.items + .filter((c) => !existingIds.has(c.id)) + .map((c) => ({ id: c.id, content: c.content, created_at: c.created_at })); + setBeforeChunks((prev) => [...newChunks, ...prev]); + } catch (err) { + console.error("Failed to load earlier chunks:", err); + } finally { + setLoadingBefore(false); + } + }, [documentData, absoluteStart, canLoadBefore, allChunks]); + + const loadAfter = useCallback(async () => { + if (!documentData || !("search_space_id" in documentData) || !canLoadAfter) return; + setLoadingAfter(true); + try { + const result = await documentsApiService.getDocumentChunks({ + document_id: documentData.id, + page: 0, + page_size: EXPAND_SIZE, + start_offset: absoluteEnd, + }); + const existingIds = new Set(allChunks.map((c) => c.id)); + const newChunks = result.items + .filter((c) => !existingIds.has(c.id)) + .map((c) => ({ id: c.id, content: c.content, created_at: c.created_at })); + setAfterChunks((prev) => [...prev, ...newChunks]); + } catch (err) { + console.error("Failed to load later chunks:", err); + } finally { + setLoadingAfter(false); + } + }, [documentData, absoluteEnd, canLoadAfter, allChunks]); + const isDirectRenderSource = sourceType === "TAVILY_API" || sourceType === "LINKUP_API" || sourceType === "SEARXNG_API" || sourceType === "BAIDU_SEARCH_API"; - // Find cited chunk index - const citedChunkIndex = documentData?.chunks?.findIndex((chunk) => chunk.id === chunkId) ?? -1; + const citedChunkIndex = allChunks.findIndex((chunk) => chunk.id === chunkId); // Simple scroll function that scrolls to a chunk by index const scrollToChunkByIndex = useCallback( @@ -336,10 +420,10 @@ export function SourceDetailPanel({ {documentData && "document_type" in documentData ? formatDocumentType(documentData.document_type) : sourceType && formatDocumentType(sourceType)} - {documentData?.chunks && ( + {totalChunks > 0 && ( - • {documentData.chunks.length} chunk - {documentData.chunks.length !== 1 ? "s" : ""} + • {totalChunks} chunk{totalChunks !== 1 ? "s" : ""} + {allChunks.length < totalChunks && ` (showing ${allChunks.length})`} )}

@@ -392,13 +476,11 @@ export function SourceDetailPanel({ animate={{ opacity: 1, scale: 1 }} className="flex flex-col items-center gap-4 text-center px-6" > -
- +
+
-

- Failed to load document -

+

Document unavailable

{documentByChunkFetchingError.message || "An unexpected error occurred. Please try again."} @@ -450,7 +532,7 @@ export function SourceDetailPanel({ {!isDirectRenderSource && documentData && (

{/* Chunk Navigation Sidebar */} - {documentData.chunks.length > 1 && ( + {allChunks.length > 1 && (
- {documentData.chunks.map((chunk, idx) => { + {allChunks.map((chunk, idx) => { + const absNum = absoluteStart + idx + 1; const isCited = chunk.id === chunkId; const isActive = activeChunkIndex === idx; return ( @@ -478,9 +561,9 @@ export function SourceDetailPanel({ ? "bg-muted text-foreground" : "bg-muted/50 text-muted-foreground hover:bg-muted hover:text-foreground" )} - title={isCited ? `Chunk ${idx + 1} (Cited)` : `Chunk ${idx + 1}`} + title={isCited ? `Chunk ${absNum} (Cited)` : `Chunk ${absNum}`} > - {idx + 1} + {absNum} {isCited && ( @@ -524,44 +607,11 @@ export function SourceDetailPanel({ )} - {/* Summary Collapsible */} - {documentData.content && ( - - - - - - Document Summary - - - - - - - - - - - - - )} - {/* Chunks Header */} -
+

- Content Chunks + Chunks {absoluteStart + 1}–{absoluteEnd} of {totalChunks}

{citedChunkIndex !== -1 && ( +
+ )} + {/* Chunks */}
- {documentData.chunks.map((chunk, idx) => { + {allChunks.map((chunk, idx) => { const isCited = chunk.id === chunkId; + const chunkNumber = absoluteStart + idx + 1; return ( 30} + disableLayoutAnimation={allChunks.length > 30} /> ); })}
+ + {/* Load Later */} + {canLoadAfter && ( +
+ +
+ )}
diff --git a/surfsense_web/components/onboarding-tour.tsx b/surfsense_web/components/onboarding-tour.tsx index d35799137..178b6c97e 100644 --- a/surfsense_web/components/onboarding-tour.tsx +++ b/surfsense_web/components/onboarding-tour.tsx @@ -429,6 +429,7 @@ export function OnboardingTour() { const pathname = usePathname(); const retryCountRef = useRef(0); const retryTimerRef = useRef | null>(null); + const startCheckTimerRef = useRef | null>(null); const maxRetries = 10; // Track previous user ID to detect user changes const previousUserIdRef = useRef(null); @@ -460,6 +461,7 @@ export function OnboardingTour() { // Find and track target element with retry logic const updateTarget = useCallback(() => { + if (retryTimerRef.current) clearTimeout(retryTimerRef.current); if (!currentStep) return; const el = document.querySelector(currentStep.target); @@ -480,11 +482,13 @@ export function OnboardingTour() { } }, 200); } + }, [currentStep]); + useEffect(() => { return () => { if (retryTimerRef.current) clearTimeout(retryTimerRef.current); }; - }, [currentStep]); + }, []); // Check if tour should run: localStorage + data validation with user ID tracking useEffect(() => { @@ -573,15 +577,15 @@ export function OnboardingTour() { setPosition(calculatePosition(connectorEl, TOUR_STEPS[0].placement)); } else { // Retry after delay - setTimeout(checkAndStartTour, 200); + startCheckTimerRef.current = setTimeout(checkAndStartTour, 200); } }; // Start checking after initial delay - const timer = setTimeout(checkAndStartTour, 500); + startCheckTimerRef.current = setTimeout(checkAndStartTour, 500); return () => { cancelled = true; - clearTimeout(timer); + if (startCheckTimerRef.current) clearTimeout(startCheckTimerRef.current); }; }, [mounted, user?.id, searchSpaceId, pathname, threadsData, documentTypeCounts, connectors]); @@ -598,11 +602,11 @@ export function OnboardingTour() { }; window.addEventListener("resize", handleUpdate); - window.addEventListener("scroll", handleUpdate, true); + window.addEventListener("scroll", handleUpdate, { capture: true, passive: true }); return () => { window.removeEventListener("resize", handleUpdate); - window.removeEventListener("scroll", handleUpdate, true); + window.removeEventListener("scroll", handleUpdate, { capture: true }); }; }, [isActive, targetEl, currentStep?.placement]); diff --git a/surfsense_web/components/platform-gate.tsx b/surfsense_web/components/platform-gate.tsx new file mode 100644 index 000000000..6908c6d32 --- /dev/null +++ b/surfsense_web/components/platform-gate.tsx @@ -0,0 +1,16 @@ +"use client"; + +import type { ReactNode } from "react"; +import { usePlatform } from "@/hooks/use-platform"; + +export function DesktopOnly({ children }: { children: ReactNode }) { + const { isDesktop } = usePlatform(); + if (!isDesktop) return null; + return <>{children}; +} + +export function WebOnly({ children }: { children: ReactNode }) { + const { isWeb } = usePlatform(); + if (!isWeb) return null; + return <>{children}; +} diff --git a/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx b/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx index ddf4746aa..4bb49c48d 100644 --- a/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx +++ b/surfsense_web/components/public-chat-snapshots/public-chat-snapshot-row.tsx @@ -1,6 +1,6 @@ "use client"; -import { Check, Copy, ExternalLink, MessageSquare, Trash2 } from "lucide-react"; +import { Check, Copy, Dot, ExternalLink, MessageSquare, Trash2 } from "lucide-react"; import { useCallback, useRef, useState } from "react"; import { Avatar, AvatarFallback, AvatarImage } from "@/components/ui/avatar"; import { Badge } from "@/components/ui/badge"; @@ -153,7 +153,7 @@ export function PublicChatSnapshotRow({ {formattedDate} {member && ( <> - · + diff --git a/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx b/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx index 4a4a57770..4e8ec5bb6 100644 --- a/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx +++ b/surfsense_web/components/public-chat-snapshots/public-chat-snapshots-empty-state.tsx @@ -11,11 +11,8 @@ export function PublicChatSnapshotsEmptyState({ }: PublicChatSnapshotsEmptyStateProps) { return (
-
- -
-

{title}

-

{description}

+

{title}

+

{description}

); } diff --git a/surfsense_web/components/public-chat/public-chat-footer.tsx b/surfsense_web/components/public-chat/public-chat-footer.tsx index 79b317ddf..e341a9a0c 100644 --- a/surfsense_web/components/public-chat/public-chat-footer.tsx +++ b/surfsense_web/components/public-chat/public-chat-footer.tsx @@ -1,7 +1,7 @@ "use client"; import { Copy } from "lucide-react"; -import { useRouter, useSearchParams } from "next/navigation"; +import { useRouter } from "next/navigation"; import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { Button } from "@/components/ui/button"; @@ -15,7 +15,6 @@ interface PublicChatFooterProps { export function PublicChatFooter({ shareToken }: PublicChatFooterProps) { const router = useRouter(); - const searchParams = useSearchParams(); const [isCloning, setIsCloning] = useState(false); const hasAutoCloned = useRef(false); @@ -36,9 +35,11 @@ export function PublicChatFooter({ shareToken }: PublicChatFooterProps) { } }, [shareToken, router]); - // Auto-trigger clone if user just logged in with action=clone + // Auto-trigger clone if user just logged in with action=clone. + // Read from window.location.search on mount — no subscription needed since + // this is a one-time post-login check. (Vercel Best Practice: rerender-defer-reads 5.2) useEffect(() => { - const action = searchParams.get("action"); + const action = new URLSearchParams(window.location.search).get("action"); const token = getBearerToken(); // Only auto-clone once, if authenticated and action=clone is present @@ -46,7 +47,7 @@ export function PublicChatFooter({ shareToken }: PublicChatFooterProps) { hasAutoCloned.current = true; triggerClone(); } - }, [searchParams, isCloning, triggerClone]); + }, [isCloning, triggerClone]); const handleCopyAndContinue = async () => { const token = getBearerToken(); diff --git a/surfsense_web/components/settings/image-model-manager.tsx b/surfsense_web/components/settings/image-model-manager.tsx index 8f08b7db3..23162b629 100644 --- a/surfsense_web/components/settings/image-model-manager.tsx +++ b/surfsense_web/components/settings/image-model-manager.tsx @@ -1,7 +1,7 @@ "use client"; import { useAtomValue } from "jotai"; -import { AlertCircle, Edit3, Info, Plus, RefreshCw, Trash2, Wand2 } from "lucide-react"; +import { AlertCircle, Dot, Edit3, Info, RefreshCw, Trash2, Wand2 } from "lucide-react"; import { useMemo, useState } from "react"; import { deleteImageGenConfigMutationAtom } from "@/atoms/image-gen-config/image-gen-config-mutation.atoms"; import { @@ -240,27 +240,14 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) { {!isLoading && (
{(userConfigs?.length ?? 0) === 0 ? ( - + -
- -
-

No Image Models Yet

-

+

No Image Models Yet

+

{canCreate ? "Add your own image generation model (DALL-E 3, GPT Image 1, etc.)" : "No image models have been added to this space yet. Contact a space owner to add one."}

- {canCreate && ( - - )}
) : ( @@ -343,7 +330,7 @@ export function ImageModelManager({ searchSpaceId }: ImageModelManagerProps) { {member && ( <> - · + diff --git a/surfsense_web/components/settings/llm-role-manager.tsx b/surfsense_web/components/settings/llm-role-manager.tsx index 07ec492a3..995159d58 100644 --- a/surfsense_web/components/settings/llm-role-manager.tsx +++ b/surfsense_web/components/settings/llm-role-manager.tsx @@ -4,16 +4,15 @@ import { useAtomValue } from "jotai"; import { AlertCircle, Bot, - CheckCircle, + CircleCheck, CircleDashed, + Eye, FileText, ImageIcon, RefreshCw, - RotateCcw, - Save, Shuffle, } from "lucide-react"; -import { useEffect, useState } from "react"; +import { useCallback, useEffect, useRef, useState } from "react"; import { toast } from "sonner"; import { globalImageGenConfigsAtom, @@ -25,6 +24,10 @@ import { llmPreferencesAtom, newLLMConfigsAtom, } from "@/atoms/new-llm-config/new-llm-config-query.atoms"; +import { + globalVisionLLMConfigsAtom, + visionLLMConfigsAtom, +} from "@/atoms/vision-llm-config/vision-llm-config-query.atoms"; import { Alert, AlertDescription } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; @@ -40,6 +43,7 @@ import { SelectValue, } from "@/components/ui/select"; import { Skeleton } from "@/components/ui/skeleton"; +import { Spinner } from "@/components/ui/spinner"; import { getProviderIcon } from "@/lib/provider-icons"; import { cn } from "@/lib/utils"; @@ -48,8 +52,8 @@ const ROLE_DESCRIPTIONS = { icon: Bot, title: "Agent LLM", description: "Primary LLM for chat interactions and agent operations", - color: "text-blue-600 dark:text-blue-400", - bgColor: "bg-blue-500/10", + color: "text-muted-foreground", + bgColor: "bg-muted", prefKey: "agent_llm_id" as const, configType: "llm" as const, }, @@ -57,8 +61,8 @@ const ROLE_DESCRIPTIONS = { icon: FileText, title: "Document Summary LLM", description: "Handles document summarization and research synthesis", - color: "text-purple-600 dark:text-purple-400", - bgColor: "bg-purple-500/10", + color: "text-muted-foreground", + bgColor: "bg-muted", prefKey: "document_summary_llm_id" as const, configType: "llm" as const, }, @@ -66,11 +70,20 @@ const ROLE_DESCRIPTIONS = { icon: ImageIcon, title: "Image Generation Model", description: "Model used for AI image generation (DALL-E, GPT Image, etc.)", - color: "text-teal-600 dark:text-teal-400", - bgColor: "bg-teal-500/10", + color: "text-muted-foreground", + bgColor: "bg-muted", prefKey: "image_generation_config_id" as const, configType: "image" as const, }, + vision: { + icon: Eye, + title: "Vision LLM", + description: "Vision-capable model for screenshot analysis and context extraction", + color: "text-amber-600 dark:text-amber-400", + bgColor: "bg-amber-500/10", + prefKey: "vision_llm_config_id" as const, + configType: "vision" as const, + }, }; interface LLMRoleManagerProps { @@ -103,6 +116,18 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { error: globalImageConfigsError, } = useAtomValue(globalImageGenConfigsAtom); + // Vision LLM configs + const { + data: userVisionConfigs = [], + isFetching: visionConfigsLoading, + error: visionConfigsError, + } = useAtomValue(visionLLMConfigsAtom); + const { + data: globalVisionConfigs = [], + isFetching: globalVisionConfigsLoading, + error: globalVisionConfigsError, + } = useAtomValue(globalVisionLLMConfigsAtom); + // Preferences const { data: preferences = {}, @@ -116,90 +141,49 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { agent_llm_id: preferences.agent_llm_id ?? "", document_summary_llm_id: preferences.document_summary_llm_id ?? "", image_generation_config_id: preferences.image_generation_config_id ?? "", + vision_llm_config_id: preferences.vision_llm_config_id ?? "", })); - const [hasChanges, setHasChanges] = useState(false); - const [isSaving, setIsSaving] = useState(false); + const [savingRole, setSavingRole] = useState(null); + const savingRef = useRef(false); useEffect(() => { - const newAssignments = { - agent_llm_id: preferences.agent_llm_id ?? "", - document_summary_llm_id: preferences.document_summary_llm_id ?? "", - image_generation_config_id: preferences.image_generation_config_id ?? "", - }; - setAssignments(newAssignments); - setHasChanges(false); + if (!savingRef.current) { + setAssignments({ + agent_llm_id: preferences.agent_llm_id ?? "", + document_summary_llm_id: preferences.document_summary_llm_id ?? "", + image_generation_config_id: preferences.image_generation_config_id ?? "", + vision_llm_config_id: preferences.vision_llm_config_id ?? "", + }); + } }, [ preferences?.agent_llm_id, preferences?.document_summary_llm_id, preferences?.image_generation_config_id, + preferences?.vision_llm_config_id, ]); - const handleRoleAssignment = (prefKey: string, configId: string) => { - const newAssignments = { - ...assignments, - [prefKey]: configId === "unassigned" ? "" : parseInt(configId), - }; + const handleRoleAssignment = useCallback( + async (prefKey: string, configId: string) => { + const value = configId === "unassigned" ? "" : parseInt(configId); - setAssignments(newAssignments); + setAssignments((prev) => ({ ...prev, [prefKey]: value })); + setSavingRole(prefKey); + savingRef.current = true; - const currentPrefs = { - agent_llm_id: preferences.agent_llm_id ?? "", - document_summary_llm_id: preferences.document_summary_llm_id ?? "", - image_generation_config_id: preferences.image_generation_config_id ?? "", - }; - - const hasChangesNow = Object.keys(newAssignments).some( - (key) => - newAssignments[key as keyof typeof newAssignments] !== - currentPrefs[key as keyof typeof currentPrefs] - ); - - setHasChanges(hasChangesNow); - }; - - const handleSave = async () => { - setIsSaving(true); - - const toNumericOrUndefined = (val: string | number) => - typeof val === "string" ? (val ? parseInt(val) : undefined) : val; - - const numericAssignments = { - agent_llm_id: toNumericOrUndefined(assignments.agent_llm_id), - document_summary_llm_id: toNumericOrUndefined(assignments.document_summary_llm_id), - image_generation_config_id: toNumericOrUndefined(assignments.image_generation_config_id), - }; - - await updatePreferences({ - search_space_id: searchSpaceId, - data: numericAssignments, - }); - - setHasChanges(false); - toast.success("Role assignments saved successfully!"); - - setIsSaving(false); - }; - - const handleReset = () => { - setAssignments({ - agent_llm_id: preferences.agent_llm_id ?? "", - document_summary_llm_id: preferences.document_summary_llm_id ?? "", - image_generation_config_id: preferences.image_generation_config_id ?? "", - }); - setHasChanges(false); - }; - - const isAssignmentComplete = - assignments.agent_llm_id !== "" && - assignments.agent_llm_id !== null && - assignments.agent_llm_id !== undefined && - assignments.document_summary_llm_id !== "" && - assignments.document_summary_llm_id !== null && - assignments.document_summary_llm_id !== undefined && - assignments.image_generation_config_id !== "" && - assignments.image_generation_config_id !== null && - assignments.image_generation_config_id !== undefined; + try { + await updatePreferences({ + search_space_id: searchSpaceId, + data: { [prefKey]: value || undefined }, + }); + toast.success("Role assignment updated"); + } finally { + setSavingRole(null); + savingRef.current = false; + } + }, + [updatePreferences, searchSpaceId] + ); // Combine global and custom LLM configs const allLLMConfigs = [ @@ -213,18 +197,35 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { ...(userImageConfigs ?? []).filter((config) => config.id && config.id.toString().trim() !== ""), ]; + // Combine global and custom vision LLM configs + const allVisionConfigs = [ + ...globalVisionConfigs.map((config) => ({ ...config, is_global: true })), + ...(userVisionConfigs ?? []).filter( + (config) => config.id && config.id.toString().trim() !== "" + ), + ]; + + const isAssignmentComplete = + allLLMConfigs.some((c) => c.id === assignments.agent_llm_id) && + allLLMConfigs.some((c) => c.id === assignments.document_summary_llm_id) && + allImageConfigs.some((c) => c.id === assignments.image_generation_config_id); + const isLoading = configsLoading || preferencesLoading || globalConfigsLoading || imageConfigsLoading || - globalImageConfigsLoading; + globalImageConfigsLoading || + visionConfigsLoading || + globalVisionConfigsLoading; const hasError = configsError || preferencesError || globalConfigsError || imageConfigsError || - globalImageConfigsError; + globalImageConfigsError || + visionConfigsError || + globalVisionConfigsError; const hasAnyConfigs = allLLMConfigs.length > 0 || allImageConfigs.length > 0; return ( @@ -242,11 +243,8 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { Refresh {isAssignmentComplete && !isLoading && !hasError && ( - - + + All roles assigned )} @@ -321,21 +319,30 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {
{Object.entries(ROLE_DESCRIPTIONS).map(([key, role]) => { const IconComponent = role.icon; - const isImageRole = role.configType === "image"; const currentAssignment = assignments[role.prefKey as keyof typeof assignments]; // Pick the right config lists based on role type - const roleGlobalConfigs = isImageRole ? globalImageConfigs : globalConfigs; - const roleUserConfigs = isImageRole - ? (userImageConfigs ?? []).filter((c) => c.id && c.id.toString().trim() !== "") - : newLLMConfigs.filter((c) => c.id && c.id.toString().trim() !== ""); - const roleAllConfigs = isImageRole ? allImageConfigs : allLLMConfigs; + const roleGlobalConfigs = + role.configType === "image" + ? globalImageConfigs + : role.configType === "vision" + ? globalVisionConfigs + : globalConfigs; + const roleUserConfigs = + role.configType === "image" + ? (userImageConfigs ?? []).filter((c) => c.id && c.id.toString().trim() !== "") + : role.configType === "vision" + ? (userVisionConfigs ?? []).filter((c) => c.id && c.id.toString().trim() !== "") + : newLLMConfigs.filter((c) => c.id && c.id.toString().trim() !== ""); + const roleAllConfigs = + role.configType === "image" + ? allImageConfigs + : role.configType === "vision" + ? allVisionConfigs + : allLLMConfigs; const assignedConfig = roleAllConfigs.find((config) => config.id === currentAssignment); - const isAssigned = - currentAssignment !== "" && - currentAssignment !== null && - currentAssignment !== undefined; + const isAssigned = !!assignedConfig; const isAutoMode = assignedConfig && "is_auto_mode" in assignedConfig && assignedConfig.is_auto_mode; @@ -361,8 +368,10 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) {

- {isAssigned ? ( - + {savingRole === role.prefKey ? ( + + ) : isAssigned ? ( + ) : ( )} @@ -374,7 +383,7 @@ export function LLMRoleManager({ searchSpaceId }: LLMRoleManagerProps) { Configuration setFormData((p) => ({ ...p, name: e.target.value }))} + /> +
+ +
+ + setFormData((p) => ({ ...p, description: e.target.value }))} + /> +
+ + + +
+ + +
+ +
+ + + + + + + + + setFormData((p) => ({ ...p, model_name: val })) + } + /> + + +
+ {formData.model_name + ? `Using: "${formData.model_name}"` + : "Type your model name"} +
+
+ {availableModels.length > 0 && ( + + {availableModels + .filter( + (model) => + !formData.model_name || + model.value + .toLowerCase() + .includes(formData.model_name.toLowerCase()) || + model.label + .toLowerCase() + .includes(formData.model_name.toLowerCase()) + ) + .slice(0, 50) + .map((model) => ( + { + setFormData((p) => ({ + ...p, + model_name: value, + })); + setModelComboboxOpen(false); + }} + className="py-2" + > + +
+
{model.label}
+ {model.contextWindow && ( +
+ Context: {model.contextWindow} +
+ )} +
+
+ ))} +
+ )} +
+
+
+
+
+ +
+ + setFormData((p) => ({ ...p, api_key: e.target.value }))} + /> +
+ +
+ + setFormData((p) => ({ ...p, api_base: e.target.value }))} + /> +
+ + {formData.provider === "AZURE_OPENAI" && ( +
+ + setFormData((p) => ({ ...p, api_version: e.target.value }))} + /> +
+ )} +
+ )} +
+ +
+ + {mode === "create" || (mode === "edit" && !isGlobal) ? ( + + ) : isGlobal && config ? ( + + ) : null} +
+ + + ); +} diff --git a/surfsense_web/components/sources/DocumentUploadTab.tsx b/surfsense_web/components/sources/DocumentUploadTab.tsx index 6817b19db..76af48c45 100644 --- a/surfsense_web/components/sources/DocumentUploadTab.tsx +++ b/surfsense_web/components/sources/DocumentUploadTab.tsx @@ -1,33 +1,42 @@ "use client"; import { useAtom } from "jotai"; -import { CheckCircle2, FileType, Info, Upload, X } from "lucide-react"; +import { ChevronDown, Dot, File as FileIcon, FolderOpen, Upload, X } from "lucide-react"; import { useTranslations } from "next-intl"; -import { useCallback, useMemo, useRef, useState } from "react"; +import { type ChangeEvent, useCallback, useEffect, useMemo, useRef, useState } from "react"; import { useDropzone } from "react-dropzone"; import { toast } from "sonner"; import { uploadDocumentMutationAtom } from "@/atoms/documents/document-mutation.atoms"; -import { SummaryConfig } from "@/components/assistant-ui/connector-popup/components/summary-config"; import { Accordion, AccordionContent, AccordionItem, AccordionTrigger, } from "@/components/ui/accordion"; -import { Alert, AlertDescription } from "@/components/ui/alert"; import { Badge } from "@/components/ui/badge"; import { Button } from "@/components/ui/button"; -import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card"; +import { + DropdownMenu, + DropdownMenuContent, + DropdownMenuItem, + DropdownMenuTrigger, +} from "@/components/ui/dropdown-menu"; import { Progress } from "@/components/ui/progress"; -import { Separator } from "@/components/ui/separator"; import { Spinner } from "@/components/ui/spinner"; +import { Switch } from "@/components/ui/switch"; +import { useElectronAPI } from "@/hooks/use-platform"; +import { documentsApiService } from "@/lib/apis/documents-api.service"; import { trackDocumentUploadFailure, trackDocumentUploadStarted, trackDocumentUploadSuccess, } from "@/lib/posthog/events"; -import { GridPattern } from "./GridPattern"; + +interface SelectedFolder { + path: string; + name: string; +} interface DocumentUploadTabProps { searchSpaceId: string; @@ -51,6 +60,7 @@ const commonTypes = { "application/vnd.openxmlformats-officedocument.presentationml.presentation": [".pptx"], "text/html": [".html", ".htm"], "text/csv": [".csv"], + "text/tab-separated-values": [".tsv"], "image/jpeg": [".jpg", ".jpeg"], "image/png": [".png"], "image/bmp": [".bmp"], @@ -76,8 +86,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/rtf": [".rtf"], "application/xml": [".xml"], "application/epub+zip": [".epub"], - "text/tab-separated-values": [".tsv"], - "text/html": [".html", ".htm", ".web"], "image/gif": [".gif"], "image/svg+xml": [".svg"], ...audioFileTypes, @@ -102,7 +110,6 @@ const FILE_TYPE_CONFIG: Record> = { "application/vnd.ms-powerpoint": [".ppt"], "text/x-rst": [".rst"], "application/rtf": [".rtf"], - "text/tab-separated-values": [".tsv"], "application/vnd.ms-excel": [".xls"], "application/xml": [".xml"], ...audioFileTypes, @@ -114,12 +121,11 @@ interface FileWithId { file: File; } -const cardClass = "border border-border bg-slate-400/5 dark:bg-white/5"; +const MAX_FILE_SIZE_MB = 500; +const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024; -// Upload limits — files are sent in batches of 5 to avoid proxy timeouts -const MAX_FILES = 50; -const MAX_TOTAL_SIZE_MB = 200; -const MAX_TOTAL_SIZE_BYTES = MAX_TOTAL_SIZE_MB * 1024 * 1024; +const toggleRowClass = + "flex items-center justify-between rounded-lg bg-slate-400/5 dark:bg-white/5 p-3"; export function DocumentUploadTab({ searchSpaceId, @@ -134,6 +140,22 @@ export function DocumentUploadTab({ const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom); const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation; const fileInputRef = useRef(null); + const folderInputRef = useRef(null); + const progressIntervalRef = useRef | null>(null); + + useEffect(() => { + return () => { + if (progressIntervalRef.current) { + clearInterval(progressIntervalRef.current); + } + }; + }, []); + + const electronAPI = useElectronAPI(); + const [selectedFolder, setSelectedFolder] = useState(null); + const [watchFolder, setWatchFolder] = useState(true); + const [folderSubmitting, setFolderSubmitting] = useState(false); + const isElectron = !!electronAPI?.browseFiles; const acceptedFileTypes = useMemo(() => { const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE; @@ -145,49 +167,104 @@ export function DocumentUploadTab({ [acceptedFileTypes] ); - const onDrop = useCallback( - (acceptedFiles: File[]) => { + const supportedExtensionsSet = useMemo( + () => new Set(supportedExtensions.map((ext) => ext.toLowerCase())), + [supportedExtensions] + ); + + const addFiles = useCallback( + (incoming: File[]) => { + const oversized = incoming.filter((f) => f.size > MAX_FILE_SIZE_BYTES); + if (oversized.length > 0) { + toast.error(t("file_too_large"), { + description: t("file_too_large_desc", { + name: oversized[0].name, + maxMB: MAX_FILE_SIZE_MB, + }), + }); + } + const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES); + if (valid.length === 0) return; + setFiles((prev) => { - const newEntries = acceptedFiles.map((f) => ({ + const newEntries = valid.map((f) => ({ id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, file: f, })); - const newFiles = [...prev, ...newEntries]; - - if (newFiles.length > MAX_FILES) { - toast.error(t("max_files_exceeded"), { - description: t("max_files_exceeded_desc", { max: MAX_FILES }), - }); - return prev; - } - - const newTotalSize = newFiles.reduce((sum, entry) => sum + entry.file.size, 0); - if (newTotalSize > MAX_TOTAL_SIZE_BYTES) { - toast.error(t("max_size_exceeded"), { - description: t("max_size_exceeded_desc", { max: MAX_TOTAL_SIZE_MB }), - }); - return prev; - } - - return newFiles; + return [...prev, ...newEntries]; }); }, [t] ); + const onDrop = useCallback( + (acceptedFiles: File[]) => { + setSelectedFolder(null); + addFiles(acceptedFiles); + }, + [addFiles] + ); + const { getRootProps, getInputProps, isDragActive } = useDropzone({ onDrop, accept: acceptedFileTypes, - maxSize: 50 * 1024 * 1024, // 50MB per file - noClick: false, - disabled: files.length >= MAX_FILES, + maxSize: MAX_FILE_SIZE_BYTES, + noClick: isElectron, }); - // Handle file input click to prevent event bubbling that might reopen dialog const handleFileInputClick = useCallback((e: React.MouseEvent) => { e.stopPropagation(); }, []); + const handleBrowseFiles = useCallback(async () => { + if (!electronAPI?.browseFiles) return; + + const paths = await electronAPI.browseFiles(); + if (!paths || paths.length === 0) return; + + setSelectedFolder(null); + const fileDataList = await electronAPI.readLocalFiles(paths); + const newFiles: FileWithId[] = fileDataList.map((fd) => ({ + id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`, + file: new File([fd.data], fd.name, { type: fd.mimeType }), + })); + setFiles((prev) => [...prev, ...newFiles]); + }, [electronAPI]); + + const handleBrowseFolder = useCallback(async () => { + if (!electronAPI?.selectFolder) return; + + const folderPath = await electronAPI.selectFolder(); + if (!folderPath) return; + + const folderName = folderPath.split("/").pop() || folderPath.split("\\").pop() || folderPath; + setFiles([]); + setSelectedFolder({ path: folderPath, name: folderName }); + setWatchFolder(true); + }, [electronAPI]); + + const handleFolderChange = useCallback( + (e: ChangeEvent) => { + const fileList = e.target.files; + if (!fileList || fileList.length === 0) return; + + const folderFiles = Array.from(fileList).filter((f) => { + const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : ""; + return ext !== "" && supportedExtensionsSet.has(ext); + }); + + if (folderFiles.length === 0) { + toast.error(t("no_supported_files_in_folder")); + e.target.value = ""; + return; + } + + addFiles(folderFiles); + e.target.value = ""; + }, + [addFiles, supportedExtensionsSet, t] + ); + const formatFileSize = (bytes: number) => { if (bytes === 0) return "0 Bytes"; const k = 1024; @@ -198,16 +275,8 @@ export function DocumentUploadTab({ const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0); - // Check if limits are reached - const isFileCountLimitReached = files.length >= MAX_FILES; - const isSizeLimitReached = totalFileSize >= MAX_TOTAL_SIZE_BYTES; - const remainingFiles = MAX_FILES - files.length; - const remainingSizeMB = Math.max( - 0, - (MAX_TOTAL_SIZE_BYTES - totalFileSize) / (1024 * 1024) - ).toFixed(1); + const hasContent = files.length > 0 || selectedFolder !== null; - // Track accordion state changes const handleAccordionChange = useCallback( (value: string) => { setAccordionValue(value); @@ -216,11 +285,57 @@ export function DocumentUploadTab({ [onAccordionStateChange] ); + const handleFolderSubmit = useCallback(async () => { + if (!selectedFolder || !electronAPI) return; + + setFolderSubmitting(true); + try { + const numericSpaceId = Number(searchSpaceId); + const result = await documentsApiService.folderIndex(numericSpaceId, { + folder_path: selectedFolder.path, + folder_name: selectedFolder.name, + search_space_id: numericSpaceId, + enable_summary: shouldSummarize, + }); + + const rootFolderId = (result as { root_folder_id?: number })?.root_folder_id ?? null; + + if (watchFolder) { + await electronAPI.addWatchedFolder({ + path: selectedFolder.path, + name: selectedFolder.name, + excludePatterns: [ + ".git", + "node_modules", + "__pycache__", + ".DS_Store", + ".obsidian", + ".trash", + ], + fileExtensions: null, + rootFolderId, + searchSpaceId: Number(searchSpaceId), + active: true, + }); + toast.success(`Watching folder: ${selectedFolder.name}`); + } else { + toast.success(`Syncing folder: ${selectedFolder.name}`); + } + + setSelectedFolder(null); + onSuccess?.(); + } catch (err) { + toast.error((err as Error)?.message || "Failed to process folder"); + } finally { + setFolderSubmitting(false); + } + }, [selectedFolder, watchFolder, searchSpaceId, shouldSummarize, onSuccess, electronAPI]); + const handleUpload = async () => { setUploadProgress(0); trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize); - const progressInterval = setInterval(() => { + progressIntervalRef.current = setInterval(() => { setUploadProgress((prev) => (prev >= 90 ? prev : prev + Math.random() * 10)); }, 200); @@ -233,14 +348,14 @@ export function DocumentUploadTab({ }, { onSuccess: () => { - clearInterval(progressInterval); + if (progressIntervalRef.current) clearInterval(progressIntervalRef.current); setUploadProgress(100); trackDocumentUploadSuccess(Number(searchSpaceId), files.length); toast(t("upload_initiated"), { description: t("upload_initiated_desc") }); onSuccess?.(); }, onError: (error: unknown) => { - clearInterval(progressInterval); + if (progressIntervalRef.current) clearInterval(progressIntervalRef.current); setUploadProgress(0); const message = error instanceof Error ? error.message : "Upload failed"; trackDocumentUploadFailure(Number(searchSpaceId), message); @@ -252,207 +367,333 @@ export function DocumentUploadTab({ ); }; - return ( -
- - - - {t("file_size_limit")}{" "} - {t("upload_limits", { maxFiles: MAX_FILES, maxSizeMB: MAX_TOTAL_SIZE_MB })} - - + const renderBrowseButton = (options?: { compact?: boolean; fullWidth?: boolean }) => { + const { compact, fullWidth } = options ?? {}; + const sizeClass = compact ? "h-7" : "h-8"; + const widthClass = fullWidth ? "w-full" : ""; - -
- -
- -
+ e.stopPropagation()}> + + + e.stopPropagation()} > - - {isFileCountLimitReached ? ( -
- -
-

- {t("file_limit_reached")} -

-

- {t("file_limit_reached_desc", { max: MAX_FILES })} -

-
-
- ) : isDragActive ? ( -
- -

{t("drop_files")}

-
- ) : ( -
- -
-

{t("drag_drop")}

-

{t("or_browse")}

-
- {files.length > 0 && ( -

- {t("remaining_capacity", { files: remainingFiles, sizeMB: remainingSizeMB })} -

- )} -
- )} - {!isFileCountLimitReached && ( -
- -
- )} + + + Files + + + + Folder + +
+ + ); + } + + return ( + + e.stopPropagation()}> + + + e.stopPropagation()} + > + fileInputRef.current?.click()}> + + {t("browse_files")} + + folderInputRef.current?.click()}> + + {t("browse_folder")} + + + + ); + }; + + return ( +
+ {/* Hidden file input */} + + + {/* Hidden folder input for web folder browsing */} + )} + /> + + {/* MOBILE DROP ZONE */} +
+ {hasContent ? ( + !selectedFolder && + (isElectron ? ( +
{renderBrowseButton({ compact: true, fullWidth: true })}
+ ) : ( + + )) + ) : ( + + )} +
+ + {/* DESKTOP DROP ZONE */} +
+ {hasContent ? ( +
+ + + {isDragActive ? t("drop_files") : t("drag_drop_more")} + + {renderBrowseButton({ compact: true })}
- - - - {files.length > 0 && ( - - -
-
- - {t("selected_files", { count: files.length })} - - - {t("total_size")}: {formatFileSize(totalFileSize)} - -
- -
-
- -
- {files.map((entry) => ( -
-
- -
-

{entry.file.name}

-
- - {formatFileSize(entry.file.size)} - - - {entry.file.type || "Unknown type"} - -
-
-
- -
- ))} -
- - {isUploading && ( -
- -
-
- {t("uploading_files")} - {Math.round(uploadProgress)}% -
- -
+ ) : ( +
+ {isDragActive && ( +
+ +

{t("drop_files")}

)} - -
- +
+ +

{t("drag_drop")}

+

{t("file_size_limit")}

+
{renderBrowseButton()}
+
+ )} +
-
- + {/* FOLDER SELECTED (Electron only — web flattens folder contents into file list) */} + {isElectron && selectedFolder && ( +
+
+ +
+

{selectedFolder.name}

+

{selectedFolder.path}

- - + +
+ +
+
+
+

Watch folder

+

Auto-sync when files change

+
+ +
+
+
+

Enable AI Summary

+

+ Improves search quality but adds latency +

+
+ +
+
+ + +
)} + {/* FILES SELECTED */} + {files.length > 0 && ( +
+
+

+ {t("selected_files", { count: files.length })} + + {formatFileSize(totalFileSize)} +

+ +
+ +
+ {files.map((entry) => ( +
+ + {entry.file.name.split(".").pop() || "?"} + + {entry.file.name} + + {formatFileSize(entry.file.size)} + + +
+ ))} +
+ + {isUploading && ( +
+
+ {t("uploading_files")} + {Math.round(uploadProgress)}% +
+ +
+ )} + +
+
+

Enable AI Summary

+

+ Improves search quality but adds latency +

+
+ +
+ + +
+ )} + + {/* SUPPORTED FORMATS */} - - -
-
-
- {t("supported_file_types")} -
-
- {t("file_types_desc")} -
-
-
+ + + + {t("supported_file_types")} + - -
+ +
{supportedExtensions.map((ext) => ( - + {ext} ))} diff --git a/surfsense_web/components/tool-ui/citation/citation-list.tsx b/surfsense_web/components/tool-ui/citation/citation-list.tsx index 3151917b6..bbe869a09 100644 --- a/surfsense_web/components/tool-ui/citation/citation-list.tsx +++ b/surfsense_web/components/tool-ui/citation/citation-list.tsx @@ -2,6 +2,7 @@ import type { LucideIcon } from "lucide-react"; import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react"; +import NextImage from "next/image"; import * as React from "react"; import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/media"; import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter"; @@ -253,17 +254,17 @@ function OverflowItem({ citation, onClick }: OverflowItemProps) { className="group hover:bg-muted focus-visible:bg-muted flex w-full cursor-pointer items-center gap-2.5 rounded-md px-2 py-2 text-left transition-colors focus-visible:outline-none" > {citation.favicon ? ( - // biome-ignore lint/performance/noImgElement: external favicon from arbitrary domain — next/image requires remotePatterns config - ) : ( -