Merge commit '056fc0e7ff' into dev_mod

This commit is contained in:
DESKTOP-RTLN3BA\$punk 2026-04-07 02:56:46 -07:00
commit 82b5c7f19e
111 changed files with 4056 additions and 2219 deletions

View file

@ -225,6 +225,55 @@ class DropboxClient:
return all_items, None
async def get_latest_cursor(self, path: str = "") -> tuple[str | None, str | None]:
"""Get a cursor representing the current state of a folder.
Uses /2/files/list_folder/get_latest_cursor so we can later call
get_changes to receive only incremental updates.
"""
resp = await self._request(
"/2/files/list_folder/get_latest_cursor",
{"path": path, "recursive": False, "include_non_downloadable_files": True},
)
if resp.status_code != 200:
return None, f"Failed to get cursor: {resp.status_code} - {resp.text}"
return resp.json().get("cursor"), None
async def get_changes(
self, cursor: str
) -> tuple[list[dict[str, Any]], str | None, str | None]:
"""Fetch incremental changes since the given cursor.
Calls /2/files/list_folder/continue and handles pagination.
Returns (entries, new_cursor, error).
"""
all_entries: list[dict[str, Any]] = []
resp = await self._request("/2/files/list_folder/continue", {"cursor": cursor})
if resp.status_code == 401:
return [], None, "Dropbox authentication expired (401)"
if resp.status_code != 200:
return [], None, f"Failed to get changes: {resp.status_code} - {resp.text}"
data = resp.json()
all_entries.extend(data.get("entries", []))
while data.get("has_more"):
cursor = data["cursor"]
resp = await self._request(
"/2/files/list_folder/continue", {"cursor": cursor}
)
if resp.status_code != 200:
return (
all_entries,
data.get("cursor"),
f"Pagination failed: {resp.status_code}",
)
data = resp.json()
all_entries.extend(data.get("entries", []))
return all_entries, data.get("cursor"), None
async def get_metadata(self, path: str) -> tuple[dict[str, Any] | None, str | None]:
resp = await self._request("/2/files/get_metadata", {"path": path})
if resp.status_code != 200:

View file

@ -53,7 +53,8 @@ async def download_and_extract_content(
file_name = file.get("name", "Unknown")
file_id = file.get("id", "")
if should_skip_file(file):
skip, _unsup_ext = should_skip_file(file)
if skip:
return None, {}, "Skipping non-indexable item"
logger.info(f"Downloading file for content extraction: {file_name}")
@ -87,9 +88,13 @@ async def download_and_extract_content(
if error:
return None, metadata, error
from app.connectors.onedrive.content_extractor import _parse_file_to_markdown
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
result = await EtlPipelineService().extract(
EtlRequest(file_path=temp_file_path, filename=file_name)
)
markdown = result.markdown_content
return markdown, metadata, None
except Exception as e:

View file

@ -1,8 +1,8 @@
"""File type handlers for Dropbox."""
PAPER_EXTENSION = ".paper"
from app.etl_pipeline.file_classifier import should_skip_for_service
SKIP_EXTENSIONS: frozenset[str] = frozenset()
PAPER_EXTENSION = ".paper"
MIME_TO_EXTENSION: dict[str, str] = {
"application/pdf": ".pdf",
@ -42,17 +42,25 @@ def is_paper_file(item: dict) -> bool:
return ext == PAPER_EXTENSION
def should_skip_file(item: dict) -> bool:
def should_skip_file(item: dict) -> tuple[bool, str | None]:
"""Skip folders and truly non-indexable files.
Paper docs are non-downloadable but exportable, so they are NOT skipped.
Returns (should_skip, unsupported_extension_or_None).
"""
if is_folder(item):
return True
return True, None
if is_paper_file(item):
return False
return False, None
if not item.get("is_downloadable", True):
return True
return True, None
from pathlib import PurePosixPath
from app.config import config as app_config
name = item.get("name", "")
ext = get_extension_from_name(name).lower()
return ext in SKIP_EXTENSIONS
if should_skip_for_service(name, app_config.ETL_SERVICE):
ext = PurePosixPath(name).suffix.lower()
return True, ext
return False, None

View file

@ -64,8 +64,10 @@ async def get_files_in_folder(
)
continue
files.extend(sub_files)
elif not should_skip_file(item):
files.append(item)
else:
skip, _unsup_ext = should_skip_file(item)
if not skip:
files.append(item)
return files, None

View file

@ -1,12 +1,9 @@
"""Content extraction for Google Drive files."""
import asyncio
import contextlib
import logging
import os
import tempfile
import threading
import time
from pathlib import Path
from typing import Any
@ -20,6 +17,7 @@ from .file_types import (
get_export_mime_type,
get_extension_from_mime,
is_google_workspace_file,
should_skip_by_extension,
should_skip_file,
)
@ -45,6 +43,11 @@ async def download_and_extract_content(
if should_skip_file(mime_type):
return None, {}, f"Skipping {mime_type}"
if not is_google_workspace_file(mime_type):
ext_skip, _unsup_ext = should_skip_by_extension(file_name)
if ext_skip:
return None, {}, f"Skipping unsupported extension: {file_name}"
logger.info(f"Downloading file for content extraction: {file_name} ({mime_type})")
drive_metadata: dict[str, Any] = {
@ -97,7 +100,10 @@ async def download_and_extract_content(
if error:
return None, drive_metadata, error
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
etl_filename = (
file_name + extension if is_google_workspace_file(mime_type) else file_name
)
markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
return markdown, drive_metadata, None
except Exception as e:
@ -110,99 +116,14 @@ async def download_and_extract_content(
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
"""Parse a local file to markdown using the configured ETL service."""
lower = filename.lower()
"""Parse a local file to markdown using the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
if lower.endswith((".md", ".markdown", ".txt")):
with open(file_path, encoding="utf-8") as f:
return f.read()
if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
from litellm import atranscription
from app.config import config as app_config
stt_service_type = (
"local"
if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
else "external"
)
if stt_service_type == "local":
from app.services.stt_service import stt_service
t0 = time.monotonic()
logger.info(
f"[local-stt] START file={filename} thread={threading.current_thread().name}"
)
result = await asyncio.to_thread(stt_service.transcribe_file, file_path)
logger.info(
f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
)
text = result.get("text", "")
else:
with open(file_path, "rb") as audio_file:
kwargs: dict[str, Any] = {
"model": app_config.STT_SERVICE,
"file": audio_file,
"api_key": app_config.STT_SERVICE_API_KEY,
}
if app_config.STT_SERVICE_API_BASE:
kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
resp = await atranscription(**kwargs)
text = resp.get("text", "")
if not text:
raise ValueError("Transcription returned empty text")
return f"# Transcription of {filename}\n\n{text}"
# Document files -- use configured ETL service
from app.config import config as app_config
if app_config.ETL_SERVICE == "UNSTRUCTURED":
from langchain_unstructured import UnstructuredLoader
from app.utils.document_converters import convert_document_to_markdown
loader = UnstructuredLoader(
file_path,
mode="elements",
post_processors=[],
languages=["eng"],
include_orig_elements=False,
include_metadata=False,
strategy="auto",
)
docs = await loader.aload()
return await convert_document_to_markdown(docs)
if app_config.ETL_SERVICE == "LLAMACLOUD":
from app.tasks.document_processors.file_processors import (
parse_with_llamacloud_retry,
)
result = await parse_with_llamacloud_retry(
file_path=file_path, estimated_pages=50
)
markdown_documents = await result.aget_markdown_documents(split_by_page=False)
if not markdown_documents:
raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
return markdown_documents[0].text
if app_config.ETL_SERVICE == "DOCLING":
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
t0 = time.monotonic()
logger.info(
f"[docling] START file={filename} thread={threading.current_thread().name}"
)
result = await asyncio.to_thread(converter.convert, file_path)
logger.info(
f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
)
return result.document.export_to_markdown()
raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
result = await EtlPipelineService().extract(
EtlRequest(file_path=file_path, filename=filename)
)
return result.markdown_content
async def download_and_process_file(
@ -236,10 +157,14 @@ async def download_and_process_file(
file_name = file.get("name", "Unknown")
mime_type = file.get("mimeType", "")
# Skip folders and shortcuts
if should_skip_file(mime_type):
return None, f"Skipping {mime_type}", None
if not is_google_workspace_file(mime_type):
ext_skip, _unsup_ext = should_skip_by_extension(file_name)
if ext_skip:
return None, f"Skipping unsupported extension: {file_name}", None
logger.info(f"Downloading file: {file_name} ({mime_type})")
temp_file_path = None
@ -310,10 +235,13 @@ async def download_and_process_file(
"."
)[-1]
etl_filename = (
file_name + extension if is_google_workspace_file(mime_type) else file_name
)
logger.info(f"Processing {file_name} with Surfsense's file processor")
await process_file_in_background(
file_path=temp_file_path,
filename=file_name,
filename=etl_filename,
search_space_id=search_space_id,
user_id=user_id,
session=session,

View file

@ -1,5 +1,7 @@
"""File type handlers for Google Drive."""
from app.etl_pipeline.file_classifier import should_skip_for_service
GOOGLE_DOC = "application/vnd.google-apps.document"
GOOGLE_SHEET = "application/vnd.google-apps.spreadsheet"
GOOGLE_SLIDE = "application/vnd.google-apps.presentation"
@ -46,6 +48,21 @@ def should_skip_file(mime_type: str) -> bool:
return mime_type in [GOOGLE_FOLDER, GOOGLE_SHORTCUT]
def should_skip_by_extension(filename: str) -> tuple[bool, str | None]:
"""Check if the file extension is not parseable by the configured ETL service.
Returns (should_skip, unsupported_extension_or_None).
"""
from pathlib import PurePosixPath
from app.config import config as app_config
if should_skip_for_service(filename, app_config.ETL_SERVICE):
ext = PurePosixPath(filename).suffix.lower()
return True, ext
return False, None
def get_export_mime_type(mime_type: str) -> str | None:
"""Get export MIME type for Google Workspace files."""
return EXPORT_FORMATS.get(mime_type)

View file

@ -1,16 +1,9 @@
"""Content extraction for OneDrive files.
"""Content extraction for OneDrive files."""
Reuses the same ETL parsing logic as Google Drive since file parsing is
extension-based, not provider-specific.
"""
import asyncio
import contextlib
import logging
import os
import tempfile
import threading
import time
from pathlib import Path
from typing import Any
@ -31,7 +24,8 @@ async def download_and_extract_content(
item_id = file.get("id")
file_name = file.get("name", "Unknown")
if should_skip_file(file):
skip, _unsup_ext = should_skip_file(file)
if skip:
return None, {}, "Skipping non-indexable item"
file_info = file.get("file", {})
@ -84,98 +78,11 @@ async def download_and_extract_content(
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
"""Parse a local file to markdown using the configured ETL service.
"""Parse a local file to markdown using the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
Same logic as Google Drive -- file parsing is extension-based.
"""
lower = filename.lower()
if lower.endswith((".md", ".markdown", ".txt")):
with open(file_path, encoding="utf-8") as f:
return f.read()
if lower.endswith((".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")):
from litellm import atranscription
from app.config import config as app_config
stt_service_type = (
"local"
if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
else "external"
)
if stt_service_type == "local":
from app.services.stt_service import stt_service
t0 = time.monotonic()
logger.info(
f"[local-stt] START file={filename} thread={threading.current_thread().name}"
)
result = await asyncio.to_thread(stt_service.transcribe_file, file_path)
logger.info(
f"[local-stt] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
)
text = result.get("text", "")
else:
with open(file_path, "rb") as audio_file:
kwargs: dict[str, Any] = {
"model": app_config.STT_SERVICE,
"file": audio_file,
"api_key": app_config.STT_SERVICE_API_KEY,
}
if app_config.STT_SERVICE_API_BASE:
kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
resp = await atranscription(**kwargs)
text = resp.get("text", "")
if not text:
raise ValueError("Transcription returned empty text")
return f"# Transcription of {filename}\n\n{text}"
from app.config import config as app_config
if app_config.ETL_SERVICE == "UNSTRUCTURED":
from langchain_unstructured import UnstructuredLoader
from app.utils.document_converters import convert_document_to_markdown
loader = UnstructuredLoader(
file_path,
mode="elements",
post_processors=[],
languages=["eng"],
include_orig_elements=False,
include_metadata=False,
strategy="auto",
)
docs = await loader.aload()
return await convert_document_to_markdown(docs)
if app_config.ETL_SERVICE == "LLAMACLOUD":
from app.tasks.document_processors.file_processors import (
parse_with_llamacloud_retry,
)
result = await parse_with_llamacloud_retry(
file_path=file_path, estimated_pages=50
)
markdown_documents = await result.aget_markdown_documents(split_by_page=False)
if not markdown_documents:
raise RuntimeError(f"LlamaCloud returned no documents for {filename}")
return markdown_documents[0].text
if app_config.ETL_SERVICE == "DOCLING":
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
t0 = time.monotonic()
logger.info(
f"[docling] START file={filename} thread={threading.current_thread().name}"
)
result = await asyncio.to_thread(converter.convert, file_path)
logger.info(
f"[docling] END file={filename} elapsed={time.monotonic() - t0:.2f}s"
)
return result.document.export_to_markdown()
raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
result = await EtlPipelineService().extract(
EtlRequest(file_path=file_path, filename=filename)
)
return result.markdown_content

View file

@ -1,5 +1,7 @@
"""File type handlers for Microsoft OneDrive."""
from app.etl_pipeline.file_classifier import should_skip_for_service
ONEDRIVE_FOLDER_FACET = "folder"
ONENOTE_MIME = "application/msonenote"
@ -38,13 +40,28 @@ def is_folder(item: dict) -> bool:
return ONEDRIVE_FOLDER_FACET in item
def should_skip_file(item: dict) -> bool:
"""Skip folders, OneNote files, remote items (shared links), and packages."""
def should_skip_file(item: dict) -> tuple[bool, str | None]:
"""Skip folders, OneNote files, remote items, packages, and unsupported extensions.
Returns (should_skip, unsupported_extension_or_None).
The second element is only set when the skip is due to an unsupported extension.
"""
if is_folder(item):
return True
return True, None
if "remoteItem" in item:
return True
return True, None
if "package" in item:
return True
return True, None
mime = item.get("file", {}).get("mimeType", "")
return mime in SKIP_MIME_TYPES
if mime in SKIP_MIME_TYPES:
return True, None
from pathlib import PurePosixPath
from app.config import config as app_config
name = item.get("name", "")
if should_skip_for_service(name, app_config.ETL_SERVICE):
ext = PurePosixPath(name).suffix.lower()
return True, ext
return False, None

View file

@ -71,8 +71,10 @@ async def get_files_in_folder(
)
continue
files.extend(sub_files)
elif not should_skip_file(item):
files.append(item)
else:
skip, _unsup_ext = should_skip_file(item)
if not skip:
files.append(item)
return files, None

View file

@ -0,0 +1,39 @@
import ssl
import httpx
LLAMACLOUD_MAX_RETRIES = 5
LLAMACLOUD_BASE_DELAY = 10
LLAMACLOUD_MAX_DELAY = 120
LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
ssl.SSLError,
httpx.ConnectError,
httpx.ConnectTimeout,
httpx.ReadError,
httpx.ReadTimeout,
httpx.WriteError,
httpx.WriteTimeout,
httpx.RemoteProtocolError,
httpx.LocalProtocolError,
ConnectionError,
ConnectionResetError,
TimeoutError,
OSError,
)
UPLOAD_BYTES_PER_SECOND_SLOW = 100 * 1024
MIN_UPLOAD_TIMEOUT = 120
MAX_UPLOAD_TIMEOUT = 1800
BASE_JOB_TIMEOUT = 600
PER_PAGE_JOB_TIMEOUT = 60
def calculate_upload_timeout(file_size_bytes: int) -> float:
estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
return max(page_based_timeout, size_based_timeout)

View file

@ -0,0 +1,21 @@
from pydantic import BaseModel, field_validator
class EtlRequest(BaseModel):
file_path: str
filename: str
estimated_pages: int = 0
@field_validator("filename")
@classmethod
def filename_must_not_be_empty(cls, v: str) -> str:
if not v.strip():
raise ValueError("filename must not be empty")
return v
class EtlResult(BaseModel):
markdown_content: str
etl_service: str
actual_pages: int = 0
content_type: str

View file

@ -0,0 +1,90 @@
from app.config import config as app_config
from app.etl_pipeline.etl_document import EtlRequest, EtlResult
from app.etl_pipeline.exceptions import (
EtlServiceUnavailableError,
EtlUnsupportedFileError,
)
from app.etl_pipeline.file_classifier import FileCategory, classify_file
from app.etl_pipeline.parsers.audio import transcribe_audio
from app.etl_pipeline.parsers.direct_convert import convert_file_directly
from app.etl_pipeline.parsers.plaintext import read_plaintext
class EtlPipelineService:
"""Single pipeline for extracting markdown from files. All callers use this."""
async def extract(self, request: EtlRequest) -> EtlResult:
category = classify_file(request.filename)
if category == FileCategory.UNSUPPORTED:
raise EtlUnsupportedFileError(
f"File type not supported for parsing: {request.filename}"
)
if category == FileCategory.PLAINTEXT:
content = read_plaintext(request.file_path)
return EtlResult(
markdown_content=content,
etl_service="PLAINTEXT",
content_type="plaintext",
)
if category == FileCategory.DIRECT_CONVERT:
content = convert_file_directly(request.file_path, request.filename)
return EtlResult(
markdown_content=content,
etl_service="DIRECT_CONVERT",
content_type="direct_convert",
)
if category == FileCategory.AUDIO:
content = await transcribe_audio(request.file_path, request.filename)
return EtlResult(
markdown_content=content,
etl_service="AUDIO",
content_type="audio",
)
return await self._extract_document(request)
async def _extract_document(self, request: EtlRequest) -> EtlResult:
from pathlib import PurePosixPath
from app.utils.file_extensions import get_document_extensions_for_service
etl_service = app_config.ETL_SERVICE
if not etl_service:
raise EtlServiceUnavailableError(
"No ETL_SERVICE configured. "
"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
)
ext = PurePosixPath(request.filename).suffix.lower()
supported = get_document_extensions_for_service(etl_service)
if ext not in supported:
raise EtlUnsupportedFileError(
f"File type {ext} is not supported by {etl_service}"
)
if etl_service == "DOCLING":
from app.etl_pipeline.parsers.docling import parse_with_docling
content = await parse_with_docling(request.file_path, request.filename)
elif etl_service == "UNSTRUCTURED":
from app.etl_pipeline.parsers.unstructured import parse_with_unstructured
content = await parse_with_unstructured(request.file_path)
elif etl_service == "LLAMACLOUD":
from app.etl_pipeline.parsers.llamacloud import parse_with_llamacloud
content = await parse_with_llamacloud(
request.file_path, request.estimated_pages
)
else:
raise EtlServiceUnavailableError(f"Unknown ETL_SERVICE: {etl_service}")
return EtlResult(
markdown_content=content,
etl_service=etl_service,
content_type="document",
)

View file

@ -0,0 +1,10 @@
class EtlParseError(Exception):
"""Raised when an ETL parser fails to produce content."""
class EtlServiceUnavailableError(Exception):
"""Raised when the configured ETL_SERVICE is not recognised."""
class EtlUnsupportedFileError(Exception):
"""Raised when a file type cannot be parsed by any ETL pipeline."""

View file

@ -0,0 +1,137 @@
from enum import Enum
from pathlib import PurePosixPath
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
get_document_extensions_for_service,
)
PLAINTEXT_EXTENSIONS = frozenset(
{
".md",
".markdown",
".txt",
".text",
".json",
".jsonl",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".xml",
".css",
".scss",
".less",
".sass",
".py",
".pyw",
".pyi",
".pyx",
".js",
".jsx",
".ts",
".tsx",
".mjs",
".cjs",
".java",
".kt",
".kts",
".scala",
".groovy",
".c",
".h",
".cpp",
".cxx",
".cc",
".hpp",
".hxx",
".cs",
".fs",
".fsx",
".go",
".rs",
".rb",
".php",
".pl",
".pm",
".lua",
".swift",
".m",
".mm",
".r",
".jl",
".sh",
".bash",
".zsh",
".fish",
".bat",
".cmd",
".ps1",
".sql",
".graphql",
".gql",
".env",
".gitignore",
".dockerignore",
".editorconfig",
".makefile",
".cmake",
".log",
".rst",
".tex",
".bib",
".org",
".adoc",
".asciidoc",
".vue",
".svelte",
".astro",
".tf",
".hcl",
".proto",
}
)
AUDIO_EXTENSIONS = frozenset(
{".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm"}
)
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm", ".xhtml"})
class FileCategory(Enum):
PLAINTEXT = "plaintext"
AUDIO = "audio"
DIRECT_CONVERT = "direct_convert"
UNSUPPORTED = "unsupported"
DOCUMENT = "document"
def classify_file(filename: str) -> FileCategory:
suffix = PurePosixPath(filename).suffix.lower()
if suffix in PLAINTEXT_EXTENSIONS:
return FileCategory.PLAINTEXT
if suffix in AUDIO_EXTENSIONS:
return FileCategory.AUDIO
if suffix in DIRECT_CONVERT_EXTENSIONS:
return FileCategory.DIRECT_CONVERT
if suffix in DOCUMENT_EXTENSIONS:
return FileCategory.DOCUMENT
return FileCategory.UNSUPPORTED
def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
"""Return True if *filename* cannot be processed by *etl_service*.
Plaintext, audio, and direct-convert files are parser-agnostic and never
skipped. Document files are checked against the per-parser extension set.
"""
category = classify_file(filename)
if category == FileCategory.UNSUPPORTED:
return True
if category == FileCategory.DOCUMENT:
suffix = PurePosixPath(filename).suffix.lower()
return suffix not in get_document_extensions_for_service(etl_service)
return False

View file

@ -0,0 +1,34 @@
from litellm import atranscription
from app.config import config as app_config
async def transcribe_audio(file_path: str, filename: str) -> str:
stt_service_type = (
"local"
if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
else "external"
)
if stt_service_type == "local":
from app.services.stt_service import stt_service
result = stt_service.transcribe_file(file_path)
text = result.get("text", "")
if not text:
raise ValueError("Transcription returned empty text")
else:
with open(file_path, "rb") as audio_file:
kwargs: dict = {
"model": app_config.STT_SERVICE,
"file": audio_file,
"api_key": app_config.STT_SERVICE_API_KEY,
}
if app_config.STT_SERVICE_API_BASE:
kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
response = await atranscription(**kwargs)
text = response.get("text", "")
if not text:
raise ValueError("Transcription returned empty text")
return f"# Transcription of {filename}\n\n{text}"

View file

@ -0,0 +1,3 @@
from app.tasks.document_processors._direct_converters import convert_file_directly
__all__ = ["convert_file_directly"]

View file

@ -0,0 +1,26 @@
import warnings
from logging import ERROR, getLogger
async def parse_with_docling(file_path: str, filename: str) -> str:
from app.services.docling_service import create_docling_service
docling_service = create_docling_service()
pdfminer_logger = getLogger("pdfminer")
original_level = pdfminer_logger.level
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
warnings.filterwarnings(
"ignore", message=".*Cannot set gray non-stroke color.*"
)
warnings.filterwarnings("ignore", message=".*invalid float value.*")
pdfminer_logger.setLevel(ERROR)
try:
result = await docling_service.process_document(file_path, filename)
finally:
pdfminer_logger.setLevel(original_level)
return result["content"]

View file

@ -0,0 +1,123 @@
import asyncio
import logging
import os
import random
import httpx
from app.config import config as app_config
from app.etl_pipeline.constants import (
LLAMACLOUD_BASE_DELAY,
LLAMACLOUD_MAX_DELAY,
LLAMACLOUD_MAX_RETRIES,
LLAMACLOUD_RETRYABLE_EXCEPTIONS,
PER_PAGE_JOB_TIMEOUT,
calculate_job_timeout,
calculate_upload_timeout,
)
async def parse_with_llamacloud(file_path: str, estimated_pages: int) -> str:
from llama_cloud_services import LlamaParse
from llama_cloud_services.parse.utils import ResultType
file_size_bytes = os.path.getsize(file_path)
file_size_mb = file_size_bytes / (1024 * 1024)
upload_timeout = calculate_upload_timeout(file_size_bytes)
job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
custom_timeout = httpx.Timeout(
connect=120.0,
read=upload_timeout,
write=upload_timeout,
pool=120.0,
)
logging.info(
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
f"job_timeout={job_timeout:.0f}s"
)
last_exception = None
attempt_errors: list[str] = []
for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
try:
async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
parser = LlamaParse(
api_key=app_config.LLAMA_CLOUD_API_KEY,
num_workers=1,
verbose=True,
language="en",
result_type=ResultType.MD,
max_timeout=int(max(2000, job_timeout + upload_timeout)),
job_timeout_in_seconds=job_timeout,
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
custom_client=custom_client,
)
result = await parser.aparse(file_path)
if attempt > 1:
logging.info(
f"LlamaCloud upload succeeded on attempt {attempt} after "
f"{len(attempt_errors)} failures"
)
if hasattr(result, "get_markdown_documents"):
markdown_docs = result.get_markdown_documents(split_by_page=False)
if markdown_docs and hasattr(markdown_docs[0], "text"):
return markdown_docs[0].text
if hasattr(result, "pages") and result.pages:
return "\n\n".join(
p.md for p in result.pages if hasattr(p, "md") and p.md
)
return str(result)
if isinstance(result, list):
if result and hasattr(result[0], "text"):
return result[0].text
return "\n\n".join(
doc.page_content if hasattr(doc, "page_content") else str(doc)
for doc in result
)
return str(result)
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
last_exception = e
error_type = type(e).__name__
error_msg = str(e)[:200]
attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
if attempt < LLAMACLOUD_MAX_RETRIES:
base_delay = min(
LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
LLAMACLOUD_MAX_DELAY,
)
jitter = base_delay * 0.25 * (2 * random.random() - 1)
delay = base_delay + jitter
logging.warning(
f"LlamaCloud upload failed "
f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
f"{error_type}. File: {file_size_mb:.1f}MB. "
f"Retrying in {delay:.0f}s..."
)
await asyncio.sleep(delay)
else:
logging.error(
f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
f"attempts. File size: {file_size_mb:.1f}MB, "
f"Pages: {estimated_pages}. "
f"Errors: {'; '.join(attempt_errors)}"
)
except Exception:
raise
raise last_exception or RuntimeError(
f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
f"File size: {file_size_mb:.1f}MB"
)

View file

@ -0,0 +1,8 @@
def read_plaintext(file_path: str) -> str:
with open(file_path, encoding="utf-8", errors="replace") as f:
content = f.read()
if "\x00" in content:
raise ValueError(
f"File contains null bytes — likely a binary file opened as text: {file_path}"
)
return content

View file

@ -0,0 +1,14 @@
async def parse_with_unstructured(file_path: str) -> str:
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path,
mode="elements",
post_processors=[],
languages=["eng"],
include_orig_elements=False,
include_metadata=False,
strategy="auto",
)
docs = await loader.aload()
return "\n\n".join(doc.page_content for doc in docs if doc.page_content)

View file

@ -1,4 +1,4 @@
from fastapi import APIRouter, Depends, HTTPException
from fastapi import APIRouter, Depends
from fastapi.responses import StreamingResponse
from pydantic import BaseModel, Field
from sqlalchemy.ext.asyncio import AsyncSession
@ -31,8 +31,11 @@ async def vision_autocomplete_stream(
return StreamingResponse(
stream_vision_autocomplete(
body.screenshot, body.search_space_id, session,
app_name=body.app_name, window_title=body.window_title,
body.screenshot,
body.search_space_id,
session,
app_name=body.app_name,
window_title=body.window_title,
),
media_type="text/event-stream",
headers={

View file

@ -311,9 +311,11 @@ async def dropbox_callback(
)
existing_cursor = db_connector.config.get("cursor")
existing_folder_cursors = db_connector.config.get("folder_cursors")
db_connector.config = {
**connector_config,
"cursor": existing_cursor,
"folder_cursors": existing_folder_cursors,
"auth_expired": False,
}
flag_modified(db_connector, "config")

View file

@ -2477,6 +2477,8 @@ async def run_google_drive_indexing(
stage="fetching",
)
total_unsupported = 0
# Index each folder with indexing options
for folder in items.folders:
try:
@ -2484,6 +2486,7 @@ async def run_google_drive_indexing(
indexed_count,
skipped_count,
error_message,
unsupported_count,
) = await index_google_drive_files(
session,
connector_id,
@ -2497,6 +2500,7 @@ async def run_google_drive_indexing(
include_subfolders=indexing_options.include_subfolders,
)
total_skipped += skipped_count
total_unsupported += unsupported_count
if error_message:
errors.append(f"Folder '{folder.name}': {error_message}")
else:
@ -2572,6 +2576,7 @@ async def run_google_drive_indexing(
indexed_count=total_indexed,
error_message=error_message,
skipped_count=total_skipped,
unsupported_count=total_unsupported,
)
except Exception as e:
@ -2642,7 +2647,12 @@ async def run_onedrive_indexing(
stage="fetching",
)
total_indexed, total_skipped, error_message = await index_onedrive_files(
(
total_indexed,
total_skipped,
error_message,
total_unsupported,
) = await index_onedrive_files(
session,
connector_id,
search_space_id,
@ -2683,6 +2693,7 @@ async def run_onedrive_indexing(
indexed_count=total_indexed,
error_message=error_message,
skipped_count=total_skipped,
unsupported_count=total_unsupported,
)
except Exception as e:
@ -2750,7 +2761,12 @@ async def run_dropbox_indexing(
stage="fetching",
)
total_indexed, total_skipped, error_message = await index_dropbox_files(
(
total_indexed,
total_skipped,
error_message,
total_unsupported,
) = await index_dropbox_files(
session,
connector_id,
search_space_id,
@ -2791,6 +2807,7 @@ async def run_dropbox_indexing(
indexed_count=total_indexed,
error_message=error_message,
skipped_count=total_skipped,
unsupported_count=total_unsupported,
)
except Exception as e:

View file

@ -111,9 +111,8 @@ class DoclingService:
pipeline_options=pipeline_options, backend=PyPdfiumDocumentBackend
)
# Initialize DocumentConverter
self.converter = DocumentConverter(
format_options={InputFormat.PDF: pdf_format_option}
format_options={InputFormat.PDF: pdf_format_option},
)
acceleration_type = "GPU (WSL2)" if self.use_gpu else "CPU"

View file

@ -421,6 +421,7 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
error_message: str | None = None,
is_warning: bool = False,
skipped_count: int | None = None,
unsupported_count: int | None = None,
) -> Notification:
"""
Update notification when connector indexing completes.
@ -428,10 +429,11 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
Args:
session: Database session
notification: Notification to update
indexed_count: Total number of items indexed
indexed_count: Total number of files indexed
error_message: Error message if indexing failed, or warning message (optional)
is_warning: If True, treat error_message as a warning (success case) rather than an error
skipped_count: Number of items skipped (e.g., duplicates) - optional
skipped_count: Number of files skipped (e.g., unchanged) - optional
unsupported_count: Number of files skipped because the ETL parser doesn't support them
Returns:
Updated notification
@ -440,52 +442,45 @@ class ConnectorIndexingNotificationHandler(BaseNotificationHandler):
"connector_name", "Connector"
)
# Build the skipped text if there are skipped items
skipped_text = ""
if skipped_count and skipped_count > 0:
skipped_item_text = "item" if skipped_count == 1 else "items"
skipped_text = (
f" ({skipped_count} {skipped_item_text} skipped - already indexed)"
)
unsupported_text = ""
if unsupported_count and unsupported_count > 0:
file_word = "file was" if unsupported_count == 1 else "files were"
unsupported_text = f" {unsupported_count} {file_word} not supported."
# If there's an error message but items were indexed, treat it as a warning (partial success)
# If is_warning is True, treat it as success even with 0 items (e.g., duplicates found)
# Otherwise, treat it as a failure
if error_message:
if indexed_count > 0:
# Partial success with warnings (e.g., duplicate content from other connectors)
title = f"Ready: {connector_name}"
item_text = "item" if indexed_count == 1 else "items"
message = f"Now searchable! {indexed_count} {item_text} synced{skipped_text}. Note: {error_message}"
file_text = "file" if indexed_count == 1 else "files"
message = f"Now searchable! {indexed_count} {file_text} synced.{unsupported_text} Note: {error_message}"
status = "completed"
elif is_warning:
# Warning case (e.g., duplicates found) - treat as success
title = f"Ready: {connector_name}"
message = f"Sync completed{skipped_text}. {error_message}"
message = f"Sync complete.{unsupported_text} {error_message}"
status = "completed"
else:
# Complete failure
title = f"Failed: {connector_name}"
message = f"Sync failed: {error_message}"
if unsupported_text:
message += unsupported_text
status = "failed"
else:
title = f"Ready: {connector_name}"
if indexed_count == 0:
if skipped_count and skipped_count > 0:
skipped_item_text = "item" if skipped_count == 1 else "items"
message = f"Already up to date! {skipped_count} {skipped_item_text} skipped (already indexed)."
if unsupported_count and unsupported_count > 0:
message = f"Sync complete.{unsupported_text}"
else:
message = "Already up to date! No new items to sync."
message = "Already up to date!"
else:
item_text = "item" if indexed_count == 1 else "items"
message = (
f"Now searchable! {indexed_count} {item_text} synced{skipped_text}."
)
file_text = "file" if indexed_count == 1 else "files"
message = f"Now searchable! {indexed_count} {file_text} synced."
if unsupported_text:
message += unsupported_text
status = "completed"
metadata_updates = {
"indexed_count": indexed_count,
"skipped_count": skipped_count or 0,
"unsupported_count": unsupported_count or 0,
"sync_stage": "completed"
if (not error_message or is_warning or indexed_count > 0)
else "failed",

View file

@ -8,7 +8,7 @@ Optimized pipeline:
"""
import logging
from typing import AsyncGenerator
from collections.abc import AsyncGenerator
from langchain_core.messages import HumanMessage
from sqlalchemy.ext.asyncio import AsyncSession

View file

@ -51,7 +51,10 @@ async def _should_skip_file(
file_id = file.get("id", "")
file_name = file.get("name", "Unknown")
if skip_item(file):
skip, unsup_ext = skip_item(file)
if skip:
if unsup_ext:
return True, f"unsupported:{unsup_ext}"
return True, "folder/non-downloadable"
if not file_id:
return True, "missing file_id"
@ -251,6 +254,121 @@ async def _download_and_index(
return batch_indexed, download_failed + batch_failed
async def _remove_document(session: AsyncSession, file_id: str, search_space_id: int):
"""Remove a document that was deleted in Dropbox."""
primary_hash = compute_identifier_hash(
DocumentType.DROPBOX_FILE.value, file_id, search_space_id
)
existing = await check_document_by_unique_identifier(session, primary_hash)
if not existing:
result = await session.execute(
select(Document).where(
Document.search_space_id == search_space_id,
Document.document_type == DocumentType.DROPBOX_FILE,
cast(Document.document_metadata["dropbox_file_id"], String) == file_id,
)
)
existing = result.scalar_one_or_none()
if existing:
await session.delete(existing)
async def _index_with_delta_sync(
dropbox_client: DropboxClient,
session: AsyncSession,
connector_id: int,
search_space_id: int,
user_id: str,
cursor: str,
task_logger: TaskLoggingService,
log_entry: object,
max_files: int,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int, int, str]:
"""Delta sync using Dropbox cursor-based change tracking.
Returns (indexed_count, skipped_count, new_cursor).
"""
await task_logger.log_task_progress(
log_entry,
f"Starting delta sync from cursor: {cursor[:20]}...",
{"stage": "delta_sync", "cursor_prefix": cursor[:20]},
)
entries, new_cursor, error = await dropbox_client.get_changes(cursor)
if error:
err_lower = error.lower()
if "401" in error or "authentication expired" in err_lower:
raise Exception(
f"Dropbox authentication failed. Please re-authenticate. (Error: {error})"
)
raise Exception(f"Failed to fetch Dropbox changes: {error}")
if not entries:
logger.info("No changes detected since last sync")
return 0, 0, 0, new_cursor or cursor
logger.info(f"Processing {len(entries)} change entries")
renamed_count = 0
skipped = 0
unsupported_count = 0
files_to_download: list[dict] = []
files_processed = 0
for entry in entries:
if files_processed >= max_files:
break
files_processed += 1
tag = entry.get(".tag")
if tag == "deleted":
path_lower = entry.get("path_lower", "")
name = entry.get("name", "")
file_id = entry.get("id", "")
if file_id:
await _remove_document(session, file_id, search_space_id)
logger.debug(f"Processed deletion: {name or path_lower}")
continue
if tag != "file":
continue
skip, msg = await _should_skip_file(session, entry, search_space_id)
if skip:
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
continue
files_to_download.append(entry)
batch_indexed, failed = await _download_and_index(
dropbox_client,
session,
files_to_download,
connector_id=connector_id,
search_space_id=search_space_id,
user_id=user_id,
enable_summary=enable_summary,
on_heartbeat=on_heartbeat_callback,
)
indexed = renamed_count + batch_indexed
logger.info(
f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
f"{unsupported_count} unsupported, {failed} failed"
)
return indexed, skipped, unsupported_count, new_cursor or cursor
async def _index_full_scan(
dropbox_client: DropboxClient,
session: AsyncSession,
@ -266,8 +384,11 @@ async def _index_full_scan(
incremental_sync: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int]:
"""Full scan indexing of a folder."""
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
Returns (indexed, skipped, unsupported_count).
"""
await task_logger.log_task_progress(
log_entry,
f"Starting full scan of folder: {folder_name}",
@ -287,6 +408,7 @@ async def _index_full_scan(
renamed_count = 0
skipped = 0
unsupported_count = 0
files_to_download: list[dict] = []
all_files, error = await get_files_in_folder(
@ -306,14 +428,21 @@ async def _index_full_scan(
if incremental_sync:
skip, msg = await _should_skip_file(session, file, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
continue
elif skip_item(file):
skipped += 1
continue
else:
item_skip, item_unsup = skip_item(file)
if item_skip:
if item_unsup:
unsupported_count += 1
else:
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
@ -352,9 +481,10 @@ async def _index_full_scan(
indexed = renamed_count + batch_indexed
logger.info(
f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
f"Full scan complete: {indexed} indexed, {skipped} skipped, "
f"{unsupported_count} unsupported, {failed} failed"
)
return indexed, skipped
return indexed, skipped, unsupported_count
async def _index_selected_files(
@ -368,7 +498,7 @@ async def _index_selected_files(
enable_summary: bool,
incremental_sync: bool = True,
on_heartbeat: HeartbeatCallbackType | None = None,
) -> tuple[int, int, list[str]]:
) -> tuple[int, int, int, list[str]]:
"""Index user-selected files using the parallel pipeline."""
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
@ -379,6 +509,7 @@ async def _index_selected_files(
errors: list[str] = []
renamed_count = 0
skipped = 0
unsupported_count = 0
for file_path, file_name in file_paths:
file, error = await get_file_by_path(dropbox_client, file_path)
@ -390,14 +521,21 @@ async def _index_selected_files(
if incremental_sync:
skip, msg = await _should_skip_file(session, file, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
continue
elif skip_item(file):
skipped += 1
continue
else:
item_skip, item_unsup = skip_item(file)
if item_skip:
if item_unsup:
unsupported_count += 1
else:
skipped += 1
continue
file_pages = PageLimitService.estimate_pages_from_metadata(
file.get("name", ""), file.get("size")
@ -429,7 +567,7 @@ async def _index_selected_files(
user_id, pages_to_deduct, allow_exceed=True
)
return renamed_count + batch_indexed, skipped, errors
return renamed_count + batch_indexed, skipped, unsupported_count, errors
async def index_dropbox_files(
@ -438,7 +576,7 @@ async def index_dropbox_files(
search_space_id: int,
user_id: str,
items_dict: dict,
) -> tuple[int, int, str | None]:
) -> tuple[int, int, str | None, int]:
"""Index Dropbox files for a specific connector.
items_dict format:
@ -469,7 +607,7 @@ async def index_dropbox_files(
await task_logger.log_task_failure(
log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
)
return 0, 0, error_msg
return 0, 0, error_msg, 0
token_encrypted = connector.config.get("_token_encrypted", False)
if token_encrypted and not config.SECRET_KEY:
@ -480,7 +618,7 @@ async def index_dropbox_files(
"Missing SECRET_KEY",
{"error_type": "MissingSecretKey"},
)
return 0, 0, error_msg
return 0, 0, error_msg, 0
connector_enable_summary = getattr(connector, "enable_summary", True)
dropbox_client = DropboxClient(session, connector_id)
@ -489,9 +627,13 @@ async def index_dropbox_files(
max_files = indexing_options.get("max_files", 500)
incremental_sync = indexing_options.get("incremental_sync", True)
include_subfolders = indexing_options.get("include_subfolders", True)
use_delta_sync = indexing_options.get("use_delta_sync", True)
folder_cursors: dict = connector.config.get("folder_cursors", {})
total_indexed = 0
total_skipped = 0
total_unsupported = 0
selected_files = items_dict.get("files", [])
if selected_files:
@ -499,7 +641,7 @@ async def index_dropbox_files(
(f.get("path", f.get("path_lower", f.get("id", ""))), f.get("name"))
for f in selected_files
]
indexed, skipped, file_errors = await _index_selected_files(
indexed, skipped, unsupported, file_errors = await _index_selected_files(
dropbox_client,
session,
file_tuples,
@ -511,6 +653,7 @@ async def index_dropbox_files(
)
total_indexed += indexed
total_skipped += skipped
total_unsupported += unsupported
if file_errors:
logger.warning(
f"File indexing errors for connector {connector_id}: {file_errors}"
@ -523,25 +666,66 @@ async def index_dropbox_files(
)
folder_name = folder.get("name", "Root")
logger.info(f"Using full scan for folder {folder_name}")
indexed, skipped = await _index_full_scan(
dropbox_client,
session,
connector_id,
search_space_id,
user_id,
folder_path,
folder_name,
task_logger,
log_entry,
max_files,
include_subfolders,
incremental_sync=incremental_sync,
enable_summary=connector_enable_summary,
saved_cursor = folder_cursors.get(folder_path)
can_use_delta = (
use_delta_sync and saved_cursor and connector.last_indexed_at
)
if can_use_delta:
logger.info(f"Using delta sync for folder {folder_name}")
indexed, skipped, unsup, new_cursor = await _index_with_delta_sync(
dropbox_client,
session,
connector_id,
search_space_id,
user_id,
saved_cursor,
task_logger,
log_entry,
max_files,
enable_summary=connector_enable_summary,
)
folder_cursors[folder_path] = new_cursor
total_unsupported += unsup
else:
logger.info(f"Using full scan for folder {folder_name}")
indexed, skipped, unsup = await _index_full_scan(
dropbox_client,
session,
connector_id,
search_space_id,
user_id,
folder_path,
folder_name,
task_logger,
log_entry,
max_files,
include_subfolders,
incremental_sync=incremental_sync,
enable_summary=connector_enable_summary,
)
total_unsupported += unsup
total_indexed += indexed
total_skipped += skipped
# Persist latest cursor for this folder
try:
latest_cursor, cursor_err = await dropbox_client.get_latest_cursor(
folder_path
)
if latest_cursor and not cursor_err:
folder_cursors[folder_path] = latest_cursor
except Exception as e:
logger.warning(f"Failed to get latest cursor for {folder_path}: {e}")
# Persist folder cursors to connector config
if folders:
cfg = dict(connector.config)
cfg["folder_cursors"] = folder_cursors
connector.config = cfg
flag_modified(connector, "config")
if total_indexed > 0 or folders:
await update_connector_last_indexed(session, connector, True)
@ -550,12 +734,18 @@ async def index_dropbox_files(
await task_logger.log_task_success(
log_entry,
f"Successfully completed Dropbox indexing for connector {connector_id}",
{"files_processed": total_indexed, "files_skipped": total_skipped},
{
"files_processed": total_indexed,
"files_skipped": total_skipped,
"files_unsupported": total_unsupported,
},
)
logger.info(
f"Dropbox indexing completed: {total_indexed} indexed, {total_skipped} skipped"
f"Dropbox indexing completed: {total_indexed} indexed, "
f"{total_skipped} skipped, {total_unsupported} unsupported"
)
return total_indexed, total_skipped, None
return total_indexed, total_skipped, None, total_unsupported
except SQLAlchemyError as db_error:
await session.rollback()
@ -566,7 +756,7 @@ async def index_dropbox_files(
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, 0, f"Database error: {db_error!s}"
return 0, 0, f"Database error: {db_error!s}", 0
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
@ -576,4 +766,4 @@ async def index_dropbox_files(
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Dropbox files: {e!s}", exc_info=True)
return 0, 0, f"Failed to index Dropbox files: {e!s}"
return 0, 0, f"Failed to index Dropbox files: {e!s}", 0

View file

@ -25,7 +25,11 @@ from app.connectors.google_drive import (
get_files_in_folder,
get_start_page_token,
)
from app.connectors.google_drive.file_types import should_skip_file as skip_mime
from app.connectors.google_drive.file_types import (
is_google_workspace_file,
should_skip_by_extension,
should_skip_file as skip_mime,
)
from app.db import Document, DocumentStatus, DocumentType, SearchSourceConnectorType
from app.indexing_pipeline.connector_document import ConnectorDocument
from app.indexing_pipeline.document_hashing import compute_identifier_hash
@ -78,6 +82,10 @@ async def _should_skip_file(
if skip_mime(mime_type):
return True, "folder/shortcut"
if not is_google_workspace_file(mime_type):
ext_skip, unsup_ext = should_skip_by_extension(file_name)
if ext_skip:
return True, f"unsupported:{unsup_ext}"
if not file_id:
return True, "missing file_id"
@ -468,13 +476,13 @@ async def _index_selected_files(
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
) -> tuple[int, int, list[str]]:
) -> tuple[int, int, int, list[str]]:
"""Index user-selected files using the parallel pipeline.
Phase 1 (serial): fetch metadata + skip checks.
Phase 2+3 (parallel): download, ETL, index via _download_and_index.
Returns (indexed_count, skipped_count, errors).
Returns (indexed_count, skipped_count, unsupported_count, errors).
"""
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
@ -485,6 +493,7 @@ async def _index_selected_files(
errors: list[str] = []
renamed_count = 0
skipped = 0
unsupported_count = 0
for file_id, file_name in file_ids:
file, error = await get_file_by_id(drive_client, file_id)
@ -495,7 +504,9 @@ async def _index_selected_files(
skip, msg = await _should_skip_file(session, file, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
@ -539,7 +550,7 @@ async def _index_selected_files(
user_id, pages_to_deduct, allow_exceed=True
)
return renamed_count + batch_indexed, skipped, errors
return renamed_count + batch_indexed, skipped, unsupported_count, errors
# ---------------------------------------------------------------------------
@ -562,8 +573,11 @@ async def _index_full_scan(
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int]:
"""Full scan indexing of a folder."""
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
Returns (indexed, skipped, unsupported_count).
"""
await task_logger.log_task_progress(
log_entry,
f"Starting full scan of folder: {folder_name} (include_subfolders={include_subfolders})",
@ -585,6 +599,7 @@ async def _index_full_scan(
renamed_count = 0
skipped = 0
unsupported_count = 0
files_processed = 0
files_to_download: list[dict] = []
folders_to_process = [(folder_id, folder_name)]
@ -625,7 +640,9 @@ async def _index_full_scan(
skip, msg = await _should_skip_file(session, file, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
@ -698,9 +715,10 @@ async def _index_full_scan(
indexed = renamed_count + batch_indexed
logger.info(
f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
f"Full scan complete: {indexed} indexed, {skipped} skipped, "
f"{unsupported_count} unsupported, {failed} failed"
)
return indexed, skipped
return indexed, skipped, unsupported_count
async def _index_with_delta_sync(
@ -718,8 +736,11 @@ async def _index_with_delta_sync(
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int]:
"""Delta sync using change tracking."""
) -> tuple[int, int, int]:
"""Delta sync using change tracking.
Returns (indexed, skipped, unsupported_count).
"""
await task_logger.log_task_progress(
log_entry,
f"Starting delta sync from token: {start_page_token[:20]}...",
@ -739,7 +760,7 @@ async def _index_with_delta_sync(
if not changes:
logger.info("No changes detected since last sync")
return 0, 0
return 0, 0, 0
logger.info(f"Processing {len(changes)} changes")
@ -754,6 +775,7 @@ async def _index_with_delta_sync(
renamed_count = 0
skipped = 0
unsupported_count = 0
files_to_download: list[dict] = []
files_processed = 0
@ -775,7 +797,9 @@ async def _index_with_delta_sync(
skip, msg = await _should_skip_file(session, file, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
@ -832,9 +856,10 @@ async def _index_with_delta_sync(
indexed = renamed_count + batch_indexed
logger.info(
f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
f"{unsupported_count} unsupported, {failed} failed"
)
return indexed, skipped
return indexed, skipped, unsupported_count
# ---------------------------------------------------------------------------
@ -854,8 +879,11 @@ async def index_google_drive_files(
max_files: int = 500,
include_subfolders: bool = False,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
) -> tuple[int, int, str | None]:
"""Index Google Drive files for a specific connector."""
) -> tuple[int, int, str | None, int]:
"""Index Google Drive files for a specific connector.
Returns (indexed, skipped, error_or_none, unsupported_count).
"""
task_logger = TaskLoggingService(session, search_space_id)
log_entry = await task_logger.log_task_start(
task_name="google_drive_files_indexing",
@ -881,7 +909,7 @@ async def index_google_drive_files(
await task_logger.log_task_failure(
log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
)
return 0, 0, error_msg
return 0, 0, error_msg, 0
await task_logger.log_task_progress(
log_entry,
@ -900,7 +928,7 @@ async def index_google_drive_files(
"Missing Composio account",
{"error_type": "MissingComposioAccount"},
)
return 0, 0, error_msg
return 0, 0, error_msg, 0
pre_built_credentials = build_composio_credentials(connected_account_id)
else:
token_encrypted = connector.config.get("_token_encrypted", False)
@ -915,6 +943,7 @@ async def index_google_drive_files(
0,
0,
"SECRET_KEY not configured but credentials are marked as encrypted",
0,
)
connector_enable_summary = getattr(connector, "enable_summary", True)
@ -927,7 +956,7 @@ async def index_google_drive_files(
await task_logger.log_task_failure(
log_entry, error_msg, {"error_type": "MissingParameter"}
)
return 0, 0, error_msg
return 0, 0, error_msg, 0
target_folder_id = folder_id
target_folder_name = folder_name or "Selected Folder"
@ -938,9 +967,11 @@ async def index_google_drive_files(
use_delta_sync and start_page_token and connector.last_indexed_at
)
documents_unsupported = 0
if can_use_delta:
logger.info(f"Using delta sync for connector {connector_id}")
documents_indexed, documents_skipped = await _index_with_delta_sync(
documents_indexed, documents_skipped, du = await _index_with_delta_sync(
drive_client,
session,
connector,
@ -956,8 +987,9 @@ async def index_google_drive_files(
on_heartbeat_callback,
connector_enable_summary,
)
documents_unsupported += du
logger.info("Running reconciliation scan after delta sync")
ri, rs = await _index_full_scan(
ri, rs, ru = await _index_full_scan(
drive_client,
session,
connector,
@ -975,9 +1007,14 @@ async def index_google_drive_files(
)
documents_indexed += ri
documents_skipped += rs
documents_unsupported += ru
else:
logger.info(f"Using full scan for connector {connector_id}")
documents_indexed, documents_skipped = await _index_full_scan(
(
documents_indexed,
documents_skipped,
documents_unsupported,
) = await _index_full_scan(
drive_client,
session,
connector,
@ -1012,14 +1049,17 @@ async def index_google_drive_files(
{
"files_processed": documents_indexed,
"files_skipped": documents_skipped,
"files_unsupported": documents_unsupported,
"sync_type": "delta" if can_use_delta else "full",
"folder": target_folder_name,
},
)
logger.info(
f"Google Drive indexing completed: {documents_indexed} indexed, {documents_skipped} skipped"
f"Google Drive indexing completed: {documents_indexed} indexed, "
f"{documents_skipped} skipped, {documents_unsupported} unsupported"
)
return documents_indexed, documents_skipped, None
return documents_indexed, documents_skipped, None, documents_unsupported
except SQLAlchemyError as db_error:
await session.rollback()
@ -1030,7 +1070,7 @@ async def index_google_drive_files(
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, 0, f"Database error: {db_error!s}"
return 0, 0, f"Database error: {db_error!s}", 0
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
@ -1040,7 +1080,7 @@ async def index_google_drive_files(
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index Google Drive files: {e!s}", exc_info=True)
return 0, 0, f"Failed to index Google Drive files: {e!s}"
return 0, 0, f"Failed to index Google Drive files: {e!s}", 0
async def index_google_drive_single_file(
@ -1242,7 +1282,7 @@ async def index_google_drive_selected_files(
session, connector_id, credentials=pre_built_credentials
)
indexed, skipped, errors = await _index_selected_files(
indexed, skipped, unsupported, errors = await _index_selected_files(
drive_client,
session,
files,
@ -1253,6 +1293,11 @@ async def index_google_drive_selected_files(
on_heartbeat=on_heartbeat_callback,
)
if unsupported > 0:
file_text = "file was" if unsupported == 1 else "files were"
unsup_msg = f"{unsupported} {file_text} not supported"
errors.append(unsup_msg)
await session.commit()
if errors:
@ -1260,7 +1305,12 @@ async def index_google_drive_selected_files(
log_entry,
f"Batch file indexing completed with {len(errors)} error(s)",
"; ".join(errors),
{"indexed": indexed, "skipped": skipped, "error_count": len(errors)},
{
"indexed": indexed,
"skipped": skipped,
"unsupported": unsupported,
"error_count": len(errors),
},
)
else:
await task_logger.log_task_success(

View file

@ -23,7 +23,6 @@ from sqlalchemy import select
from sqlalchemy.exc import IntegrityError, SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config
from app.db import (
Document,
DocumentStatus,
@ -44,132 +43,6 @@ from .base import (
logger,
)
PLAINTEXT_EXTENSIONS = frozenset(
{
".md",
".markdown",
".txt",
".text",
".json",
".jsonl",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".xml",
".css",
".scss",
".less",
".sass",
".py",
".pyw",
".pyi",
".pyx",
".js",
".jsx",
".ts",
".tsx",
".mjs",
".cjs",
".java",
".kt",
".kts",
".scala",
".groovy",
".c",
".h",
".cpp",
".cxx",
".cc",
".hpp",
".hxx",
".cs",
".fs",
".fsx",
".go",
".rs",
".rb",
".php",
".pl",
".pm",
".lua",
".swift",
".m",
".mm",
".r",
".R",
".jl",
".sh",
".bash",
".zsh",
".fish",
".bat",
".cmd",
".ps1",
".sql",
".graphql",
".gql",
".env",
".gitignore",
".dockerignore",
".editorconfig",
".makefile",
".cmake",
".log",
".rst",
".tex",
".bib",
".org",
".adoc",
".asciidoc",
".vue",
".svelte",
".astro",
".tf",
".hcl",
".proto",
}
)
AUDIO_EXTENSIONS = frozenset(
{
".mp3",
".mp4",
".mpeg",
".mpga",
".m4a",
".wav",
".webm",
}
)
DIRECT_CONVERT_EXTENSIONS = frozenset({".csv", ".tsv", ".html", ".htm"})
def _is_plaintext_file(filename: str) -> bool:
return Path(filename).suffix.lower() in PLAINTEXT_EXTENSIONS
def _is_audio_file(filename: str) -> bool:
return Path(filename).suffix.lower() in AUDIO_EXTENSIONS
def _is_direct_convert_file(filename: str) -> bool:
return Path(filename).suffix.lower() in DIRECT_CONVERT_EXTENSIONS
def _needs_etl(filename: str) -> bool:
"""File is not plaintext, not audio, and not direct-convert — requires ETL."""
return (
not _is_plaintext_file(filename)
and not _is_audio_file(filename)
and not _is_direct_convert_file(filename)
)
HeartbeatCallbackType = Callable[[int], Awaitable[None]]
@ -279,57 +152,19 @@ def scan_folder(
return files
def _read_plaintext_file(file_path: str) -> str:
"""Read a plaintext/text-based file as UTF-8."""
with open(file_path, encoding="utf-8", errors="replace") as f:
content = f.read()
if "\x00" in content:
raise ValueError(
f"File contains null bytes — likely a binary file opened as text: {file_path}"
)
return content
async def _read_file_content(file_path: str, filename: str) -> str:
"""Read file content, using ETL for binary formats.
"""Read file content via the unified ETL pipeline.
Plaintext files are read directly. Audio and document files (PDF, DOCX, etc.)
are routed through the configured ETL service (same as Google Drive / OneDrive).
Raises ValueError if the file cannot be parsed (e.g. no ETL service configured
for a binary file).
All file types (plaintext, audio, direct-convert, document) are handled
by ``EtlPipelineService``.
"""
if _is_plaintext_file(filename):
return _read_plaintext_file(file_path)
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
if _is_direct_convert_file(filename):
from app.tasks.document_processors._direct_converters import (
convert_file_directly,
)
return convert_file_directly(file_path, filename)
if _is_audio_file(filename):
etl_service = config.ETL_SERVICE if hasattr(config, "ETL_SERVICE") else None
stt_service_val = config.STT_SERVICE if hasattr(config, "STT_SERVICE") else None
if not stt_service_val and not etl_service:
raise ValueError(
f"No STT_SERVICE configured — cannot transcribe audio file: {filename}"
)
if _needs_etl(filename):
etl_service = getattr(config, "ETL_SERVICE", None)
if not etl_service:
raise ValueError(
f"No ETL_SERVICE configured — cannot parse binary file: {filename}. "
f"Set ETL_SERVICE to UNSTRUCTURED, LLAMACLOUD, or DOCLING in your .env"
)
from app.connectors.onedrive.content_extractor import (
_parse_file_to_markdown,
result = await EtlPipelineService().extract(
EtlRequest(file_path=file_path, filename=filename)
)
return await _parse_file_to_markdown(file_path, filename)
return result.markdown_content
def _content_hash(content: str, search_space_id: int) -> str:

View file

@ -56,7 +56,10 @@ async def _should_skip_file(
file_id = file.get("id")
file_name = file.get("name", "Unknown")
if skip_item(file):
skip, unsup_ext = skip_item(file)
if skip:
if unsup_ext:
return True, f"unsupported:{unsup_ext}"
return True, "folder/onenote/remote"
if not file_id:
return True, "missing file_id"
@ -290,7 +293,7 @@ async def _index_selected_files(
user_id: str,
enable_summary: bool,
on_heartbeat: HeartbeatCallbackType | None = None,
) -> tuple[int, int, list[str]]:
) -> tuple[int, int, int, list[str]]:
"""Index user-selected files using the parallel pipeline."""
page_limit_service = PageLimitService(session)
pages_used, pages_limit = await page_limit_service.get_page_usage(user_id)
@ -301,6 +304,7 @@ async def _index_selected_files(
errors: list[str] = []
renamed_count = 0
skipped = 0
unsupported_count = 0
for file_id, file_name in file_ids:
file, error = await get_file_by_id(onedrive_client, file_id)
@ -311,7 +315,9 @@ async def _index_selected_files(
skip, msg = await _should_skip_file(session, file, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
@ -347,7 +353,7 @@ async def _index_selected_files(
user_id, pages_to_deduct, allow_exceed=True
)
return renamed_count + batch_indexed, skipped, errors
return renamed_count + batch_indexed, skipped, unsupported_count, errors
# ---------------------------------------------------------------------------
@ -369,8 +375,11 @@ async def _index_full_scan(
include_subfolders: bool = True,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int]:
"""Full scan indexing of a folder."""
) -> tuple[int, int, int]:
"""Full scan indexing of a folder.
Returns (indexed, skipped, unsupported_count).
"""
await task_logger.log_task_progress(
log_entry,
f"Starting full scan of folder: {folder_name}",
@ -389,6 +398,7 @@ async def _index_full_scan(
renamed_count = 0
skipped = 0
unsupported_count = 0
files_to_download: list[dict] = []
all_files, error = await get_files_in_folder(
@ -407,7 +417,9 @@ async def _index_full_scan(
for file in all_files[:max_files]:
skip, msg = await _should_skip_file(session, file, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
@ -450,9 +462,10 @@ async def _index_full_scan(
indexed = renamed_count + batch_indexed
logger.info(
f"Full scan complete: {indexed} indexed, {skipped} skipped, {failed} failed"
f"Full scan complete: {indexed} indexed, {skipped} skipped, "
f"{unsupported_count} unsupported, {failed} failed"
)
return indexed, skipped
return indexed, skipped, unsupported_count
async def _index_with_delta_sync(
@ -468,8 +481,11 @@ async def _index_with_delta_sync(
max_files: int,
on_heartbeat_callback: HeartbeatCallbackType | None = None,
enable_summary: bool = True,
) -> tuple[int, int, str | None]:
"""Delta sync using OneDrive change tracking. Returns (indexed, skipped, new_delta_link)."""
) -> tuple[int, int, int, str | None]:
"""Delta sync using OneDrive change tracking.
Returns (indexed, skipped, unsupported_count, new_delta_link).
"""
await task_logger.log_task_progress(
log_entry,
"Starting delta sync",
@ -489,7 +505,7 @@ async def _index_with_delta_sync(
if not changes:
logger.info("No changes detected since last sync")
return 0, 0, new_delta_link
return 0, 0, 0, new_delta_link
logger.info(f"Processing {len(changes)} delta changes")
@ -501,6 +517,7 @@ async def _index_with_delta_sync(
renamed_count = 0
skipped = 0
unsupported_count = 0
files_to_download: list[dict] = []
files_processed = 0
@ -523,7 +540,9 @@ async def _index_with_delta_sync(
skip, msg = await _should_skip_file(session, change, search_space_id)
if skip:
if msg and "renamed" in msg.lower():
if msg and msg.startswith("unsupported:"):
unsupported_count += 1
elif msg and "renamed" in msg.lower():
renamed_count += 1
else:
skipped += 1
@ -566,9 +585,10 @@ async def _index_with_delta_sync(
indexed = renamed_count + batch_indexed
logger.info(
f"Delta sync complete: {indexed} indexed, {skipped} skipped, {failed} failed"
f"Delta sync complete: {indexed} indexed, {skipped} skipped, "
f"{unsupported_count} unsupported, {failed} failed"
)
return indexed, skipped, new_delta_link
return indexed, skipped, unsupported_count, new_delta_link
# ---------------------------------------------------------------------------
@ -582,7 +602,7 @@ async def index_onedrive_files(
search_space_id: int,
user_id: str,
items_dict: dict,
) -> tuple[int, int, str | None]:
) -> tuple[int, int, str | None, int]:
"""Index OneDrive files for a specific connector.
items_dict format:
@ -609,7 +629,7 @@ async def index_onedrive_files(
await task_logger.log_task_failure(
log_entry, error_msg, None, {"error_type": "ConnectorNotFound"}
)
return 0, 0, error_msg
return 0, 0, error_msg, 0
token_encrypted = connector.config.get("_token_encrypted", False)
if token_encrypted and not config.SECRET_KEY:
@ -620,7 +640,7 @@ async def index_onedrive_files(
"Missing SECRET_KEY",
{"error_type": "MissingSecretKey"},
)
return 0, 0, error_msg
return 0, 0, error_msg, 0
connector_enable_summary = getattr(connector, "enable_summary", True)
onedrive_client = OneDriveClient(session, connector_id)
@ -632,12 +652,13 @@ async def index_onedrive_files(
total_indexed = 0
total_skipped = 0
total_unsupported = 0
# Index selected individual files
selected_files = items_dict.get("files", [])
if selected_files:
file_tuples = [(f["id"], f.get("name")) for f in selected_files]
indexed, skipped, _errors = await _index_selected_files(
indexed, skipped, unsupported, _errors = await _index_selected_files(
onedrive_client,
session,
file_tuples,
@ -648,6 +669,7 @@ async def index_onedrive_files(
)
total_indexed += indexed
total_skipped += skipped
total_unsupported += unsupported
# Index selected folders
folders = items_dict.get("folders", [])
@ -661,7 +683,7 @@ async def index_onedrive_files(
if can_use_delta:
logger.info(f"Using delta sync for folder {folder_name}")
indexed, skipped, new_delta_link = await _index_with_delta_sync(
indexed, skipped, unsup, new_delta_link = await _index_with_delta_sync(
onedrive_client,
session,
connector_id,
@ -676,6 +698,7 @@ async def index_onedrive_files(
)
total_indexed += indexed
total_skipped += skipped
total_unsupported += unsup
if new_delta_link:
await session.refresh(connector)
@ -685,7 +708,7 @@ async def index_onedrive_files(
flag_modified(connector, "config")
# Reconciliation full scan
ri, rs = await _index_full_scan(
ri, rs, ru = await _index_full_scan(
onedrive_client,
session,
connector_id,
@ -701,9 +724,10 @@ async def index_onedrive_files(
)
total_indexed += ri
total_skipped += rs
total_unsupported += ru
else:
logger.info(f"Using full scan for folder {folder_name}")
indexed, skipped = await _index_full_scan(
indexed, skipped, unsup = await _index_full_scan(
onedrive_client,
session,
connector_id,
@ -719,6 +743,7 @@ async def index_onedrive_files(
)
total_indexed += indexed
total_skipped += skipped
total_unsupported += unsup
# Store new delta link for this folder
_, new_delta_link, _ = await onedrive_client.get_delta(folder_id=folder_id)
@ -737,12 +762,18 @@ async def index_onedrive_files(
await task_logger.log_task_success(
log_entry,
f"Successfully completed OneDrive indexing for connector {connector_id}",
{"files_processed": total_indexed, "files_skipped": total_skipped},
{
"files_processed": total_indexed,
"files_skipped": total_skipped,
"files_unsupported": total_unsupported,
},
)
logger.info(
f"OneDrive indexing completed: {total_indexed} indexed, {total_skipped} skipped"
f"OneDrive indexing completed: {total_indexed} indexed, "
f"{total_skipped} skipped, {total_unsupported} unsupported"
)
return total_indexed, total_skipped, None
return total_indexed, total_skipped, None, total_unsupported
except SQLAlchemyError as db_error:
await session.rollback()
@ -753,7 +784,7 @@ async def index_onedrive_files(
{"error_type": "SQLAlchemyError"},
)
logger.error(f"Database error: {db_error!s}", exc_info=True)
return 0, 0, f"Database error: {db_error!s}"
return 0, 0, f"Database error: {db_error!s}", 0
except Exception as e:
await session.rollback()
await task_logger.log_task_failure(
@ -763,4 +794,4 @@ async def index_onedrive_files(
{"error_type": type(e).__name__},
)
logger.error(f"Failed to index OneDrive files: {e!s}", exc_info=True)
return 0, 0, f"Failed to index OneDrive files: {e!s}"
return 0, 0, f"Failed to index OneDrive files: {e!s}", 0

View file

@ -1,41 +1,17 @@
"""
Document processors module for background tasks.
This module provides a collection of document processors for different content types
and sources. Each processor is responsible for handling a specific type of document
processing task in the background.
Available processors:
- Extension processor: Handle documents from browser extension
- Markdown processor: Process markdown files
- File processors: Handle files using different ETL services (Unstructured, LlamaCloud, Docling)
- YouTube processor: Process YouTube videos and extract transcripts
Content extraction is handled by ``app.etl_pipeline.EtlPipelineService``.
This package keeps orchestration (save, notify, page-limit) and
non-ETL processors (extension, markdown, youtube).
"""
# Extension processor
# File processors (backward-compatible re-exports from _save)
from ._save import (
add_received_file_document_using_docling,
add_received_file_document_using_llamacloud,
add_received_file_document_using_unstructured,
)
from .extension_processor import add_extension_received_document
# Markdown processor
from .markdown_processor import add_received_markdown_file_document
# YouTube processor
from .youtube_processor import add_youtube_video_document
__all__ = [
# Extension processing
"add_extension_received_document",
# File processing with different ETL services
"add_received_file_document_using_docling",
"add_received_file_document_using_llamacloud",
"add_received_file_document_using_unstructured",
# Markdown file processing
"add_received_markdown_file_document",
# YouTube video processing
"add_youtube_video_document",
]

View file

@ -1,74 +0,0 @@
"""
Constants for file document processing.
Centralizes file type classification, LlamaCloud retry configuration,
and timeout calculation parameters.
"""
import ssl
from enum import Enum
import httpx
# ---------------------------------------------------------------------------
# File type classification
# ---------------------------------------------------------------------------
MARKDOWN_EXTENSIONS = (".md", ".markdown", ".txt")
AUDIO_EXTENSIONS = (".mp3", ".mp4", ".mpeg", ".mpga", ".m4a", ".wav", ".webm")
DIRECT_CONVERT_EXTENSIONS = (".csv", ".tsv", ".html", ".htm")
class FileCategory(Enum):
MARKDOWN = "markdown"
AUDIO = "audio"
DIRECT_CONVERT = "direct_convert"
DOCUMENT = "document"
def classify_file(filename: str) -> FileCategory:
"""Classify a file by its extension into a processing category."""
lower = filename.lower()
if lower.endswith(MARKDOWN_EXTENSIONS):
return FileCategory.MARKDOWN
if lower.endswith(AUDIO_EXTENSIONS):
return FileCategory.AUDIO
if lower.endswith(DIRECT_CONVERT_EXTENSIONS):
return FileCategory.DIRECT_CONVERT
return FileCategory.DOCUMENT
# ---------------------------------------------------------------------------
# LlamaCloud retry configuration
# ---------------------------------------------------------------------------
LLAMACLOUD_MAX_RETRIES = 5
LLAMACLOUD_BASE_DELAY = 10 # seconds (exponential backoff base)
LLAMACLOUD_MAX_DELAY = 120 # max delay between retries (2 minutes)
LLAMACLOUD_RETRYABLE_EXCEPTIONS = (
ssl.SSLError,
httpx.ConnectError,
httpx.ConnectTimeout,
httpx.ReadError,
httpx.ReadTimeout,
httpx.WriteError,
httpx.WriteTimeout,
httpx.RemoteProtocolError,
httpx.LocalProtocolError,
ConnectionError,
ConnectionResetError,
TimeoutError,
OSError,
)
# ---------------------------------------------------------------------------
# Timeout calculation constants
# ---------------------------------------------------------------------------
UPLOAD_BYTES_PER_SECOND_SLOW = (
100 * 1024
) # 100 KB/s (conservative for slow connections)
MIN_UPLOAD_TIMEOUT = 120 # Minimum 2 minutes for any file
MAX_UPLOAD_TIMEOUT = 1800 # Maximum 30 minutes for very large files
BASE_JOB_TIMEOUT = 600 # 10 minutes base for job processing
PER_PAGE_JOB_TIMEOUT = 60 # 1 minute per page for processing

View file

@ -4,8 +4,8 @@ Lossless file-to-markdown converters for text-based formats.
These converters handle file types that can be faithfully represented as
markdown without any external ETL/OCR service:
- CSV / TSV markdown table (stdlib ``csv``)
- HTML / HTM markdown (``markdownify``)
- CSV / TSV markdown table (stdlib ``csv``)
- HTML / HTM / XHTML markdown (``markdownify``)
"""
from __future__ import annotations
@ -73,6 +73,7 @@ _CONVERTER_MAP: dict[str, Callable[..., str]] = {
".tsv": tsv_to_markdown,
".html": html_to_markdown,
".htm": html_to_markdown,
".xhtml": html_to_markdown,
}

View file

@ -1,209 +0,0 @@
"""
ETL parsing strategies for different document processing services.
Provides parse functions for Unstructured, LlamaCloud, and Docling, along with
LlamaCloud retry logic and dynamic timeout calculations.
"""
import asyncio
import logging
import os
import random
import warnings
from logging import ERROR, getLogger
import httpx
from app.config import config as app_config
from app.db import Log
from app.services.task_logging_service import TaskLoggingService
from ._constants import (
LLAMACLOUD_BASE_DELAY,
LLAMACLOUD_MAX_DELAY,
LLAMACLOUD_MAX_RETRIES,
LLAMACLOUD_RETRYABLE_EXCEPTIONS,
PER_PAGE_JOB_TIMEOUT,
)
from ._helpers import calculate_job_timeout, calculate_upload_timeout
# ---------------------------------------------------------------------------
# LlamaCloud parsing with retry
# ---------------------------------------------------------------------------
async def parse_with_llamacloud_retry(
file_path: str,
estimated_pages: int,
task_logger: TaskLoggingService | None = None,
log_entry: Log | None = None,
):
"""
Parse a file with LlamaCloud with retry logic for transient SSL/connection errors.
Uses dynamic timeout calculations based on file size and page count to handle
very large files reliably.
Returns:
LlamaParse result object
Raises:
Exception: If all retries fail
"""
from llama_cloud_services import LlamaParse
from llama_cloud_services.parse.utils import ResultType
file_size_bytes = os.path.getsize(file_path)
file_size_mb = file_size_bytes / (1024 * 1024)
upload_timeout = calculate_upload_timeout(file_size_bytes)
job_timeout = calculate_job_timeout(estimated_pages, file_size_bytes)
custom_timeout = httpx.Timeout(
connect=120.0,
read=upload_timeout,
write=upload_timeout,
pool=120.0,
)
logging.info(
f"LlamaCloud upload configured: file_size={file_size_mb:.1f}MB, "
f"pages={estimated_pages}, upload_timeout={upload_timeout:.0f}s, "
f"job_timeout={job_timeout:.0f}s"
)
last_exception = None
attempt_errors: list[str] = []
for attempt in range(1, LLAMACLOUD_MAX_RETRIES + 1):
try:
async with httpx.AsyncClient(timeout=custom_timeout) as custom_client:
parser = LlamaParse(
api_key=app_config.LLAMA_CLOUD_API_KEY,
num_workers=1,
verbose=True,
language="en",
result_type=ResultType.MD,
max_timeout=int(max(2000, job_timeout + upload_timeout)),
job_timeout_in_seconds=job_timeout,
job_timeout_extra_time_per_page_in_seconds=PER_PAGE_JOB_TIMEOUT,
custom_client=custom_client,
)
result = await parser.aparse(file_path)
if attempt > 1:
logging.info(
f"LlamaCloud upload succeeded on attempt {attempt} after "
f"{len(attempt_errors)} failures"
)
return result
except LLAMACLOUD_RETRYABLE_EXCEPTIONS as e:
last_exception = e
error_type = type(e).__name__
error_msg = str(e)[:200]
attempt_errors.append(f"Attempt {attempt}: {error_type} - {error_msg}")
if attempt < LLAMACLOUD_MAX_RETRIES:
base_delay = min(
LLAMACLOUD_BASE_DELAY * (2 ** (attempt - 1)),
LLAMACLOUD_MAX_DELAY,
)
jitter = base_delay * 0.25 * (2 * random.random() - 1)
delay = base_delay + jitter
if task_logger and log_entry:
await task_logger.log_task_progress(
log_entry,
f"LlamaCloud upload failed "
f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}), "
f"retrying in {delay:.0f}s",
{
"error_type": error_type,
"error_message": error_msg,
"attempt": attempt,
"retry_delay": delay,
"file_size_mb": round(file_size_mb, 1),
"upload_timeout": upload_timeout,
},
)
else:
logging.warning(
f"LlamaCloud upload failed "
f"(attempt {attempt}/{LLAMACLOUD_MAX_RETRIES}): "
f"{error_type}. File: {file_size_mb:.1f}MB. "
f"Retrying in {delay:.0f}s..."
)
await asyncio.sleep(delay)
else:
logging.error(
f"LlamaCloud upload failed after {LLAMACLOUD_MAX_RETRIES} "
f"attempts. File size: {file_size_mb:.1f}MB, "
f"Pages: {estimated_pages}. "
f"Errors: {'; '.join(attempt_errors)}"
)
except Exception:
raise
raise last_exception or RuntimeError(
f"LlamaCloud parsing failed after {LLAMACLOUD_MAX_RETRIES} retries. "
f"File size: {file_size_mb:.1f}MB"
)
# ---------------------------------------------------------------------------
# Per-service parse functions
# ---------------------------------------------------------------------------
async def parse_with_unstructured(file_path: str):
"""
Parse a file using the Unstructured ETL service.
Returns:
List of LangChain Document elements.
"""
from langchain_unstructured import UnstructuredLoader
loader = UnstructuredLoader(
file_path,
mode="elements",
post_processors=[],
languages=["eng"],
include_orig_elements=False,
include_metadata=False,
strategy="auto",
)
return await loader.aload()
async def parse_with_docling(file_path: str, filename: str) -> str:
"""
Parse a file using the Docling ETL service (via the Docling service wrapper).
Returns:
Markdown content string.
"""
from app.services.docling_service import create_docling_service
docling_service = create_docling_service()
pdfminer_logger = getLogger("pdfminer")
original_level = pdfminer_logger.level
with warnings.catch_warnings():
warnings.filterwarnings("ignore", category=UserWarning, module="pdfminer")
warnings.filterwarnings(
"ignore", message=".*Cannot set gray non-stroke color.*"
)
warnings.filterwarnings("ignore", message=".*invalid float value.*")
pdfminer_logger.setLevel(ERROR)
try:
result = await docling_service.process_document(file_path, filename)
finally:
pdfminer_logger.setLevel(original_level)
return result["content"]

View file

@ -11,13 +11,6 @@ from sqlalchemy.ext.asyncio import AsyncSession
from app.db import Document, DocumentStatus, DocumentType
from app.utils.document_converters import generate_unique_identifier_hash
from ._constants import (
BASE_JOB_TIMEOUT,
MAX_UPLOAD_TIMEOUT,
MIN_UPLOAD_TIMEOUT,
PER_PAGE_JOB_TIMEOUT,
UPLOAD_BYTES_PER_SECOND_SLOW,
)
from .base import (
check_document_by_unique_identifier,
check_duplicate_document,
@ -198,21 +191,3 @@ async def update_document_from_connector(
if "connector_id" in connector:
document.connector_id = connector["connector_id"]
await session.commit()
# ---------------------------------------------------------------------------
# Timeout calculations
# ---------------------------------------------------------------------------
def calculate_upload_timeout(file_size_bytes: int) -> float:
"""Calculate upload timeout based on file size (conservative for slow connections)."""
estimated_time = (file_size_bytes / UPLOAD_BYTES_PER_SECOND_SLOW) * 1.5
return max(MIN_UPLOAD_TIMEOUT, min(estimated_time, MAX_UPLOAD_TIMEOUT))
def calculate_job_timeout(estimated_pages: int, file_size_bytes: int) -> float:
"""Calculate job processing timeout based on page count and file size."""
page_based_timeout = BASE_JOB_TIMEOUT + (estimated_pages * PER_PAGE_JOB_TIMEOUT)
size_based_timeout = BASE_JOB_TIMEOUT + (file_size_bytes / (10 * 1024 * 1024)) * 60
return max(page_based_timeout, size_based_timeout)

View file

@ -1,14 +1,9 @@
"""
Unified document save/update logic for file processors.
Replaces the three nearly-identical ``add_received_file_document_using_*``
functions with a single ``save_file_document`` function plus thin wrappers
for backward compatibility.
"""
import logging
from langchain_core.documents import Document as LangChainDocument
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.ext.asyncio import AsyncSession
@ -207,79 +202,3 @@ async def save_file_document(
raise RuntimeError(
f"Failed to process file document using {etl_service}: {e!s}"
) from e
# ---------------------------------------------------------------------------
# Backward-compatible wrapper functions
# ---------------------------------------------------------------------------
async def add_received_file_document_using_unstructured(
session: AsyncSession,
file_name: str,
unstructured_processed_elements: list[LangChainDocument],
search_space_id: int,
user_id: str,
connector: dict | None = None,
enable_summary: bool = True,
) -> Document | None:
"""Process and store a file document using the Unstructured service."""
from app.utils.document_converters import convert_document_to_markdown
markdown_content = await convert_document_to_markdown(
unstructured_processed_elements
)
return await save_file_document(
session,
file_name,
markdown_content,
search_space_id,
user_id,
"UNSTRUCTURED",
connector,
enable_summary,
)
async def add_received_file_document_using_llamacloud(
session: AsyncSession,
file_name: str,
llamacloud_markdown_document: str,
search_space_id: int,
user_id: str,
connector: dict | None = None,
enable_summary: bool = True,
) -> Document | None:
"""Process and store document content parsed by LlamaCloud."""
return await save_file_document(
session,
file_name,
llamacloud_markdown_document,
search_space_id,
user_id,
"LLAMACLOUD",
connector,
enable_summary,
)
async def add_received_file_document_using_docling(
session: AsyncSession,
file_name: str,
docling_markdown_document: str,
search_space_id: int,
user_id: str,
connector: dict | None = None,
enable_summary: bool = True,
) -> Document | None:
"""Process and store document content parsed by Docling."""
return await save_file_document(
session,
file_name,
docling_markdown_document,
search_space_id,
user_id,
"DOCLING",
connector,
enable_summary,
)

View file

@ -1,14 +1,8 @@
"""
File document processors orchestrating content extraction and indexing.
This module is the public entry point for file processing. It delegates to
specialised sub-modules that each own a single concern:
- ``_constants`` file type classification and configuration constants
- ``_helpers`` document deduplication, migration, connector helpers
- ``_direct_converters`` lossless file-to-markdown for csv/tsv/html
- ``_etl`` ETL parsing strategies (Unstructured, LlamaCloud, Docling)
- ``_save`` unified document creation / update logic
Delegates content extraction to ``app.etl_pipeline.EtlPipelineService`` and
keeps only orchestration concerns (notifications, logging, page limits, saving).
"""
from __future__ import annotations
@ -17,38 +11,19 @@ import contextlib
import logging
import os
from dataclasses import dataclass, field
from logging import ERROR, getLogger
from fastapi import HTTPException
from sqlalchemy.ext.asyncio import AsyncSession
from app.config import config as app_config
from app.db import Document, Log, Notification
from app.services.notification_service import NotificationService
from app.services.task_logging_service import TaskLoggingService
from ._constants import FileCategory, classify_file
from ._direct_converters import convert_file_directly
from ._etl import (
parse_with_docling,
parse_with_llamacloud_retry,
parse_with_unstructured,
)
from ._helpers import update_document_from_connector
from ._save import (
add_received_file_document_using_docling,
add_received_file_document_using_llamacloud,
add_received_file_document_using_unstructured,
save_file_document,
)
from ._save import save_file_document
from .markdown_processor import add_received_markdown_file_document
# Re-export public API so existing ``from file_processors import …`` keeps working.
__all__ = [
"add_received_file_document_using_docling",
"add_received_file_document_using_llamacloud",
"add_received_file_document_using_unstructured",
"parse_with_llamacloud_retry",
"process_file_in_background",
"process_file_in_background_with_document",
"save_file_document",
@ -142,35 +117,31 @@ async def _log_page_divergence(
# ===================================================================
async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
"""Read a markdown / text file and create or update a document."""
await _notify(ctx, "parsing", "Reading file")
async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
"""Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
await _notify(ctx, "parsing", "Processing file")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Processing markdown/text file: {ctx.filename}",
{"file_type": "markdown", "processing_stage": "reading_file"},
f"Processing file: {ctx.filename}",
{"processing_stage": "extracting"},
)
with open(ctx.file_path, encoding="utf-8") as f:
markdown_content = f.read()
etl_result = await EtlPipelineService().extract(
EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
)
with contextlib.suppress(Exception):
os.unlink(ctx.file_path)
await _notify(ctx, "chunking")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Creating document from markdown content: {ctx.filename}",
{
"processing_stage": "creating_document",
"content_length": len(markdown_content),
},
)
result = await add_received_markdown_file_document(
ctx.session,
ctx.filename,
markdown_content,
etl_result.markdown_content,
ctx.search_space_id,
ctx.user_id,
ctx.connector,
@ -181,179 +152,19 @@ async def _process_markdown_upload(ctx: _ProcessingContext) -> Document | None:
if result:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Successfully processed markdown file: {ctx.filename}",
f"Successfully processed file: {ctx.filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "markdown",
"file_type": etl_result.content_type,
"etl_service": etl_result.etl_service,
},
)
else:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Markdown file already exists (duplicate): {ctx.filename}",
{"duplicate_detected": True, "file_type": "markdown"},
)
return result
async def _process_direct_convert_upload(ctx: _ProcessingContext) -> Document | None:
"""Convert a text-based file (csv/tsv/html) to markdown without ETL."""
await _notify(ctx, "parsing", "Converting file")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Direct-converting file to markdown: {ctx.filename}",
{"file_type": "direct_convert", "processing_stage": "converting"},
)
markdown_content = convert_file_directly(ctx.file_path, ctx.filename)
with contextlib.suppress(Exception):
os.unlink(ctx.file_path)
await _notify(ctx, "chunking")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Creating document from converted content: {ctx.filename}",
{
"processing_stage": "creating_document",
"content_length": len(markdown_content),
},
)
result = await add_received_markdown_file_document(
ctx.session,
ctx.filename,
markdown_content,
ctx.search_space_id,
ctx.user_id,
ctx.connector,
)
if ctx.connector:
await update_document_from_connector(result, ctx.connector, ctx.session)
if result:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Successfully direct-converted file: {ctx.filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "direct_convert",
},
)
else:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Direct-converted file already exists (duplicate): {ctx.filename}",
{"duplicate_detected": True, "file_type": "direct_convert"},
)
return result
async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
"""Transcribe an audio file and create or update a document."""
await _notify(ctx, "parsing", "Transcribing audio")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Processing audio file for transcription: {ctx.filename}",
{"file_type": "audio", "processing_stage": "starting_transcription"},
)
stt_service_type = (
"local"
if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
else "external"
)
if stt_service_type == "local":
from app.services.stt_service import stt_service
try:
stt_result = stt_service.transcribe_file(ctx.file_path)
transcribed_text = stt_result.get("text", "")
if not transcribed_text:
raise ValueError("Transcription returned empty text")
transcribed_text = (
f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
)
except Exception as e:
raise HTTPException(
status_code=422,
detail=f"Failed to transcribe audio file {ctx.filename}: {e!s}",
) from e
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Local STT transcription completed: {ctx.filename}",
{
"processing_stage": "local_transcription_complete",
"language": stt_result.get("language"),
"confidence": stt_result.get("language_probability"),
"duration": stt_result.get("duration"),
},
)
else:
from litellm import atranscription
with open(ctx.file_path, "rb") as audio_file:
transcription_kwargs: dict = {
"model": app_config.STT_SERVICE,
"file": audio_file,
"api_key": app_config.STT_SERVICE_API_KEY,
}
if app_config.STT_SERVICE_API_BASE:
transcription_kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
transcription_response = await atranscription(**transcription_kwargs)
transcribed_text = transcription_response.get("text", "")
if not transcribed_text:
raise ValueError("Transcription returned empty text")
transcribed_text = f"# Transcription of {ctx.filename}\n\n{transcribed_text}"
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Transcription completed, creating document: {ctx.filename}",
{
"processing_stage": "transcription_complete",
"transcript_length": len(transcribed_text),
},
)
await _notify(ctx, "chunking")
with contextlib.suppress(Exception):
os.unlink(ctx.file_path)
result = await add_received_markdown_file_document(
ctx.session,
ctx.filename,
transcribed_text,
ctx.search_space_id,
ctx.user_id,
ctx.connector,
)
if ctx.connector:
await update_document_from_connector(result, ctx.connector, ctx.session)
if result:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Successfully transcribed and processed audio file: {ctx.filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "audio",
"transcript_length": len(transcribed_text),
"stt_service": stt_service_type,
},
)
else:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Audio file transcript already exists (duplicate): {ctx.filename}",
{"duplicate_detected": True, "file_type": "audio"},
f"File already exists (duplicate): {ctx.filename}",
{"duplicate_detected": True, "file_type": etl_result.content_type},
)
return result
@ -363,279 +174,10 @@ async def _process_audio_upload(ctx: _ProcessingContext) -> Document | None:
# ---------------------------------------------------------------------------
async def _etl_unstructured(
ctx: _ProcessingContext,
page_limit_service,
estimated_pages: int,
) -> Document | None:
"""Parse and save via the Unstructured ETL service."""
await _notify(ctx, "parsing", "Extracting content")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Processing file with Unstructured ETL: {ctx.filename}",
{
"file_type": "document",
"etl_service": "UNSTRUCTURED",
"processing_stage": "loading",
},
)
docs = await parse_with_unstructured(ctx.file_path)
await _notify(ctx, "chunking", chunks_count=len(docs))
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Unstructured ETL completed, creating document: {ctx.filename}",
{"processing_stage": "etl_complete", "elements_count": len(docs)},
)
actual_pages = page_limit_service.estimate_pages_from_elements(docs)
final_pages = max(estimated_pages, actual_pages)
await _log_page_divergence(
ctx.task_logger,
ctx.log_entry,
ctx.filename,
estimated_pages,
actual_pages,
final_pages,
)
with contextlib.suppress(Exception):
os.unlink(ctx.file_path)
result = await add_received_file_document_using_unstructured(
ctx.session,
ctx.filename,
docs,
ctx.search_space_id,
ctx.user_id,
ctx.connector,
enable_summary=ctx.enable_summary,
)
if ctx.connector:
await update_document_from_connector(result, ctx.connector, ctx.session)
if result:
await page_limit_service.update_page_usage(
ctx.user_id, final_pages, allow_exceed=True
)
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Successfully processed file with Unstructured: {ctx.filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "document",
"etl_service": "UNSTRUCTURED",
"pages_processed": final_pages,
},
)
else:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Document already exists (duplicate): {ctx.filename}",
{
"duplicate_detected": True,
"file_type": "document",
"etl_service": "UNSTRUCTURED",
},
)
return result
async def _etl_llamacloud(
ctx: _ProcessingContext,
page_limit_service,
estimated_pages: int,
) -> Document | None:
"""Parse and save via the LlamaCloud ETL service."""
await _notify(ctx, "parsing", "Extracting content")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Processing file with LlamaCloud ETL: {ctx.filename}",
{
"file_type": "document",
"etl_service": "LLAMACLOUD",
"processing_stage": "parsing",
"estimated_pages": estimated_pages,
},
)
raw_result = await parse_with_llamacloud_retry(
file_path=ctx.file_path,
estimated_pages=estimated_pages,
task_logger=ctx.task_logger,
log_entry=ctx.log_entry,
)
with contextlib.suppress(Exception):
os.unlink(ctx.file_path)
markdown_documents = await raw_result.aget_markdown_documents(split_by_page=False)
await _notify(ctx, "chunking", chunks_count=len(markdown_documents))
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"LlamaCloud parsing completed, creating documents: {ctx.filename}",
{
"processing_stage": "parsing_complete",
"documents_count": len(markdown_documents),
},
)
if not markdown_documents:
await ctx.task_logger.log_task_failure(
ctx.log_entry,
f"LlamaCloud parsing returned no documents: {ctx.filename}",
"ETL service returned empty document list",
{"error_type": "EmptyDocumentList", "etl_service": "LLAMACLOUD"},
)
raise ValueError(f"LlamaCloud parsing returned no documents for {ctx.filename}")
actual_pages = page_limit_service.estimate_pages_from_markdown(markdown_documents)
final_pages = max(estimated_pages, actual_pages)
await _log_page_divergence(
ctx.task_logger,
ctx.log_entry,
ctx.filename,
estimated_pages,
actual_pages,
final_pages,
)
any_created = False
last_doc: Document | None = None
for doc in markdown_documents:
doc_result = await add_received_file_document_using_llamacloud(
ctx.session,
ctx.filename,
llamacloud_markdown_document=doc.text,
search_space_id=ctx.search_space_id,
user_id=ctx.user_id,
connector=ctx.connector,
enable_summary=ctx.enable_summary,
)
if doc_result:
any_created = True
last_doc = doc_result
if any_created:
await page_limit_service.update_page_usage(
ctx.user_id, final_pages, allow_exceed=True
)
if ctx.connector:
await update_document_from_connector(last_doc, ctx.connector, ctx.session)
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Successfully processed file with LlamaCloud: {ctx.filename}",
{
"document_id": last_doc.id,
"content_hash": last_doc.content_hash,
"file_type": "document",
"etl_service": "LLAMACLOUD",
"pages_processed": final_pages,
"documents_count": len(markdown_documents),
},
)
return last_doc
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Document already exists (duplicate): {ctx.filename}",
{
"duplicate_detected": True,
"file_type": "document",
"etl_service": "LLAMACLOUD",
"documents_count": len(markdown_documents),
},
)
return None
async def _etl_docling(
ctx: _ProcessingContext,
page_limit_service,
estimated_pages: int,
) -> Document | None:
"""Parse and save via the Docling ETL service."""
await _notify(ctx, "parsing", "Extracting content")
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Processing file with Docling ETL: {ctx.filename}",
{
"file_type": "document",
"etl_service": "DOCLING",
"processing_stage": "parsing",
},
)
content = await parse_with_docling(ctx.file_path, ctx.filename)
with contextlib.suppress(Exception):
os.unlink(ctx.file_path)
await ctx.task_logger.log_task_progress(
ctx.log_entry,
f"Docling parsing completed, creating document: {ctx.filename}",
{"processing_stage": "parsing_complete", "content_length": len(content)},
)
actual_pages = page_limit_service.estimate_pages_from_content_length(len(content))
final_pages = max(estimated_pages, actual_pages)
await _log_page_divergence(
ctx.task_logger,
ctx.log_entry,
ctx.filename,
estimated_pages,
actual_pages,
final_pages,
)
await _notify(ctx, "chunking")
result = await add_received_file_document_using_docling(
ctx.session,
ctx.filename,
docling_markdown_document=content,
search_space_id=ctx.search_space_id,
user_id=ctx.user_id,
connector=ctx.connector,
enable_summary=ctx.enable_summary,
)
if result:
await page_limit_service.update_page_usage(
ctx.user_id, final_pages, allow_exceed=True
)
if ctx.connector:
await update_document_from_connector(result, ctx.connector, ctx.session)
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Successfully processed file with Docling: {ctx.filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "document",
"etl_service": "DOCLING",
"pages_processed": final_pages,
},
)
else:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Document already exists (duplicate): {ctx.filename}",
{
"duplicate_detected": True,
"file_type": "document",
"etl_service": "DOCLING",
},
)
return result
async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
"""Route a document file to the configured ETL service."""
"""Route a document file to the configured ETL service via the unified pipeline."""
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
from app.services.page_limit_service import PageLimitExceededError, PageLimitService
page_limit_service = PageLimitService(ctx.session)
@ -665,16 +207,60 @@ async def _process_document_upload(ctx: _ProcessingContext) -> Document | None:
os.unlink(ctx.file_path)
raise HTTPException(status_code=403, detail=str(e)) from e
etl_dispatch = {
"UNSTRUCTURED": _etl_unstructured,
"LLAMACLOUD": _etl_llamacloud,
"DOCLING": _etl_docling,
}
handler = etl_dispatch.get(app_config.ETL_SERVICE)
if handler is None:
raise RuntimeError(f"Unknown ETL_SERVICE: {app_config.ETL_SERVICE}")
await _notify(ctx, "parsing", "Extracting content")
return await handler(ctx, page_limit_service, estimated_pages)
etl_result = await EtlPipelineService().extract(
EtlRequest(
file_path=ctx.file_path,
filename=ctx.filename,
estimated_pages=estimated_pages,
)
)
with contextlib.suppress(Exception):
os.unlink(ctx.file_path)
await _notify(ctx, "chunking")
result = await save_file_document(
ctx.session,
ctx.filename,
etl_result.markdown_content,
ctx.search_space_id,
ctx.user_id,
etl_result.etl_service,
ctx.connector,
enable_summary=ctx.enable_summary,
)
if result:
await page_limit_service.update_page_usage(
ctx.user_id, estimated_pages, allow_exceed=True
)
if ctx.connector:
await update_document_from_connector(result, ctx.connector, ctx.session)
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Successfully processed file: {ctx.filename}",
{
"document_id": result.id,
"content_hash": result.content_hash,
"file_type": "document",
"etl_service": etl_result.etl_service,
"pages_processed": estimated_pages,
},
)
else:
await ctx.task_logger.log_task_success(
ctx.log_entry,
f"Document already exists (duplicate): {ctx.filename}",
{
"duplicate_detected": True,
"file_type": "document",
"etl_service": etl_result.etl_service,
},
)
return result
# ===================================================================
@ -706,15 +292,16 @@ async def process_file_in_background(
)
try:
category = classify_file(filename)
from app.etl_pipeline.file_classifier import (
FileCategory as EtlFileCategory,
classify_file as etl_classify,
)
if category == FileCategory.MARKDOWN:
return await _process_markdown_upload(ctx)
if category == FileCategory.DIRECT_CONVERT:
return await _process_direct_convert_upload(ctx)
if category == FileCategory.AUDIO:
return await _process_audio_upload(ctx)
return await _process_document_upload(ctx)
category = etl_classify(filename)
if category == EtlFileCategory.DOCUMENT:
return await _process_document_upload(ctx)
return await _process_non_document_upload(ctx)
except Exception as e:
await session.rollback()
@ -758,201 +345,64 @@ async def _extract_file_content(
Returns:
Tuple of (markdown_content, etl_service_name).
"""
category = classify_file(filename)
if category == FileCategory.MARKDOWN:
if notification:
await NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Reading file",
)
await task_logger.log_task_progress(
log_entry,
f"Processing markdown/text file: {filename}",
{"file_type": "markdown", "processing_stage": "reading_file"},
)
with open(file_path, encoding="utf-8") as f:
content = f.read()
with contextlib.suppress(Exception):
os.unlink(file_path)
return content, "MARKDOWN"
if category == FileCategory.DIRECT_CONVERT:
if notification:
await NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Converting file",
)
await task_logger.log_task_progress(
log_entry,
f"Direct-converting file to markdown: {filename}",
{"file_type": "direct_convert", "processing_stage": "converting"},
)
content = convert_file_directly(file_path, filename)
with contextlib.suppress(Exception):
os.unlink(file_path)
return content, "DIRECT_CONVERT"
if category == FileCategory.AUDIO:
if notification:
await NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Transcribing audio",
)
await task_logger.log_task_progress(
log_entry,
f"Processing audio file for transcription: {filename}",
{"file_type": "audio", "processing_stage": "starting_transcription"},
)
transcribed_text = await _transcribe_audio(file_path, filename)
with contextlib.suppress(Exception):
os.unlink(file_path)
return transcribed_text, "AUDIO_TRANSCRIPTION"
# Document file — use ETL service
return await _extract_document_content(
file_path,
filename,
session,
user_id,
task_logger,
log_entry,
notification,
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
from app.etl_pipeline.file_classifier import (
FileCategory,
classify_file as etl_classify,
)
async def _transcribe_audio(file_path: str, filename: str) -> str:
"""Transcribe an audio file and return formatted markdown text."""
stt_service_type = (
"local"
if app_config.STT_SERVICE and app_config.STT_SERVICE.startswith("local/")
else "external"
)
if stt_service_type == "local":
from app.services.stt_service import stt_service
result = stt_service.transcribe_file(file_path)
text = result.get("text", "")
if not text:
raise ValueError("Transcription returned empty text")
else:
from litellm import atranscription
with open(file_path, "rb") as audio_file:
kwargs: dict = {
"model": app_config.STT_SERVICE,
"file": audio_file,
"api_key": app_config.STT_SERVICE_API_KEY,
}
if app_config.STT_SERVICE_API_BASE:
kwargs["api_base"] = app_config.STT_SERVICE_API_BASE
response = await atranscription(**kwargs)
text = response.get("text", "")
if not text:
raise ValueError("Transcription returned empty text")
return f"# Transcription of {filename}\n\n{text}"
async def _extract_document_content(
file_path: str,
filename: str,
session: AsyncSession,
user_id: str,
task_logger: TaskLoggingService,
log_entry: Log,
notification: Notification | None,
) -> tuple[str, str]:
"""
Parse a document file via the configured ETL service.
Returns:
Tuple of (markdown_content, etl_service_name).
"""
from app.services.page_limit_service import PageLimitService
page_limit_service = PageLimitService(session)
try:
estimated_pages = page_limit_service.estimate_pages_before_processing(file_path)
except Exception:
file_size = os.path.getsize(file_path)
estimated_pages = max(1, file_size // (80 * 1024))
await page_limit_service.check_page_limit(user_id, estimated_pages)
etl_service = app_config.ETL_SERVICE
markdown_content: str | None = None
category = etl_classify(filename)
estimated_pages = 0
if notification:
stage_messages = {
FileCategory.PLAINTEXT: "Reading file",
FileCategory.DIRECT_CONVERT: "Converting file",
FileCategory.AUDIO: "Transcribing audio",
FileCategory.UNSUPPORTED: "Unsupported file type",
FileCategory.DOCUMENT: "Extracting content",
}
await NotificationService.document_processing.notify_processing_progress(
session,
notification,
stage="parsing",
stage_message="Extracting content",
stage_message=stage_messages.get(category, "Processing"),
)
if etl_service == "UNSTRUCTURED":
from app.utils.document_converters import convert_document_to_markdown
await task_logger.log_task_progress(
log_entry,
f"Processing {category.value} file: {filename}",
{"file_type": category.value, "processing_stage": "extracting"},
)
docs = await parse_with_unstructured(file_path)
markdown_content = await convert_document_to_markdown(docs)
actual_pages = page_limit_service.estimate_pages_from_elements(docs)
final_pages = max(estimated_pages, actual_pages)
await page_limit_service.update_page_usage(
user_id, final_pages, allow_exceed=True
)
if category == FileCategory.DOCUMENT:
from app.services.page_limit_service import PageLimitService
elif etl_service == "LLAMACLOUD":
raw_result = await parse_with_llamacloud_retry(
page_limit_service = PageLimitService(session)
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
await page_limit_service.check_page_limit(user_id, estimated_pages)
result = await EtlPipelineService().extract(
EtlRequest(
file_path=file_path,
filename=filename,
estimated_pages=estimated_pages,
task_logger=task_logger,
log_entry=log_entry,
)
markdown_documents = await raw_result.aget_markdown_documents(
split_by_page=False
)
if not markdown_documents:
raise RuntimeError(f"LlamaCloud parsing returned no documents: {filename}")
markdown_content = markdown_documents[0].text
)
if category == FileCategory.DOCUMENT:
await page_limit_service.update_page_usage(
user_id, estimated_pages, allow_exceed=True
)
elif etl_service == "DOCLING":
getLogger("docling.pipeline.base_pipeline").setLevel(ERROR)
getLogger("docling.document_converter").setLevel(ERROR)
getLogger("docling_core.transforms.chunker.hierarchical_chunker").setLevel(
ERROR
)
from docling.document_converter import DocumentConverter
converter = DocumentConverter()
result = converter.convert(file_path)
markdown_content = result.document.export_to_markdown()
await page_limit_service.update_page_usage(
user_id, estimated_pages, allow_exceed=True
)
else:
raise RuntimeError(f"Unknown ETL_SERVICE: {etl_service}")
with contextlib.suppress(Exception):
os.unlink(file_path)
if not markdown_content:
if not result.markdown_content:
raise RuntimeError(f"Failed to extract content from file: {filename}")
return markdown_content, etl_service
return result.markdown_content, result.etl_service
async def process_file_in_background_with_document(

View file

@ -0,0 +1,124 @@
"""Per-parser document extension sets for the ETL pipeline.
Every consumer (file_classifier, connector-level skip checks, ETL pipeline
validation) imports from here so there is a single source of truth.
Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
Unstructured).
"""
from pathlib import PurePosixPath
# ---------------------------------------------------------------------------
# Per-parser document extension sets (from official documentation)
# ---------------------------------------------------------------------------
DOCLING_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
{
".pdf",
".docx",
".xlsx",
".pptx",
".png",
".jpg",
".jpeg",
".tiff",
".tif",
".bmp",
".webp",
}
)
LLAMAPARSE_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
{
".pdf",
".docx",
".doc",
".xlsx",
".xls",
".pptx",
".ppt",
".docm",
".dot",
".dotm",
".pptm",
".pot",
".potx",
".xlsm",
".xlsb",
".xlw",
".rtf",
".epub",
".png",
".jpg",
".jpeg",
".gif",
".bmp",
".tiff",
".tif",
".webp",
".svg",
".odt",
".ods",
".odp",
".hwp",
".hwpx",
}
)
UNSTRUCTURED_DOCUMENT_EXTENSIONS: frozenset[str] = frozenset(
{
".pdf",
".docx",
".doc",
".xlsx",
".xls",
".pptx",
".ppt",
".png",
".jpg",
".jpeg",
".bmp",
".tiff",
".tif",
".heic",
".rtf",
".epub",
".odt",
".eml",
".msg",
".p7s",
}
)
# ---------------------------------------------------------------------------
# Union (used by classify_file for routing) + service lookup
# ---------------------------------------------------------------------------
DOCUMENT_EXTENSIONS: frozenset[str] = (
DOCLING_DOCUMENT_EXTENSIONS
| LLAMAPARSE_DOCUMENT_EXTENSIONS
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
)
_SERVICE_MAP: dict[str, frozenset[str]] = {
"DOCLING": DOCLING_DOCUMENT_EXTENSIONS,
"LLAMACLOUD": LLAMAPARSE_DOCUMENT_EXTENSIONS,
"UNSTRUCTURED": UNSTRUCTURED_DOCUMENT_EXTENSIONS,
}
def get_document_extensions_for_service(etl_service: str | None) -> frozenset[str]:
"""Return the document extensions supported by *etl_service*.
Falls back to the full union when the service is ``None`` or unknown.
"""
return _SERVICE_MAP.get(etl_service or "", DOCUMENT_EXTENSIONS)
def is_supported_document_extension(filename: str) -> bool:
"""Return True if the file's extension is in the supported document set."""
suffix = PurePosixPath(filename).suffix.lower()
return suffix in DOCUMENT_EXTENSIONS

View file

@ -319,31 +319,23 @@ def _mock_etl_parsing(monkeypatch):
# -- LlamaParse mock (external API) --------------------------------
class _FakeMarkdownDoc:
def __init__(self, text: str):
self.text = text
class _FakeLlamaParseResult:
async def aget_markdown_documents(self, *, split_by_page=False):
return [_FakeMarkdownDoc(_MOCK_ETL_MARKDOWN)]
async def _fake_llamacloud_parse(**kwargs):
_reject_empty(kwargs["file_path"])
return _FakeLlamaParseResult()
async def _fake_llamacloud_parse(file_path: str, estimated_pages: int) -> str:
_reject_empty(file_path)
return _MOCK_ETL_MARKDOWN
monkeypatch.setattr(
"app.tasks.document_processors.file_processors.parse_with_llamacloud_retry",
"app.etl_pipeline.parsers.llamacloud.parse_with_llamacloud",
_fake_llamacloud_parse,
)
# -- Docling mock (heavy library boundary) -------------------------
async def _fake_docling_parse(file_path: str, filename: str):
async def _fake_docling_parse(file_path: str, filename: str) -> str:
_reject_empty(file_path)
return _MOCK_ETL_MARKDOWN
monkeypatch.setattr(
"app.tasks.document_processors.file_processors.parse_with_docling",
"app.etl_pipeline.parsers.docling.parse_with_docling",
_fake_docling_parse,
)

View file

@ -124,7 +124,7 @@ async def test_composio_connector_without_account_id_returns_error(
maker = make_session_factory(async_engine)
async with maker() as session:
count, _skipped, error = await index_google_drive_files(
count, _skipped, error, _unsupported = await index_google_drive_files(
session=session,
connector_id=data["connector_id"],
search_space_id=data["search_space_id"],

View file

@ -0,0 +1,244 @@
"""Tests that each cloud connector's download_and_extract_content correctly
produces markdown from a real file via the unified ETL pipeline.
Only the cloud client is mocked (system boundary). The ETL pipeline runs for
real so we know the full path from "cloud gives us bytes" to "we get markdown
back" actually works.
"""
from unittest.mock import AsyncMock, MagicMock
import pytest
pytestmark = pytest.mark.unit
_TXT_CONTENT = "Hello from the cloud connector test."
_CSV_CONTENT = "name,age\nAlice,30\nBob,25\n"
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
async def _write_file(dest_path: str, content: str) -> None:
"""Simulate a cloud client writing downloaded bytes to disk."""
with open(dest_path, "w", encoding="utf-8") as f:
f.write(content)
def _make_download_side_effect(content: str):
"""Return an async side-effect that writes *content* to the dest path
and returns ``None`` (success)."""
async def _side_effect(*args):
dest_path = args[-1]
await _write_file(dest_path, content)
return None
return _side_effect
# ===================================================================
# Google Drive
# ===================================================================
class TestGoogleDriveContentExtraction:
async def test_txt_file_returns_markdown(self):
from app.connectors.google_drive.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(
side_effect=_make_download_side_effect(_TXT_CONTENT),
)
file = {"id": "f1", "name": "notes.txt", "mimeType": "text/plain"}
markdown, metadata, error = await download_and_extract_content(client, file)
assert error is None
assert _TXT_CONTENT in markdown
assert metadata["google_drive_file_id"] == "f1"
assert metadata["google_drive_file_name"] == "notes.txt"
async def test_csv_file_returns_markdown_table(self):
from app.connectors.google_drive.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(
side_effect=_make_download_side_effect(_CSV_CONTENT),
)
file = {"id": "f2", "name": "data.csv", "mimeType": "text/csv"}
markdown, _metadata, error = await download_and_extract_content(client, file)
assert error is None
assert "Alice" in markdown
assert "Bob" in markdown
assert "|" in markdown
async def test_download_error_returns_error_message(self):
from app.connectors.google_drive.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(return_value="Network timeout")
file = {"id": "f3", "name": "doc.txt", "mimeType": "text/plain"}
markdown, _metadata, error = await download_and_extract_content(client, file)
assert markdown is None
assert error == "Network timeout"
# ===================================================================
# OneDrive
# ===================================================================
class TestOneDriveContentExtraction:
async def test_txt_file_returns_markdown(self):
from app.connectors.onedrive.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(
side_effect=_make_download_side_effect(_TXT_CONTENT),
)
file = {
"id": "od-1",
"name": "report.txt",
"file": {"mimeType": "text/plain"},
}
markdown, metadata, error = await download_and_extract_content(client, file)
assert error is None
assert _TXT_CONTENT in markdown
assert metadata["onedrive_file_id"] == "od-1"
assert metadata["onedrive_file_name"] == "report.txt"
async def test_csv_file_returns_markdown_table(self):
from app.connectors.onedrive.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(
side_effect=_make_download_side_effect(_CSV_CONTENT),
)
file = {
"id": "od-2",
"name": "data.csv",
"file": {"mimeType": "text/csv"},
}
markdown, _metadata, error = await download_and_extract_content(client, file)
assert error is None
assert "Alice" in markdown
assert "|" in markdown
async def test_download_error_returns_error_message(self):
from app.connectors.onedrive.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(return_value="403 Forbidden")
file = {
"id": "od-3",
"name": "secret.txt",
"file": {"mimeType": "text/plain"},
}
markdown, _metadata, error = await download_and_extract_content(client, file)
assert markdown is None
assert error == "403 Forbidden"
# ===================================================================
# Dropbox
# ===================================================================
class TestDropboxContentExtraction:
async def test_txt_file_returns_markdown(self):
from app.connectors.dropbox.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(
side_effect=_make_download_side_effect(_TXT_CONTENT),
)
file = {
"id": "dbx-1",
"name": "memo.txt",
".tag": "file",
"path_lower": "/memo.txt",
}
markdown, metadata, error = await download_and_extract_content(client, file)
assert error is None
assert _TXT_CONTENT in markdown
assert metadata["dropbox_file_id"] == "dbx-1"
assert metadata["dropbox_file_name"] == "memo.txt"
async def test_csv_file_returns_markdown_table(self):
from app.connectors.dropbox.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(
side_effect=_make_download_side_effect(_CSV_CONTENT),
)
file = {
"id": "dbx-2",
"name": "data.csv",
".tag": "file",
"path_lower": "/data.csv",
}
markdown, _metadata, error = await download_and_extract_content(client, file)
assert error is None
assert "Alice" in markdown
assert "|" in markdown
async def test_download_error_returns_error_message(self):
from app.connectors.dropbox.content_extractor import (
download_and_extract_content,
)
client = MagicMock()
client.download_file_to_disk = AsyncMock(return_value="Rate limited")
file = {
"id": "dbx-3",
"name": "big.txt",
".tag": "file",
"path_lower": "/big.txt",
}
markdown, _metadata, error = await download_and_extract_content(client, file)
assert markdown is None
assert error == "Rate limited"

View file

@ -8,6 +8,10 @@ import pytest
from app.db import DocumentType
from app.tasks.connector_indexers.dropbox_indexer import (
_download_files_parallel,
_index_full_scan,
_index_selected_files,
_index_with_delta_sync,
index_dropbox_files,
)
pytestmark = pytest.mark.unit
@ -234,3 +238,610 @@ async def test_heartbeat_fires_during_parallel_downloads(
assert len(docs) == 3
assert failed == 0
assert len(heartbeat_calls) >= 1, "Heartbeat should have fired at least once"
# ---------------------------------------------------------------------------
# D1-D2: _index_full_scan tests
# ---------------------------------------------------------------------------
def _folder_dict(name: str) -> dict:
return {".tag": "folder", "name": name}
@pytest.fixture
def full_scan_mocks(mock_dropbox_client, monkeypatch):
"""Wire up mocks for _index_full_scan in isolation."""
import app.tasks.connector_indexers.dropbox_indexer as _mod
mock_session = AsyncMock()
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
mock_log_entry = MagicMock()
skip_results: dict[str, tuple[bool, str | None]] = {}
monkeypatch.setattr("app.config.config.ETL_SERVICE", "LLAMACLOUD")
async def _fake_skip(session, file, search_space_id):
from app.connectors.dropbox.file_types import should_skip_file as _skip
item_skip, unsup_ext = _skip(file)
if item_skip:
if unsup_ext:
return True, f"unsupported:{unsup_ext}"
return True, "folder/non-downloadable"
return skip_results.get(file.get("id", ""), (False, None))
monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip)
download_and_index_mock = AsyncMock(return_value=(0, 0))
monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
from app.services.page_limit_service import PageLimitService as _RealPLS
mock_page_limit_instance = MagicMock()
mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
mock_page_limit_instance.update_page_usage = AsyncMock()
class _MockPageLimitService:
estimate_pages_from_metadata = staticmethod(
_RealPLS.estimate_pages_from_metadata
)
def __init__(self, session):
self.get_page_usage = mock_page_limit_instance.get_page_usage
self.update_page_usage = mock_page_limit_instance.update_page_usage
monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
return {
"dropbox_client": mock_dropbox_client,
"session": mock_session,
"task_logger": mock_task_logger,
"log_entry": mock_log_entry,
"skip_results": skip_results,
"download_and_index_mock": download_and_index_mock,
}
async def _run_full_scan(mocks, monkeypatch, page_files, *, max_files=500):
import app.tasks.connector_indexers.dropbox_indexer as _mod
monkeypatch.setattr(
_mod,
"get_files_in_folder",
AsyncMock(return_value=(page_files, None)),
)
return await _index_full_scan(
mocks["dropbox_client"],
mocks["session"],
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
"",
"Root",
mocks["task_logger"],
mocks["log_entry"],
max_files,
enable_summary=True,
)
async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
"""Skipped files excluded, renames counted as indexed, new files downloaded."""
page_files = [
_folder_dict("SubFolder"),
_make_file_dict("skip1", "unchanged.txt"),
_make_file_dict("rename1", "renamed.txt"),
_make_file_dict("new1", "new1.txt"),
_make_file_dict("new2", "new2.txt"),
]
full_scan_mocks["skip_results"]["skip1"] = (True, "unchanged")
full_scan_mocks["skip_results"]["rename1"] = (
True,
"File renamed: 'old' -> 'renamed.txt'",
)
full_scan_mocks["download_and_index_mock"].return_value = (2, 0)
indexed, skipped, _unsupported = await _run_full_scan(
full_scan_mocks, monkeypatch, page_files
)
assert indexed == 3 # 1 renamed + 2 from batch
assert skipped == 2 # 1 folder + 1 unchanged
call_args = full_scan_mocks["download_and_index_mock"].call_args
call_files = call_args[0][2]
assert len(call_files) == 2
assert {f["id"] for f in call_files} == {"new1", "new2"}
async def test_full_scan_respects_max_files(full_scan_mocks, monkeypatch):
"""Only max_files non-folder items are considered."""
page_files = [_make_file_dict(f"f{i}", f"file{i}.txt") for i in range(10)]
full_scan_mocks["download_and_index_mock"].return_value = (3, 0)
await _run_full_scan(full_scan_mocks, monkeypatch, page_files, max_files=3)
call_files = full_scan_mocks["download_and_index_mock"].call_args[0][2]
assert len(call_files) == 3
# ---------------------------------------------------------------------------
# D3-D5: _index_selected_files tests
# ---------------------------------------------------------------------------
@pytest.fixture
def selected_files_mocks(mock_dropbox_client, monkeypatch):
"""Wire up mocks for _index_selected_files tests."""
import app.tasks.connector_indexers.dropbox_indexer as _mod
mock_session = AsyncMock()
get_file_results: dict[str, tuple[dict | None, str | None]] = {}
async def _fake_get_file(client, path):
return get_file_results.get(path, (None, f"Not configured: {path}"))
monkeypatch.setattr(_mod, "get_file_by_path", _fake_get_file)
skip_results: dict[str, tuple[bool, str | None]] = {}
async def _fake_skip(session, file, search_space_id):
return skip_results.get(file["id"], (False, None))
monkeypatch.setattr(_mod, "_should_skip_file", _fake_skip)
download_and_index_mock = AsyncMock(return_value=(0, 0))
monkeypatch.setattr(_mod, "_download_and_index", download_and_index_mock)
from app.services.page_limit_service import PageLimitService as _RealPLS
mock_page_limit_instance = MagicMock()
mock_page_limit_instance.get_page_usage = AsyncMock(return_value=(0, 999_999))
mock_page_limit_instance.update_page_usage = AsyncMock()
class _MockPageLimitService:
estimate_pages_from_metadata = staticmethod(
_RealPLS.estimate_pages_from_metadata
)
def __init__(self, session):
self.get_page_usage = mock_page_limit_instance.get_page_usage
self.update_page_usage = mock_page_limit_instance.update_page_usage
monkeypatch.setattr(_mod, "PageLimitService", _MockPageLimitService)
return {
"dropbox_client": mock_dropbox_client,
"session": mock_session,
"get_file_results": get_file_results,
"skip_results": skip_results,
"download_and_index_mock": download_and_index_mock,
}
async def _run_selected(mocks, file_tuples):
return await _index_selected_files(
mocks["dropbox_client"],
mocks["session"],
file_tuples,
connector_id=_CONNECTOR_ID,
search_space_id=_SEARCH_SPACE_ID,
user_id=_USER_ID,
enable_summary=True,
)
async def test_selected_files_single_file_indexed(selected_files_mocks):
selected_files_mocks["get_file_results"]["/report.pdf"] = (
_make_file_dict("f1", "report.pdf"),
None,
)
selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
indexed, skipped, _unsupported, errors = await _run_selected(
selected_files_mocks,
[("/report.pdf", "report.pdf")],
)
assert indexed == 1
assert skipped == 0
assert errors == []
async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
selected_files_mocks["get_file_results"]["/first.txt"] = (
_make_file_dict("f1", "first.txt"),
None,
)
selected_files_mocks["get_file_results"]["/mid.txt"] = (None, "HTTP 404")
selected_files_mocks["get_file_results"]["/third.txt"] = (
_make_file_dict("f3", "third.txt"),
None,
)
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
indexed, skipped, _unsupported, errors = await _run_selected(
selected_files_mocks,
[
("/first.txt", "first.txt"),
("/mid.txt", "mid.txt"),
("/third.txt", "third.txt"),
],
)
assert indexed == 2
assert skipped == 0
assert len(errors) == 1
assert "mid.txt" in errors[0]
async def test_selected_files_skip_rename_counting(selected_files_mocks):
for path, fid, fname in [
("/unchanged.txt", "s1", "unchanged.txt"),
("/renamed.txt", "r1", "renamed.txt"),
("/new1.txt", "n1", "new1.txt"),
("/new2.txt", "n2", "new2.txt"),
]:
selected_files_mocks["get_file_results"][path] = (
_make_file_dict(fid, fname),
None,
)
selected_files_mocks["skip_results"]["s1"] = (True, "unchanged")
selected_files_mocks["skip_results"]["r1"] = (
True,
"File renamed: 'old' -> 'renamed.txt'",
)
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
indexed, skipped, _unsupported, errors = await _run_selected(
selected_files_mocks,
[
("/unchanged.txt", "unchanged.txt"),
("/renamed.txt", "renamed.txt"),
("/new1.txt", "new1.txt"),
("/new2.txt", "new2.txt"),
],
)
assert indexed == 3 # 1 renamed + 2 batch
assert skipped == 1
assert errors == []
mock = selected_files_mocks["download_and_index_mock"]
call_files = mock.call_args[0][2]
assert len(call_files) == 2
assert {f["id"] for f in call_files} == {"n1", "n2"}
# ---------------------------------------------------------------------------
# E1-E4: _index_with_delta_sync tests
# ---------------------------------------------------------------------------
async def test_delta_sync_deletions_call_remove_document(monkeypatch):
"""E1: deleted entries are processed via _remove_document."""
import app.tasks.connector_indexers.dropbox_indexer as _mod
entries = [
{
".tag": "deleted",
"name": "gone.txt",
"path_lower": "/gone.txt",
"id": "id:del1",
},
{
".tag": "deleted",
"name": "also_gone.pdf",
"path_lower": "/also_gone.pdf",
"id": "id:del2",
},
]
mock_client = MagicMock()
mock_client.get_changes = AsyncMock(return_value=(entries, "new-cursor", None))
remove_calls: list[str] = []
async def _fake_remove(session, file_id, search_space_id):
remove_calls.append(file_id)
monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0)))
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
_indexed, _skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
"old-cursor",
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["id:del1", "id:del2"]
assert cursor == "new-cursor"
async def test_delta_sync_upserts_filtered_and_downloaded(monkeypatch):
"""E2: modified/new file entries go through skip filter then download+index."""
import app.tasks.connector_indexers.dropbox_indexer as _mod
entries = [
_make_file_dict("mod1", "modified1.txt"),
_make_file_dict("mod2", "modified2.txt"),
]
mock_client = MagicMock()
mock_client.get_changes = AsyncMock(return_value=(entries, "cursor-v2", None))
monkeypatch.setattr(
_mod, "_should_skip_file", AsyncMock(return_value=(False, None))
)
download_mock = AsyncMock(return_value=(2, 0))
monkeypatch.setattr(_mod, "_download_and_index", download_mock)
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
"cursor-v1",
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert indexed == 2
assert skipped == 0
assert cursor == "cursor-v2"
downloaded_files = download_mock.call_args[0][2]
assert len(downloaded_files) == 2
assert {f["id"] for f in downloaded_files} == {"mod1", "mod2"}
async def test_delta_sync_mix_deletions_and_upserts(monkeypatch):
"""E3: deletions processed, then remaining upserts filtered and indexed."""
import app.tasks.connector_indexers.dropbox_indexer as _mod
entries = [
{
".tag": "deleted",
"name": "removed.txt",
"path_lower": "/removed.txt",
"id": "id:del1",
},
{
".tag": "deleted",
"name": "trashed.pdf",
"path_lower": "/trashed.pdf",
"id": "id:del2",
},
_make_file_dict("mod1", "updated.txt"),
_make_file_dict("new1", "brandnew.docx"),
]
mock_client = MagicMock()
mock_client.get_changes = AsyncMock(return_value=(entries, "final-cursor", None))
remove_calls: list[str] = []
async def _fake_remove(session, file_id, search_space_id):
remove_calls.append(file_id)
monkeypatch.setattr(_mod, "_remove_document", _fake_remove)
monkeypatch.setattr(
_mod, "_should_skip_file", AsyncMock(return_value=(False, None))
)
download_mock = AsyncMock(return_value=(2, 0))
monkeypatch.setattr(_mod, "_download_and_index", download_mock)
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
"old-cursor",
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert sorted(remove_calls) == ["id:del1", "id:del2"]
assert indexed == 2
assert skipped == 0
assert cursor == "final-cursor"
downloaded_files = download_mock.call_args[0][2]
assert {f["id"] for f in downloaded_files} == {"mod1", "new1"}
async def test_delta_sync_returns_new_cursor(monkeypatch):
"""E4: the new cursor from the API response is returned."""
import app.tasks.connector_indexers.dropbox_indexer as _mod
mock_client = MagicMock()
mock_client.get_changes = AsyncMock(return_value=([], "brand-new-cursor-xyz", None))
monkeypatch.setattr(_mod, "_download_and_index", AsyncMock(return_value=(0, 0)))
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped, _unsupported, cursor = await _index_with_delta_sync(
mock_client,
AsyncMock(),
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
"old-cursor",
mock_task_logger,
MagicMock(),
max_files=500,
enable_summary=True,
)
assert cursor == "brand-new-cursor-xyz"
assert indexed == 0
assert skipped == 0
# ---------------------------------------------------------------------------
# F1-F3: index_dropbox_files orchestrator tests
# ---------------------------------------------------------------------------
@pytest.fixture
def orchestrator_mocks(monkeypatch):
"""Wire up mocks for index_dropbox_files orchestrator tests."""
import app.tasks.connector_indexers.dropbox_indexer as _mod
mock_connector = MagicMock()
mock_connector.config = {"_token_encrypted": False}
mock_connector.last_indexed_at = None
mock_connector.enable_summary = True
monkeypatch.setattr(
_mod,
"get_connector_by_id",
AsyncMock(return_value=mock_connector),
)
mock_task_logger = MagicMock()
mock_task_logger.log_task_start = AsyncMock(return_value=MagicMock())
mock_task_logger.log_task_progress = AsyncMock()
mock_task_logger.log_task_success = AsyncMock()
mock_task_logger.log_task_failure = AsyncMock()
monkeypatch.setattr(
_mod, "TaskLoggingService", MagicMock(return_value=mock_task_logger)
)
monkeypatch.setattr(_mod, "update_connector_last_indexed", AsyncMock())
full_scan_mock = AsyncMock(return_value=(5, 2, 0))
monkeypatch.setattr(_mod, "_index_full_scan", full_scan_mock)
delta_sync_mock = AsyncMock(return_value=(3, 1, 0, "delta-cursor-new"))
monkeypatch.setattr(_mod, "_index_with_delta_sync", delta_sync_mock)
mock_client = MagicMock()
mock_client.get_latest_cursor = AsyncMock(return_value=("latest-cursor-abc", None))
monkeypatch.setattr(_mod, "DropboxClient", MagicMock(return_value=mock_client))
return {
"connector": mock_connector,
"full_scan_mock": full_scan_mock,
"delta_sync_mock": delta_sync_mock,
"mock_client": mock_client,
}
async def test_orchestrator_uses_delta_sync_when_cursor_and_last_indexed(
orchestrator_mocks,
):
"""F1: with cursor + last_indexed_at + use_delta_sync, calls delta sync."""
from datetime import UTC, datetime
connector = orchestrator_mocks["connector"]
connector.config = {
"_token_encrypted": False,
"folder_cursors": {"/docs": "saved-cursor-123"},
}
connector.last_indexed_at = datetime(2026, 1, 1, tzinfo=UTC)
mock_session = AsyncMock()
mock_session.commit = AsyncMock()
_indexed, _skipped, error, _unsupported = await index_dropbox_files(
mock_session,
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
{
"folders": [{"path": "/docs", "name": "Docs"}],
"files": [],
"indexing_options": {"use_delta_sync": True},
},
)
assert error is None
orchestrator_mocks["delta_sync_mock"].assert_called_once()
orchestrator_mocks["full_scan_mock"].assert_not_called()
async def test_orchestrator_falls_back_to_full_scan_without_cursor(
orchestrator_mocks,
):
"""F2: without cursor, falls back to full scan."""
connector = orchestrator_mocks["connector"]
connector.config = {"_token_encrypted": False}
connector.last_indexed_at = None
mock_session = AsyncMock()
mock_session.commit = AsyncMock()
_indexed, _skipped, error, _unsupported = await index_dropbox_files(
mock_session,
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
{
"folders": [{"path": "/docs", "name": "Docs"}],
"files": [],
"indexing_options": {"use_delta_sync": True},
},
)
assert error is None
orchestrator_mocks["full_scan_mock"].assert_called_once()
orchestrator_mocks["delta_sync_mock"].assert_not_called()
async def test_orchestrator_persists_cursor_after_sync(orchestrator_mocks):
"""F3: after sync, persists new cursor to connector config."""
connector = orchestrator_mocks["connector"]
connector.config = {"_token_encrypted": False}
connector.last_indexed_at = None
mock_session = AsyncMock()
mock_session.commit = AsyncMock()
await index_dropbox_files(
mock_session,
_CONNECTOR_ID,
_SEARCH_SPACE_ID,
_USER_ID,
{
"folders": [{"path": "/docs", "name": "Docs"}],
"files": [],
},
)
assert "folder_cursors" in connector.config
assert connector.config["folder_cursors"]["/docs"] == "latest-cursor-abc"

View file

@ -366,7 +366,7 @@ async def test_full_scan_three_phase_counts(full_scan_mocks, monkeypatch):
full_scan_mocks["download_mock"].return_value = (mock_docs, 0)
full_scan_mocks["batch_mock"].return_value = ([], 2, 0)
indexed, skipped = await _run_full_scan(full_scan_mocks)
indexed, skipped, _unsupported = await _run_full_scan(full_scan_mocks)
assert indexed == 3 # 1 renamed + 2 from batch
assert skipped == 1 # 1 unchanged
@ -497,7 +497,7 @@ async def test_delta_sync_removals_serial_rest_parallel(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
indexed, skipped = await _index_with_delta_sync(
indexed, skipped, _unsupported = await _index_with_delta_sync(
MagicMock(),
mock_session,
MagicMock(),
@ -589,7 +589,7 @@ async def test_selected_files_single_file_indexed(selected_files_mocks):
)
selected_files_mocks["download_and_index_mock"].return_value = (1, 0)
indexed, skipped, errors = await _run_selected(
indexed, skipped, _unsup, errors = await _run_selected(
selected_files_mocks,
[("f1", "report.pdf")],
)
@ -613,7 +613,7 @@ async def test_selected_files_fetch_failure_isolation(selected_files_mocks):
)
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
indexed, skipped, errors = await _run_selected(
indexed, skipped, _unsup, errors = await _run_selected(
selected_files_mocks,
[("f1", "first.txt"), ("f2", "mid.txt"), ("f3", "third.txt")],
)
@ -647,7 +647,7 @@ async def test_selected_files_skip_rename_counting(selected_files_mocks):
selected_files_mocks["download_and_index_mock"].return_value = (2, 0)
indexed, skipped, errors = await _run_selected(
indexed, skipped, _unsup, errors = await _run_selected(
selected_files_mocks,
[
("s1", "unchanged.txt"),

View file

@ -198,7 +198,7 @@ async def test_gdrive_files_within_quota_are_downloaded(gdrive_selected_mocks):
)
m["download_and_index_mock"].return_value = (3, 0)
indexed, _skipped, errors = await _run_gdrive_selected(
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")]
)
@ -219,7 +219,9 @@ async def test_gdrive_files_exceeding_quota_rejected(gdrive_selected_mocks):
None,
)
indexed, _skipped, errors = await _run_gdrive_selected(m, [("big", "huge.pdf")])
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
m, [("big", "huge.pdf")]
)
assert indexed == 0
assert len(errors) == 1
@ -239,7 +241,7 @@ async def test_gdrive_quota_mix_partial_indexing(gdrive_selected_mocks):
)
m["download_and_index_mock"].return_value = (2, 0)
indexed, _skipped, errors = await _run_gdrive_selected(
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
m, [("f1", "f1.xyz"), ("f2", "f2.xyz"), ("f3", "f3.xyz")]
)
@ -299,7 +301,7 @@ async def test_gdrive_zero_quota_rejects_all(gdrive_selected_mocks):
None,
)
indexed, _skipped, errors = await _run_gdrive_selected(
indexed, _skipped, _unsup, errors = await _run_gdrive_selected(
m, [("f1", "f1.xyz"), ("f2", "f2.xyz")]
)
@ -384,7 +386,7 @@ async def test_gdrive_full_scan_skips_over_quota(gdrive_full_scan_mocks, monkeyp
m["download_mock"].return_value = ([], 0)
m["batch_mock"].return_value = ([], 2, 0)
_indexed, skipped = await _run_gdrive_full_scan(m)
_indexed, skipped, _unsup = await _run_gdrive_full_scan(m)
call_files = m["download_mock"].call_args[0][1]
assert len(call_files) == 2
@ -459,7 +461,7 @@ async def test_gdrive_delta_sync_skips_over_quota(monkeypatch):
mock_task_logger = MagicMock()
mock_task_logger.log_task_progress = AsyncMock()
_indexed, skipped = await _mod._index_with_delta_sync(
_indexed, skipped, _unsupported = await _mod._index_with_delta_sync(
MagicMock(),
session,
MagicMock(),
@ -552,7 +554,9 @@ async def test_onedrive_over_quota_rejected(onedrive_selected_mocks):
None,
)
indexed, _skipped, errors = await _run_onedrive_selected(m, [("big", "huge.pdf")])
indexed, _skipped, _unsup, errors = await _run_onedrive_selected(
m, [("big", "huge.pdf")]
)
assert indexed == 0
assert len(errors) == 1
@ -652,7 +656,7 @@ async def test_dropbox_over_quota_rejected(dropbox_selected_mocks):
None,
)
indexed, _skipped, errors = await _run_dropbox_selected(
indexed, _skipped, _unsup, errors = await _run_dropbox_selected(
m, [("/huge.pdf", "huge.pdf")]
)

View file

@ -0,0 +1,123 @@
"""Tests for DropboxClient delta-sync methods (get_latest_cursor, get_changes)."""
from unittest.mock import AsyncMock, MagicMock
import pytest
from app.connectors.dropbox.client import DropboxClient
pytestmark = pytest.mark.unit
def _make_client() -> DropboxClient:
"""Create a DropboxClient with a mocked DB session so no real DB needed."""
client = DropboxClient.__new__(DropboxClient)
client._session = MagicMock()
client._connector_id = 1
return client
# ---------- C1: get_latest_cursor ----------
async def test_get_latest_cursor_returns_cursor_string(monkeypatch):
client = _make_client()
fake_resp = MagicMock()
fake_resp.status_code = 200
fake_resp.json.return_value = {"cursor": "AAHbKxRZ9enq…"}
monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
cursor, error = await client.get_latest_cursor("/my-folder")
assert cursor == "AAHbKxRZ9enq…"
assert error is None
client._request.assert_called_once_with(
"/2/files/list_folder/get_latest_cursor",
{
"path": "/my-folder",
"recursive": False,
"include_non_downloadable_files": True,
},
)
# ---------- C2: get_changes returns entries and new cursor ----------
async def test_get_changes_returns_entries_and_cursor(monkeypatch):
client = _make_client()
fake_resp = MagicMock()
fake_resp.status_code = 200
fake_resp.json.return_value = {
"entries": [
{".tag": "file", "name": "new.txt", "id": "id:abc"},
{".tag": "deleted", "name": "old.txt"},
],
"cursor": "cursor-v2",
"has_more": False,
}
monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
entries, new_cursor, error = await client.get_changes("cursor-v1")
assert error is None
assert new_cursor == "cursor-v2"
assert len(entries) == 2
assert entries[0]["name"] == "new.txt"
assert entries[1][".tag"] == "deleted"
# ---------- C3: get_changes handles pagination ----------
async def test_get_changes_handles_pagination(monkeypatch):
client = _make_client()
page1 = MagicMock()
page1.status_code = 200
page1.json.return_value = {
"entries": [{".tag": "file", "name": "a.txt", "id": "id:a"}],
"cursor": "cursor-page2",
"has_more": True,
}
page2 = MagicMock()
page2.status_code = 200
page2.json.return_value = {
"entries": [{".tag": "file", "name": "b.txt", "id": "id:b"}],
"cursor": "cursor-final",
"has_more": False,
}
request_mock = AsyncMock(side_effect=[page1, page2])
monkeypatch.setattr(client, "_request", request_mock)
entries, new_cursor, error = await client.get_changes("cursor-v1")
assert error is None
assert new_cursor == "cursor-final"
assert len(entries) == 2
assert {e["name"] for e in entries} == {"a.txt", "b.txt"}
assert request_mock.call_count == 2
# ---------- C4: get_changes raises on 401 ----------
async def test_get_changes_returns_error_on_401(monkeypatch):
client = _make_client()
fake_resp = MagicMock()
fake_resp.status_code = 401
fake_resp.text = "Unauthorized"
monkeypatch.setattr(client, "_request", AsyncMock(return_value=fake_resp))
entries, new_cursor, error = await client.get_changes("old-cursor")
assert error is not None
assert "401" in error
assert entries == []
assert new_cursor is None

View file

@ -0,0 +1,173 @@
"""Tests for Dropbox file type filtering (should_skip_file)."""
import pytest
from app.connectors.dropbox.file_types import should_skip_file
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_item_is_skipped():
item = {".tag": "folder", "name": "My Folder"}
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_paper_file_is_not_skipped():
item = {".tag": "file", "name": "notes.paper", "is_downloadable": False}
skip, ext = should_skip_file(item)
assert skip is False
assert ext is None
def test_non_downloadable_item_is_skipped():
item = {".tag": "file", "name": "locked.gdoc", "is_downloadable": False}
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"filename",
[
"archive.zip",
"backup.tar",
"data.gz",
"stuff.rar",
"pack.7z",
"program.exe",
"lib.dll",
"module.so",
"image.dmg",
"disk.iso",
"movie.mov",
"clip.avi",
"video.mkv",
"film.wmv",
"stream.flv",
"favicon.ico",
"raw.cr2",
"photo.nef",
"image.arw",
"pic.dng",
"design.psd",
"vector.ai",
"mockup.sketch",
"proto.fig",
"font.ttf",
"font.otf",
"font.woff",
"font.woff2",
"model.stl",
"scene.fbx",
"mesh.blend",
"local.db",
"data.sqlite",
"access.mdb",
],
)
def test_non_parseable_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {".tag": "file", "name": filename}
skip, ext = should_skip_file(item)
assert skip is True, f"{filename} should be skipped"
assert ext is not None
@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"document.docx",
"sheet.xlsx",
"slides.pptx",
"readme.txt",
"data.csv",
"page.html",
"notes.md",
"config.json",
"feed.xml",
],
)
def test_parseable_documents_are_not_skipped(filename, mocker):
"""Files in plaintext/direct_convert/universal document sets are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
skip, ext = should_skip_file(item)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize(
"filename",
["photo.jpg", "image.jpeg", "screenshot.png", "scan.bmp", "page.tiff", "doc.tif"],
)
def test_universal_images_are_not_skipped(filename, mocker):
"""Images supported by all parsers are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
skip, ext = should_skip_file(item)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize(
"filename,service,expected_skip",
[
("old.doc", "DOCLING", True),
("old.doc", "LLAMACLOUD", False),
("old.doc", "UNSTRUCTURED", False),
("legacy.xls", "DOCLING", True),
("legacy.xls", "LLAMACLOUD", False),
("legacy.xls", "UNSTRUCTURED", False),
("deck.ppt", "DOCLING", True),
("deck.ppt", "LLAMACLOUD", False),
("deck.ppt", "UNSTRUCTURED", False),
("icon.svg", "DOCLING", True),
("icon.svg", "LLAMACLOUD", False),
("anim.gif", "DOCLING", True),
("anim.gif", "LLAMACLOUD", False),
("photo.webp", "DOCLING", False),
("photo.webp", "LLAMACLOUD", False),
("photo.webp", "UNSTRUCTURED", True),
("live.heic", "DOCLING", True),
("live.heic", "UNSTRUCTURED", False),
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
],
)
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {".tag": "file", "name": filename}
skip, ext = should_skip_file(item)
assert skip is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)
if expected_skip:
assert ext is not None
else:
assert ext is None
def test_returns_unsupported_extension(mocker):
"""When a file is skipped due to unsupported extension, the ext string is returned."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {".tag": "file", "name": "old.doc"}
skip, ext = should_skip_file(item)
assert skip is True
assert ext == ".doc"

View file

@ -0,0 +1,43 @@
"""Test that Dropbox re-auth preserves folder_cursors in connector config."""
import pytest
pytestmark = pytest.mark.unit
def test_reauth_preserves_folder_cursors():
"""G1: re-authentication preserves folder_cursors alongside cursor."""
old_config = {
"access_token": "old-token-enc",
"refresh_token": "old-refresh-enc",
"cursor": "old-cursor-abc",
"folder_cursors": {"/docs": "cursor-docs-123", "/photos": "cursor-photos-456"},
"_token_encrypted": True,
"auth_expired": True,
}
new_connector_config = {
"access_token": "new-token-enc",
"refresh_token": "new-refresh-enc",
"token_type": "bearer",
"expires_in": 14400,
"expires_at": "2026-04-06T16:00:00+00:00",
"_token_encrypted": True,
}
existing_cursor = old_config.get("cursor")
existing_folder_cursors = old_config.get("folder_cursors")
merged_config = {
**new_connector_config,
"cursor": existing_cursor,
"folder_cursors": existing_folder_cursors,
"auth_expired": False,
}
assert merged_config["access_token"] == "new-token-enc"
assert merged_config["cursor"] == "old-cursor-abc"
assert merged_config["folder_cursors"] == {
"/docs": "cursor-docs-123",
"/photos": "cursor-photos-456",
}
assert merged_config["auth_expired"] is False

View file

@ -0,0 +1,80 @@
"""Tests for Google Drive file type filtering."""
import pytest
from app.connectors.google_drive.file_types import should_skip_by_extension
pytestmark = pytest.mark.unit
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
],
)
def test_unsupported_extensions_are_skipped_regardless_of_service(filename, mocker):
"""Truly unsupported files are skipped no matter which ETL service is configured."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
skip, _ext = should_skip_by_extension(filename)
assert skip is True
@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"doc.docx",
"sheet.xlsx",
"slides.pptx",
"readme.txt",
"data.csv",
"photo.png",
"notes.md",
],
)
def test_universal_extensions_are_not_skipped(filename, mocker):
"""Files supported by all parsers (or handled by plaintext/direct_convert) are never skipped."""
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
skip, ext = should_skip_by_extension(filename)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize(
"filename,service,expected_skip",
[
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.gif", "DOCLING", True),
("photo.gif", "LLAMACLOUD", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
],
)
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
skip, ext = should_skip_by_extension(filename)
assert skip is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)
if expected_skip:
assert ext is not None, "unsupported extension should be returned"
else:
assert ext is None
def test_returns_unsupported_extension(mocker):
"""When a file is skipped, the unsupported extension string is returned."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
skip, ext = should_skip_by_extension("macro.docm")
assert skip is True
assert ext == ".docm"

View file

@ -0,0 +1,118 @@
"""Tests for OneDrive file type filtering."""
import pytest
from app.connectors.onedrive.file_types import should_skip_file
pytestmark = pytest.mark.unit
# ---------------------------------------------------------------------------
# Structural skips (independent of ETL service)
# ---------------------------------------------------------------------------
def test_folder_is_skipped():
item = {"folder": {}, "name": "My Folder"}
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_remote_item_is_skipped():
item = {"remoteItem": {}, "name": "shared.docx"}
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_package_is_skipped():
item = {"package": {}, "name": "notebook"}
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
def test_onenote_is_skipped():
item = {"name": "notes", "file": {"mimeType": "application/msonenote"}}
skip, ext = should_skip_file(item)
assert skip is True
assert ext is None
# ---------------------------------------------------------------------------
# Extension-based skips (require ETL service context)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
],
)
def test_unsupported_extensions_are_skipped(filename, mocker):
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
skip, ext = should_skip_file(item)
assert skip is True, f"{filename} should be skipped"
assert ext is not None
@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"doc.docx",
"sheet.xlsx",
"slides.pptx",
"readme.txt",
"data.csv",
"photo.png",
"notes.md",
],
)
def test_universal_files_are_not_skipped(filename, mocker):
for service in ("DOCLING", "LLAMACLOUD", "UNSTRUCTURED"):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
skip, ext = should_skip_file(item)
assert skip is False, f"{filename} should NOT be skipped with {service}"
assert ext is None
@pytest.mark.parametrize(
"filename,service,expected_skip",
[
("macro.docm", "DOCLING", True),
("macro.docm", "LLAMACLOUD", False),
("mail.eml", "DOCLING", True),
("mail.eml", "UNSTRUCTURED", False),
("photo.heic", "UNSTRUCTURED", False),
("photo.heic", "DOCLING", True),
],
)
def test_parser_specific_extensions(filename, service, expected_skip, mocker):
mocker.patch("app.config.config.ETL_SERVICE", service)
item = {"name": filename, "file": {"mimeType": "application/octet-stream"}}
skip, ext = should_skip_file(item)
assert skip is expected_skip, (
f"{filename} with {service}: expected skip={expected_skip}"
)
if expected_skip:
assert ext is not None
else:
assert ext is None
def test_returns_unsupported_extension(mocker):
"""When a file is skipped due to unsupported extension, the ext string is returned."""
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
item = {"name": "mail.eml", "file": {"mimeType": "application/octet-stream"}}
skip, ext = should_skip_file(item)
assert skip is True
assert ext == ".eml"

View file

@ -0,0 +1,27 @@
"""Pre-register the etl_pipeline package to avoid circular imports during unit tests."""
import sys
import types
from pathlib import Path
_BACKEND = Path(__file__).resolve().parents[3]
def _stub_package(dotted: str, fs_dir: Path) -> None:
if dotted not in sys.modules:
mod = types.ModuleType(dotted)
mod.__path__ = [str(fs_dir)]
mod.__package__ = dotted
sys.modules[dotted] = mod
parts = dotted.split(".")
if len(parts) > 1:
parent_dotted = ".".join(parts[:-1])
parent = sys.modules.get(parent_dotted)
if parent is not None:
setattr(parent, parts[-1], sys.modules[dotted])
_stub_package("app", _BACKEND / "app")
_stub_package("app.etl_pipeline", _BACKEND / "app" / "etl_pipeline")
_stub_package("app.etl_pipeline.parsers", _BACKEND / "app" / "etl_pipeline" / "parsers")

View file

@ -0,0 +1,461 @@
"""Tests for EtlPipelineService -- the unified ETL pipeline public interface."""
import pytest
from app.etl_pipeline.etl_document import EtlRequest
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
pytestmark = pytest.mark.unit
async def test_extract_txt_file_returns_markdown(tmp_path):
"""Tracer bullet: a .txt file is read and returned as-is in an EtlResult."""
txt_file = tmp_path / "hello.txt"
txt_file.write_text("Hello, world!", encoding="utf-8")
service = EtlPipelineService()
result = await service.extract(
EtlRequest(file_path=str(txt_file), filename="hello.txt")
)
assert result.markdown_content == "Hello, world!"
assert result.etl_service == "PLAINTEXT"
assert result.content_type == "plaintext"
async def test_extract_md_file(tmp_path):
"""A .md file is classified as PLAINTEXT and extracted."""
md_file = tmp_path / "readme.md"
md_file.write_text("# Title\n\nBody text.", encoding="utf-8")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(md_file), filename="readme.md")
)
assert result.markdown_content == "# Title\n\nBody text."
assert result.etl_service == "PLAINTEXT"
assert result.content_type == "plaintext"
async def test_extract_markdown_file(tmp_path):
"""A .markdown file is classified as PLAINTEXT and extracted."""
md_file = tmp_path / "notes.markdown"
md_file.write_text("Some notes.", encoding="utf-8")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(md_file), filename="notes.markdown")
)
assert result.markdown_content == "Some notes."
assert result.etl_service == "PLAINTEXT"
async def test_extract_python_file(tmp_path):
"""A .py source code file is classified as PLAINTEXT."""
py_file = tmp_path / "script.py"
py_file.write_text("print('hello')", encoding="utf-8")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(py_file), filename="script.py")
)
assert result.markdown_content == "print('hello')"
assert result.etl_service == "PLAINTEXT"
assert result.content_type == "plaintext"
async def test_extract_js_file(tmp_path):
"""A .js source code file is classified as PLAINTEXT."""
js_file = tmp_path / "app.js"
js_file.write_text("console.log('hi');", encoding="utf-8")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(js_file), filename="app.js")
)
assert result.markdown_content == "console.log('hi');"
assert result.etl_service == "PLAINTEXT"
async def test_extract_csv_returns_markdown_table(tmp_path):
"""A .csv file is converted to a markdown table."""
csv_file = tmp_path / "data.csv"
csv_file.write_text("name,age\nAlice,30\nBob,25\n", encoding="utf-8")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(csv_file), filename="data.csv")
)
assert "| name | age |" in result.markdown_content
assert "| Alice | 30 |" in result.markdown_content
assert result.etl_service == "DIRECT_CONVERT"
assert result.content_type == "direct_convert"
async def test_extract_tsv_returns_markdown_table(tmp_path):
"""A .tsv file is converted to a markdown table."""
tsv_file = tmp_path / "data.tsv"
tsv_file.write_text("x\ty\n1\t2\n", encoding="utf-8")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(tsv_file), filename="data.tsv")
)
assert "| x | y |" in result.markdown_content
assert result.etl_service == "DIRECT_CONVERT"
async def test_extract_html_returns_markdown(tmp_path):
"""An .html file is converted to markdown."""
html_file = tmp_path / "page.html"
html_file.write_text("<h1>Title</h1><p>Body</p>", encoding="utf-8")
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(html_file), filename="page.html")
)
assert "Title" in result.markdown_content
assert "Body" in result.markdown_content
assert result.etl_service == "DIRECT_CONVERT"
async def test_extract_mp3_returns_transcription(tmp_path, mocker):
"""An .mp3 audio file is transcribed via litellm.atranscription."""
audio_file = tmp_path / "recording.mp3"
audio_file.write_bytes(b"\x00" * 100)
mocker.patch("app.config.config.STT_SERVICE", "openai/whisper-1")
mocker.patch("app.config.config.STT_SERVICE_API_KEY", "fake-key")
mocker.patch("app.config.config.STT_SERVICE_API_BASE", None)
mock_transcription = mocker.patch(
"app.etl_pipeline.parsers.audio.atranscription",
return_value={"text": "Hello from audio"},
)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(audio_file), filename="recording.mp3")
)
assert "Hello from audio" in result.markdown_content
assert result.etl_service == "AUDIO"
assert result.content_type == "audio"
mock_transcription.assert_called_once()
# ---------------------------------------------------------------------------
# Slice 7 - DOCLING document parsing
# ---------------------------------------------------------------------------
async def test_extract_pdf_with_docling(tmp_path, mocker):
"""A .pdf file with ETL_SERVICE=DOCLING returns parsed markdown."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "# Parsed PDF"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert result.markdown_content == "# Parsed PDF"
assert result.etl_service == "DOCLING"
assert result.content_type == "document"
# ---------------------------------------------------------------------------
# Slice 8 - UNSTRUCTURED document parsing
# ---------------------------------------------------------------------------
async def test_extract_pdf_with_unstructured(tmp_path, mocker):
"""A .pdf file with ETL_SERVICE=UNSTRUCTURED returns parsed markdown."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake")
mocker.patch("app.config.config.ETL_SERVICE", "UNSTRUCTURED")
class FakeDoc:
def __init__(self, text):
self.page_content = text
fake_loader_instance = mocker.AsyncMock()
fake_loader_instance.aload.return_value = [
FakeDoc("Page 1 content"),
FakeDoc("Page 2 content"),
]
mocker.patch(
"langchain_unstructured.UnstructuredLoader",
return_value=fake_loader_instance,
)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
assert "Page 1 content" in result.markdown_content
assert "Page 2 content" in result.markdown_content
assert result.etl_service == "UNSTRUCTURED"
assert result.content_type == "document"
# ---------------------------------------------------------------------------
# Slice 9 - LLAMACLOUD document parsing
# ---------------------------------------------------------------------------
async def test_extract_pdf_with_llamacloud(tmp_path, mocker):
"""A .pdf file with ETL_SERVICE=LLAMACLOUD returns parsed markdown."""
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF-1.4 fake content " * 10)
mocker.patch("app.config.config.ETL_SERVICE", "LLAMACLOUD")
mocker.patch("app.config.config.LLAMA_CLOUD_API_KEY", "fake-key", create=True)
class FakeDoc:
text = "# LlamaCloud parsed"
class FakeJobResult:
pages = []
def get_markdown_documents(self, split_by_page=True):
return [FakeDoc()]
fake_parser = mocker.AsyncMock()
fake_parser.aparse.return_value = FakeJobResult()
mocker.patch(
"llama_cloud_services.LlamaParse",
return_value=fake_parser,
)
mocker.patch(
"llama_cloud_services.parse.utils.ResultType",
mocker.MagicMock(MD="md"),
)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf", estimated_pages=5)
)
assert result.markdown_content == "# LlamaCloud parsed"
assert result.etl_service == "LLAMACLOUD"
assert result.content_type == "document"
# ---------------------------------------------------------------------------
# Slice 10 - unknown extension falls through to document ETL
# ---------------------------------------------------------------------------
async def test_unknown_extension_uses_document_etl(tmp_path, mocker):
"""An allowlisted document extension (.docx) routes to the document ETL path."""
docx_file = tmp_path / "doc.docx"
docx_file.write_bytes(b"PK fake docx")
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
fake_docling = mocker.AsyncMock()
fake_docling.process_document.return_value = {"content": "Docx content"}
mocker.patch(
"app.services.docling_service.create_docling_service",
return_value=fake_docling,
)
result = await EtlPipelineService().extract(
EtlRequest(file_path=str(docx_file), filename="doc.docx")
)
assert result.markdown_content == "Docx content"
assert result.content_type == "document"
# ---------------------------------------------------------------------------
# Slice 11 - EtlRequest validation
# ---------------------------------------------------------------------------
def test_etl_request_requires_filename():
"""EtlRequest rejects missing filename."""
with pytest.raises(ValueError, match="filename must not be empty"):
EtlRequest(file_path="/tmp/some.txt", filename="")
# ---------------------------------------------------------------------------
# Slice 12 - unknown ETL_SERVICE raises EtlServiceUnavailableError
# ---------------------------------------------------------------------------
async def test_unknown_etl_service_raises(tmp_path, mocker):
"""An unknown ETL_SERVICE raises EtlServiceUnavailableError."""
from app.etl_pipeline.exceptions import EtlServiceUnavailableError
pdf_file = tmp_path / "report.pdf"
pdf_file.write_bytes(b"%PDF fake")
mocker.patch("app.config.config.ETL_SERVICE", "NONEXISTENT")
with pytest.raises(EtlServiceUnavailableError, match="Unknown ETL_SERVICE"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(pdf_file), filename="report.pdf")
)
# ---------------------------------------------------------------------------
# Slice 13 - unsupported file types are rejected before reaching any parser
# ---------------------------------------------------------------------------
def test_unknown_extension_classified_as_unsupported():
"""An unknown extension defaults to UNSUPPORTED (allowlist behaviour)."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file
assert classify_file("random.xyz") == FileCategory.UNSUPPORTED
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
"data.parquet",
"package.deb",
"firmware.bin",
],
)
def test_unsupported_extensions_classified_correctly(filename):
"""Extensions not in any allowlist are classified as UNSUPPORTED."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file
assert classify_file(filename) == FileCategory.UNSUPPORTED
@pytest.mark.parametrize(
"filename,expected",
[
("report.pdf", "document"),
("doc.docx", "document"),
("slides.pptx", "document"),
("sheet.xlsx", "document"),
("photo.png", "document"),
("photo.jpg", "document"),
("book.epub", "document"),
("letter.odt", "document"),
("readme.md", "plaintext"),
("data.csv", "direct_convert"),
],
)
def test_parseable_extensions_classified_correctly(filename, expected):
"""Parseable files are classified into their correct category."""
from app.etl_pipeline.file_classifier import FileCategory, classify_file
result = classify_file(filename)
assert result != FileCategory.UNSUPPORTED
assert result.value == expected
async def test_extract_unsupported_file_raises_error(tmp_path):
"""EtlPipelineService.extract() raises EtlUnsupportedFileError for .exe files."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
exe_file = tmp_path / "program.exe"
exe_file.write_bytes(b"\x00" * 10)
with pytest.raises(EtlUnsupportedFileError, match="not supported"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(exe_file), filename="program.exe")
)
async def test_extract_zip_raises_unsupported_error(tmp_path):
"""EtlPipelineService.extract() raises EtlUnsupportedFileError for .zip archives."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
zip_file = tmp_path / "archive.zip"
zip_file.write_bytes(b"PK\x03\x04")
with pytest.raises(EtlUnsupportedFileError, match="not supported"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(zip_file), filename="archive.zip")
)
# ---------------------------------------------------------------------------
# Slice 14 - should_skip_for_service (per-parser document filtering)
# ---------------------------------------------------------------------------
@pytest.mark.parametrize(
"filename,etl_service,expected_skip",
[
("file.eml", "DOCLING", True),
("file.eml", "UNSTRUCTURED", False),
("file.docm", "LLAMACLOUD", False),
("file.docm", "DOCLING", True),
("file.txt", "DOCLING", False),
("file.csv", "LLAMACLOUD", False),
("file.mp3", "UNSTRUCTURED", False),
("file.exe", "LLAMACLOUD", True),
("file.pdf", "DOCLING", False),
("file.webp", "DOCLING", False),
("file.webp", "UNSTRUCTURED", True),
("file.gif", "LLAMACLOUD", False),
("file.gif", "DOCLING", True),
("file.heic", "UNSTRUCTURED", False),
("file.heic", "DOCLING", True),
("file.svg", "LLAMACLOUD", False),
("file.svg", "DOCLING", True),
("file.p7s", "UNSTRUCTURED", False),
("file.p7s", "LLAMACLOUD", True),
],
)
def test_should_skip_for_service(filename, etl_service, expected_skip):
from app.etl_pipeline.file_classifier import should_skip_for_service
assert should_skip_for_service(filename, etl_service) is expected_skip, (
f"{filename} with {etl_service}: expected skip={expected_skip}"
)
# ---------------------------------------------------------------------------
# Slice 14b - ETL pipeline rejects per-parser incompatible documents
# ---------------------------------------------------------------------------
async def test_extract_docm_with_docling_raises_unsupported(tmp_path, mocker):
"""Docling cannot parse .docm -- pipeline should reject before dispatching."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
docm_file = tmp_path / "macro.docm"
docm_file.write_bytes(b"\x00" * 10)
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(docm_file), filename="macro.docm")
)
async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
"""Docling cannot parse .eml -- pipeline should reject before dispatching."""
from app.etl_pipeline.exceptions import EtlUnsupportedFileError
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
eml_file = tmp_path / "mail.eml"
eml_file.write_bytes(b"From: test@example.com")
with pytest.raises(EtlUnsupportedFileError, match="not supported by DOCLING"):
await EtlPipelineService().extract(
EtlRequest(file_path=str(eml_file), filename="mail.eml")
)

View file

@ -0,0 +1,70 @@
"""Test that DoclingService does NOT restrict allowed_formats, letting Docling
accept all its supported formats (PDF, DOCX, PPTX, XLSX, IMAGE, etc.)."""
from enum import Enum
from unittest.mock import MagicMock, patch
import pytest
pytestmark = pytest.mark.unit
class _FakeInputFormat(Enum):
PDF = "pdf"
IMAGE = "image"
DOCX = "docx"
PPTX = "pptx"
XLSX = "xlsx"
def test_docling_service_does_not_restrict_allowed_formats():
"""DoclingService should NOT pass allowed_formats to DocumentConverter,
so Docling defaults to accepting every InputFormat it supports."""
mock_converter_cls = MagicMock()
mock_backend = MagicMock()
fake_pipeline_options_cls = MagicMock()
fake_pipeline_options = MagicMock()
fake_pipeline_options_cls.return_value = fake_pipeline_options
fake_pdf_format_option_cls = MagicMock()
with patch.dict(
"sys.modules",
{
"docling": MagicMock(),
"docling.backend": MagicMock(),
"docling.backend.pypdfium2_backend": MagicMock(
PyPdfiumDocumentBackend=mock_backend
),
"docling.datamodel": MagicMock(),
"docling.datamodel.base_models": MagicMock(InputFormat=_FakeInputFormat),
"docling.datamodel.pipeline_options": MagicMock(
PdfPipelineOptions=fake_pipeline_options_cls
),
"docling.document_converter": MagicMock(
DocumentConverter=mock_converter_cls,
PdfFormatOption=fake_pdf_format_option_cls,
),
},
):
from importlib import reload
import app.services.docling_service as mod
reload(mod)
mod.DoclingService()
call_kwargs = mock_converter_cls.call_args
assert call_kwargs is not None, "DocumentConverter was never called"
_, kwargs = call_kwargs
assert "allowed_formats" not in kwargs, (
f"allowed_formats should not be passed — let Docling accept all formats. "
f"Got: {kwargs.get('allowed_formats')}"
)
assert _FakeInputFormat.PDF in kwargs.get("format_options", {}), (
"format_options should still configure PDF pipeline options"
)

View file

@ -0,0 +1,154 @@
"""Tests for the DOCUMENT_EXTENSIONS allowlist module."""
import pytest
pytestmark = pytest.mark.unit
def test_pdf_is_supported_document():
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension("report.pdf") is True
def test_exe_is_not_supported_document():
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension("malware.exe") is False
@pytest.mark.parametrize(
"filename",
[
"report.pdf",
"doc.docx",
"old.doc",
"sheet.xlsx",
"legacy.xls",
"slides.pptx",
"deck.ppt",
"macro.docm",
"macro.xlsm",
"macro.pptm",
"photo.png",
"photo.jpg",
"photo.jpeg",
"scan.bmp",
"scan.tiff",
"scan.tif",
"photo.webp",
"anim.gif",
"iphone.heic",
"manual.rtf",
"book.epub",
"letter.odt",
"data.ods",
"presentation.odp",
"inbox.eml",
"outlook.msg",
"korean.hwpx",
"korean.hwp",
"template.dot",
"template.dotm",
"template.pot",
"template.potx",
"binary.xlsb",
"workspace.xlw",
"vector.svg",
"signature.p7s",
],
)
def test_document_extensions_are_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension(filename) is True, (
f"{filename} should be supported"
)
@pytest.mark.parametrize(
"filename",
[
"malware.exe",
"archive.zip",
"video.mov",
"font.woff2",
"model.blend",
"random.xyz",
"data.parquet",
"package.deb",
],
)
def test_non_document_extensions_are_not_supported(filename):
from app.utils.file_extensions import is_supported_document_extension
assert is_supported_document_extension(filename) is False, (
f"{filename} should NOT be supported"
)
# ---------------------------------------------------------------------------
# Per-parser extension sets
# ---------------------------------------------------------------------------
def test_union_equals_all_three_sets():
from app.utils.file_extensions import (
DOCLING_DOCUMENT_EXTENSIONS,
DOCUMENT_EXTENSIONS,
LLAMAPARSE_DOCUMENT_EXTENSIONS,
UNSTRUCTURED_DOCUMENT_EXTENSIONS,
)
expected = (
DOCLING_DOCUMENT_EXTENSIONS
| LLAMAPARSE_DOCUMENT_EXTENSIONS
| UNSTRUCTURED_DOCUMENT_EXTENSIONS
)
assert expected == DOCUMENT_EXTENSIONS
def test_get_extensions_for_docling():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("DOCLING")
assert ".pdf" in exts
assert ".webp" in exts
assert ".docx" in exts
assert ".eml" not in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".heic" not in exts
def test_get_extensions_for_llamacloud():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("LLAMACLOUD")
assert ".docm" in exts
assert ".gif" in exts
assert ".svg" in exts
assert ".hwp" in exts
assert ".eml" not in exts
assert ".heic" not in exts
def test_get_extensions_for_unstructured():
from app.utils.file_extensions import get_document_extensions_for_service
exts = get_document_extensions_for_service("UNSTRUCTURED")
assert ".eml" in exts
assert ".heic" in exts
assert ".p7s" in exts
assert ".docm" not in exts
assert ".gif" not in exts
assert ".svg" not in exts
def test_get_extensions_for_none_returns_union():
from app.utils.file_extensions import (
DOCUMENT_EXTENSIONS,
get_document_extensions_for_service,
)
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS

View file

@ -8,6 +8,7 @@ import { Button } from "@/components/ui/button";
import { Checkbox } from "@/components/ui/checkbox";
import { Input } from "@/components/ui/input";
import { Popover, PopoverContent, PopoverTrigger } from "@/components/ui/popover";
import { ToggleGroup, ToggleGroupItem } from "@/components/ui/toggle-group";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import type { DocumentTypeEnum } from "@/contracts/types/document.types";
import { getDocumentTypeIcon, getDocumentTypeLabel } from "./DocumentTypeIcon";
@ -63,109 +64,113 @@ export function DocumentsFilters({
return (
<div className="flex select-none">
<div className="flex items-center gap-2 w-full">
{/* Type Filter */}
<Popover>
<PopoverTrigger asChild>
<Button
variant="outline"
size="icon"
className="h-9 w-9 shrink-0 border-dashed border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
>
<ListFilter size={14} />
{activeTypes.length > 0 && (
<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-primary text-[9px] font-medium text-primary-foreground">
{activeTypes.length}
</span>
)}
</Button>
</PopoverTrigger>
<PopoverContent className="w-56 md:w-52 !p-0 overflow-hidden" align="end">
<div>
{/* Search input */}
<div className="p-2">
<div className="relative">
<Search className="absolute left-0.5 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
<Input
placeholder="Search types"
value={typeSearchQuery}
onChange={(e) => setTypeSearchQuery(e.target.value)}
className="h-6 pl-6 text-sm bg-transparent border-0 shadow-none"
/>
</div>
</div>
{/* Filter + New Folder Toggle Group */}
<ToggleGroup type="multiple" variant="outline" value={[]} className="overflow-visible">
{onCreateFolder && (
<Tooltip>
<TooltipTrigger asChild>
<ToggleGroupItem
value="folder"
className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
onClick={(e) => {
e.preventDefault();
onCreateFolder();
}}
>
<FolderPlus size={14} />
</ToggleGroupItem>
</TooltipTrigger>
<TooltipContent>New folder</TooltipContent>
</Tooltip>
)}
<div
className="max-h-[300px] overflow-y-auto overflow-x-hidden py-1.5 px-1.5"
onScroll={handleScroll}
style={{
maskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
WebkitMaskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
}}
>
{filteredTypes.length === 0 ? (
<div className="py-6 text-center text-sm text-muted-foreground">
No types found
</div>
) : (
filteredTypes.map((value: DocumentTypeEnum, i) => (
<div
role="option"
aria-selected={activeTypes.includes(value)}
tabIndex={0}
key={value}
className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-neutral-200 dark:hover:bg-neutral-700 transition-colors cursor-pointer text-left"
onClick={() => onToggleType(value, !activeTypes.includes(value))}
onKeyDown={(e) => {
if (e.key === "Enter" || e.key === " ") {
e.preventDefault();
onToggleType(value, !activeTypes.includes(value));
}
}}
>
{/* Icon */}
<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-md bg-muted/50 text-foreground/80">
{getDocumentTypeIcon(value, "h-4 w-4")}
</div>
{/* Text content */}
<div className="flex flex-col min-w-0 flex-1 gap-0.5">
<span className="text-[13px] font-medium text-foreground truncate leading-tight">
{getDocumentTypeLabel(value)}
</span>
<span className="text-[11px] text-muted-foreground leading-tight">
{typeCounts.get(value)} document
{(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""}
</span>
</div>
{/* Checkbox */}
<Checkbox
id={`${id}-${i}`}
checked={activeTypes.includes(value)}
onCheckedChange={(checked: boolean) => onToggleType(value, !!checked)}
className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary"
/>
</div>
))
)}
</div>
{activeTypes.length > 0 && (
<div className="px-3 pt-1.5 pb-1.5 border-t border-border dark:border-neutral-700">
<Button
variant="ghost"
size="sm"
className="w-full h-7 text-[11px] text-muted-foreground hover:text-foreground hover:bg-neutral-200 dark:hover:bg-neutral-700"
onClick={() => {
activeTypes.forEach((t) => {
onToggleType(t, false);
});
}}
<Popover>
<Tooltip>
<TooltipTrigger asChild>
<PopoverTrigger asChild>
<ToggleGroupItem
value="filter"
className="relative h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar overflow-visible"
>
Clear filters
</Button>
<ListFilter size={14} />
{activeTypes.length > 0 && (
<span className="absolute -top-1 -right-1 flex h-4 w-4 items-center justify-center rounded-full bg-sidebar-border text-[9px] font-medium text-sidebar-foreground">
{activeTypes.length}
</span>
)}
</ToggleGroupItem>
</PopoverTrigger>
</TooltipTrigger>
<TooltipContent>Filter by type</TooltipContent>
</Tooltip>
<PopoverContent className="w-56 md:w-52 !p-0 overflow-hidden" align="start">
<div>
<div className="p-2">
<div className="relative">
<Search className="absolute left-0.5 top-1/2 -translate-y-1/2 h-4 w-4 text-muted-foreground" />
<Input
placeholder="Search types"
value={typeSearchQuery}
onChange={(e) => setTypeSearchQuery(e.target.value)}
className="h-6 pl-6 text-sm bg-transparent border-0 shadow-none"
/>
</div>
</div>
)}
</div>
</PopoverContent>
</Popover>
<div
className="max-h-[300px] overflow-y-auto overflow-x-hidden py-1.5 px-1.5"
onScroll={handleScroll}
style={{
maskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
WebkitMaskImage: `linear-gradient(to bottom, ${scrollPos === "top" ? "black" : "transparent"}, black 16px, black calc(100% - 16px), ${scrollPos === "bottom" ? "black" : "transparent"})`,
}}
>
{filteredTypes.length === 0 ? (
<div className="py-6 text-center text-sm text-muted-foreground">
No types found
</div>
) : (
filteredTypes.map((value: DocumentTypeEnum, i) => (
<div
role="option"
aria-selected={activeTypes.includes(value)}
tabIndex={0}
key={value}
className="flex w-full items-center gap-2.5 py-2 px-3 rounded-md hover:bg-neutral-200 dark:hover:bg-neutral-700 transition-colors cursor-pointer text-left"
onClick={() => onToggleType(value, !activeTypes.includes(value))}
onKeyDown={(e) => {
if (e.key === "Enter" || e.key === " ") {
e.preventDefault();
onToggleType(value, !activeTypes.includes(value));
}
}}
>
<div className="flex h-7 w-7 shrink-0 items-center justify-center rounded-md bg-muted/50 text-foreground/80">
{getDocumentTypeIcon(value, "h-4 w-4")}
</div>
<div className="flex flex-col min-w-0 flex-1 gap-0.5">
<span className="text-[13px] font-medium text-foreground truncate leading-tight">
{getDocumentTypeLabel(value)}
</span>
<span className="text-[11px] text-muted-foreground leading-tight">
{typeCounts.get(value)} document
{(typeCounts.get(value) ?? 0) !== 1 ? "s" : ""}
</span>
</div>
<Checkbox
id={`${id}-${i}`}
checked={activeTypes.includes(value)}
onCheckedChange={(checked: boolean) => onToggleType(value, !!checked)}
className="h-4 w-4 shrink-0 rounded border-muted-foreground/30 data-[state=checked]:bg-primary data-[state=checked]:border-primary"
/>
</div>
))
)}
</div>
</div>
</PopoverContent>
</Popover>
</ToggleGroup>
{/* Search Input */}
<div className="relative flex-1 min-w-0">
@ -197,23 +202,6 @@ export function DocumentsFilters({
)}
</div>
{/* New Folder Button */}
{onCreateFolder && (
<Tooltip>
<TooltipTrigger asChild>
<Button
variant="outline"
size="icon"
className="h-9 w-9 shrink-0 border-dashed border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
onClick={onCreateFolder}
>
<FolderPlus size={14} />
</Button>
</TooltipTrigger>
<TooltipContent>New folder</TooltipContent>
</Tooltip>
)}
{/* Upload Button */}
<Button
data-joyride="upload-button"

View file

@ -9,7 +9,6 @@ import {
} from "@/components/desktop/shortcut-recorder";
import { Card, CardContent, CardDescription, CardHeader, CardTitle } from "@/components/ui/card";
import { Label } from "@/components/ui/label";
import { Switch } from "@/components/ui/switch";
import { Spinner } from "@/components/ui/spinner";
import { useElectronAPI } from "@/hooks/use-platform";
@ -147,11 +146,7 @@ export function DesktopContent() {
Show suggestions while typing in other applications.
</p>
</div>
<Switch
id="autocomplete-toggle"
checked={enabled}
onCheckedChange={handleToggle}
/>
<Switch id="autocomplete-toggle" checked={enabled} onCheckedChange={handleToggle} />
</div>
</CardContent>
</Card>

View file

@ -1,7 +1,7 @@
"use client";
import { useEffect, useState } from "react";
import { useRouter } from "next/navigation";
import { useEffect, useState } from "react";
import { Logo } from "@/components/Logo";
import { Button } from "@/components/ui/button";
import { Spinner } from "@/components/ui/spinner";
@ -18,7 +18,8 @@ const STEPS = [
{
id: "screen-recording",
title: "Screen Recording",
description: "Lets SurfSense capture your screen to understand context and provide smart writing suggestions.",
description:
"Lets SurfSense capture your screen to understand context and provide smart writing suggestions.",
action: "requestScreenRecording",
field: "screenRecording" as const,
},
@ -98,7 +99,8 @@ export default function DesktopPermissionsPage() {
);
}
const allGranted = permissions.accessibility === "authorized" && permissions.screenRecording === "authorized";
const allGranted =
permissions.accessibility === "authorized" && permissions.screenRecording === "authorized";
const handleRequest = async (action: string) => {
if (action === "requestScreenRecording") {
@ -175,7 +177,8 @@ export default function DesktopPermissionsPage() {
</p>
)}
<p className="text-xs text-muted-foreground">
If SurfSense doesn&apos;t appear in the list, click <strong>+</strong> and select it from Applications.
If SurfSense doesn&apos;t appear in the list, click <strong>+</strong> and
select it from Applications.
</p>
</div>
)}

View file

@ -4,10 +4,6 @@ export const metadata = {
title: "SurfSense Suggestion",
};
export default function SuggestionLayout({
children,
}: {
children: React.ReactNode;
}) {
export default function SuggestionLayout({ children }: { children: React.ReactNode }) {
return <div className="suggestion-body">{children}</div>;
}

View file

@ -103,27 +103,23 @@ export default function SuggestionPage() {
return;
}
const backendUrl =
process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
const backendUrl = process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL || "http://localhost:8000";
try {
const response = await fetch(
`${backendUrl}/api/v1/autocomplete/vision/stream`,
{
method: "POST",
headers: {
Authorization: `Bearer ${token}`,
"Content-Type": "application/json",
},
body: JSON.stringify({
screenshot,
search_space_id: parseInt(searchSpaceId, 10),
app_name: appName || "",
window_title: windowTitle || "",
}),
signal: controller.signal,
const response = await fetch(`${backendUrl}/api/v1/autocomplete/vision/stream`, {
method: "POST",
headers: {
Authorization: `Bearer ${token}`,
"Content-Type": "application/json",
},
);
body: JSON.stringify({
screenshot,
search_space_id: parseInt(searchSpaceId, 10),
app_name: appName || "",
window_title: windowTitle || "",
}),
signal: controller.signal,
});
if (!response.ok) {
setError(friendlyError(response.status));
@ -174,9 +170,7 @@ export default function SuggestionPage() {
return [...prev, { id, title, status, items }];
});
}
} catch {
continue;
}
} catch {}
}
}
}
@ -187,7 +181,7 @@ export default function SuggestionPage() {
setIsLoading(false);
}
},
[],
[]
);
useEffect(() => {
@ -269,10 +263,18 @@ export default function SuggestionPage() {
<div className="suggestion-tooltip">
<p className="suggestion-text">{suggestion}</p>
<div className="suggestion-actions">
<button className="suggestion-btn suggestion-btn-accept" onClick={handleAccept}>
<button
type="button"
className="suggestion-btn suggestion-btn-accept"
onClick={handleAccept}
>
Accept
</button>
<button className="suggestion-btn suggestion-btn-dismiss" onClick={handleDismiss}>
<button
type="button"
className="suggestion-btn suggestion-btn-dismiss"
onClick={handleDismiss}
>
Dismiss
</button>
</div>

View file

@ -1,21 +1,21 @@
html:has(.suggestion-body),
body:has(.suggestion-body) {
margin: 0 !important;
padding: 0 !important;
background: transparent !important;
overflow: hidden !important;
height: auto !important;
width: 100% !important;
margin: 0 !important;
padding: 0 !important;
background: transparent !important;
overflow: hidden !important;
height: auto !important;
width: 100% !important;
}
.suggestion-body {
margin: 0;
padding: 0;
background: transparent;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-webkit-font-smoothing: antialiased;
user-select: none;
-webkit-app-region: no-drag;
margin: 0;
padding: 0;
background: transparent;
font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif;
-webkit-font-smoothing: antialiased;
user-select: none;
-webkit-app-region: no-drag;
}
.suggestion-tooltip {
@ -75,44 +75,46 @@ body:has(.suggestion-body) {
}
.suggestion-btn {
padding: 2px 8px;
border-radius: 3px;
border: 1px solid #3c3c3c;
font-family: inherit;
font-size: 10px;
font-weight: 500;
cursor: pointer;
line-height: 16px;
transition: background 0.15s, border-color 0.15s;
padding: 2px 8px;
border-radius: 3px;
border: 1px solid #3c3c3c;
font-family: inherit;
font-size: 10px;
font-weight: 500;
cursor: pointer;
line-height: 16px;
transition:
background 0.15s,
border-color 0.15s;
}
.suggestion-btn-accept {
background: #2563eb;
border-color: #3b82f6;
color: #fff;
background: #2563eb;
border-color: #3b82f6;
color: #fff;
}
.suggestion-btn-accept:hover {
background: #1d4ed8;
background: #1d4ed8;
}
.suggestion-btn-dismiss {
background: #2a2a2a;
color: #999;
background: #2a2a2a;
color: #999;
}
.suggestion-btn-dismiss:hover {
background: #333;
color: #ccc;
background: #333;
color: #ccc;
}
.suggestion-error {
border-color: #5c2626;
border-color: #5c2626;
}
.suggestion-error-text {
color: #f48771;
font-size: 12px;
color: #f48771;
font-size: 12px;
}
/* --- Agent activity indicator --- */

View file

@ -216,7 +216,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
onPointerDownOutside={(e) => {
if (pickerOpen) e.preventDefault();
}}
className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button_svg]:size-5 select-none"
className="max-w-3xl w-[95vw] sm:w-full h-[75vh] sm:h-[85vh] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 dark:ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-4 sm:[&>button]:right-12 [&>button]:top-6 sm:[&>button]:top-10 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button>svg]:size-5 select-none"
>
<DialogTitle className="sr-only">Manage Connectors</DialogTitle>
{/* YouTube Crawler View - shown when adding YouTube videos */}

View file

@ -144,18 +144,14 @@ export const ConnectorConnectView: FC<ConnectorConnectViewProps> = ({
type="button"
onClick={handleFormSubmit}
disabled={isSubmitting}
className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
>
{isSubmitting ? (
<>
<Spinner size="sm" className="mr-2" />
Connecting
</>
) : connectorType === "MCP_CONNECTOR" ? (
"Connect"
) : (
`Connect ${getConnectorTypeDisplay(connectorType)}`
)}
<span className={isSubmitting ? "opacity-0" : ""}>
{connectorType === "MCP_CONNECTOR"
? "Connect"
: `Connect ${getConnectorTypeDisplay(connectorType)}`}
</span>
{isSubmitting && <Spinner size="sm" className="absolute" />}
</Button>
</div>
</div>

View file

@ -369,16 +369,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
size="sm"
onClick={handleDisconnectConfirm}
disabled={isDisconnecting}
className="text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2"
className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-10 sm:h-auto py-2 sm:py-2"
>
{isDisconnecting ? (
<>
<Spinner size="sm" className="mr-2" />
Disconnecting
</>
) : (
"Confirm Disconnect"
)}
<span className={isDisconnecting ? "opacity-0" : ""}>Confirm Disconnect</span>
{isDisconnecting && <Spinner size="sm" className="absolute" />}
</Button>
<Button
variant="ghost"
@ -415,16 +409,10 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
<Button
onClick={onSave}
disabled={isSaving || isDisconnecting}
className="text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
className="relative text-xs sm:text-sm flex-1 sm:flex-initial h-12 sm:h-auto py-3 sm:py-2"
>
{isSaving ? (
<>
<Spinner size="sm" className="mr-2" />
Saving
</>
) : (
"Save Changes"
)}
<span className={isSaving ? "opacity-0" : ""}>Save Changes</span>
{isSaving && <Spinner size="sm" className="absolute" />}
</Button>
)}
</div>

View file

@ -1,6 +1,6 @@
"use client";
import { Cable } from "lucide-react";
import { Search, Unplug } from "lucide-react";
import type { FC } from "react";
import { getDocumentTypeLabel } from "@/app/dashboard/[search_space_id]/documents/(manage)/components/DocumentTypeIcon";
import { Button } from "@/components/ui/button";
@ -134,9 +134,17 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
const hasActiveConnectors =
filteredOAuthConnectorTypes.length > 0 || filteredNonOAuthConnectors.length > 0;
const hasFilteredResults = hasActiveConnectors || standaloneDocuments.length > 0;
return (
<TabsContent value="active" className="m-0">
{hasSources ? (
{hasSources && !hasFilteredResults && searchQuery ? (
<div className="flex flex-col items-center justify-center py-20 text-center">
<Search className="size-8 text-muted-foreground mb-3" />
<p className="text-sm text-muted-foreground">No connectors found</p>
<p className="text-xs text-muted-foreground/60 mt-1">Try a different search term</p>
</div>
) : hasSources ? (
<div className="space-y-6">
{/* Active Connectors Section */}
{hasActiveConnectors && (
@ -302,7 +310,7 @@ export const ActiveConnectorsTab: FC<ActiveConnectorsTabProps> = ({
) : (
<div className="flex flex-col items-center justify-center py-20 text-center">
<div className="flex h-16 w-16 items-center justify-center rounded-full bg-muted mb-4">
<Cable className="size-8 text-muted-foreground" />
<Unplug className="size-8 text-muted-foreground" />
</div>
<h4 className="text-lg font-semibold">No active sources</h4>
<p className="text-sm text-muted-foreground mt-1 max-w-[280px]">

View file

@ -1,5 +1,6 @@
"use client";
import { Search } from "lucide-react";
import type { FC } from "react";
import { EnumConnectorName } from "@/contracts/enums/connector";
import type { SearchSourceConnector } from "@/contracts/types/connector.types";
@ -287,6 +288,18 @@ export const AllConnectorsTab: FC<AllConnectorsTabProps> = ({
moreIntegrationsOther.length > 0 ||
moreIntegrationsCrawlers.length > 0;
const hasAnyResults = hasDocumentFileConnectors || hasMoreIntegrations;
if (!hasAnyResults && searchQuery) {
return (
<div className="flex flex-col items-center justify-center py-20 text-center">
<Search className="size-8 text-muted-foreground mb-3" />
<p className="text-sm text-muted-foreground">No connectors found</p>
<p className="text-xs text-muted-foreground/60 mt-1">Try a different search term</p>
</div>
);
}
return (
<div className="space-y-8">
{/* Document/Files Connectors */}

View file

@ -173,9 +173,7 @@ export const ConnectorAccountsListView: FC<ConnectorAccountsListViewProps> = ({
<Plus className="size-3 text-primary" />
)}
</div>
<span className="text-xs sm:text-sm font-medium">
{isConnecting ? "Connecting" : buttonText}
</span>
<span className="text-xs sm:text-sm font-medium">{buttonText}</span>
</button>
</div>
</div>

View file

@ -335,16 +335,10 @@ export const YouTubeCrawlerView: FC<YouTubeCrawlerViewProps> = ({ searchSpaceId,
<Button
onClick={handleSubmit}
disabled={isSubmitting || isFetchingPlaylist || videoTags.length === 0}
className="text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
className="relative text-xs sm:text-sm min-w-[140px] disabled:opacity-50 disabled:cursor-not-allowed disabled:pointer-events-none"
>
{isSubmitting ? (
<>
<Spinner size="sm" className="mr-2" />
{t("processing")}
</>
) : (
t("submit")
)}
<span className={isSubmitting ? "opacity-0" : ""}>{t("submit")}</span>
{isSubmitting && <Spinner size="sm" className="absolute" />}
</Button>
</div>
</div>

View file

@ -125,18 +125,16 @@ const DocumentUploadPopupContent: FC<{
onPointerDownOutside={(e) => e.preventDefault()}
onInteractOutside={(e) => e.preventDefault()}
onEscapeKeyDown={(e) => e.preventDefault()}
className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(500px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-3 sm:[&>button]:top-5 [&>button]:opacity-80 hover:[&>button]:opacity-100 [&>button]:z-[100] [&>button_svg]:size-4 sm:[&>button_svg]:size-5"
className="select-none max-w-2xl w-[95vw] sm:w-[640px] h-[min(440px,75dvh)] sm:h-[min(520px,80vh)] flex flex-col p-0 gap-0 overflow-hidden border border-border ring-0 bg-muted dark:bg-muted text-foreground [&>button]:right-3 sm:[&>button]:right-6 [&>button]:top-5 sm:[&>button]:top-8 [&>button]:opacity-80 [&>button]:hover:opacity-100 [&>button]:hover:bg-foreground/10 [&>button]:z-[100] [&>button>svg]:size-4 sm:[&>button>svg]:size-5"
>
<DialogTitle className="sr-only">Upload Document</DialogTitle>
<div className="flex-1 min-h-0 overflow-y-auto overscroll-contain">
<div className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-4 sm:pt-5 pb-10">
<div className="sticky top-0 z-20 bg-muted px-4 sm:px-6 pt-6 sm:pt-8 pb-10">
<div className="flex items-center gap-2 mb-1 pr-8 sm:pr-0">
<h2 className="text-base sm:text-lg font-semibold tracking-tight">
Upload Documents
</h2>
<h2 className="text-xl sm:text-3xl font-semibold tracking-tight">Upload Documents</h2>
</div>
<p className="text-xs sm:text-sm text-muted-foreground line-clamp-1">
<p className="text-xs sm:text-base text-muted-foreground/80 line-clamp-1">
Upload and sync your documents to your search space
</p>
</div>

View file

@ -3,10 +3,10 @@
import type { ImageMessagePartComponent } from "@assistant-ui/react";
import { cva, type VariantProps } from "class-variance-authority";
import { ImageIcon, ImageOffIcon } from "lucide-react";
import NextImage from "next/image";
import { memo, type PropsWithChildren, useEffect, useRef, useState } from "react";
import { createPortal } from "react-dom";
import { cn } from "@/lib/utils";
import NextImage from 'next/image';
const imageVariants = cva("aui-image-root relative overflow-hidden rounded-lg", {
variants: {
@ -88,23 +88,23 @@ function ImagePreview({
<ImageOffIcon className="size-8 text-muted-foreground" />
</div>
) : isDataOrBlobUrl(src) ? (
// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
<img
ref={imgRef}
src={src}
alt={alt}
className={cn("block h-auto w-full object-contain", !loaded && "invisible", className)}
onLoad={(e) => {
if (typeof src === "string") setLoadedSrc(src);
onLoad?.(e);
}}
onError={(e) => {
if (typeof src === "string") setErrorSrc(src);
onError?.(e);
}}
{...props}
/>
) : (
// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
<img
ref={imgRef}
src={src}
alt={alt}
className={cn("block h-auto w-full object-contain", !loaded && "invisible", className)}
onLoad={(e) => {
if (typeof src === "string") setLoadedSrc(src);
onLoad?.(e);
}}
onError={(e) => {
if (typeof src === "string") setErrorSrc(src);
onError?.(e);
}}
{...props}
/>
) : (
// biome-ignore lint/performance/noImgElement: intentional for dynamic external URLs
// <img
// ref={imgRef}
@ -122,22 +122,22 @@ function ImagePreview({
// {...props}
// />
<NextImage
fill
src={src || ""}
alt={alt}
sizes="(max-width: 768px) 100vw, (max-width: 1200px) 80vw, 60vw"
className={cn("block object-contain", !loaded && "invisible", className)}
onLoad={() => {
if (typeof src === "string") setLoadedSrc(src);
onLoad?.();
}}
onError={() => {
if (typeof src === "string") setErrorSrc(src);
onError?.();
}}
unoptimized={false}
{...props}
/>
fill
src={src || ""}
alt={alt}
sizes="(max-width: 768px) 100vw, (max-width: 1200px) 80vw, 60vw"
className={cn("block object-contain", !loaded && "invisible", className)}
onLoad={() => {
if (typeof src === "string") setLoadedSrc(src);
onLoad?.();
}}
onError={() => {
if (typeof src === "string") setErrorSrc(src);
onError?.();
}}
unoptimized={false}
{...props}
/>
)}
</div>
);
@ -162,8 +162,8 @@ type ImageZoomProps = PropsWithChildren<{
alt?: string;
}>;
function isDataOrBlobUrl(src: string | undefined): boolean {
if (!src || typeof src !== "string") return false;
return src.startsWith("data:") || src.startsWith("blob:");
if (!src || typeof src !== "string") return false;
return src.startsWith("data:") || src.startsWith("blob:");
}
function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) {
const [isMounted, setIsMounted] = useState(false);
@ -216,38 +216,38 @@ function ImageZoom({ src, alt = "Image preview", children }: ImageZoomProps) {
>
{/** biome-ignore lint/performance/noImgElement: <explanation> */}
{isDataOrBlobUrl(src) ? (
// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
<img
data-slot="image-zoom-content"
src={src}
alt={alt}
className="aui-image-zoom-content fade-in zoom-in-95 max-h-[90vh] max-w-[90vw] animate-in object-contain duration-200"
onClick={(e) => {
e.stopPropagation();
handleClose();
}}
onKeyDown={(e) => {
if (e.key === "Enter") {
e.stopPropagation();
handleClose();
}
}}
/>
) : (
// biome-ignore lint/performance/noImgElement: data/blob URLs need plain img
<img
data-slot="image-zoom-content"
src={src}
alt={alt}
className="aui-image-zoom-content fade-in zoom-in-95 max-h-[90vh] max-w-[90vw] animate-in object-contain duration-200"
onClick={(e) => {
e.stopPropagation();
handleClose();
}}
onKeyDown={(e) => {
if (e.key === "Enter") {
e.stopPropagation();
handleClose();
}
}}
/>
) : (
<NextImage
data-slot="image-zoom-content"
fill
src={src}
alt={alt}
sizes="90vw"
className="aui-image-zoom-content fade-in zoom-in-95 object-contain duration-200"
onClick={(e) => {
e.stopPropagation();
handleClose();
}}
unoptimized={false}
/>
)}
data-slot="image-zoom-content"
fill
src={src}
alt={alt}
sizes="90vw"
className="aui-image-zoom-content fade-in zoom-in-95 object-contain duration-200"
onClick={(e) => {
e.stopPropagation();
handleClose();
}}
unoptimized={false}
/>
)}
</button>,
document.body
)}

View file

@ -241,9 +241,7 @@ const ThreadListItemComponent = memo(function ThreadListItemComponent({
<MessageSquareIcon className="size-4 shrink-0 text-muted-foreground" />
<div className="flex-1 min-w-0">
<p className="truncate text-sm font-medium">{thread.title || "New Chat"}</p>
<p className="truncate text-xs text-muted-foreground">
{relativeTime}
</p>
<p className="truncate text-xs text-muted-foreground">{relativeTime}</p>
</div>
<DropdownMenu>
<DropdownMenuTrigger asChild>

View file

@ -26,7 +26,8 @@ export const ToolFallback: ToolCallMessagePartComponent = ({
);
const serializedResult = useMemo(
() => (result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null),
() =>
result !== undefined && typeof result !== "string" ? JSON.stringify(result, null, 2) : null,
[result]
);

View file

@ -1,6 +1,6 @@
"use client";
import { ArrowUp, Send, X } from "lucide-react";
import { ArrowUp } from "lucide-react";
import { useCallback, useEffect, useRef, useState } from "react";
import { Button } from "@/components/ui/button";
import { Popover, PopoverAnchor, PopoverContent } from "@/components/ui/popover";
@ -307,7 +307,6 @@ export function CommentComposer({
onClick={onCancel}
disabled={isSubmitting}
>
<X className="mr-1 size-4" />
Cancel
</Button>
)}
@ -318,14 +317,7 @@ export function CommentComposer({
disabled={!canSubmit}
className={cn(!canSubmit && "opacity-50", compact && "size-8 shrink-0 rounded-full")}
>
{compact ? (
<ArrowUp className="size-4" />
) : (
<>
<Send className="mr-1 size-4" />
{submitLabel}
</>
)}
{compact ? <ArrowUp className="size-4" /> : submitLabel}
</Button>
</div>
</div>

View file

@ -1,6 +1,6 @@
"use client";
import { MoreHorizontal, Pencil, Trash2 } from "lucide-react";
import { MoreHorizontal, PenLine, Trash2 } from "lucide-react";
import { Button } from "@/components/ui/button";
import {
DropdownMenu,
@ -21,15 +21,15 @@ export function CommentActions({ canEdit, canDelete, onEdit, onDelete }: Comment
<Button
variant="ghost"
size="icon"
className="size-7 opacity-100 md:opacity-0 md:group-hover:opacity-100 transition-opacity"
className="size-7 text-muted-foreground opacity-100 md:opacity-0 md:group-hover:opacity-100 transition-opacity"
>
<MoreHorizontal className="size-4 text-muted-foreground" />
<MoreHorizontal className="size-4" />
</Button>
</DropdownMenuTrigger>
<DropdownMenuContent align="end">
{canEdit && (
<DropdownMenuItem onClick={onEdit}>
<Pencil className="mr-2 size-4" />
<PenLine className="mr-2 size-4" />
Edit
</DropdownMenuItem>
)}

View file

@ -198,7 +198,7 @@ export function CommentItem({
<CommentComposer
members={members}
membersLoading={membersLoading}
placeholder="Edit your comment..."
placeholder="Edit your comment"
submitLabel="Save"
isSubmitting={isSubmitting}
onSubmit={handleEditSubmit}

View file

@ -106,7 +106,9 @@ export const DocumentNode = React.memo(function DocumentNode({
const isProcessing = statusState === "pending" || statusState === "processing";
const [dropdownOpen, setDropdownOpen] = useState(false);
const [exporting, setExporting] = useState<string | null>(null);
const [titleTooltipOpen, setTitleTooltipOpen] = useState(false);
const rowRef = useRef<HTMLDivElement>(null);
const titleRef = useRef<HTMLSpanElement>(null);
const handleExport = useCallback(
(format: string) => {
@ -118,6 +120,14 @@ export const DocumentNode = React.memo(function DocumentNode({
[doc, onExport]
);
const handleTitleTooltipOpenChange = useCallback((open: boolean) => {
if (open && titleRef.current) {
setTitleTooltipOpen(titleRef.current.scrollWidth > titleRef.current.clientWidth);
} else {
setTitleTooltipOpen(false);
}
}, []);
const attachRef = useCallback(
(node: HTMLDivElement | null) => {
(rowRef as React.MutableRefObject<HTMLDivElement | null>).current = node;
@ -197,7 +207,20 @@ export const DocumentNode = React.memo(function DocumentNode({
);
})()}
<span className="flex-1 min-w-0 truncate">{doc.title}</span>
<Tooltip
delayDuration={600}
open={titleTooltipOpen}
onOpenChange={handleTitleTooltipOpenChange}
>
<TooltipTrigger asChild>
<span ref={titleRef} className="flex-1 min-w-0 truncate">
{doc.title}
</span>
</TooltipTrigger>
<TooltipContent side="bottom" className="max-w-xs break-words">
{doc.title}
</TooltipContent>
</Tooltip>
{getDocumentTypeIcon(
doc.document_type as DocumentTypeEnum,
@ -259,11 +282,7 @@ export const DocumentNode = React.memo(function DocumentNode({
Versions
</DropdownMenuItem>
)}
<DropdownMenuItem
className="text-destructive focus:text-destructive"
disabled={isProcessing}
onClick={() => onDelete(doc)}
>
<DropdownMenuItem disabled={isProcessing} onClick={() => onDelete(doc)}>
<Trash2 className="mr-2 h-4 w-4" />
Delete
</DropdownMenuItem>
@ -305,11 +324,7 @@ export const DocumentNode = React.memo(function DocumentNode({
Versions
</ContextMenuItem>
)}
<ContextMenuItem
className="text-destructive focus:text-destructive"
disabled={isProcessing}
onClick={() => onDelete(doc)}
>
<ContextMenuItem disabled={isProcessing} onClick={() => onDelete(doc)}>
<Trash2 className="mr-2 h-4 w-4" />
Delete
</ContextMenuItem>

View file

@ -56,7 +56,6 @@ interface FolderNodeProps {
depth: number;
isExpanded: boolean;
isRenaming: boolean;
childCount: number;
selectionState: FolderSelectionState;
processingState: "idle" | "processing" | "failed";
onToggleSelect: (folderId: number, selectAll: boolean) => void;
@ -101,7 +100,6 @@ export const FolderNode = React.memo(function FolderNode({
depth,
isExpanded,
isRenaming,
childCount,
selectionState,
processingState,
onToggleSelect,
@ -336,12 +334,6 @@ export const FolderNode = React.memo(function FolderNode({
<span className="flex-1 min-w-0 truncate">{folder.name}</span>
)}
{!isRenaming && childCount > 0 && (
<span className="shrink-0 text-[10px] text-muted-foreground tabular-nums">
{childCount}
</span>
)}
{!isRenaming && (
<DropdownMenu>
<DropdownMenuTrigger asChild>

View file

@ -86,16 +86,6 @@ export function FolderTreeView({
const docsByFolder = useMemo(() => groupBy(documents, (d) => d.folderId ?? "root"), [documents]);
const folderChildCounts = useMemo(() => {
const counts: Record<number, number> = {};
for (const f of folders) {
const children = foldersByParent[f.id] ?? [];
const docs = docsByFolder[f.id] ?? [];
counts[f.id] = children.length + docs.length;
}
return counts;
}, [folders, foldersByParent, docsByFolder]);
const [openContextMenuId, setOpenContextMenuId] = useState<string | null>(null);
// Single subscription for rename state — derived boolean passed to each FolderNode
@ -106,14 +96,26 @@ export function FolderTreeView({
);
const handleCancelRename = useCallback(() => setRenamingFolderId(null), [setRenamingFolderId]);
const effectiveActiveTypes = useMemo(() => {
if (
activeTypes.includes("FILE" as DocumentTypeEnum) &&
!activeTypes.includes("LOCAL_FOLDER_FILE" as DocumentTypeEnum)
) {
return [...activeTypes, "LOCAL_FOLDER_FILE" as DocumentTypeEnum];
}
return activeTypes;
}, [activeTypes]);
const hasDescendantMatch = useMemo(() => {
if (activeTypes.length === 0 && !searchQuery) return null;
if (effectiveActiveTypes.length === 0 && !searchQuery) return null;
const match: Record<number, boolean> = {};
function check(folderId: number): boolean {
if (match[folderId] !== undefined) return match[folderId];
const childDocs = (docsByFolder[folderId] ?? []).some(
(d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum)
(d) =>
effectiveActiveTypes.length === 0 ||
effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
);
if (childDocs) {
match[folderId] = true;
@ -134,7 +136,7 @@ export function FolderTreeView({
check(f.id);
}
return match;
}, [folders, docsByFolder, foldersByParent, activeTypes, searchQuery]);
}, [folders, docsByFolder, foldersByParent, effectiveActiveTypes, searchQuery]);
const folderSelectionStates = useMemo(() => {
const states: Record<number, FolderSelectionState> = {};
@ -204,7 +206,9 @@ export function FolderTreeView({
? childFolders.filter((f) => hasDescendantMatch[f.id])
: childFolders;
const childDocs = (docsByFolder[key] ?? []).filter(
(d) => activeTypes.length === 0 || activeTypes.includes(d.document_type as DocumentTypeEnum)
(d) =>
effectiveActiveTypes.length === 0 ||
effectiveActiveTypes.includes(d.document_type as DocumentTypeEnum)
);
const nodes: React.ReactNode[] = [];
@ -226,7 +230,6 @@ export function FolderTreeView({
depth={depth}
isExpanded={isExpanded}
isRenaming={renamingFolderId === f.id}
childCount={folderChildCounts[f.id] ?? 0}
selectionState={folderSelectionStates[f.id] ?? "none"}
processingState={folderProcessingStates[f.id] ?? "idle"}
onToggleSelect={onToggleFolderSelect}
@ -289,7 +292,7 @@ export function FolderTreeView({
);
}
if (treeNodes.length === 0 && (activeTypes.length > 0 || searchQuery)) {
if (treeNodes.length === 0 && (effectiveActiveTypes.length > 0 || searchQuery)) {
return (
<div className="flex flex-1 flex-col items-center justify-center gap-3 px-4 py-12 text-muted-foreground">
<Search className="h-10 w-10" />

View file

@ -11,13 +11,12 @@ import { MarkdownViewer } from "@/components/markdown-viewer";
import { Alert, AlertDescription } from "@/components/ui/alert";
import { Button } from "@/components/ui/button";
import { Drawer, DrawerContent, DrawerHandle, DrawerTitle } from "@/components/ui/drawer";
import { Skeleton } from "@/components/ui/skeleton";
import { useMediaQuery } from "@/hooks/use-media-query";
import { authenticatedFetch, getBearerToken, redirectToLogin } from "@/lib/auth-utils";
const PlateEditor = dynamic(
() => import("@/components/editor/plate-editor").then((m) => ({ default: m.PlateEditor })),
{ ssr: false, loading: () => <Skeleton className="h-64 w-full" /> }
{ ssr: false, loading: () => <EditorPanelSkeleton /> }
);
const LARGE_DOCUMENT_THRESHOLD = 2 * 1024 * 1024; // 2MB

View file

@ -158,17 +158,18 @@ export function PlateEditor({
// When not forced read-only, the user can toggle between editing/viewing.
const canToggleMode = !readOnly;
const contextProviderValue = useMemo(()=> ({
onSave,
hasUnsavedChanges,
isSaving,
canToggleMode,
}), [onSave, hasUnsavedChanges, isSaving, canToggleMode]);
const contextProviderValue = useMemo(
() => ({
onSave,
hasUnsavedChanges,
isSaving,
canToggleMode,
}),
[onSave, hasUnsavedChanges, isSaving, canToggleMode]
);
return (
<EditorSaveContext.Provider
value={contextProviderValue}
>
<EditorSaveContext.Provider value={contextProviderValue}>
<Plate
editor={editor}
// Only pass readOnly as a controlled prop when forced (permanently read-only).

View file

@ -1,7 +1,7 @@
"use client";
import Image from 'next/image';
import { AnimatePresence, motion } from "motion/react";
import Image from "next/image";
import { ExpandedGifOverlay, useExpandedGif } from "@/components/ui/expanded-gif-overlay";
const useCases = [
@ -83,13 +83,13 @@ function UseCaseCard({
className="w-full rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]"
/>
<div className="relative w-full h-48">
<Image
src={src}
alt={title}
fill
className="rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]"
unoptimized={src.endsWith('.gif')}
/>
<Image
src={src}
alt={title}
fill
className="rounded-xl object-cover transition-transform duration-500 group-hover:scale-[1.02]"
unoptimized={src.endsWith(".gif")}
/>
</div>
</div>
<div className="px-5 py-4">

View file

@ -347,35 +347,38 @@ export function LayoutDataProvider({ searchSpaceId, children }: LayoutDataProvid
// Navigation items
const navItems: NavItem[] = useMemo(
() => [
{
title: "Inbox",
url: "#inbox",
icon: Inbox,
isActive: isInboxSidebarOpen,
badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined,
},
{
title: "Documents",
url: "#documents",
icon: SquareLibrary,
isActive: isMobile
? isDocumentsSidebarOpen
: isDocumentsSidebarOpen && !isRightPanelCollapsed,
},
{
title: "Announcements",
url: "#announcements",
icon: Megaphone,
isActive: isAnnouncementsSidebarOpen,
badge: announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
},
],
() =>
(
[
{
title: "Inbox",
url: "#inbox",
icon: Inbox,
isActive: isInboxSidebarOpen,
badge: totalUnreadCount > 0 ? formatInboxCount(totalUnreadCount) : undefined,
},
isMobile
? {
title: "Documents",
url: "#documents",
icon: SquareLibrary,
isActive: isDocumentsSidebarOpen,
}
: null,
{
title: "Announcements",
url: "#announcements",
icon: Megaphone,
isActive: isAnnouncementsSidebarOpen,
badge:
announcementUnreadCount > 0 ? formatInboxCount(announcementUnreadCount) : undefined,
},
] as (NavItem | null)[]
).filter((item): item is NavItem => item !== null),
[
isMobile,
isInboxSidebarOpen,
isDocumentsSidebarOpen,
isRightPanelCollapsed,
totalUnreadCount,
isAnnouncementsSidebarOpen,
announcementUnreadCount,

View file

@ -82,7 +82,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
return (
<Dialog open={open} onOpenChange={handleOpenChange}>
<DialogContent className="max-w-[90vw] sm:max-w-sm p-4 sm:p-5 data-[state=open]:animate-none data-[state=closed]:animate-none">
<DialogContent className="max-w-[90vw] sm:max-w-sm p-4 sm:p-5 select-none data-[state=open]:animate-none data-[state=closed]:animate-none">
<DialogHeader className="space-y-2 pb-2">
<div className="flex items-center gap-2 sm:gap-3">
<div className="flex-1 min-w-0">
@ -107,7 +107,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
placeholder={t("name_placeholder")}
{...field}
autoFocus
className="text-sm h-9 sm:h-10"
className="text-sm h-9 sm:h-10 select-text"
/>
</FormControl>
<FormMessage />
@ -130,7 +130,7 @@ export function CreateSearchSpaceDialog({ open, onOpenChange }: CreateSearchSpac
<Input
placeholder={t("description_placeholder")}
{...field}
className="text-sm h-9 sm:h-10"
className="text-sm h-9 sm:h-10 select-text"
/>
</FormControl>
<FormMessage />

View file

@ -10,7 +10,6 @@ import { documentsSidebarOpenAtom } from "@/atoms/documents/ui.atoms";
import { closeEditorPanelAtom, editorPanelAtom } from "@/atoms/editor/editor-panel.atom";
import { rightPanelCollapsedAtom, rightPanelTabAtom } from "@/atoms/layout/right-panel.atom";
import { Button } from "@/components/ui/button";
import { Skeleton } from "@/components/ui/skeleton";
import { Tooltip, TooltipContent, TooltipTrigger } from "@/components/ui/tooltip";
import { DocumentsSidebar } from "../sidebar";
@ -27,7 +26,7 @@ const HitlEditPanelContent = dynamic(
import("@/components/hitl-edit-panel/hitl-edit-panel").then((m) => ({
default: m.HitlEditPanelContent,
})),
{ ssr: false, loading: () => <Skeleton className="h-96 w-full" /> }
{ ssr: false, loading: () => null }
);
const ReportPanelContent = dynamic(
@ -35,7 +34,7 @@ const ReportPanelContent = dynamic(
import("@/components/report-panel/report-panel").then((m) => ({
default: m.ReportPanelContent,
})),
{ ssr: false, loading: () => <Skeleton className="h-96 w-full" /> }
{ ssr: false, loading: () => null }
);
interface RightPanelProps {
@ -78,14 +77,14 @@ export function RightPanelExpandButton() {
if (!collapsed || !hasContent) return null;
return (
<div className="flex shrink-0 items-center px-1">
<div className="flex shrink-0 items-center px-0.5">
<Tooltip>
<TooltipTrigger asChild>
<Button
variant="ghost"
size="icon"
onClick={() => startTransition(() => setCollapsed(false))}
className="h-7 w-7 shrink-0"
className="h-8 w-8 shrink-0 -m-0.5"
>
<PanelRight className="h-4 w-4" />
<span className="sr-only">Expand panel</span>

View file

@ -376,7 +376,7 @@ export function AllPrivateChatsSidebarContent({
<span className="truncate">{thread.title || "New Chat"}</span>
</button>
) : (
<Tooltip>
<Tooltip delayDuration={600}>
<TooltipTrigger asChild>
<button
type="button"

View file

@ -375,7 +375,7 @@ export function AllSharedChatsSidebarContent({
<span className="truncate">{thread.title || "New Chat"}</span>
</button>
) : (
<Tooltip>
<Tooltip delayDuration={600}>
<TooltipTrigger asChild>
<button
type="button"

View file

@ -530,7 +530,8 @@ export function DocumentsSidebar({
const typeCounts = useMemo(() => {
const counts: Partial<Record<string, number>> = {};
for (const d of treeDocuments) {
counts[d.document_type] = (counts[d.document_type] || 0) + 1;
const displayType = d.document_type === "LOCAL_FOLDER_FILE" ? "FILE" : d.document_type;
counts[displayType] = (counts[displayType] || 0) + 1;
}
return counts;
}, [treeDocuments]);
@ -745,7 +746,7 @@ export function DocumentsSidebar({
</button>
</div>
<div className="flex-1 min-h-0 overflow-x-hidden pt-0 flex flex-col">
<div className="flex-1 min-h-0 pt-0 flex flex-col">
<div className="px-4 pb-2">
<DocumentsFilters
typeCounts={typeCounts}

View file

@ -790,36 +790,23 @@ export function InboxSidebarContent({
</DropdownMenuContent>
</DropdownMenu>
)}
{isMobile ? (
<Button
variant="ghost"
size="icon"
className="h-7 w-7 rounded-full"
onClick={handleMarkAllAsRead}
disabled={totalUnreadCount === 0}
>
<CheckCheck className="h-4 w-4 text-muted-foreground" />
<span className="sr-only">{t("mark_all_read") || "Mark all as read"}</span>
</Button>
) : (
<Tooltip>
<TooltipTrigger asChild>
<Button
variant="ghost"
size="icon"
className="h-7 w-7 rounded-full"
onClick={handleMarkAllAsRead}
disabled={totalUnreadCount === 0}
>
<CheckCheck className="h-4 w-4 text-muted-foreground" />
<span className="sr-only">{t("mark_all_read") || "Mark all as read"}</span>
</Button>
</TooltipTrigger>
<TooltipContent className="z-80">
{t("mark_all_read") || "Mark all as read"}
</TooltipContent>
</Tooltip>
)}
<Tooltip>
<TooltipTrigger asChild>
<Button
variant="ghost"
size="icon"
className="h-7 w-7 rounded-full"
onClick={handleMarkAllAsRead}
disabled={totalUnreadCount === 0}
>
<CheckCheck className="h-4 w-4 text-muted-foreground" />
<span className="sr-only">{t("mark_all_read") || "Mark all as read"}</span>
</Button>
</TooltipTrigger>
<TooltipContent className="z-80">
{t("mark_all_read") || "Mark all as read"}
</TooltipContent>
</Tooltip>
</div>
</div>
@ -932,30 +919,8 @@ export function InboxSidebarContent({
)}
style={{ contentVisibility: "auto", containIntrinsicSize: "0 80px" }}
>
{isMobile ? (
<button
type="button"
onClick={() => handleItemClick(item)}
disabled={isMarkingAsRead}
className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
>
<div className="shrink-0">{getStatusIcon(item)}</div>
<div className="flex-1 min-w-0 overflow-hidden">
<p
className={cn(
"text-xs font-medium line-clamp-2",
!item.read && "font-semibold"
)}
>
{item.title}
</p>
<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
{convertRenderedToDisplay(item.message)}
</p>
</div>
</button>
) : (
<Tooltip>
{activeTab === "status" ? (
<Tooltip delayDuration={600}>
<TooltipTrigger asChild>
<button
type="button"
@ -986,6 +951,28 @@ export function InboxSidebarContent({
</p>
</TooltipContent>
</Tooltip>
) : (
<button
type="button"
onClick={() => handleItemClick(item)}
disabled={isMarkingAsRead}
className="flex items-center gap-3 flex-1 min-w-0 text-left overflow-hidden"
>
<div className="shrink-0">{getStatusIcon(item)}</div>
<div className="flex-1 min-w-0 overflow-hidden">
<p
className={cn(
"text-xs font-medium line-clamp-2",
!item.read && "font-semibold"
)}
>
{item.title}
</p>
<p className="text-[11px] text-muted-foreground line-clamp-2 mt-0.5">
{convertRenderedToDisplay(item.message)}
</p>
</div>
</button>
)}
<div className="flex items-center justify-end gap-1.5 shrink-0 w-10">

View file

@ -35,7 +35,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp
<Progress value={usagePercentage} className="h-1.5" />
<Link
href={`/dashboard/${searchSpaceId}/more-pages`}
className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
>
<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
<Zap className="h-3 w-3 shrink-0" />
@ -48,7 +48,7 @@ export function PageUsageDisplay({ pagesUsed, pagesLimit }: PageUsageDisplayProp
{pageBuyingEnabled && (
<Link
href={`/dashboard/${searchSpaceId}/buy-pages`}
className="group flex w-full items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
className="group flex w-[calc(100%+0.75rem)] items-center justify-between rounded-md px-1.5 py-1 -mx-1.5 transition-colors hover:bg-accent"
>
<span className="flex items-center gap-1.5 text-xs text-muted-foreground group-hover:text-accent-foreground">
<CreditCard className="h-3 w-3 shrink-0" />

View file

@ -2,9 +2,9 @@ import { createCodePlugin } from "@streamdown/code";
import { createMathPlugin } from "@streamdown/math";
import { Streamdown, type StreamdownProps } from "streamdown";
import "katex/dist/katex.min.css";
import { cn } from "@/lib/utils";
import Image from 'next/image';
import { is } from "drizzle-orm";
import Image from "next/image";
import { cn } from "@/lib/utils";
const code = createCodePlugin({
themes: ["nord", "nord"],
@ -130,30 +130,31 @@ export function MarkdownViewer({ content, className, maxLength }: MarkdownViewer
),
hr: ({ ...props }) => <hr className="my-4 border-muted" {...props} />,
img: ({ src, alt, width: _w, height: _h, ...props }) => {
const isDataOrUnknownUrl = typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http"));
const isDataOrUnknownUrl =
typeof src === "string" && (src.startsWith("data:") || !src.startsWith("http"));
return isDataOrUnknownUrl ? (
// eslint-disable-next-line @next/next/no-img-element
<img
className="max-w-full h-auto my-4 rounded"
alt={alt || "markdown image"}
src={src}
loading="lazy"
{...props}
/>
) : (
<Image
className="max-w-full h-auto my-4 rounded"
alt={alt || "markdown image"}
src={typeof src === "string" ? src : ""}
width={_w || 800}
height={_h || 600}
sizes="(max-width: 768px) 100vw, (max-width: 1200px) 75vw, 60vw"
unoptimized={isDataOrUnknownUrl}
{...props}
/>
);
},
return isDataOrUnknownUrl ? (
// eslint-disable-next-line @next/next/no-img-element
<img
className="max-w-full h-auto my-4 rounded"
alt={alt || "markdown image"}
src={src}
loading="lazy"
{...props}
/>
) : (
<Image
className="max-w-full h-auto my-4 rounded"
alt={alt || "markdown image"}
src={typeof src === "string" ? src : ""}
width={_w || 800}
height={_h || 600}
sizes="(max-width: 768px) 100vw, (max-width: 1200px) 75vw, 60vw"
unoptimized={isDataOrUnknownUrl}
{...props}
/>
);
},
table: ({ ...props }) => (
<div className="overflow-x-auto my-4 rounded-lg border border-border w-full">
<table className="w-full divide-y divide-border" {...props} />

View file

@ -163,21 +163,16 @@ export function ChatShareButton({ thread, onVisibilityChange, className }: ChatS
)}
<Popover open={open} onOpenChange={setOpen}>
<Tooltip>
<TooltipTrigger asChild>
<PopoverTrigger asChild>
<Button
variant="outline"
size="icon"
className="h-8 w-8 md:w-auto md:px-3 md:gap-2 relative bg-muted hover:bg-muted/80 border-0 select-none"
>
<CurrentIcon className="h-4 w-4" />
<span className="hidden md:inline text-sm">{buttonLabel}</span>
</Button>
</PopoverTrigger>
</TooltipTrigger>
<TooltipContent>Share settings</TooltipContent>
</Tooltip>
<PopoverTrigger asChild>
<Button
variant="outline"
size="icon"
className="h-8 w-8 md:w-auto md:px-3 md:gap-2 relative bg-muted hover:bg-muted/80 border-0 select-none"
>
<CurrentIcon className="h-4 w-4" />
<span className="hidden md:inline text-sm">{buttonLabel}</span>
</Button>
</PopoverTrigger>
<PopoverContent
className="w-[280px] md:w-[320px] p-0 rounded-lg shadow-lg border-border/60 dark:bg-neutral-900 dark:border dark:border-white/5 select-none"

View file

@ -1,7 +1,7 @@
"use client";
import { useAtomValue } from "jotai";
import { Bot, Check, ChevronDown, Edit3, ImageIcon, Plus, Zap } from "lucide-react";
import { Bot, Check, ChevronDown, Edit3, ImageIcon, Plus, Search, Zap } from "lucide-react";
import { type UIEvent, useCallback, useMemo, useState } from "react";
import { toast } from "sonner";
import {
@ -344,7 +344,7 @@ export function ModelSelector({
>
<CommandEmpty className="py-8 text-center">
<div className="flex flex-col items-center gap-2">
<Bot className="size-8 text-muted-foreground" />
<Search className="size-8 text-muted-foreground" />
<p className="text-sm text-muted-foreground">No models found</p>
<p className="text-xs text-muted-foreground/60">Try a different search term</p>
</div>
@ -531,8 +531,9 @@ export function ModelSelector({
>
<CommandEmpty className="py-8 text-center">
<div className="flex flex-col items-center gap-2">
<ImageIcon className="size-8 text-muted-foreground" />
<Search className="size-8 text-muted-foreground" />
<p className="text-sm text-muted-foreground">No image models found</p>
<p className="text-xs text-muted-foreground/60">Try a different search term</p>
</div>
</CommandEmpty>

View file

@ -6,10 +6,10 @@ import { useTranslations } from "next-intl";
import { useMemo } from "react";
import { ApiKeyContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ApiKeyContent";
import { CommunityPromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/CommunityPromptsContent";
import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent";
import { ProfileContent } from "@/app/dashboard/[search_space_id]/user-settings/components/ProfileContent";
import { PromptsContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PromptsContent";
import { PurchaseHistoryContent } from "@/app/dashboard/[search_space_id]/user-settings/components/PurchaseHistoryContent";
import { DesktopContent } from "@/app/dashboard/[search_space_id]/user-settings/components/DesktopContent";
import { userSettingsDialogAtom } from "@/atoms/settings/settings-dialog.atoms";
import { SettingsDialog } from "@/components/settings/settings-dialog";
import { usePlatform } from "@/hooks/use-platform";

View file

@ -433,7 +433,7 @@ export function ImageConfigDialog({
className="relative text-sm h-9 min-w-[120px]"
>
<span className={isSubmitting ? "opacity-0" : ""}>
{mode === "edit" ? "Save Changes" : "Create & Use"}
{mode === "edit" ? "Save Changes" : "Add Model"}
</span>
{isSubmitting && <Spinner size="sm" className="absolute" />}
</Button>

View file

@ -312,7 +312,7 @@ export function ModelConfigDialog({
className="relative text-sm h-9 min-w-[120px]"
>
<span className={isSubmitting ? "opacity-0" : ""}>
{mode === "edit" ? "Save Changes" : "Create & Use"}
{mode === "edit" ? "Save Changes" : "Add Model"}
</span>
{isSubmitting && <Spinner size="sm" className="absolute" />}
</Button>

View file

@ -86,7 +86,6 @@ const FILE_TYPE_CONFIG: Record<string, Record<string, string[]>> = {
"application/rtf": [".rtf"],
"application/xml": [".xml"],
"application/epub+zip": [".epub"],
"text/html": [".html", ".htm", ".web"],
"image/gif": [".gif"],
"image/svg+xml": [".svg"],
...audioFileTypes,
@ -470,8 +469,9 @@ export function DocumentUploadTab({
</button>
))
) : (
<div
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer"
<button
type="button"
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
onClick={() => {
if (!isElectron) fileInputRef.current?.click();
}}
@ -483,10 +483,16 @@ export function DocumentUploadTab({
</p>
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
</div>
<div className="w-full mt-1" onClick={(e) => e.stopPropagation()}>
{/* biome-ignore lint/a11y/useSemanticElements: wrapper to stop click propagation to parent button */}
<div
className="w-full mt-1"
onClick={(e) => e.stopPropagation()}
onKeyDown={(e) => e.stopPropagation()}
role="group"
>
{renderBrowseButton({ fullWidth: true })}
</div>
</div>
</button>
)}
</div>
@ -681,9 +687,13 @@ export function DocumentUploadTab({
</span>
</AccordionTrigger>
<AccordionContent className="px-3 pb-3">
<div className="flex flex-wrap gap-1">
<div className="flex flex-wrap gap-1.5">
{supportedExtensions.map((ext) => (
<Badge key={ext} variant="outline" className="text-[10px] px-1.5 py-0">
<Badge
key={ext}
variant="secondary"
className="rounded border-0 bg-neutral-200/80 dark:bg-neutral-700/60 text-muted-foreground text-[10px] px-2 py-0.5 font-normal"
>
{ext}
</Badge>
))}

View file

@ -2,13 +2,12 @@
import type { LucideIcon } from "lucide-react";
import { Code2, Database, ExternalLink, File, FileText, Globe, Newspaper } from "lucide-react";
import NextImage from "next/image";
import * as React from "react";
import { openSafeNavigationHref, resolveSafeNavigationHref } from "../shared/media";
import { cn, Popover, PopoverContent, PopoverTrigger } from "./_adapter";
import { Citation } from "./citation";
import type { CitationType, CitationVariant, SerializableCitation } from "./schema";
import NextImage from 'next/image';
const TYPE_ICONS: Record<CitationType, LucideIcon> = {
webpage: Globe,
@ -264,9 +263,9 @@ function OverflowItem({ citation, onClick }: OverflowItemProps) {
className="size-4.5 rounded-full object-cover"
unoptimized={true}
/>
) : (
) : (
<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
)}
)}
<div className="min-w-0 flex-1">
<p className="group-hover:decoration-foreground/30 truncate text-sm font-medium group-hover:underline group-hover:underline-offset-2">
{citation.title}
@ -341,18 +340,18 @@ function StackedCitations({ id, citations, className, onNavigate }: StackedCitat
style={{ zIndex: maxIcons - index }}
>
{citation.favicon ? (
<NextImage
src={citation.favicon}
alt=""
aria-hidden="true"
width={18}
height={18}
className="size-4.5 rounded-full object-cover"
unoptimized={true}
/>
) : (
<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
)}
<NextImage
src={citation.favicon}
alt=""
aria-hidden="true"
width={18}
height={18}
className="size-4.5 rounded-full object-cover"
unoptimized={true}
/>
) : (
<TypeIcon className="text-muted-foreground size-3" aria-hidden="true" />
)}
</div>
);
})}

Some files were not shown because too many files have changed in this diff Show more