mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-04-25 00:36:31 +02:00
Merge pull request #1207 from CREDO23/feat/kb-export-and-folder-upload
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions
Some checks are pending
Build and Push Docker Images / tag_release (push) Waiting to run
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_backend, ./surfsense_backend/Dockerfile, backend, surfsense-backend, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-24.04-arm, linux/arm64, arm64) (push) Blocked by required conditions
Build and Push Docker Images / build (./surfsense_web, ./surfsense_web/Dockerfile, web, surfsense-web, ubuntu-latest, linux/amd64, amd64) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (backend, surfsense-backend) (push) Blocked by required conditions
Build and Push Docker Images / create_manifest (web, surfsense-web) (push) Blocked by required conditions
[Feat] KB Export, Folder Upload & Vision LLM for Image Processing
This commit is contained in:
commit
61b3f0d7e3
47 changed files with 1399 additions and 107 deletions
|
|
@ -0,0 +1,45 @@
|
||||||
|
"""123_add_enable_vision_llm_to_connectors
|
||||||
|
|
||||||
|
Revision ID: 123
|
||||||
|
Revises: 122
|
||||||
|
Create Date: 2026-04-09
|
||||||
|
|
||||||
|
Adds enable_vision_llm boolean column to search_source_connectors.
|
||||||
|
Defaults to False so vision LLM image processing is opt-in.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from collections.abc import Sequence
|
||||||
|
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "123"
|
||||||
|
down_revision: str | None = "122"
|
||||||
|
branch_labels: str | Sequence[str] | None = None
|
||||||
|
depends_on: str | Sequence[str] | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
conn = op.get_bind()
|
||||||
|
existing_columns = [
|
||||||
|
col["name"] for col in sa.inspect(conn).get_columns("search_source_connectors")
|
||||||
|
]
|
||||||
|
|
||||||
|
if "enable_vision_llm" not in existing_columns:
|
||||||
|
op.add_column(
|
||||||
|
"search_source_connectors",
|
||||||
|
sa.Column(
|
||||||
|
"enable_vision_llm",
|
||||||
|
sa.Boolean(),
|
||||||
|
nullable=False,
|
||||||
|
server_default=sa.text("false"),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
op.drop_column("search_source_connectors", "enable_vision_llm")
|
||||||
|
|
@ -44,6 +44,8 @@ async def _export_paper_content(
|
||||||
async def download_and_extract_content(
|
async def download_and_extract_content(
|
||||||
client: DropboxClient,
|
client: DropboxClient,
|
||||||
file: dict[str, Any],
|
file: dict[str, Any],
|
||||||
|
*,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[str | None, dict[str, Any], str | None]:
|
) -> tuple[str | None, dict[str, Any], str | None]:
|
||||||
"""Download a Dropbox file and extract its content as markdown.
|
"""Download a Dropbox file and extract its content as markdown.
|
||||||
|
|
||||||
|
|
@ -91,7 +93,7 @@ async def download_and_extract_content(
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||||
|
|
||||||
result = await EtlPipelineService().extract(
|
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||||
EtlRequest(file_path=temp_file_path, filename=file_name)
|
EtlRequest(file_path=temp_file_path, filename=file_name)
|
||||||
)
|
)
|
||||||
markdown = result.markdown_content
|
markdown = result.markdown_content
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,8 @@ logger = logging.getLogger(__name__)
|
||||||
async def download_and_extract_content(
|
async def download_and_extract_content(
|
||||||
client: GoogleDriveClient,
|
client: GoogleDriveClient,
|
||||||
file: dict[str, Any],
|
file: dict[str, Any],
|
||||||
|
*,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[str | None, dict[str, Any], str | None]:
|
) -> tuple[str | None, dict[str, Any], str | None]:
|
||||||
"""Download a Google Drive file and extract its content as markdown.
|
"""Download a Google Drive file and extract its content as markdown.
|
||||||
|
|
||||||
|
|
@ -103,7 +105,9 @@ async def download_and_extract_content(
|
||||||
etl_filename = (
|
etl_filename = (
|
||||||
file_name + extension if is_google_workspace_file(mime_type) else file_name
|
file_name + extension if is_google_workspace_file(mime_type) else file_name
|
||||||
)
|
)
|
||||||
markdown = await _parse_file_to_markdown(temp_file_path, etl_filename)
|
markdown = await _parse_file_to_markdown(
|
||||||
|
temp_file_path, etl_filename, vision_llm=vision_llm
|
||||||
|
)
|
||||||
return markdown, drive_metadata, None
|
return markdown, drive_metadata, None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -115,12 +119,14 @@ async def download_and_extract_content(
|
||||||
os.unlink(temp_file_path)
|
os.unlink(temp_file_path)
|
||||||
|
|
||||||
|
|
||||||
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
|
async def _parse_file_to_markdown(
|
||||||
|
file_path: str, filename: str, *, vision_llm=None
|
||||||
|
) -> str:
|
||||||
"""Parse a local file to markdown using the unified ETL pipeline."""
|
"""Parse a local file to markdown using the unified ETL pipeline."""
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||||
|
|
||||||
result = await EtlPipelineService().extract(
|
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||||
EtlRequest(file_path=file_path, filename=filename)
|
EtlRequest(file_path=file_path, filename=filename)
|
||||||
)
|
)
|
||||||
return result.markdown_content
|
return result.markdown_content
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,8 @@ logger = logging.getLogger(__name__)
|
||||||
async def download_and_extract_content(
|
async def download_and_extract_content(
|
||||||
client: OneDriveClient,
|
client: OneDriveClient,
|
||||||
file: dict[str, Any],
|
file: dict[str, Any],
|
||||||
|
*,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[str | None, dict[str, Any], str | None]:
|
) -> tuple[str | None, dict[str, Any], str | None]:
|
||||||
"""Download a OneDrive file and extract its content as markdown.
|
"""Download a OneDrive file and extract its content as markdown.
|
||||||
|
|
||||||
|
|
@ -65,7 +67,9 @@ async def download_and_extract_content(
|
||||||
if error:
|
if error:
|
||||||
return None, metadata, error
|
return None, metadata, error
|
||||||
|
|
||||||
markdown = await _parse_file_to_markdown(temp_file_path, file_name)
|
markdown = await _parse_file_to_markdown(
|
||||||
|
temp_file_path, file_name, vision_llm=vision_llm
|
||||||
|
)
|
||||||
return markdown, metadata, None
|
return markdown, metadata, None
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|
@ -77,12 +81,14 @@ async def download_and_extract_content(
|
||||||
os.unlink(temp_file_path)
|
os.unlink(temp_file_path)
|
||||||
|
|
||||||
|
|
||||||
async def _parse_file_to_markdown(file_path: str, filename: str) -> str:
|
async def _parse_file_to_markdown(
|
||||||
|
file_path: str, filename: str, *, vision_llm=None
|
||||||
|
) -> str:
|
||||||
"""Parse a local file to markdown using the unified ETL pipeline."""
|
"""Parse a local file to markdown using the unified ETL pipeline."""
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||||
|
|
||||||
result = await EtlPipelineService().extract(
|
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||||
EtlRequest(file_path=file_path, filename=filename)
|
EtlRequest(file_path=file_path, filename=filename)
|
||||||
)
|
)
|
||||||
return result.markdown_content
|
return result.markdown_content
|
||||||
|
|
|
||||||
|
|
@ -1450,6 +1450,13 @@ class SearchSourceConnector(BaseModel, TimestampMixin):
|
||||||
Boolean, nullable=False, default=False, server_default="false"
|
Boolean, nullable=False, default=False, server_default="false"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Vision LLM for image files - disabled by default to save cost/time.
|
||||||
|
# When enabled, images are described via a vision language model instead
|
||||||
|
# of falling back to the document parser.
|
||||||
|
enable_vision_llm = Column(
|
||||||
|
Boolean, nullable=False, default=False, server_default="false"
|
||||||
|
)
|
||||||
|
|
||||||
# Periodic indexing fields
|
# Periodic indexing fields
|
||||||
periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
|
periodic_indexing_enabled = Column(Boolean, nullable=False, default=False)
|
||||||
indexing_frequency_minutes = Column(Integer, nullable=True)
|
indexing_frequency_minutes = Column(Integer, nullable=True)
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,9 @@ from app.etl_pipeline.parsers.plaintext import read_plaintext
|
||||||
class EtlPipelineService:
|
class EtlPipelineService:
|
||||||
"""Single pipeline for extracting markdown from files. All callers use this."""
|
"""Single pipeline for extracting markdown from files. All callers use this."""
|
||||||
|
|
||||||
|
def __init__(self, *, vision_llm=None):
|
||||||
|
self._vision_llm = vision_llm
|
||||||
|
|
||||||
async def extract(self, request: EtlRequest) -> EtlResult:
|
async def extract(self, request: EtlRequest) -> EtlResult:
|
||||||
category = classify_file(request.filename)
|
category = classify_file(request.filename)
|
||||||
|
|
||||||
|
|
@ -47,8 +50,45 @@ class EtlPipelineService:
|
||||||
content_type="audio",
|
content_type="audio",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if category == FileCategory.IMAGE:
|
||||||
|
return await self._extract_image(request)
|
||||||
|
|
||||||
return await self._extract_document(request)
|
return await self._extract_document(request)
|
||||||
|
|
||||||
|
async def _extract_image(self, request: EtlRequest) -> EtlResult:
|
||||||
|
if self._vision_llm:
|
||||||
|
try:
|
||||||
|
from app.etl_pipeline.parsers.vision_llm import parse_with_vision_llm
|
||||||
|
|
||||||
|
content = await parse_with_vision_llm(
|
||||||
|
request.file_path, request.filename, self._vision_llm
|
||||||
|
)
|
||||||
|
return EtlResult(
|
||||||
|
markdown_content=content,
|
||||||
|
etl_service="VISION_LLM",
|
||||||
|
content_type="image",
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
logging.warning(
|
||||||
|
"Vision LLM failed for %s, falling back to document parser",
|
||||||
|
request.filename,
|
||||||
|
exc_info=True,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logging.info(
|
||||||
|
"No vision LLM provided, falling back to document parser for %s",
|
||||||
|
request.filename,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await self._extract_document(request)
|
||||||
|
except (EtlUnsupportedFileError, EtlServiceUnavailableError):
|
||||||
|
raise EtlUnsupportedFileError(
|
||||||
|
f"Cannot process image {request.filename}: vision LLM "
|
||||||
|
f"{'failed' if self._vision_llm else 'not configured'} and "
|
||||||
|
f"document parser does not support this format"
|
||||||
|
) from None
|
||||||
|
|
||||||
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
async def _extract_document(self, request: EtlRequest) -> EtlResult:
|
||||||
from pathlib import PurePosixPath
|
from pathlib import PurePosixPath
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,7 @@ from pathlib import PurePosixPath
|
||||||
|
|
||||||
from app.utils.file_extensions import (
|
from app.utils.file_extensions import (
|
||||||
DOCUMENT_EXTENSIONS,
|
DOCUMENT_EXTENSIONS,
|
||||||
|
IMAGE_EXTENSIONS,
|
||||||
get_document_extensions_for_service,
|
get_document_extensions_for_service,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -105,6 +106,7 @@ class FileCategory(Enum):
|
||||||
PLAINTEXT = "plaintext"
|
PLAINTEXT = "plaintext"
|
||||||
AUDIO = "audio"
|
AUDIO = "audio"
|
||||||
DIRECT_CONVERT = "direct_convert"
|
DIRECT_CONVERT = "direct_convert"
|
||||||
|
IMAGE = "image"
|
||||||
UNSUPPORTED = "unsupported"
|
UNSUPPORTED = "unsupported"
|
||||||
DOCUMENT = "document"
|
DOCUMENT = "document"
|
||||||
|
|
||||||
|
|
@ -117,6 +119,8 @@ def classify_file(filename: str) -> FileCategory:
|
||||||
return FileCategory.AUDIO
|
return FileCategory.AUDIO
|
||||||
if suffix in DIRECT_CONVERT_EXTENSIONS:
|
if suffix in DIRECT_CONVERT_EXTENSIONS:
|
||||||
return FileCategory.DIRECT_CONVERT
|
return FileCategory.DIRECT_CONVERT
|
||||||
|
if suffix in IMAGE_EXTENSIONS:
|
||||||
|
return FileCategory.IMAGE
|
||||||
if suffix in DOCUMENT_EXTENSIONS:
|
if suffix in DOCUMENT_EXTENSIONS:
|
||||||
return FileCategory.DOCUMENT
|
return FileCategory.DOCUMENT
|
||||||
return FileCategory.UNSUPPORTED
|
return FileCategory.UNSUPPORTED
|
||||||
|
|
@ -126,12 +130,14 @@ def should_skip_for_service(filename: str, etl_service: str | None) -> bool:
|
||||||
"""Return True if *filename* cannot be processed by *etl_service*.
|
"""Return True if *filename* cannot be processed by *etl_service*.
|
||||||
|
|
||||||
Plaintext, audio, and direct-convert files are parser-agnostic and never
|
Plaintext, audio, and direct-convert files are parser-agnostic and never
|
||||||
skipped. Document files are checked against the per-parser extension set.
|
skipped. Image and document files are checked against the per-parser
|
||||||
|
extension set (images fall back to the document parser when no vision LLM
|
||||||
|
is available, so the same service constraint applies).
|
||||||
"""
|
"""
|
||||||
category = classify_file(filename)
|
category = classify_file(filename)
|
||||||
if category == FileCategory.UNSUPPORTED:
|
if category == FileCategory.UNSUPPORTED:
|
||||||
return True
|
return True
|
||||||
if category == FileCategory.DOCUMENT:
|
if category in (FileCategory.DOCUMENT, FileCategory.IMAGE):
|
||||||
suffix = PurePosixPath(filename).suffix.lower()
|
suffix = PurePosixPath(filename).suffix.lower()
|
||||||
return suffix not in get_document_extensions_for_service(etl_service)
|
return suffix not in get_document_extensions_for_service(etl_service)
|
||||||
return False
|
return False
|
||||||
|
|
|
||||||
64
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
64
surfsense_backend/app/etl_pipeline/parsers/vision_llm.py
Normal file
|
|
@ -0,0 +1,64 @@
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
import os
|
||||||
|
|
||||||
|
from langchain_core.messages import HumanMessage
|
||||||
|
|
||||||
|
_PROMPT = (
|
||||||
|
"Describe this image in markdown. "
|
||||||
|
"Transcribe any visible text verbatim. "
|
||||||
|
"Be concise but complete — let the image content guide the level of detail."
|
||||||
|
)
|
||||||
|
|
||||||
|
_MAX_IMAGE_BYTES = (
|
||||||
|
5 * 1024 * 1024
|
||||||
|
) # 5 MB (Anthropic Claude's limit, the most restrictive)
|
||||||
|
|
||||||
|
_INVOKE_TIMEOUT_SECONDS = 120
|
||||||
|
|
||||||
|
_EXT_TO_MIME: dict[str, str] = {
|
||||||
|
".png": "image/png",
|
||||||
|
".jpg": "image/jpeg",
|
||||||
|
".jpeg": "image/jpeg",
|
||||||
|
".gif": "image/gif",
|
||||||
|
".bmp": "image/bmp",
|
||||||
|
".tiff": "image/tiff",
|
||||||
|
".tif": "image/tiff",
|
||||||
|
".webp": "image/webp",
|
||||||
|
".svg": "image/svg+xml",
|
||||||
|
".heic": "image/heic",
|
||||||
|
".heif": "image/heif",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _image_to_data_url(file_path: str) -> str:
|
||||||
|
file_size = os.path.getsize(file_path)
|
||||||
|
if file_size > _MAX_IMAGE_BYTES:
|
||||||
|
raise ValueError(
|
||||||
|
f"Image too large for vision LLM ({file_size / (1024 * 1024):.1f} MB, "
|
||||||
|
f"limit {_MAX_IMAGE_BYTES // (1024 * 1024)} MB): {file_path}"
|
||||||
|
)
|
||||||
|
ext = os.path.splitext(file_path)[1].lower()
|
||||||
|
mime_type = _EXT_TO_MIME.get(ext)
|
||||||
|
if not mime_type:
|
||||||
|
raise ValueError(f"Unsupported image extension {ext!r}: {file_path}")
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
encoded = base64.b64encode(f.read()).decode("ascii")
|
||||||
|
return f"data:{mime_type};base64,{encoded}"
|
||||||
|
|
||||||
|
|
||||||
|
async def parse_with_vision_llm(file_path: str, filename: str, llm) -> str:
|
||||||
|
data_url = _image_to_data_url(file_path)
|
||||||
|
message = HumanMessage(
|
||||||
|
content=[
|
||||||
|
{"type": "text", "text": _PROMPT},
|
||||||
|
{"type": "image_url", "image_url": {"url": data_url}},
|
||||||
|
]
|
||||||
|
)
|
||||||
|
response = await asyncio.wait_for(
|
||||||
|
llm.ainvoke([message]), timeout=_INVOKE_TIMEOUT_SECONDS
|
||||||
|
)
|
||||||
|
text = response.content if hasattr(response, "content") else str(response)
|
||||||
|
if not text or not text.strip():
|
||||||
|
raise ValueError(f"Vision LLM returned empty content for {filename}")
|
||||||
|
return text.strip()
|
||||||
|
|
@ -13,6 +13,7 @@ from .discord_add_connector_route import router as discord_add_connector_router
|
||||||
from .documents_routes import router as documents_router
|
from .documents_routes import router as documents_router
|
||||||
from .dropbox_add_connector_route import router as dropbox_add_connector_router
|
from .dropbox_add_connector_route import router as dropbox_add_connector_router
|
||||||
from .editor_routes import router as editor_router
|
from .editor_routes import router as editor_router
|
||||||
|
from .export_routes import router as export_router
|
||||||
from .folders_routes import router as folders_router
|
from .folders_routes import router as folders_router
|
||||||
from .google_calendar_add_connector_route import (
|
from .google_calendar_add_connector_route import (
|
||||||
router as google_calendar_add_connector_router,
|
router as google_calendar_add_connector_router,
|
||||||
|
|
@ -58,6 +59,7 @@ router = APIRouter()
|
||||||
router.include_router(search_spaces_router)
|
router.include_router(search_spaces_router)
|
||||||
router.include_router(rbac_router) # RBAC routes for roles, members, invites
|
router.include_router(rbac_router) # RBAC routes for roles, members, invites
|
||||||
router.include_router(editor_router)
|
router.include_router(editor_router)
|
||||||
|
router.include_router(export_router)
|
||||||
router.include_router(documents_router)
|
router.include_router(documents_router)
|
||||||
router.include_router(folders_router)
|
router.include_router(folders_router)
|
||||||
router.include_router(notes_router)
|
router.include_router(notes_router)
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,7 @@
|
||||||
import asyncio
|
import asyncio
|
||||||
|
|
||||||
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
|
from fastapi import APIRouter, Depends, Form, HTTPException, Query, UploadFile
|
||||||
from pydantic import BaseModel as PydanticBaseModel
|
from pydantic import BaseModel as PydanticBaseModel, Field
|
||||||
from sqlalchemy.ext.asyncio import AsyncSession
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
from sqlalchemy.future import select
|
from sqlalchemy.future import select
|
||||||
from sqlalchemy.orm import selectinload
|
from sqlalchemy.orm import selectinload
|
||||||
|
|
@ -123,6 +123,7 @@ async def create_documents_file_upload(
|
||||||
files: list[UploadFile],
|
files: list[UploadFile],
|
||||||
search_space_id: int = Form(...),
|
search_space_id: int = Form(...),
|
||||||
should_summarize: bool = Form(False),
|
should_summarize: bool = Form(False),
|
||||||
|
use_vision_llm: bool = Form(False),
|
||||||
session: AsyncSession = Depends(get_async_session),
|
session: AsyncSession = Depends(get_async_session),
|
||||||
user: User = Depends(current_active_user),
|
user: User = Depends(current_active_user),
|
||||||
dispatcher: TaskDispatcher = Depends(get_task_dispatcher),
|
dispatcher: TaskDispatcher = Depends(get_task_dispatcher),
|
||||||
|
|
@ -272,6 +273,7 @@ async def create_documents_file_upload(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=str(user.id),
|
user_id=str(user.id),
|
||||||
should_summarize=should_summarize,
|
should_summarize=should_summarize,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -1395,10 +1397,13 @@ class FolderMtimeCheckFile(PydanticBaseModel):
|
||||||
mtime: float
|
mtime: float
|
||||||
|
|
||||||
|
|
||||||
|
_MAX_MTIME_CHECK_FILES = 10_000
|
||||||
|
|
||||||
|
|
||||||
class FolderMtimeCheckRequest(PydanticBaseModel):
|
class FolderMtimeCheckRequest(PydanticBaseModel):
|
||||||
folder_name: str
|
folder_name: str
|
||||||
search_space_id: int
|
search_space_id: int
|
||||||
files: list[FolderMtimeCheckFile]
|
files: list[FolderMtimeCheckFile] = Field(max_length=_MAX_MTIME_CHECK_FILES)
|
||||||
|
|
||||||
|
|
||||||
class FolderUnlinkRequest(PydanticBaseModel):
|
class FolderUnlinkRequest(PydanticBaseModel):
|
||||||
|
|
@ -1487,6 +1492,7 @@ async def folder_upload(
|
||||||
relative_paths: str = Form(...),
|
relative_paths: str = Form(...),
|
||||||
root_folder_id: int | None = Form(None),
|
root_folder_id: int | None = Form(None),
|
||||||
enable_summary: bool = Form(False),
|
enable_summary: bool = Form(False),
|
||||||
|
use_vision_llm: bool = Form(False),
|
||||||
session: AsyncSession = Depends(get_async_session),
|
session: AsyncSession = Depends(get_async_session),
|
||||||
user: User = Depends(current_active_user),
|
user: User = Depends(current_active_user),
|
||||||
):
|
):
|
||||||
|
|
@ -1531,6 +1537,23 @@ async def folder_upload(
|
||||||
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
|
f"exceeds the {MAX_FILE_SIZE_BYTES // (1024 * 1024)} MB per-file limit.",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from app.services.folder_service import MAX_FOLDER_DEPTH
|
||||||
|
|
||||||
|
max_subfolder_depth = max((p.count("/") for p in rel_paths if "/" in p), default=0)
|
||||||
|
if 1 + max_subfolder_depth > MAX_FOLDER_DEPTH:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=400,
|
||||||
|
detail=f"Folder structure too deep: {1 + max_subfolder_depth} levels "
|
||||||
|
f"exceeds the maximum of {MAX_FOLDER_DEPTH}.",
|
||||||
|
)
|
||||||
|
|
||||||
|
if root_folder_id:
|
||||||
|
root_folder = await session.get(Folder, root_folder_id)
|
||||||
|
if not root_folder or root_folder.search_space_id != search_space_id:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=404, detail="Root folder not found in this search space"
|
||||||
|
)
|
||||||
|
|
||||||
if not root_folder_id:
|
if not root_folder_id:
|
||||||
watched_metadata = {
|
watched_metadata = {
|
||||||
"watched": True,
|
"watched": True,
|
||||||
|
|
@ -1565,7 +1588,8 @@ async def folder_upload(
|
||||||
|
|
||||||
async def _read_and_save(file: UploadFile, idx: int) -> dict:
|
async def _read_and_save(file: UploadFile, idx: int) -> dict:
|
||||||
content = await file.read()
|
content = await file.read()
|
||||||
filename = file.filename or rel_paths[idx].split("/")[-1]
|
raw_name = file.filename or rel_paths[idx]
|
||||||
|
filename = raw_name.split("/")[-1]
|
||||||
|
|
||||||
def _write_temp() -> str:
|
def _write_temp() -> str:
|
||||||
with tempfile.NamedTemporaryFile(
|
with tempfile.NamedTemporaryFile(
|
||||||
|
|
@ -1595,6 +1619,7 @@ async def folder_upload(
|
||||||
folder_name=folder_name,
|
folder_name=folder_name,
|
||||||
root_folder_id=root_folder_id,
|
root_folder_id=root_folder_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
file_mappings=list(file_mappings),
|
file_mappings=list(file_mappings),
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
61
surfsense_backend/app/routes/export_routes.py
Normal file
61
surfsense_backend/app/routes/export_routes.py
Normal file
|
|
@ -0,0 +1,61 @@
|
||||||
|
"""Routes for exporting knowledge base content as ZIP."""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, Query
|
||||||
|
from fastapi.responses import StreamingResponse
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
|
||||||
|
from app.db import Permission, User, get_async_session
|
||||||
|
from app.services.export_service import build_export_zip
|
||||||
|
from app.users import current_active_user
|
||||||
|
from app.utils.rbac import check_permission
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/search-spaces/{search_space_id}/export")
|
||||||
|
async def export_knowledge_base(
|
||||||
|
search_space_id: int,
|
||||||
|
folder_id: int | None = Query(None, description="Export only this folder's subtree"),
|
||||||
|
session: AsyncSession = Depends(get_async_session),
|
||||||
|
user: User = Depends(current_active_user),
|
||||||
|
):
|
||||||
|
"""Export documents as a ZIP of markdown files preserving folder structure."""
|
||||||
|
await check_permission(
|
||||||
|
session,
|
||||||
|
user,
|
||||||
|
search_space_id,
|
||||||
|
Permission.DOCUMENTS_READ.value,
|
||||||
|
"You don't have permission to export documents in this search space",
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
result = await build_export_zip(session, search_space_id, folder_id)
|
||||||
|
except ValueError as e:
|
||||||
|
raise HTTPException(status_code=404, detail=str(e)) from None
|
||||||
|
|
||||||
|
def stream_and_cleanup():
|
||||||
|
try:
|
||||||
|
with open(result.zip_path, "rb") as f:
|
||||||
|
while chunk := f.read(8192):
|
||||||
|
yield chunk
|
||||||
|
finally:
|
||||||
|
os.unlink(result.zip_path)
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"Content-Disposition": f'attachment; filename="{result.export_name}.zip"',
|
||||||
|
"Content-Length": str(result.zip_size),
|
||||||
|
}
|
||||||
|
|
||||||
|
if result.skipped_docs:
|
||||||
|
headers["X-Skipped-Documents"] = str(len(result.skipped_docs))
|
||||||
|
|
||||||
|
return StreamingResponse(
|
||||||
|
stream_and_cleanup(),
|
||||||
|
media_type="application/zip",
|
||||||
|
headers=headers,
|
||||||
|
)
|
||||||
|
|
@ -17,6 +17,7 @@ class SearchSourceConnectorBase(BaseModel):
|
||||||
last_indexed_at: datetime | None = None
|
last_indexed_at: datetime | None = None
|
||||||
config: dict[str, Any]
|
config: dict[str, Any]
|
||||||
enable_summary: bool = False
|
enable_summary: bool = False
|
||||||
|
enable_vision_llm: bool = False
|
||||||
periodic_indexing_enabled: bool = False
|
periodic_indexing_enabled: bool = False
|
||||||
indexing_frequency_minutes: int | None = None
|
indexing_frequency_minutes: int | None = None
|
||||||
next_scheduled_at: datetime | None = None
|
next_scheduled_at: datetime | None = None
|
||||||
|
|
@ -67,6 +68,7 @@ class SearchSourceConnectorUpdate(BaseModel):
|
||||||
last_indexed_at: datetime | None = None
|
last_indexed_at: datetime | None = None
|
||||||
config: dict[str, Any] | None = None
|
config: dict[str, Any] | None = None
|
||||||
enable_summary: bool | None = None
|
enable_summary: bool | None = None
|
||||||
|
enable_vision_llm: bool | None = None
|
||||||
periodic_indexing_enabled: bool | None = None
|
periodic_indexing_enabled: bool | None = None
|
||||||
indexing_frequency_minutes: int | None = None
|
indexing_frequency_minutes: int | None = None
|
||||||
next_scheduled_at: datetime | None = None
|
next_scheduled_at: datetime | None = None
|
||||||
|
|
|
||||||
200
surfsense_backend/app/services/export_service.py
Normal file
200
surfsense_backend/app/services/export_service.py
Normal file
|
|
@ -0,0 +1,200 @@
|
||||||
|
"""Service for exporting knowledge base content as a ZIP archive."""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import tempfile
|
||||||
|
import zipfile
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
from sqlalchemy.ext.asyncio import AsyncSession
|
||||||
|
from sqlalchemy.future import select
|
||||||
|
|
||||||
|
from app.db import Chunk, Document, Folder
|
||||||
|
from app.services.folder_service import get_folder_subtree_ids
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def _sanitize_filename(title: str) -> str:
|
||||||
|
safe = "".join(c if c.isalnum() or c in " -_." else "_" for c in title).strip()
|
||||||
|
return safe[:80] or "document"
|
||||||
|
|
||||||
|
|
||||||
|
def _build_folder_path_map(folders: list[Folder]) -> dict[int, str]:
|
||||||
|
"""Build a mapping of folder_id -> full path string (e.g. 'Research/AI')."""
|
||||||
|
id_to_folder = {f.id: f for f in folders}
|
||||||
|
cache: dict[int, str] = {}
|
||||||
|
|
||||||
|
def resolve(folder_id: int) -> str:
|
||||||
|
if folder_id in cache:
|
||||||
|
return cache[folder_id]
|
||||||
|
folder = id_to_folder[folder_id]
|
||||||
|
safe_name = _sanitize_filename(folder.name)
|
||||||
|
if folder.parent_id is None or folder.parent_id not in id_to_folder:
|
||||||
|
cache[folder_id] = safe_name
|
||||||
|
else:
|
||||||
|
cache[folder_id] = f"{resolve(folder.parent_id)}/{safe_name}"
|
||||||
|
return cache[folder_id]
|
||||||
|
|
||||||
|
for f in folders:
|
||||||
|
resolve(f.id)
|
||||||
|
|
||||||
|
return cache
|
||||||
|
|
||||||
|
|
||||||
|
async def _get_document_markdown(
|
||||||
|
session: AsyncSession, document: Document
|
||||||
|
) -> str | None:
|
||||||
|
"""Resolve markdown content using the 3-tier fallback:
|
||||||
|
1. source_markdown 2. blocknote_document conversion 3. chunk concatenation
|
||||||
|
"""
|
||||||
|
if document.source_markdown is not None:
|
||||||
|
return document.source_markdown
|
||||||
|
|
||||||
|
if document.blocknote_document:
|
||||||
|
from app.utils.blocknote_to_markdown import blocknote_to_markdown
|
||||||
|
|
||||||
|
md = blocknote_to_markdown(document.blocknote_document)
|
||||||
|
if md:
|
||||||
|
return md
|
||||||
|
|
||||||
|
chunk_result = await session.execute(
|
||||||
|
select(Chunk.content)
|
||||||
|
.filter(Chunk.document_id == document.id)
|
||||||
|
.order_by(Chunk.id)
|
||||||
|
)
|
||||||
|
chunks = chunk_result.scalars().all()
|
||||||
|
if chunks:
|
||||||
|
return "\n\n".join(chunks)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class ExportResult:
|
||||||
|
zip_path: str
|
||||||
|
export_name: str
|
||||||
|
zip_size: int
|
||||||
|
skipped_docs: list[str] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
async def build_export_zip(
|
||||||
|
session: AsyncSession,
|
||||||
|
search_space_id: int,
|
||||||
|
folder_id: int | None = None,
|
||||||
|
) -> ExportResult:
|
||||||
|
"""Build a ZIP archive of markdown documents preserving folder structure.
|
||||||
|
|
||||||
|
Returns an ExportResult with the path to the temp ZIP file.
|
||||||
|
The caller is responsible for streaming and cleaning up the file.
|
||||||
|
|
||||||
|
Raises ValueError if folder_id is provided but not found.
|
||||||
|
"""
|
||||||
|
if folder_id is not None:
|
||||||
|
folder = await session.get(Folder, folder_id)
|
||||||
|
if not folder or folder.search_space_id != search_space_id:
|
||||||
|
raise ValueError("Folder not found")
|
||||||
|
target_folder_ids = set(await get_folder_subtree_ids(session, folder_id))
|
||||||
|
else:
|
||||||
|
target_folder_ids = None
|
||||||
|
|
||||||
|
folder_query = select(Folder).where(Folder.search_space_id == search_space_id)
|
||||||
|
if target_folder_ids is not None:
|
||||||
|
folder_query = folder_query.where(Folder.id.in_(target_folder_ids))
|
||||||
|
folder_result = await session.execute(folder_query)
|
||||||
|
folders = list(folder_result.scalars().all())
|
||||||
|
|
||||||
|
folder_path_map = _build_folder_path_map(folders)
|
||||||
|
|
||||||
|
batch_size = 100
|
||||||
|
|
||||||
|
base_doc_query = select(Document).where(Document.search_space_id == search_space_id)
|
||||||
|
if target_folder_ids is not None:
|
||||||
|
base_doc_query = base_doc_query.where(Document.folder_id.in_(target_folder_ids))
|
||||||
|
base_doc_query = base_doc_query.order_by(Document.id)
|
||||||
|
|
||||||
|
fd, tmp_path = tempfile.mkstemp(suffix=".zip")
|
||||||
|
os.close(fd)
|
||||||
|
|
||||||
|
used_paths: dict[str, int] = {}
|
||||||
|
skipped_docs: list[str] = []
|
||||||
|
is_first_batch = True
|
||||||
|
|
||||||
|
try:
|
||||||
|
offset = 0
|
||||||
|
while True:
|
||||||
|
batch_query = base_doc_query.limit(batch_size).offset(offset)
|
||||||
|
batch_result = await session.execute(batch_query)
|
||||||
|
documents = list(batch_result.scalars().all())
|
||||||
|
if not documents:
|
||||||
|
break
|
||||||
|
|
||||||
|
entries: list[tuple[str, str]] = []
|
||||||
|
|
||||||
|
for doc in documents:
|
||||||
|
status = doc.status or {}
|
||||||
|
state = (
|
||||||
|
status.get("state", "ready")
|
||||||
|
if isinstance(status, dict)
|
||||||
|
else "ready"
|
||||||
|
)
|
||||||
|
if state in ("pending", "processing"):
|
||||||
|
skipped_docs.append(doc.title or "Untitled")
|
||||||
|
continue
|
||||||
|
|
||||||
|
markdown = await _get_document_markdown(session, doc)
|
||||||
|
if not markdown or not markdown.strip():
|
||||||
|
continue
|
||||||
|
|
||||||
|
if doc.folder_id and doc.folder_id in folder_path_map:
|
||||||
|
dir_path = folder_path_map[doc.folder_id]
|
||||||
|
else:
|
||||||
|
dir_path = ""
|
||||||
|
|
||||||
|
base_name = _sanitize_filename(doc.title or "Untitled")
|
||||||
|
file_path = (
|
||||||
|
f"{dir_path}/{base_name}.md" if dir_path else f"{base_name}.md"
|
||||||
|
)
|
||||||
|
|
||||||
|
if file_path in used_paths:
|
||||||
|
used_paths[file_path] += 1
|
||||||
|
suffix = used_paths[file_path]
|
||||||
|
file_path = (
|
||||||
|
f"{dir_path}/{base_name}_{suffix}.md"
|
||||||
|
if dir_path
|
||||||
|
else f"{base_name}_{suffix}.md"
|
||||||
|
)
|
||||||
|
used_paths[file_path] = used_paths.get(file_path, 0) + 1
|
||||||
|
|
||||||
|
entries.append((file_path, markdown))
|
||||||
|
|
||||||
|
if entries:
|
||||||
|
mode = "w" if is_first_batch else "a"
|
||||||
|
batch_entries = entries
|
||||||
|
|
||||||
|
def _write_batch(m: str = mode, e: list = batch_entries) -> None:
|
||||||
|
with zipfile.ZipFile(tmp_path, m, zipfile.ZIP_DEFLATED) as zf:
|
||||||
|
for path, content in e:
|
||||||
|
zf.writestr(path, content)
|
||||||
|
|
||||||
|
await asyncio.to_thread(_write_batch)
|
||||||
|
is_first_batch = False
|
||||||
|
|
||||||
|
offset += batch_size
|
||||||
|
|
||||||
|
export_name = "knowledge-base"
|
||||||
|
if folder_id is not None and folder_id in folder_path_map:
|
||||||
|
export_name = _sanitize_filename(folder_path_map[folder_id].split("/")[0])
|
||||||
|
|
||||||
|
return ExportResult(
|
||||||
|
zip_path=tmp_path,
|
||||||
|
export_name=export_name,
|
||||||
|
zip_size=os.path.getsize(tmp_path),
|
||||||
|
skipped_docs=skipped_docs,
|
||||||
|
)
|
||||||
|
|
||||||
|
except Exception:
|
||||||
|
if os.path.exists(tmp_path):
|
||||||
|
os.unlink(tmp_path)
|
||||||
|
raise
|
||||||
|
|
@ -19,6 +19,7 @@ class TaskDispatcher(Protocol):
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
should_summarize: bool = False,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
) -> None: ...
|
) -> None: ...
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -34,6 +35,7 @@ class CeleryTaskDispatcher:
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
should_summarize: bool = False,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
from app.tasks.celery_tasks.document_tasks import (
|
from app.tasks.celery_tasks.document_tasks import (
|
||||||
process_file_upload_with_document_task,
|
process_file_upload_with_document_task,
|
||||||
|
|
@ -46,6 +48,7 @@ class CeleryTaskDispatcher:
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
should_summarize=should_summarize,
|
should_summarize=should_summarize,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -778,6 +778,7 @@ def process_file_upload_with_document_task(
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
should_summarize: bool = False,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Celery task to process uploaded file with existing pending document.
|
Celery task to process uploaded file with existing pending document.
|
||||||
|
|
@ -833,6 +834,7 @@ def process_file_upload_with_document_task(
|
||||||
search_space_id,
|
search_space_id,
|
||||||
user_id,
|
user_id,
|
||||||
should_summarize=should_summarize,
|
should_summarize=should_summarize,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
|
|
@ -869,6 +871,7 @@ async def _process_file_with_document(
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
should_summarize: bool = False,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
Process file and update existing pending document status.
|
Process file and update existing pending document status.
|
||||||
|
|
@ -971,6 +974,7 @@ async def _process_file_with_document(
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
notification=notification,
|
notification=notification,
|
||||||
should_summarize=should_summarize,
|
should_summarize=should_summarize,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
# Update notification on success
|
# Update notification on success
|
||||||
|
|
@ -1428,6 +1432,7 @@ def index_uploaded_folder_files_task(
|
||||||
root_folder_id: int,
|
root_folder_id: int,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
file_mappings: list[dict],
|
file_mappings: list[dict],
|
||||||
|
use_vision_llm: bool = False,
|
||||||
):
|
):
|
||||||
"""Celery task to index files uploaded from the desktop app."""
|
"""Celery task to index files uploaded from the desktop app."""
|
||||||
loop = asyncio.new_event_loop()
|
loop = asyncio.new_event_loop()
|
||||||
|
|
@ -1441,6 +1446,7 @@ def index_uploaded_folder_files_task(
|
||||||
root_folder_id=root_folder_id,
|
root_folder_id=root_folder_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
file_mappings=file_mappings,
|
file_mappings=file_mappings,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
finally:
|
finally:
|
||||||
|
|
@ -1454,6 +1460,7 @@ async def _index_uploaded_folder_files_async(
|
||||||
root_folder_id: int,
|
root_folder_id: int,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
file_mappings: list[dict],
|
file_mappings: list[dict],
|
||||||
|
use_vision_llm: bool = False,
|
||||||
):
|
):
|
||||||
"""Run upload-based folder indexing with notification + heartbeat."""
|
"""Run upload-based folder indexing with notification + heartbeat."""
|
||||||
file_count = len(file_mappings)
|
file_count = len(file_mappings)
|
||||||
|
|
@ -1503,6 +1510,7 @@ async def _index_uploaded_folder_files_async(
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
file_mappings=file_mappings,
|
file_mappings=file_mappings,
|
||||||
on_heartbeat_callback=_heartbeat_progress,
|
on_heartbeat_callback=_heartbeat_progress,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if notification:
|
if notification:
|
||||||
|
|
|
||||||
|
|
@ -164,6 +164,7 @@ async def _download_files_parallel(
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
max_concurrency: int = 3,
|
max_concurrency: int = 3,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[list[ConnectorDocument], int]:
|
) -> tuple[list[ConnectorDocument], int]:
|
||||||
"""Download and ETL files in parallel. Returns (docs, failed_count)."""
|
"""Download and ETL files in parallel. Returns (docs, failed_count)."""
|
||||||
results: list[ConnectorDocument] = []
|
results: list[ConnectorDocument] = []
|
||||||
|
|
@ -176,7 +177,7 @@ async def _download_files_parallel(
|
||||||
nonlocal last_heartbeat, completed_count
|
nonlocal last_heartbeat, completed_count
|
||||||
async with sem:
|
async with sem:
|
||||||
markdown, db_metadata, error = await download_and_extract_content(
|
markdown, db_metadata, error = await download_and_extract_content(
|
||||||
dropbox_client, file
|
dropbox_client, file, vision_llm=vision_llm
|
||||||
)
|
)
|
||||||
if error or not markdown:
|
if error or not markdown:
|
||||||
file_name = file.get("name", "Unknown")
|
file_name = file.get("name", "Unknown")
|
||||||
|
|
@ -224,6 +225,7 @@ async def _download_and_index(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
"""Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
|
"""Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
|
||||||
connector_docs, download_failed = await _download_files_parallel(
|
connector_docs, download_failed = await _download_files_parallel(
|
||||||
|
|
@ -234,6 +236,7 @@ async def _download_and_index(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
batch_indexed = 0
|
batch_indexed = 0
|
||||||
|
|
@ -287,6 +290,7 @@ async def _index_with_delta_sync(
|
||||||
max_files: int,
|
max_files: int,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
enable_summary: bool = True,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, str]:
|
) -> tuple[int, int, int, str]:
|
||||||
"""Delta sync using Dropbox cursor-based change tracking.
|
"""Delta sync using Dropbox cursor-based change tracking.
|
||||||
|
|
||||||
|
|
@ -359,6 +363,7 @@ async def _index_with_delta_sync(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
indexed = renamed_count + batch_indexed
|
indexed = renamed_count + batch_indexed
|
||||||
|
|
@ -384,6 +389,7 @@ async def _index_full_scan(
|
||||||
incremental_sync: bool = True,
|
incremental_sync: bool = True,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
enable_summary: bool = True,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Full scan indexing of a folder.
|
"""Full scan indexing of a folder.
|
||||||
|
|
||||||
|
|
@ -469,6 +475,7 @@ async def _index_full_scan(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -498,6 +505,7 @@ async def _index_selected_files(
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
incremental_sync: bool = True,
|
incremental_sync: bool = True,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, list[str]]:
|
) -> tuple[int, int, int, list[str]]:
|
||||||
"""Index user-selected files using the parallel pipeline."""
|
"""Index user-selected files using the parallel pipeline."""
|
||||||
page_limit_service = PageLimitService(session)
|
page_limit_service = PageLimitService(session)
|
||||||
|
|
@ -557,6 +565,7 @@ async def _index_selected_files(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -621,6 +630,13 @@ async def index_dropbox_files(
|
||||||
return 0, 0, error_msg, 0
|
return 0, 0, error_msg, 0
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
connector_enable_summary = getattr(connector, "enable_summary", True)
|
||||||
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
|
vision_llm = None
|
||||||
|
if connector_enable_vision_llm:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
dropbox_client = DropboxClient(session, connector_id)
|
dropbox_client = DropboxClient(session, connector_id)
|
||||||
|
|
||||||
indexing_options = items_dict.get("indexing_options", {})
|
indexing_options = items_dict.get("indexing_options", {})
|
||||||
|
|
@ -650,6 +666,7 @@ async def index_dropbox_files(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
incremental_sync=incremental_sync,
|
incremental_sync=incremental_sync,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += indexed
|
total_indexed += indexed
|
||||||
total_skipped += skipped
|
total_skipped += skipped
|
||||||
|
|
@ -684,6 +701,7 @@ async def index_dropbox_files(
|
||||||
log_entry,
|
log_entry,
|
||||||
max_files,
|
max_files,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
folder_cursors[folder_path] = new_cursor
|
folder_cursors[folder_path] = new_cursor
|
||||||
total_unsupported += unsup
|
total_unsupported += unsup
|
||||||
|
|
@ -703,6 +721,7 @@ async def index_dropbox_files(
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
incremental_sync=incremental_sync,
|
incremental_sync=incremental_sync,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_unsupported += unsup
|
total_unsupported += unsup
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -261,6 +261,7 @@ async def _download_files_parallel(
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
max_concurrency: int = 3,
|
max_concurrency: int = 3,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[list[ConnectorDocument], int]:
|
) -> tuple[list[ConnectorDocument], int]:
|
||||||
"""Download and ETL files in parallel, returning ConnectorDocuments.
|
"""Download and ETL files in parallel, returning ConnectorDocuments.
|
||||||
|
|
||||||
|
|
@ -276,7 +277,7 @@ async def _download_files_parallel(
|
||||||
nonlocal last_heartbeat, completed_count
|
nonlocal last_heartbeat, completed_count
|
||||||
async with sem:
|
async with sem:
|
||||||
markdown, drive_metadata, error = await download_and_extract_content(
|
markdown, drive_metadata, error = await download_and_extract_content(
|
||||||
drive_client, file
|
drive_client, file, vision_llm=vision_llm
|
||||||
)
|
)
|
||||||
if error or not markdown:
|
if error or not markdown:
|
||||||
file_name = file.get("name", "Unknown")
|
file_name = file.get("name", "Unknown")
|
||||||
|
|
@ -322,6 +323,7 @@ async def _process_single_file(
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool = True,
|
enable_summary: bool = True,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Download, extract, and index a single Drive file via the pipeline.
|
"""Download, extract, and index a single Drive file via the pipeline.
|
||||||
|
|
||||||
|
|
@ -343,7 +345,7 @@ async def _process_single_file(
|
||||||
await page_limit_service.check_page_limit(user_id, estimated_pages)
|
await page_limit_service.check_page_limit(user_id, estimated_pages)
|
||||||
|
|
||||||
markdown, drive_metadata, error = await download_and_extract_content(
|
markdown, drive_metadata, error = await download_and_extract_content(
|
||||||
drive_client, file
|
drive_client, file, vision_llm=vision_llm
|
||||||
)
|
)
|
||||||
if error or not markdown:
|
if error or not markdown:
|
||||||
logger.warning(f"ETL failed for {file_name}: {error}")
|
logger.warning(f"ETL failed for {file_name}: {error}")
|
||||||
|
|
@ -433,6 +435,7 @@ async def _download_and_index(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
"""Phase 2+3: parallel download then parallel indexing.
|
"""Phase 2+3: parallel download then parallel indexing.
|
||||||
|
|
||||||
|
|
@ -446,6 +449,7 @@ async def _download_and_index(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
batch_indexed = 0
|
batch_indexed = 0
|
||||||
|
|
@ -476,6 +480,7 @@ async def _index_selected_files(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, list[str]]:
|
) -> tuple[int, int, int, list[str]]:
|
||||||
"""Index user-selected files using the parallel pipeline.
|
"""Index user-selected files using the parallel pipeline.
|
||||||
|
|
||||||
|
|
@ -540,6 +545,7 @@ async def _index_selected_files(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -573,6 +579,7 @@ async def _index_full_scan(
|
||||||
include_subfolders: bool = False,
|
include_subfolders: bool = False,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
enable_summary: bool = True,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Full scan indexing of a folder.
|
"""Full scan indexing of a folder.
|
||||||
|
|
||||||
|
|
@ -703,6 +710,7 @@ async def _index_full_scan(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -736,6 +744,7 @@ async def _index_with_delta_sync(
|
||||||
include_subfolders: bool = False,
|
include_subfolders: bool = False,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
enable_summary: bool = True,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Delta sync using change tracking.
|
"""Delta sync using change tracking.
|
||||||
|
|
||||||
|
|
@ -844,6 +853,7 @@ async def _index_with_delta_sync(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -947,6 +957,11 @@ async def index_google_drive_files(
|
||||||
)
|
)
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
connector_enable_summary = getattr(connector, "enable_summary", True)
|
||||||
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
|
vision_llm = None
|
||||||
|
if connector_enable_vision_llm:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
drive_client = GoogleDriveClient(
|
drive_client = GoogleDriveClient(
|
||||||
session, connector_id, credentials=pre_built_credentials
|
session, connector_id, credentials=pre_built_credentials
|
||||||
)
|
)
|
||||||
|
|
@ -986,6 +1001,7 @@ async def index_google_drive_files(
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
on_heartbeat_callback,
|
on_heartbeat_callback,
|
||||||
connector_enable_summary,
|
connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
documents_unsupported += du
|
documents_unsupported += du
|
||||||
logger.info("Running reconciliation scan after delta sync")
|
logger.info("Running reconciliation scan after delta sync")
|
||||||
|
|
@ -1004,6 +1020,7 @@ async def index_google_drive_files(
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
on_heartbeat_callback,
|
on_heartbeat_callback,
|
||||||
connector_enable_summary,
|
connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
documents_indexed += ri
|
documents_indexed += ri
|
||||||
documents_skipped += rs
|
documents_skipped += rs
|
||||||
|
|
@ -1029,6 +1046,7 @@ async def index_google_drive_files(
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
on_heartbeat_callback,
|
on_heartbeat_callback,
|
||||||
connector_enable_summary,
|
connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if documents_indexed > 0 or can_use_delta:
|
if documents_indexed > 0 or can_use_delta:
|
||||||
|
|
@ -1146,6 +1164,11 @@ async def index_google_drive_single_file(
|
||||||
)
|
)
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
connector_enable_summary = getattr(connector, "enable_summary", True)
|
||||||
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
|
vision_llm = None
|
||||||
|
if connector_enable_vision_llm:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
drive_client = GoogleDriveClient(
|
drive_client = GoogleDriveClient(
|
||||||
session, connector_id, credentials=pre_built_credentials
|
session, connector_id, credentials=pre_built_credentials
|
||||||
)
|
)
|
||||||
|
|
@ -1168,6 +1191,7 @@ async def index_google_drive_single_file(
|
||||||
search_space_id,
|
search_space_id,
|
||||||
user_id,
|
user_id,
|
||||||
connector_enable_summary,
|
connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
await session.commit()
|
await session.commit()
|
||||||
|
|
||||||
|
|
@ -1278,6 +1302,11 @@ async def index_google_drive_selected_files(
|
||||||
return 0, 0, [error_msg]
|
return 0, 0, [error_msg]
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
connector_enable_summary = getattr(connector, "enable_summary", True)
|
||||||
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
|
vision_llm = None
|
||||||
|
if connector_enable_vision_llm:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
drive_client = GoogleDriveClient(
|
drive_client = GoogleDriveClient(
|
||||||
session, connector_id, credentials=pre_built_credentials
|
session, connector_id, credentials=pre_built_credentials
|
||||||
)
|
)
|
||||||
|
|
@ -1291,6 +1320,7 @@ async def index_google_drive_selected_files(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if unsupported > 0:
|
if unsupported > 0:
|
||||||
|
|
|
||||||
|
|
@ -153,16 +153,16 @@ def scan_folder(
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
async def _read_file_content(file_path: str, filename: str) -> str:
|
async def _read_file_content(file_path: str, filename: str, *, vision_llm=None) -> str:
|
||||||
"""Read file content via the unified ETL pipeline.
|
"""Read file content via the unified ETL pipeline.
|
||||||
|
|
||||||
All file types (plaintext, audio, direct-convert, document) are handled
|
All file types (plaintext, audio, direct-convert, document, image) are
|
||||||
by ``EtlPipelineService``.
|
handled by ``EtlPipelineService``.
|
||||||
"""
|
"""
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||||
|
|
||||||
result = await EtlPipelineService().extract(
|
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||||
EtlRequest(file_path=file_path, filename=filename)
|
EtlRequest(file_path=file_path, filename=filename)
|
||||||
)
|
)
|
||||||
return result.markdown_content
|
return result.markdown_content
|
||||||
|
|
@ -199,12 +199,14 @@ async def _compute_file_content_hash(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
filename: str,
|
filename: str,
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
|
*,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[str, str]:
|
) -> tuple[str, str]:
|
||||||
"""Read a file (via ETL if needed) and compute its content hash.
|
"""Read a file (via ETL if needed) and compute its content hash.
|
||||||
|
|
||||||
Returns (content_text, content_hash).
|
Returns (content_text, content_hash).
|
||||||
"""
|
"""
|
||||||
content = await _read_file_content(file_path, filename)
|
content = await _read_file_content(file_path, filename, vision_llm=vision_llm)
|
||||||
return content, _content_hash(content, search_space_id)
|
return content, _content_hash(content, search_space_id)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -704,7 +706,9 @@ async def index_local_folder(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content, content_hash = await _compute_file_content_hash(
|
content, content_hash = await _compute_file_content_hash(
|
||||||
file_path_abs, file_info["relative_path"], search_space_id
|
file_path_abs,
|
||||||
|
file_info["relative_path"],
|
||||||
|
search_space_id,
|
||||||
)
|
)
|
||||||
except Exception as read_err:
|
except Exception as read_err:
|
||||||
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
||||||
|
|
@ -738,7 +742,9 @@ async def index_local_folder(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content, content_hash = await _compute_file_content_hash(
|
content, content_hash = await _compute_file_content_hash(
|
||||||
file_path_abs, file_info["relative_path"], search_space_id
|
file_path_abs,
|
||||||
|
file_info["relative_path"],
|
||||||
|
search_space_id,
|
||||||
)
|
)
|
||||||
except Exception as read_err:
|
except Exception as read_err:
|
||||||
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
logger.warning(f"Could not read {file_path_abs}: {read_err}")
|
||||||
|
|
@ -1264,6 +1270,7 @@ async def index_uploaded_files(
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
file_mappings: list[dict],
|
file_mappings: list[dict],
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
) -> tuple[int, int, str | None]:
|
) -> tuple[int, int, str | None]:
|
||||||
"""Index files uploaded from the desktop app via temp paths.
|
"""Index files uploaded from the desktop app via temp paths.
|
||||||
|
|
||||||
|
|
@ -1300,6 +1307,12 @@ async def index_uploaded_files(
|
||||||
pipeline = IndexingPipelineService(session)
|
pipeline = IndexingPipelineService(session)
|
||||||
llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
llm = await get_user_long_context_llm(session, user_id, search_space_id)
|
||||||
|
|
||||||
|
vision_llm_instance = None
|
||||||
|
if use_vision_llm:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
|
||||||
|
vision_llm_instance = await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
indexed_count = 0
|
indexed_count = 0
|
||||||
failed_count = 0
|
failed_count = 0
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
|
@ -1347,7 +1360,8 @@ async def index_uploaded_files(
|
||||||
|
|
||||||
try:
|
try:
|
||||||
content, content_hash = await _compute_file_content_hash(
|
content, content_hash = await _compute_file_content_hash(
|
||||||
temp_path, filename, search_space_id
|
temp_path, filename, search_space_id,
|
||||||
|
vision_llm=vision_llm_instance,
|
||||||
)
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"Could not read {relative_path}: {e}")
|
logger.warning(f"Could not read {relative_path}: {e}")
|
||||||
|
|
|
||||||
|
|
@ -171,6 +171,7 @@ async def _download_files_parallel(
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
max_concurrency: int = 3,
|
max_concurrency: int = 3,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[list[ConnectorDocument], int]:
|
) -> tuple[list[ConnectorDocument], int]:
|
||||||
"""Download and ETL files in parallel. Returns (docs, failed_count)."""
|
"""Download and ETL files in parallel. Returns (docs, failed_count)."""
|
||||||
results: list[ConnectorDocument] = []
|
results: list[ConnectorDocument] = []
|
||||||
|
|
@ -183,7 +184,7 @@ async def _download_files_parallel(
|
||||||
nonlocal last_heartbeat, completed_count
|
nonlocal last_heartbeat, completed_count
|
||||||
async with sem:
|
async with sem:
|
||||||
markdown, od_metadata, error = await download_and_extract_content(
|
markdown, od_metadata, error = await download_and_extract_content(
|
||||||
onedrive_client, file
|
onedrive_client, file, vision_llm=vision_llm
|
||||||
)
|
)
|
||||||
if error or not markdown:
|
if error or not markdown:
|
||||||
file_name = file.get("name", "Unknown")
|
file_name = file.get("name", "Unknown")
|
||||||
|
|
@ -231,6 +232,7 @@ async def _download_and_index(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int]:
|
) -> tuple[int, int]:
|
||||||
"""Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
|
"""Parallel download then parallel indexing. Returns (batch_indexed, total_failed)."""
|
||||||
connector_docs, download_failed = await _download_files_parallel(
|
connector_docs, download_failed = await _download_files_parallel(
|
||||||
|
|
@ -241,6 +243,7 @@ async def _download_and_index(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
batch_indexed = 0
|
batch_indexed = 0
|
||||||
|
|
@ -293,6 +296,7 @@ async def _index_selected_files(
|
||||||
user_id: str,
|
user_id: str,
|
||||||
enable_summary: bool,
|
enable_summary: bool,
|
||||||
on_heartbeat: HeartbeatCallbackType | None = None,
|
on_heartbeat: HeartbeatCallbackType | None = None,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, list[str]]:
|
) -> tuple[int, int, int, list[str]]:
|
||||||
"""Index user-selected files using the parallel pipeline."""
|
"""Index user-selected files using the parallel pipeline."""
|
||||||
page_limit_service = PageLimitService(session)
|
page_limit_service = PageLimitService(session)
|
||||||
|
|
@ -343,6 +347,7 @@ async def _index_selected_files(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat,
|
on_heartbeat=on_heartbeat,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -375,6 +380,7 @@ async def _index_full_scan(
|
||||||
include_subfolders: bool = True,
|
include_subfolders: bool = True,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
enable_summary: bool = True,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int]:
|
) -> tuple[int, int, int]:
|
||||||
"""Full scan indexing of a folder.
|
"""Full scan indexing of a folder.
|
||||||
|
|
||||||
|
|
@ -450,6 +456,7 @@ async def _index_full_scan(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -481,6 +488,7 @@ async def _index_with_delta_sync(
|
||||||
max_files: int,
|
max_files: int,
|
||||||
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
on_heartbeat_callback: HeartbeatCallbackType | None = None,
|
||||||
enable_summary: bool = True,
|
enable_summary: bool = True,
|
||||||
|
vision_llm=None,
|
||||||
) -> tuple[int, int, int, str | None]:
|
) -> tuple[int, int, int, str | None]:
|
||||||
"""Delta sync using OneDrive change tracking.
|
"""Delta sync using OneDrive change tracking.
|
||||||
|
|
||||||
|
|
@ -573,6 +581,7 @@ async def _index_with_delta_sync(
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=enable_summary,
|
enable_summary=enable_summary,
|
||||||
on_heartbeat=on_heartbeat_callback,
|
on_heartbeat=on_heartbeat_callback,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
if batch_indexed > 0 and files_to_download and batch_estimated_pages > 0:
|
||||||
|
|
@ -643,6 +652,12 @@ async def index_onedrive_files(
|
||||||
return 0, 0, error_msg, 0
|
return 0, 0, error_msg, 0
|
||||||
|
|
||||||
connector_enable_summary = getattr(connector, "enable_summary", True)
|
connector_enable_summary = getattr(connector, "enable_summary", True)
|
||||||
|
connector_enable_vision_llm = getattr(connector, "enable_vision_llm", False)
|
||||||
|
vision_llm = None
|
||||||
|
if connector_enable_vision_llm:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
onedrive_client = OneDriveClient(session, connector_id)
|
onedrive_client = OneDriveClient(session, connector_id)
|
||||||
|
|
||||||
indexing_options = items_dict.get("indexing_options", {})
|
indexing_options = items_dict.get("indexing_options", {})
|
||||||
|
|
@ -666,6 +681,7 @@ async def index_onedrive_files(
|
||||||
search_space_id=search_space_id,
|
search_space_id=search_space_id,
|
||||||
user_id=user_id,
|
user_id=user_id,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += indexed
|
total_indexed += indexed
|
||||||
total_skipped += skipped
|
total_skipped += skipped
|
||||||
|
|
@ -695,6 +711,7 @@ async def index_onedrive_files(
|
||||||
log_entry,
|
log_entry,
|
||||||
max_files,
|
max_files,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += indexed
|
total_indexed += indexed
|
||||||
total_skipped += skipped
|
total_skipped += skipped
|
||||||
|
|
@ -721,6 +738,7 @@ async def index_onedrive_files(
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += ri
|
total_indexed += ri
|
||||||
total_skipped += rs
|
total_skipped += rs
|
||||||
|
|
@ -740,6 +758,7 @@ async def index_onedrive_files(
|
||||||
max_files,
|
max_files,
|
||||||
include_subfolders,
|
include_subfolders,
|
||||||
enable_summary=connector_enable_summary,
|
enable_summary=connector_enable_summary,
|
||||||
|
vision_llm=vision_llm,
|
||||||
)
|
)
|
||||||
total_indexed += indexed
|
total_indexed += indexed
|
||||||
total_skipped += skipped
|
total_skipped += skipped
|
||||||
|
|
|
||||||
|
|
@ -46,6 +46,7 @@ class _ProcessingContext:
|
||||||
log_entry: Log
|
log_entry: Log
|
||||||
connector: dict | None = None
|
connector: dict | None = None
|
||||||
notification: Notification | None = None
|
notification: Notification | None = None
|
||||||
|
use_vision_llm: bool = False
|
||||||
enable_summary: bool = field(init=False)
|
enable_summary: bool = field(init=False)
|
||||||
|
|
||||||
def __post_init__(self) -> None:
|
def __post_init__(self) -> None:
|
||||||
|
|
@ -118,9 +119,13 @@ async def _log_page_divergence(
|
||||||
|
|
||||||
|
|
||||||
async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
|
async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | None:
|
||||||
"""Extract content from a non-document file (plaintext/direct_convert/audio) via the unified ETL pipeline."""
|
"""Extract content from a non-document file (plaintext/direct_convert/audio/image) via the unified ETL pipeline."""
|
||||||
from app.etl_pipeline.etl_document import EtlRequest
|
from app.etl_pipeline.etl_document import EtlRequest
|
||||||
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
from app.etl_pipeline.etl_pipeline_service import EtlPipelineService
|
||||||
|
from app.etl_pipeline.file_classifier import (
|
||||||
|
FileCategory,
|
||||||
|
classify_file as etl_classify,
|
||||||
|
)
|
||||||
|
|
||||||
await _notify(ctx, "parsing", "Processing file")
|
await _notify(ctx, "parsing", "Processing file")
|
||||||
await ctx.task_logger.log_task_progress(
|
await ctx.task_logger.log_task_progress(
|
||||||
|
|
@ -129,7 +134,13 @@ async def _process_non_document_upload(ctx: _ProcessingContext) -> Document | No
|
||||||
{"processing_stage": "extracting"},
|
{"processing_stage": "extracting"},
|
||||||
)
|
)
|
||||||
|
|
||||||
etl_result = await EtlPipelineService().extract(
|
vision_llm = None
|
||||||
|
if ctx.use_vision_llm and etl_classify(ctx.filename) == FileCategory.IMAGE:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
|
||||||
|
vision_llm = await get_vision_llm(ctx.session, ctx.search_space_id)
|
||||||
|
|
||||||
|
etl_result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||||
EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
|
EtlRequest(file_path=ctx.file_path, filename=ctx.filename)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -278,6 +289,7 @@ async def process_file_in_background(
|
||||||
log_entry: Log,
|
log_entry: Log,
|
||||||
connector: dict | None = None,
|
connector: dict | None = None,
|
||||||
notification: Notification | None = None,
|
notification: Notification | None = None,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
ctx = _ProcessingContext(
|
ctx = _ProcessingContext(
|
||||||
session=session,
|
session=session,
|
||||||
|
|
@ -289,6 +301,7 @@ async def process_file_in_background(
|
||||||
log_entry=log_entry,
|
log_entry=log_entry,
|
||||||
connector=connector,
|
connector=connector,
|
||||||
notification=notification,
|
notification=notification,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
|
@ -333,11 +346,13 @@ async def process_file_in_background(
|
||||||
async def _extract_file_content(
|
async def _extract_file_content(
|
||||||
file_path: str,
|
file_path: str,
|
||||||
filename: str,
|
filename: str,
|
||||||
|
search_space_id: int,
|
||||||
session: AsyncSession,
|
session: AsyncSession,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
task_logger: TaskLoggingService,
|
task_logger: TaskLoggingService,
|
||||||
log_entry: Log,
|
log_entry: Log,
|
||||||
notification: Notification | None,
|
notification: Notification | None,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
) -> tuple[str, str]:
|
) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
Extract markdown content from a file regardless of type.
|
Extract markdown content from a file regardless of type.
|
||||||
|
|
@ -360,6 +375,7 @@ async def _extract_file_content(
|
||||||
FileCategory.PLAINTEXT: "Reading file",
|
FileCategory.PLAINTEXT: "Reading file",
|
||||||
FileCategory.DIRECT_CONVERT: "Converting file",
|
FileCategory.DIRECT_CONVERT: "Converting file",
|
||||||
FileCategory.AUDIO: "Transcribing audio",
|
FileCategory.AUDIO: "Transcribing audio",
|
||||||
|
FileCategory.IMAGE: "Analyzing image",
|
||||||
FileCategory.UNSUPPORTED: "Unsupported file type",
|
FileCategory.UNSUPPORTED: "Unsupported file type",
|
||||||
FileCategory.DOCUMENT: "Extracting content",
|
FileCategory.DOCUMENT: "Extracting content",
|
||||||
}
|
}
|
||||||
|
|
@ -383,7 +399,13 @@ async def _extract_file_content(
|
||||||
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
|
estimated_pages = _estimate_pages_safe(page_limit_service, file_path)
|
||||||
await page_limit_service.check_page_limit(user_id, estimated_pages)
|
await page_limit_service.check_page_limit(user_id, estimated_pages)
|
||||||
|
|
||||||
result = await EtlPipelineService().extract(
|
vision_llm = None
|
||||||
|
if use_vision_llm and category == FileCategory.IMAGE:
|
||||||
|
from app.services.llm_service import get_vision_llm
|
||||||
|
|
||||||
|
vision_llm = await get_vision_llm(session, search_space_id)
|
||||||
|
|
||||||
|
result = await EtlPipelineService(vision_llm=vision_llm).extract(
|
||||||
EtlRequest(
|
EtlRequest(
|
||||||
file_path=file_path,
|
file_path=file_path,
|
||||||
filename=filename,
|
filename=filename,
|
||||||
|
|
@ -417,6 +439,7 @@ async def process_file_in_background_with_document(
|
||||||
connector: dict | None = None,
|
connector: dict | None = None,
|
||||||
notification: Notification | None = None,
|
notification: Notification | None = None,
|
||||||
should_summarize: bool = False,
|
should_summarize: bool = False,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
) -> Document | None:
|
) -> Document | None:
|
||||||
"""
|
"""
|
||||||
Process file and update existing pending document (2-phase pattern).
|
Process file and update existing pending document (2-phase pattern).
|
||||||
|
|
@ -439,11 +462,13 @@ async def process_file_in_background_with_document(
|
||||||
markdown_content, etl_service = await _extract_file_content(
|
markdown_content, etl_service = await _extract_file_content(
|
||||||
file_path,
|
file_path,
|
||||||
filename,
|
filename,
|
||||||
|
search_space_id,
|
||||||
session,
|
session,
|
||||||
user_id,
|
user_id,
|
||||||
task_logger,
|
task_logger,
|
||||||
log_entry,
|
log_entry,
|
||||||
notification,
|
notification,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
if not markdown_content:
|
if not markdown_content:
|
||||||
|
|
|
||||||
|
|
@ -7,10 +7,33 @@ Extensions already covered by PLAINTEXT_EXTENSIONS, AUDIO_EXTENSIONS, or
|
||||||
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
|
DIRECT_CONVERT_EXTENSIONS in file_classifier are NOT repeated here -- these
|
||||||
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
|
sets are exclusively for the "document" ETL path (Docling / LlamaParse /
|
||||||
Unstructured).
|
Unstructured).
|
||||||
|
|
||||||
|
Image extensions intentionally remain in the per-parser sets for fallback
|
||||||
|
compatibility. IMAGE_EXTENSIONS is used only for routing classification.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from pathlib import PurePosixPath
|
from pathlib import PurePosixPath
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Image extensions (used by file_classifier for routing to vision LLM)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
IMAGE_EXTENSIONS: frozenset[str] = frozenset(
|
||||||
|
{
|
||||||
|
".png",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
|
".gif",
|
||||||
|
".bmp",
|
||||||
|
".tiff",
|
||||||
|
".tif",
|
||||||
|
".webp",
|
||||||
|
".svg",
|
||||||
|
".heic",
|
||||||
|
".heif",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Per-parser document extension sets (from official documentation)
|
# Per-parser document extension sets (from official documentation)
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
|
||||||
|
|
@ -69,6 +69,7 @@ class InlineTaskDispatcher:
|
||||||
search_space_id: int,
|
search_space_id: int,
|
||||||
user_id: str,
|
user_id: str,
|
||||||
should_summarize: bool = False,
|
should_summarize: bool = False,
|
||||||
|
use_vision_llm: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
from app.tasks.celery_tasks.document_tasks import (
|
from app.tasks.celery_tasks.document_tasks import (
|
||||||
_process_file_with_document,
|
_process_file_with_document,
|
||||||
|
|
@ -82,6 +83,7 @@ class InlineTaskDispatcher:
|
||||||
search_space_id,
|
search_space_id,
|
||||||
user_id,
|
user_id,
|
||||||
should_summarize=should_summarize,
|
should_summarize=should_summarize,
|
||||||
|
use_vision_llm=use_vision_llm,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -168,7 +168,7 @@ async def test_concurrency_bounded_by_semaphore(
|
||||||
active = 0
|
active = 0
|
||||||
peak = 0
|
peak = 0
|
||||||
|
|
||||||
async def _slow_extract(client, file):
|
async def _slow_extract(client, file, **kwargs):
|
||||||
nonlocal active, peak
|
nonlocal active, peak
|
||||||
async with lock:
|
async with lock:
|
||||||
active += 1
|
active += 1
|
||||||
|
|
@ -209,7 +209,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
|
||||||
|
|
||||||
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
|
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
|
||||||
|
|
||||||
async def _slow_extract(client, file):
|
async def _slow_extract(client, file, **kwargs):
|
||||||
await asyncio.sleep(0.05)
|
await asyncio.sleep(0.05)
|
||||||
return _mock_extract_ok(file["id"], file["name"])
|
return _mock_extract_ok(file["id"], file["name"])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
|
||||||
active = 0
|
active = 0
|
||||||
peak = 0
|
peak = 0
|
||||||
|
|
||||||
async def _slow_extract(client, file):
|
async def _slow_extract(client, file, **kwargs):
|
||||||
nonlocal active, peak
|
nonlocal active, peak
|
||||||
async with lock:
|
async with lock:
|
||||||
active += 1
|
active += 1
|
||||||
|
|
@ -204,7 +204,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
|
||||||
|
|
||||||
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
|
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
|
||||||
|
|
||||||
async def _slow_extract(client, file):
|
async def _slow_extract(client, file, **kwargs):
|
||||||
await asyncio.sleep(0.05)
|
await asyncio.sleep(0.05)
|
||||||
return _mock_extract_ok(file["id"], file["name"])
|
return _mock_extract_ok(file["id"], file["name"])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -162,7 +162,7 @@ async def test_concurrency_bounded_by_semaphore(
|
||||||
active = 0
|
active = 0
|
||||||
peak = 0
|
peak = 0
|
||||||
|
|
||||||
async def _slow_extract(client, file):
|
async def _slow_extract(client, file, **kwargs):
|
||||||
nonlocal active, peak
|
nonlocal active, peak
|
||||||
async with lock:
|
async with lock:
|
||||||
active += 1
|
active += 1
|
||||||
|
|
@ -203,7 +203,7 @@ async def test_heartbeat_fires_during_parallel_downloads(
|
||||||
|
|
||||||
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
|
monkeypatch.setattr(_mod, "HEARTBEAT_INTERVAL_SECONDS", 0)
|
||||||
|
|
||||||
async def _slow_extract(client, file):
|
async def _slow_extract(client, file, **kwargs):
|
||||||
await asyncio.sleep(0.05)
|
await asyncio.sleep(0.05)
|
||||||
return _mock_extract_ok(file["id"], file["name"])
|
return _mock_extract_ok(file["id"], file["name"])
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -431,7 +431,7 @@ async def test_llamacloud_heif_accepted_only_with_azure_di(tmp_path, mocker):
|
||||||
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
|
mocker.patch("app.config.config.AZURE_DI_ENDPOINT", None, create=True)
|
||||||
mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
|
mocker.patch("app.config.config.AZURE_DI_KEY", None, create=True)
|
||||||
|
|
||||||
with pytest.raises(EtlUnsupportedFileError, match="not supported by LLAMACLOUD"):
|
with pytest.raises(EtlUnsupportedFileError, match="document parser does not support this format"):
|
||||||
await EtlPipelineService().extract(
|
await EtlPipelineService().extract(
|
||||||
EtlRequest(file_path=str(heif_file), filename="photo.heif")
|
EtlRequest(file_path=str(heif_file), filename="photo.heif")
|
||||||
)
|
)
|
||||||
|
|
@ -549,8 +549,11 @@ def test_unsupported_extensions_classified_correctly(filename):
|
||||||
("doc.docx", "document"),
|
("doc.docx", "document"),
|
||||||
("slides.pptx", "document"),
|
("slides.pptx", "document"),
|
||||||
("sheet.xlsx", "document"),
|
("sheet.xlsx", "document"),
|
||||||
("photo.png", "document"),
|
("photo.png", "image"),
|
||||||
("photo.jpg", "document"),
|
("photo.jpg", "image"),
|
||||||
|
("photo.webp", "image"),
|
||||||
|
("photo.gif", "image"),
|
||||||
|
("photo.heic", "image"),
|
||||||
("book.epub", "document"),
|
("book.epub", "document"),
|
||||||
("letter.odt", "document"),
|
("letter.odt", "document"),
|
||||||
("readme.md", "plaintext"),
|
("readme.md", "plaintext"),
|
||||||
|
|
@ -680,3 +683,57 @@ async def test_extract_eml_with_docling_raises_unsupported(tmp_path, mocker):
|
||||||
await EtlPipelineService().extract(
|
await EtlPipelineService().extract(
|
||||||
EtlRequest(file_path=str(eml_file), filename="mail.eml")
|
EtlRequest(file_path=str(eml_file), filename="mail.eml")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Image extraction via vision LLM
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
async def test_extract_image_with_vision_llm(tmp_path):
|
||||||
|
"""An image file is analyzed by the vision LLM when provided."""
|
||||||
|
from unittest.mock import AsyncMock, MagicMock
|
||||||
|
|
||||||
|
img_file = tmp_path / "photo.png"
|
||||||
|
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
|
||||||
|
|
||||||
|
fake_response = MagicMock()
|
||||||
|
fake_response.content = "# A photo of a sunset over the ocean"
|
||||||
|
fake_llm = AsyncMock()
|
||||||
|
fake_llm.ainvoke.return_value = fake_response
|
||||||
|
|
||||||
|
service = EtlPipelineService(vision_llm=fake_llm)
|
||||||
|
result = await service.extract(
|
||||||
|
EtlRequest(file_path=str(img_file), filename="photo.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.markdown_content == "# A photo of a sunset over the ocean"
|
||||||
|
assert result.etl_service == "VISION_LLM"
|
||||||
|
assert result.content_type == "image"
|
||||||
|
fake_llm.ainvoke.assert_called_once()
|
||||||
|
|
||||||
|
|
||||||
|
async def test_extract_image_falls_back_to_document_without_vision_llm(
|
||||||
|
tmp_path, mocker
|
||||||
|
):
|
||||||
|
"""Without a vision LLM, image files fall back to the document parser."""
|
||||||
|
mocker.patch("app.config.config.ETL_SERVICE", "DOCLING")
|
||||||
|
|
||||||
|
fake_docling = mocker.AsyncMock()
|
||||||
|
fake_docling.process_document.return_value = {"content": "# OCR text from image"}
|
||||||
|
mocker.patch(
|
||||||
|
"app.services.docling_service.create_docling_service",
|
||||||
|
return_value=fake_docling,
|
||||||
|
)
|
||||||
|
|
||||||
|
img_file = tmp_path / "scan.png"
|
||||||
|
img_file.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 50)
|
||||||
|
|
||||||
|
service = EtlPipelineService()
|
||||||
|
result = await service.extract(
|
||||||
|
EtlRequest(file_path=str(img_file), filename="scan.png")
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.markdown_content == "# OCR text from image"
|
||||||
|
assert result.etl_service == "DOCLING"
|
||||||
|
assert result.content_type == "document"
|
||||||
|
|
|
||||||
|
|
@ -154,3 +154,40 @@ def test_get_extensions_for_none_returns_union():
|
||||||
)
|
)
|
||||||
|
|
||||||
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
|
assert get_document_extensions_for_service(None) == DOCUMENT_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# IMAGE_EXTENSIONS
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"ext",
|
||||||
|
[
|
||||||
|
".png",
|
||||||
|
".jpg",
|
||||||
|
".jpeg",
|
||||||
|
".gif",
|
||||||
|
".bmp",
|
||||||
|
".tiff",
|
||||||
|
".tif",
|
||||||
|
".webp",
|
||||||
|
".svg",
|
||||||
|
".heic",
|
||||||
|
".heif",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_image_extensions_contains_expected(ext):
|
||||||
|
from app.utils.file_extensions import IMAGE_EXTENSIONS
|
||||||
|
|
||||||
|
assert ext in IMAGE_EXTENSIONS
|
||||||
|
|
||||||
|
|
||||||
|
def test_image_extensions_are_subset_of_document_extensions():
|
||||||
|
"""Image extensions used for routing should also be in DOCUMENT_EXTENSIONS for fallback."""
|
||||||
|
from app.utils.file_extensions import DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS
|
||||||
|
|
||||||
|
missing = IMAGE_EXTENSIONS - DOCUMENT_EXTENSIONS
|
||||||
|
assert not missing, (
|
||||||
|
f"Image extensions missing from document sets (breaks fallback): {missing}"
|
||||||
|
)
|
||||||
|
|
|
||||||
10
surfsense_desktop/.env.example
Normal file
10
surfsense_desktop/.env.example
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
# Electron-specific build-time configuration.
|
||||||
|
# Set before running pnpm dist:mac / dist:win / dist:linux.
|
||||||
|
|
||||||
|
# The hosted web frontend URL. Used to intercept OAuth redirects and keep them
|
||||||
|
# inside the desktop app. Set to your production frontend domain.
|
||||||
|
HOSTED_FRONTEND_URL=https://surfsense.net
|
||||||
|
|
||||||
|
# PostHog analytics (leave empty to disable)
|
||||||
|
POSTHOG_KEY=
|
||||||
|
POSTHOG_HOST=https://assets.surfsense.com
|
||||||
1
surfsense_desktop/.gitignore
vendored
1
surfsense_desktop/.gitignore
vendored
|
|
@ -1,3 +1,4 @@
|
||||||
node_modules/
|
node_modules/
|
||||||
dist/
|
dist/
|
||||||
release/
|
release/
|
||||||
|
.env
|
||||||
|
|
@ -98,6 +98,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
periodicEnabled,
|
periodicEnabled,
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
enableSummary,
|
enableSummary,
|
||||||
|
enableVisionLlm,
|
||||||
allConnectors,
|
allConnectors,
|
||||||
viewingAccountsType,
|
viewingAccountsType,
|
||||||
viewingMCPList,
|
viewingMCPList,
|
||||||
|
|
@ -109,6 +110,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
setPeriodicEnabled,
|
setPeriodicEnabled,
|
||||||
setFrequencyMinutes,
|
setFrequencyMinutes,
|
||||||
setEnableSummary,
|
setEnableSummary,
|
||||||
|
setEnableVisionLlm,
|
||||||
handleOpenChange,
|
handleOpenChange,
|
||||||
handleTabChange,
|
handleTabChange,
|
||||||
handleScroll,
|
handleScroll,
|
||||||
|
|
@ -279,6 +281,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
periodicEnabled={periodicEnabled}
|
periodicEnabled={periodicEnabled}
|
||||||
frequencyMinutes={frequencyMinutes}
|
frequencyMinutes={frequencyMinutes}
|
||||||
enableSummary={enableSummary}
|
enableSummary={enableSummary}
|
||||||
|
enableVisionLlm={enableVisionLlm}
|
||||||
isSaving={isSaving}
|
isSaving={isSaving}
|
||||||
isDisconnecting={isDisconnecting}
|
isDisconnecting={isDisconnecting}
|
||||||
isIndexing={indexingConnectorIds.has(editingConnector.id)}
|
isIndexing={indexingConnectorIds.has(editingConnector.id)}
|
||||||
|
|
@ -288,6 +291,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
onPeriodicEnabledChange={setPeriodicEnabled}
|
onPeriodicEnabledChange={setPeriodicEnabled}
|
||||||
onFrequencyChange={setFrequencyMinutes}
|
onFrequencyChange={setFrequencyMinutes}
|
||||||
onEnableSummaryChange={setEnableSummary}
|
onEnableSummaryChange={setEnableSummary}
|
||||||
|
onEnableVisionLlmChange={setEnableVisionLlm}
|
||||||
onSave={() => {
|
onSave={() => {
|
||||||
startIndexing(editingConnector.id);
|
startIndexing(editingConnector.id);
|
||||||
handleSaveConnector(() => refreshConnectors());
|
handleSaveConnector(() => refreshConnectors());
|
||||||
|
|
@ -336,6 +340,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
periodicEnabled={periodicEnabled}
|
periodicEnabled={periodicEnabled}
|
||||||
frequencyMinutes={frequencyMinutes}
|
frequencyMinutes={frequencyMinutes}
|
||||||
enableSummary={enableSummary}
|
enableSummary={enableSummary}
|
||||||
|
enableVisionLlm={enableVisionLlm}
|
||||||
isStartingIndexing={isStartingIndexing}
|
isStartingIndexing={isStartingIndexing}
|
||||||
isFromOAuth={isFromOAuth}
|
isFromOAuth={isFromOAuth}
|
||||||
onStartDateChange={setStartDate}
|
onStartDateChange={setStartDate}
|
||||||
|
|
@ -343,6 +348,7 @@ export const ConnectorIndicator = forwardRef<ConnectorIndicatorHandle, Connector
|
||||||
onPeriodicEnabledChange={setPeriodicEnabled}
|
onPeriodicEnabledChange={setPeriodicEnabled}
|
||||||
onFrequencyChange={setFrequencyMinutes}
|
onFrequencyChange={setFrequencyMinutes}
|
||||||
onEnableSummaryChange={setEnableSummary}
|
onEnableSummaryChange={setEnableSummary}
|
||||||
|
onEnableVisionLlmChange={setEnableVisionLlm}
|
||||||
onConfigChange={setIndexingConnectorConfig}
|
onConfigChange={setIndexingConnectorConfig}
|
||||||
onStartIndexing={() => {
|
onStartIndexing={() => {
|
||||||
if (indexingConfig.connectorId) {
|
if (indexingConfig.connectorId) {
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,25 @@
|
||||||
|
"use client";
|
||||||
|
|
||||||
|
import type { FC } from "react";
|
||||||
|
import { Switch } from "@/components/ui/switch";
|
||||||
|
|
||||||
|
interface VisionLLMConfigProps {
|
||||||
|
enabled: boolean;
|
||||||
|
onEnabledChange: (enabled: boolean) => void;
|
||||||
|
}
|
||||||
|
|
||||||
|
export const VisionLLMConfig: FC<VisionLLMConfigProps> = ({ enabled, onEnabledChange }) => {
|
||||||
|
return (
|
||||||
|
<div className="rounded-xl bg-slate-400/5 dark:bg-white/5 p-3 sm:p-6">
|
||||||
|
<div className="flex items-center justify-between">
|
||||||
|
<div className="space-y-1">
|
||||||
|
<h3 className="font-medium text-sm sm:text-base">Enable Vision LLM</h3>
|
||||||
|
<p className="text-xs sm:text-sm text-muted-foreground">
|
||||||
|
Describes images using AI vision (costly, slower)
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Switch checked={enabled} onCheckedChange={onEnabledChange} />
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
@ -15,6 +15,7 @@ import { cn } from "@/lib/utils";
|
||||||
import { DateRangeSelector } from "../../components/date-range-selector";
|
import { DateRangeSelector } from "../../components/date-range-selector";
|
||||||
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
|
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
|
||||||
import { SummaryConfig } from "../../components/summary-config";
|
import { SummaryConfig } from "../../components/summary-config";
|
||||||
|
import { VisionLLMConfig } from "../../components/vision-llm-config";
|
||||||
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
|
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
|
||||||
import { getConnectorConfigComponent } from "../index";
|
import { getConnectorConfigComponent } from "../index";
|
||||||
|
|
||||||
|
|
@ -38,6 +39,7 @@ interface ConnectorEditViewProps {
|
||||||
periodicEnabled: boolean;
|
periodicEnabled: boolean;
|
||||||
frequencyMinutes: string;
|
frequencyMinutes: string;
|
||||||
enableSummary: boolean;
|
enableSummary: boolean;
|
||||||
|
enableVisionLlm: boolean;
|
||||||
isSaving: boolean;
|
isSaving: boolean;
|
||||||
isDisconnecting: boolean;
|
isDisconnecting: boolean;
|
||||||
isIndexing?: boolean;
|
isIndexing?: boolean;
|
||||||
|
|
@ -47,6 +49,7 @@ interface ConnectorEditViewProps {
|
||||||
onPeriodicEnabledChange: (enabled: boolean) => void;
|
onPeriodicEnabledChange: (enabled: boolean) => void;
|
||||||
onFrequencyChange: (frequency: string) => void;
|
onFrequencyChange: (frequency: string) => void;
|
||||||
onEnableSummaryChange: (enabled: boolean) => void;
|
onEnableSummaryChange: (enabled: boolean) => void;
|
||||||
|
onEnableVisionLlmChange: (enabled: boolean) => void;
|
||||||
onSave: () => void;
|
onSave: () => void;
|
||||||
onDisconnect: () => void;
|
onDisconnect: () => void;
|
||||||
onBack: () => void;
|
onBack: () => void;
|
||||||
|
|
@ -62,6 +65,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
||||||
periodicEnabled,
|
periodicEnabled,
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
enableSummary,
|
enableSummary,
|
||||||
|
enableVisionLlm,
|
||||||
isSaving,
|
isSaving,
|
||||||
isDisconnecting,
|
isDisconnecting,
|
||||||
isIndexing = false,
|
isIndexing = false,
|
||||||
|
|
@ -71,6 +75,7 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
||||||
onPeriodicEnabledChange,
|
onPeriodicEnabledChange,
|
||||||
onFrequencyChange,
|
onFrequencyChange,
|
||||||
onEnableSummaryChange,
|
onEnableSummaryChange,
|
||||||
|
onEnableVisionLlmChange,
|
||||||
onSave,
|
onSave,
|
||||||
onDisconnect,
|
onDisconnect,
|
||||||
onBack,
|
onBack,
|
||||||
|
|
@ -272,6 +277,14 @@ export const ConnectorEditView: FC<ConnectorEditViewProps> = ({
|
||||||
{/* AI Summary toggle */}
|
{/* AI Summary toggle */}
|
||||||
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
|
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
|
||||||
|
|
||||||
|
{/* Vision LLM toggle - only for file-based connectors */}
|
||||||
|
{(connector.connector_type === "GOOGLE_DRIVE_CONNECTOR" ||
|
||||||
|
connector.connector_type === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
|
||||||
|
connector.connector_type === "DROPBOX_CONNECTOR" ||
|
||||||
|
connector.connector_type === "ONEDRIVE_CONNECTOR") && (
|
||||||
|
<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
|
{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
|
||||||
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
{connector.connector_type !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||||
connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
|
connector.connector_type !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,7 @@ import { cn } from "@/lib/utils";
|
||||||
import { DateRangeSelector } from "../../components/date-range-selector";
|
import { DateRangeSelector } from "../../components/date-range-selector";
|
||||||
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
|
import { PeriodicSyncConfig } from "../../components/periodic-sync-config";
|
||||||
import { SummaryConfig } from "../../components/summary-config";
|
import { SummaryConfig } from "../../components/summary-config";
|
||||||
|
import { VisionLLMConfig } from "../../components/vision-llm-config";
|
||||||
import type { IndexingConfigState } from "../../constants/connector-constants";
|
import type { IndexingConfigState } from "../../constants/connector-constants";
|
||||||
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
|
import { getConnectorDisplayName } from "../../tabs/all-connectors-tab";
|
||||||
import { getConnectorConfigComponent } from "../index";
|
import { getConnectorConfigComponent } from "../index";
|
||||||
|
|
@ -22,6 +23,7 @@ interface IndexingConfigurationViewProps {
|
||||||
periodicEnabled: boolean;
|
periodicEnabled: boolean;
|
||||||
frequencyMinutes: string;
|
frequencyMinutes: string;
|
||||||
enableSummary: boolean;
|
enableSummary: boolean;
|
||||||
|
enableVisionLlm: boolean;
|
||||||
isStartingIndexing: boolean;
|
isStartingIndexing: boolean;
|
||||||
isFromOAuth?: boolean;
|
isFromOAuth?: boolean;
|
||||||
onStartDateChange: (date: Date | undefined) => void;
|
onStartDateChange: (date: Date | undefined) => void;
|
||||||
|
|
@ -29,6 +31,7 @@ interface IndexingConfigurationViewProps {
|
||||||
onPeriodicEnabledChange: (enabled: boolean) => void;
|
onPeriodicEnabledChange: (enabled: boolean) => void;
|
||||||
onFrequencyChange: (frequency: string) => void;
|
onFrequencyChange: (frequency: string) => void;
|
||||||
onEnableSummaryChange: (enabled: boolean) => void;
|
onEnableSummaryChange: (enabled: boolean) => void;
|
||||||
|
onEnableVisionLlmChange: (enabled: boolean) => void;
|
||||||
onConfigChange?: (config: Record<string, unknown>) => void;
|
onConfigChange?: (config: Record<string, unknown>) => void;
|
||||||
onStartIndexing: () => void;
|
onStartIndexing: () => void;
|
||||||
onSkip: () => void;
|
onSkip: () => void;
|
||||||
|
|
@ -42,6 +45,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
||||||
periodicEnabled,
|
periodicEnabled,
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
enableSummary,
|
enableSummary,
|
||||||
|
enableVisionLlm,
|
||||||
isStartingIndexing,
|
isStartingIndexing,
|
||||||
isFromOAuth = false,
|
isFromOAuth = false,
|
||||||
onStartDateChange,
|
onStartDateChange,
|
||||||
|
|
@ -49,6 +53,7 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
||||||
onPeriodicEnabledChange,
|
onPeriodicEnabledChange,
|
||||||
onFrequencyChange,
|
onFrequencyChange,
|
||||||
onEnableSummaryChange,
|
onEnableSummaryChange,
|
||||||
|
onEnableVisionLlmChange,
|
||||||
onConfigChange,
|
onConfigChange,
|
||||||
onStartIndexing,
|
onStartIndexing,
|
||||||
onSkip,
|
onSkip,
|
||||||
|
|
@ -158,6 +163,14 @@ export const IndexingConfigurationView: FC<IndexingConfigurationViewProps> = ({
|
||||||
{/* AI Summary toggle */}
|
{/* AI Summary toggle */}
|
||||||
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
|
<SummaryConfig enabled={enableSummary} onEnabledChange={onEnableSummaryChange} />
|
||||||
|
|
||||||
|
{/* Vision LLM toggle - only for file-based connectors */}
|
||||||
|
{(config.connectorType === "GOOGLE_DRIVE_CONNECTOR" ||
|
||||||
|
config.connectorType === "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" ||
|
||||||
|
config.connectorType === "DROPBOX_CONNECTOR" ||
|
||||||
|
config.connectorType === "ONEDRIVE_CONNECTOR") && (
|
||||||
|
<VisionLLMConfig enabled={enableVisionLlm} onEnabledChange={onEnableVisionLlmChange} />
|
||||||
|
)}
|
||||||
|
|
||||||
{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
|
{/* Date range selector - not shown for file-based connectors (Drive, Dropbox, OneDrive), Webcrawler, GitHub, or Local Folder */}
|
||||||
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
{config.connectorType !== "GOOGLE_DRIVE_CONNECTOR" &&
|
||||||
config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
|
config.connectorType !== "COMPOSIO_GOOGLE_DRIVE_CONNECTOR" &&
|
||||||
|
|
|
||||||
|
|
@ -80,6 +80,7 @@ export const useConnectorDialog = () => {
|
||||||
const [periodicEnabled, setPeriodicEnabled] = useState(false);
|
const [periodicEnabled, setPeriodicEnabled] = useState(false);
|
||||||
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
|
const [frequencyMinutes, setFrequencyMinutes] = useState("1440");
|
||||||
const [enableSummary, setEnableSummary] = useState(false);
|
const [enableSummary, setEnableSummary] = useState(false);
|
||||||
|
const [enableVisionLlm, setEnableVisionLlm] = useState(false);
|
||||||
|
|
||||||
// Edit mode state
|
// Edit mode state
|
||||||
const [editingConnector, setEditingConnector] = useState<SearchSourceConnector | null>(null);
|
const [editingConnector, setEditingConnector] = useState<SearchSourceConnector | null>(null);
|
||||||
|
|
@ -621,6 +622,7 @@ export const useConnectorDialog = () => {
|
||||||
setPeriodicEnabled(false);
|
setPeriodicEnabled(false);
|
||||||
setFrequencyMinutes("1440");
|
setFrequencyMinutes("1440");
|
||||||
setEnableSummary(connector.enable_summary ?? false);
|
setEnableSummary(connector.enable_summary ?? false);
|
||||||
|
setEnableVisionLlm(connector.enable_vision_llm ?? false);
|
||||||
setStartDate(undefined);
|
setStartDate(undefined);
|
||||||
setEndDate(undefined);
|
setEndDate(undefined);
|
||||||
|
|
||||||
|
|
@ -763,12 +765,13 @@ export const useConnectorDialog = () => {
|
||||||
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
|
const endDateStr = endDate ? format(endDate, "yyyy-MM-dd") : undefined;
|
||||||
|
|
||||||
// Update connector with summary, periodic sync settings, and config changes
|
// Update connector with summary, periodic sync settings, and config changes
|
||||||
if (enableSummary || periodicEnabled || indexingConnectorConfig) {
|
if (enableSummary || enableVisionLlm || periodicEnabled || indexingConnectorConfig) {
|
||||||
const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
|
const frequency = periodicEnabled ? parseInt(frequencyMinutes, 10) : undefined;
|
||||||
await updateConnector({
|
await updateConnector({
|
||||||
id: indexingConfig.connectorId,
|
id: indexingConfig.connectorId,
|
||||||
data: {
|
data: {
|
||||||
enable_summary: enableSummary,
|
enable_summary: enableSummary,
|
||||||
|
enable_vision_llm: enableVisionLlm,
|
||||||
...(periodicEnabled && {
|
...(periodicEnabled && {
|
||||||
periodic_indexing_enabled: true,
|
periodic_indexing_enabled: true,
|
||||||
indexing_frequency_minutes: frequency,
|
indexing_frequency_minutes: frequency,
|
||||||
|
|
@ -896,6 +899,7 @@ export const useConnectorDialog = () => {
|
||||||
periodicEnabled,
|
periodicEnabled,
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
enableSummary,
|
enableSummary,
|
||||||
|
enableVisionLlm,
|
||||||
indexingConnectorConfig,
|
indexingConnectorConfig,
|
||||||
setIsOpen,
|
setIsOpen,
|
||||||
]
|
]
|
||||||
|
|
@ -960,6 +964,7 @@ export const useConnectorDialog = () => {
|
||||||
setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
|
setPeriodicEnabled(!connector.is_indexable ? false : connector.periodic_indexing_enabled);
|
||||||
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
setFrequencyMinutes(connector.indexing_frequency_minutes?.toString() || "1440");
|
||||||
setEnableSummary(connector.enable_summary ?? false);
|
setEnableSummary(connector.enable_summary ?? false);
|
||||||
|
setEnableVisionLlm(connector.enable_vision_llm ?? false);
|
||||||
setStartDate(undefined);
|
setStartDate(undefined);
|
||||||
setEndDate(undefined);
|
setEndDate(undefined);
|
||||||
},
|
},
|
||||||
|
|
@ -1038,6 +1043,7 @@ export const useConnectorDialog = () => {
|
||||||
data: {
|
data: {
|
||||||
name: connectorName || editingConnector.name,
|
name: connectorName || editingConnector.name,
|
||||||
enable_summary: enableSummary,
|
enable_summary: enableSummary,
|
||||||
|
enable_vision_llm: enableVisionLlm,
|
||||||
periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
|
periodic_indexing_enabled: !editingConnector.is_indexable ? false : periodicEnabled,
|
||||||
indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
|
indexing_frequency_minutes: !editingConnector.is_indexable ? null : frequency,
|
||||||
config: connectorConfig || editingConnector.config,
|
config: connectorConfig || editingConnector.config,
|
||||||
|
|
@ -1172,6 +1178,7 @@ export const useConnectorDialog = () => {
|
||||||
periodicEnabled,
|
periodicEnabled,
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
enableSummary,
|
enableSummary,
|
||||||
|
enableVisionLlm,
|
||||||
getFrequencyLabel,
|
getFrequencyLabel,
|
||||||
connectorConfig,
|
connectorConfig,
|
||||||
connectorName,
|
connectorName,
|
||||||
|
|
@ -1332,6 +1339,7 @@ export const useConnectorDialog = () => {
|
||||||
setPeriodicEnabled(false);
|
setPeriodicEnabled(false);
|
||||||
setFrequencyMinutes("1440");
|
setFrequencyMinutes("1440");
|
||||||
setEnableSummary(false);
|
setEnableSummary(false);
|
||||||
|
setEnableVisionLlm(false);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
|
|
@ -1368,6 +1376,7 @@ export const useConnectorDialog = () => {
|
||||||
periodicEnabled,
|
periodicEnabled,
|
||||||
frequencyMinutes,
|
frequencyMinutes,
|
||||||
enableSummary,
|
enableSummary,
|
||||||
|
enableVisionLlm,
|
||||||
searchSpaceId,
|
searchSpaceId,
|
||||||
allConnectors,
|
allConnectors,
|
||||||
viewingAccountsType,
|
viewingAccountsType,
|
||||||
|
|
@ -1382,6 +1391,7 @@ export const useConnectorDialog = () => {
|
||||||
setPeriodicEnabled,
|
setPeriodicEnabled,
|
||||||
setFrequencyMinutes,
|
setFrequencyMinutes,
|
||||||
setEnableSummary,
|
setEnableSummary,
|
||||||
|
setEnableVisionLlm,
|
||||||
setConnectorName,
|
setConnectorName,
|
||||||
|
|
||||||
// Handlers
|
// Handlers
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,6 @@
|
||||||
"use client";
|
"use client";
|
||||||
|
|
||||||
import { FolderPlus, ListFilter, Search, Upload, X } from "lucide-react";
|
import { Download, FolderPlus, ListFilter, Loader2, Search, Upload, X } from "lucide-react";
|
||||||
import { useTranslations } from "next-intl";
|
import { useTranslations } from "next-intl";
|
||||||
import React, { useCallback, useMemo, useRef, useState } from "react";
|
import React, { useCallback, useMemo, useRef, useState } from "react";
|
||||||
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
|
import { useDocumentUploadDialog } from "@/components/assistant-ui/document-upload-popup";
|
||||||
|
|
@ -20,6 +20,8 @@ export function DocumentsFilters({
|
||||||
onToggleType,
|
onToggleType,
|
||||||
activeTypes,
|
activeTypes,
|
||||||
onCreateFolder,
|
onCreateFolder,
|
||||||
|
onExportKB,
|
||||||
|
isExporting,
|
||||||
}: {
|
}: {
|
||||||
typeCounts: Partial<Record<DocumentTypeEnum, number>>;
|
typeCounts: Partial<Record<DocumentTypeEnum, number>>;
|
||||||
onSearch: (v: string) => void;
|
onSearch: (v: string) => void;
|
||||||
|
|
@ -27,6 +29,8 @@ export function DocumentsFilters({
|
||||||
onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
|
onToggleType: (type: DocumentTypeEnum, checked: boolean) => void;
|
||||||
activeTypes: DocumentTypeEnum[];
|
activeTypes: DocumentTypeEnum[];
|
||||||
onCreateFolder?: () => void;
|
onCreateFolder?: () => void;
|
||||||
|
onExportKB?: () => void;
|
||||||
|
isExporting?: boolean;
|
||||||
}) {
|
}) {
|
||||||
const t = useTranslations("documents");
|
const t = useTranslations("documents");
|
||||||
const id = React.useId();
|
const id = React.useId();
|
||||||
|
|
@ -84,6 +88,31 @@ export function DocumentsFilters({
|
||||||
</Tooltip>
|
</Tooltip>
|
||||||
)}
|
)}
|
||||||
|
|
||||||
|
{onExportKB && (
|
||||||
|
<Tooltip>
|
||||||
|
<TooltipTrigger asChild>
|
||||||
|
<ToggleGroupItem
|
||||||
|
value="export"
|
||||||
|
disabled={isExporting}
|
||||||
|
className="h-9 w-9 shrink-0 border-sidebar-border text-sidebar-foreground/60 hover:text-sidebar-foreground hover:border-sidebar-border bg-sidebar"
|
||||||
|
onClick={(e) => {
|
||||||
|
e.preventDefault();
|
||||||
|
onExportKB();
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
{isExporting ? (
|
||||||
|
<Loader2 size={14} className="animate-spin" />
|
||||||
|
) : (
|
||||||
|
<Download size={14} />
|
||||||
|
)}
|
||||||
|
</ToggleGroupItem>
|
||||||
|
</TooltipTrigger>
|
||||||
|
<TooltipContent>
|
||||||
|
{isExporting ? "Exporting…" : "Export knowledge base"}
|
||||||
|
</TooltipContent>
|
||||||
|
</Tooltip>
|
||||||
|
)}
|
||||||
|
|
||||||
<Popover>
|
<Popover>
|
||||||
<Tooltip>
|
<Tooltip>
|
||||||
<TooltipTrigger asChild>
|
<TooltipTrigger asChild>
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@ import {
|
||||||
AlertCircle,
|
AlertCircle,
|
||||||
ChevronDown,
|
ChevronDown,
|
||||||
ChevronRight,
|
ChevronRight,
|
||||||
|
Download,
|
||||||
Eye,
|
Eye,
|
||||||
EyeOff,
|
EyeOff,
|
||||||
Folder,
|
Folder,
|
||||||
|
|
@ -80,6 +81,7 @@ interface FolderNodeProps {
|
||||||
isWatched?: boolean;
|
isWatched?: boolean;
|
||||||
onRescan?: (folder: FolderDisplay) => void | Promise<void>;
|
onRescan?: (folder: FolderDisplay) => void | Promise<void>;
|
||||||
onStopWatching?: (folder: FolderDisplay) => void;
|
onStopWatching?: (folder: FolderDisplay) => void;
|
||||||
|
onExportFolder?: (folder: FolderDisplay) => void;
|
||||||
}
|
}
|
||||||
|
|
||||||
function getDropZone(
|
function getDropZone(
|
||||||
|
|
@ -120,6 +122,7 @@ export const FolderNode = React.memo(function FolderNode({
|
||||||
isWatched,
|
isWatched,
|
||||||
onRescan,
|
onRescan,
|
||||||
onStopWatching,
|
onStopWatching,
|
||||||
|
onExportFolder,
|
||||||
}: FolderNodeProps) {
|
}: FolderNodeProps) {
|
||||||
const [renameValue, setRenameValue] = useState(folder.name);
|
const [renameValue, setRenameValue] = useState(folder.name);
|
||||||
const inputRef = useRef<HTMLInputElement>(null);
|
const inputRef = useRef<HTMLInputElement>(null);
|
||||||
|
|
@ -408,6 +411,17 @@ export const FolderNode = React.memo(function FolderNode({
|
||||||
<Move className="mr-2 h-4 w-4" />
|
<Move className="mr-2 h-4 w-4" />
|
||||||
Move to...
|
Move to...
|
||||||
</DropdownMenuItem>
|
</DropdownMenuItem>
|
||||||
|
{onExportFolder && (
|
||||||
|
<DropdownMenuItem
|
||||||
|
onClick={(e) => {
|
||||||
|
e.stopPropagation();
|
||||||
|
onExportFolder(folder);
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Download className="mr-2 h-4 w-4" />
|
||||||
|
Export folder
|
||||||
|
</DropdownMenuItem>
|
||||||
|
)}
|
||||||
<DropdownMenuItem
|
<DropdownMenuItem
|
||||||
onClick={(e) => {
|
onClick={(e) => {
|
||||||
e.stopPropagation();
|
e.stopPropagation();
|
||||||
|
|
@ -449,6 +463,12 @@ export const FolderNode = React.memo(function FolderNode({
|
||||||
<Move className="mr-2 h-4 w-4" />
|
<Move className="mr-2 h-4 w-4" />
|
||||||
Move to...
|
Move to...
|
||||||
</ContextMenuItem>
|
</ContextMenuItem>
|
||||||
|
{onExportFolder && (
|
||||||
|
<ContextMenuItem onClick={() => onExportFolder(folder)}>
|
||||||
|
<Download className="mr-2 h-4 w-4" />
|
||||||
|
Export folder
|
||||||
|
</ContextMenuItem>
|
||||||
|
)}
|
||||||
<ContextMenuItem onClick={() => onDelete(folder)}>
|
<ContextMenuItem onClick={() => onDelete(folder)}>
|
||||||
<Trash2 className="mr-2 h-4 w-4" />
|
<Trash2 className="mr-2 h-4 w-4" />
|
||||||
Delete
|
Delete
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,7 @@ interface FolderTreeViewProps {
|
||||||
watchedFolderIds?: Set<number>;
|
watchedFolderIds?: Set<number>;
|
||||||
onRescanFolder?: (folder: FolderDisplay) => void;
|
onRescanFolder?: (folder: FolderDisplay) => void;
|
||||||
onStopWatchingFolder?: (folder: FolderDisplay) => void;
|
onStopWatchingFolder?: (folder: FolderDisplay) => void;
|
||||||
|
onExportFolder?: (folder: FolderDisplay) => void;
|
||||||
}
|
}
|
||||||
|
|
||||||
function groupBy<T>(items: T[], keyFn: (item: T) => string | number): Record<string | number, T[]> {
|
function groupBy<T>(items: T[], keyFn: (item: T) => string | number): Record<string | number, T[]> {
|
||||||
|
|
@ -81,6 +82,7 @@ export function FolderTreeView({
|
||||||
watchedFolderIds,
|
watchedFolderIds,
|
||||||
onRescanFolder,
|
onRescanFolder,
|
||||||
onStopWatchingFolder,
|
onStopWatchingFolder,
|
||||||
|
onExportFolder,
|
||||||
}: FolderTreeViewProps) {
|
}: FolderTreeViewProps) {
|
||||||
const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]);
|
const foldersByParent = useMemo(() => groupBy(folders, (f) => f.parentId ?? "root"), [folders]);
|
||||||
|
|
||||||
|
|
@ -259,6 +261,7 @@ export function FolderTreeView({
|
||||||
isWatched={watchedFolderIds?.has(f.id)}
|
isWatched={watchedFolderIds?.has(f.id)}
|
||||||
onRescan={onRescanFolder}
|
onRescan={onRescanFolder}
|
||||||
onStopWatching={onStopWatchingFolder}
|
onStopWatching={onStopWatchingFolder}
|
||||||
|
onExportFolder={onExportFolder}
|
||||||
/>
|
/>
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -406,6 +406,160 @@ export function DocumentsSidebar({
|
||||||
setFolderPickerOpen(true);
|
setFolderPickerOpen(true);
|
||||||
}, []);
|
}, []);
|
||||||
|
|
||||||
|
const [isExportingKB, setIsExportingKB] = useState(false);
|
||||||
|
const [exportWarningOpen, setExportWarningOpen] = useState(false);
|
||||||
|
const [exportWarningContext, setExportWarningContext] = useState<{
|
||||||
|
type: "kb" | "folder";
|
||||||
|
folder?: FolderDisplay;
|
||||||
|
pendingCount: number;
|
||||||
|
} | null>(null);
|
||||||
|
|
||||||
|
const pendingDocuments = useMemo(
|
||||||
|
() =>
|
||||||
|
treeDocuments.filter(
|
||||||
|
(d) => d.status?.state === "pending" || d.status?.state === "processing"
|
||||||
|
),
|
||||||
|
[treeDocuments]
|
||||||
|
);
|
||||||
|
|
||||||
|
const doExport = useCallback(async (url: string, downloadName: string) => {
|
||||||
|
const response = await authenticatedFetch(url, { method: "GET" });
|
||||||
|
if (!response.ok) {
|
||||||
|
const errorData = await response.json().catch(() => ({ detail: "Export failed" }));
|
||||||
|
throw new Error(errorData.detail || "Export failed");
|
||||||
|
}
|
||||||
|
|
||||||
|
const blob = await response.blob();
|
||||||
|
const blobUrl = URL.createObjectURL(blob);
|
||||||
|
const a = document.createElement("a");
|
||||||
|
a.href = blobUrl;
|
||||||
|
a.download = downloadName;
|
||||||
|
document.body.appendChild(a);
|
||||||
|
a.click();
|
||||||
|
document.body.removeChild(a);
|
||||||
|
URL.revokeObjectURL(blobUrl);
|
||||||
|
}, []);
|
||||||
|
|
||||||
|
const handleExportKB = useCallback(async () => {
|
||||||
|
if (isExportingKB) return;
|
||||||
|
|
||||||
|
if (pendingDocuments.length > 0) {
|
||||||
|
setExportWarningContext({ type: "kb", pendingCount: pendingDocuments.length });
|
||||||
|
setExportWarningOpen(true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setIsExportingKB(true);
|
||||||
|
try {
|
||||||
|
await doExport(
|
||||||
|
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
|
||||||
|
"knowledge-base.zip"
|
||||||
|
);
|
||||||
|
toast.success("Knowledge base exported");
|
||||||
|
} catch (err) {
|
||||||
|
console.error("KB export failed:", err);
|
||||||
|
toast.error(err instanceof Error ? err.message : "Export failed");
|
||||||
|
} finally {
|
||||||
|
setIsExportingKB(false);
|
||||||
|
}
|
||||||
|
}, [searchSpaceId, isExportingKB, pendingDocuments.length, doExport]);
|
||||||
|
|
||||||
|
const handleExportWarningConfirm = useCallback(async () => {
|
||||||
|
setExportWarningOpen(false);
|
||||||
|
const ctx = exportWarningContext;
|
||||||
|
if (!ctx) return;
|
||||||
|
|
||||||
|
if (ctx.type === "kb") {
|
||||||
|
setIsExportingKB(true);
|
||||||
|
try {
|
||||||
|
await doExport(
|
||||||
|
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export`,
|
||||||
|
"knowledge-base.zip"
|
||||||
|
);
|
||||||
|
toast.success("Knowledge base exported");
|
||||||
|
} catch (err) {
|
||||||
|
console.error("KB export failed:", err);
|
||||||
|
toast.error(err instanceof Error ? err.message : "Export failed");
|
||||||
|
} finally {
|
||||||
|
setIsExportingKB(false);
|
||||||
|
}
|
||||||
|
} else if (ctx.type === "folder" && ctx.folder) {
|
||||||
|
setIsExportingKB(true);
|
||||||
|
try {
|
||||||
|
const safeName =
|
||||||
|
ctx.folder.name
|
||||||
|
.replace(/[^a-zA-Z0-9 _-]/g, "_")
|
||||||
|
.trim()
|
||||||
|
.slice(0, 80) || "folder";
|
||||||
|
await doExport(
|
||||||
|
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${ctx.folder.id}`,
|
||||||
|
`${safeName}.zip`
|
||||||
|
);
|
||||||
|
toast.success(`Folder "${ctx.folder.name}" exported`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Folder export failed:", err);
|
||||||
|
toast.error(err instanceof Error ? err.message : "Export failed");
|
||||||
|
} finally {
|
||||||
|
setIsExportingKB(false);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
setExportWarningContext(null);
|
||||||
|
}, [exportWarningContext, searchSpaceId, doExport]);
|
||||||
|
|
||||||
|
const getPendingCountInSubtree = useCallback(
|
||||||
|
(folderId: number): number => {
|
||||||
|
const subtreeIds = new Set<number>();
|
||||||
|
function collect(id: number) {
|
||||||
|
subtreeIds.add(id);
|
||||||
|
for (const child of foldersByParent[String(id)] ?? []) {
|
||||||
|
collect(child.id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
collect(folderId);
|
||||||
|
return treeDocuments.filter(
|
||||||
|
(d) =>
|
||||||
|
subtreeIds.has(d.folderId ?? -1) &&
|
||||||
|
(d.status?.state === "pending" || d.status?.state === "processing")
|
||||||
|
).length;
|
||||||
|
},
|
||||||
|
[foldersByParent, treeDocuments]
|
||||||
|
);
|
||||||
|
|
||||||
|
const handleExportFolder = useCallback(
|
||||||
|
async (folder: FolderDisplay) => {
|
||||||
|
const folderPendingCount = getPendingCountInSubtree(folder.id);
|
||||||
|
if (folderPendingCount > 0) {
|
||||||
|
setExportWarningContext({
|
||||||
|
type: "folder",
|
||||||
|
folder,
|
||||||
|
pendingCount: folderPendingCount,
|
||||||
|
});
|
||||||
|
setExportWarningOpen(true);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
setIsExportingKB(true);
|
||||||
|
try {
|
||||||
|
const safeName =
|
||||||
|
folder.name
|
||||||
|
.replace(/[^a-zA-Z0-9 _-]/g, "_")
|
||||||
|
.trim()
|
||||||
|
.slice(0, 80) || "folder";
|
||||||
|
await doExport(
|
||||||
|
`${process.env.NEXT_PUBLIC_FASTAPI_BACKEND_URL}/api/v1/search-spaces/${searchSpaceId}/export?folder_id=${folder.id}`,
|
||||||
|
`${safeName}.zip`
|
||||||
|
);
|
||||||
|
toast.success(`Folder "${folder.name}" exported`);
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Folder export failed:", err);
|
||||||
|
toast.error(err instanceof Error ? err.message : "Export failed");
|
||||||
|
} finally {
|
||||||
|
setIsExportingKB(false);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
[searchSpaceId, getPendingCountInSubtree, doExport]
|
||||||
|
);
|
||||||
|
|
||||||
const handleExportDocument = useCallback(
|
const handleExportDocument = useCallback(
|
||||||
async (doc: DocumentNodeDoc, format: string) => {
|
async (doc: DocumentNodeDoc, format: string) => {
|
||||||
const safeTitle =
|
const safeTitle =
|
||||||
|
|
@ -800,6 +954,8 @@ export function DocumentsSidebar({
|
||||||
onToggleType={onToggleType}
|
onToggleType={onToggleType}
|
||||||
activeTypes={activeTypes}
|
activeTypes={activeTypes}
|
||||||
onCreateFolder={() => handleCreateFolder(null)}
|
onCreateFolder={() => handleCreateFolder(null)}
|
||||||
|
onExportKB={handleExportKB}
|
||||||
|
isExporting={isExportingKB}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
@ -855,6 +1011,7 @@ export function DocumentsSidebar({
|
||||||
watchedFolderIds={watchedFolderIds}
|
watchedFolderIds={watchedFolderIds}
|
||||||
onRescanFolder={handleRescanFolder}
|
onRescanFolder={handleRescanFolder}
|
||||||
onStopWatchingFolder={handleStopWatching}
|
onStopWatchingFolder={handleStopWatching}
|
||||||
|
onExportFolder={handleExportFolder}
|
||||||
/>
|
/>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
@ -933,6 +1090,33 @@ export function DocumentsSidebar({
|
||||||
</AlertDialogFooter>
|
</AlertDialogFooter>
|
||||||
</AlertDialogContent>
|
</AlertDialogContent>
|
||||||
</AlertDialog>
|
</AlertDialog>
|
||||||
|
|
||||||
|
<AlertDialog
|
||||||
|
open={exportWarningOpen}
|
||||||
|
onOpenChange={(open) => {
|
||||||
|
if (!open) {
|
||||||
|
setExportWarningOpen(false);
|
||||||
|
setExportWarningContext(null);
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<AlertDialogContent>
|
||||||
|
<AlertDialogHeader>
|
||||||
|
<AlertDialogTitle>Some documents are still processing</AlertDialogTitle>
|
||||||
|
<AlertDialogDescription>
|
||||||
|
{exportWarningContext?.pendingCount} document
|
||||||
|
{exportWarningContext?.pendingCount !== 1 ? "s are" : " is"} currently being processed
|
||||||
|
and will be excluded from the export. Do you want to continue?
|
||||||
|
</AlertDialogDescription>
|
||||||
|
</AlertDialogHeader>
|
||||||
|
<AlertDialogFooter>
|
||||||
|
<AlertDialogCancel>Cancel</AlertDialogCancel>
|
||||||
|
<AlertDialogAction onClick={handleExportWarningConfirm}>
|
||||||
|
Export anyway
|
||||||
|
</AlertDialogAction>
|
||||||
|
</AlertDialogFooter>
|
||||||
|
</AlertDialogContent>
|
||||||
|
</AlertDialog>
|
||||||
</>
|
</>
|
||||||
);
|
);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -26,6 +26,7 @@ import { Progress } from "@/components/ui/progress";
|
||||||
import { Spinner } from "@/components/ui/spinner";
|
import { Spinner } from "@/components/ui/spinner";
|
||||||
import { Switch } from "@/components/ui/switch";
|
import { Switch } from "@/components/ui/switch";
|
||||||
import { useElectronAPI } from "@/hooks/use-platform";
|
import { useElectronAPI } from "@/hooks/use-platform";
|
||||||
|
import { documentsApiService } from "@/lib/apis/documents-api.service";
|
||||||
import {
|
import {
|
||||||
trackDocumentUploadFailure,
|
trackDocumentUploadFailure,
|
||||||
trackDocumentUploadStarted,
|
trackDocumentUploadStarted,
|
||||||
|
|
@ -48,6 +49,77 @@ interface FileWithId {
|
||||||
file: File;
|
file: File;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
interface FolderEntry {
|
||||||
|
id: string;
|
||||||
|
file: File;
|
||||||
|
relativePath: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
interface FolderUploadData {
|
||||||
|
folderName: string;
|
||||||
|
entries: FolderEntry[];
|
||||||
|
}
|
||||||
|
|
||||||
|
interface FolderTreeNode {
|
||||||
|
name: string;
|
||||||
|
isFolder: boolean;
|
||||||
|
size?: number;
|
||||||
|
children: FolderTreeNode[];
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildFolderTree(entries: FolderEntry[]): FolderTreeNode[] {
|
||||||
|
const root: FolderTreeNode = { name: "", isFolder: true, children: [] };
|
||||||
|
|
||||||
|
for (const entry of entries) {
|
||||||
|
const parts = entry.relativePath.split("/");
|
||||||
|
let current = root;
|
||||||
|
|
||||||
|
for (let i = 0; i < parts.length - 1; i++) {
|
||||||
|
let child = current.children.find((c) => c.name === parts[i] && c.isFolder);
|
||||||
|
if (!child) {
|
||||||
|
child = { name: parts[i], isFolder: true, children: [] };
|
||||||
|
current.children.push(child);
|
||||||
|
}
|
||||||
|
current = child;
|
||||||
|
}
|
||||||
|
|
||||||
|
current.children.push({
|
||||||
|
name: parts[parts.length - 1],
|
||||||
|
isFolder: false,
|
||||||
|
size: entry.file.size,
|
||||||
|
children: [],
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function sortNodes(node: FolderTreeNode) {
|
||||||
|
node.children.sort((a, b) => {
|
||||||
|
if (a.isFolder !== b.isFolder) return a.isFolder ? -1 : 1;
|
||||||
|
return a.name.localeCompare(b.name);
|
||||||
|
});
|
||||||
|
for (const child of node.children) sortNodes(child);
|
||||||
|
}
|
||||||
|
sortNodes(root);
|
||||||
|
|
||||||
|
return root.children;
|
||||||
|
}
|
||||||
|
|
||||||
|
function flattenTree(
|
||||||
|
nodes: FolderTreeNode[],
|
||||||
|
depth = 0
|
||||||
|
): { name: string; isFolder: boolean; depth: number; size?: number }[] {
|
||||||
|
const items: { name: string; isFolder: boolean; depth: number; size?: number }[] = [];
|
||||||
|
for (const node of nodes) {
|
||||||
|
items.push({ name: node.name, isFolder: node.isFolder, depth, size: node.size });
|
||||||
|
if (node.isFolder && node.children.length > 0) {
|
||||||
|
items.push(...flattenTree(node.children, depth + 1));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return items;
|
||||||
|
}
|
||||||
|
|
||||||
|
const FOLDER_BATCH_SIZE_BYTES = 20 * 1024 * 1024;
|
||||||
|
const FOLDER_BATCH_MAX_FILES = 10;
|
||||||
|
|
||||||
const MAX_FILE_SIZE_MB = 500;
|
const MAX_FILE_SIZE_MB = 500;
|
||||||
const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
|
const MAX_FILE_SIZE_BYTES = MAX_FILE_SIZE_MB * 1024 * 1024;
|
||||||
|
|
||||||
|
|
@ -64,11 +136,14 @@ export function DocumentUploadTab({
|
||||||
const [uploadProgress, setUploadProgress] = useState(0);
|
const [uploadProgress, setUploadProgress] = useState(0);
|
||||||
const [accordionValue, setAccordionValue] = useState<string>("");
|
const [accordionValue, setAccordionValue] = useState<string>("");
|
||||||
const [shouldSummarize, setShouldSummarize] = useState(false);
|
const [shouldSummarize, setShouldSummarize] = useState(false);
|
||||||
|
const [useVisionLlm, setUseVisionLlm] = useState(false);
|
||||||
const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
|
const [uploadDocumentMutation] = useAtom(uploadDocumentMutationAtom);
|
||||||
const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
|
const { mutate: uploadDocuments, isPending: isUploading } = uploadDocumentMutation;
|
||||||
const fileInputRef = useRef<HTMLInputElement>(null);
|
const fileInputRef = useRef<HTMLInputElement>(null);
|
||||||
const folderInputRef = useRef<HTMLInputElement>(null);
|
const folderInputRef = useRef<HTMLInputElement>(null);
|
||||||
const progressIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
const progressIntervalRef = useRef<ReturnType<typeof setInterval> | null>(null);
|
||||||
|
const [folderUpload, setFolderUpload] = useState<FolderUploadData | null>(null);
|
||||||
|
const [isFolderUploading, setIsFolderUploading] = useState(false);
|
||||||
|
|
||||||
useEffect(() => {
|
useEffect(() => {
|
||||||
return () => {
|
return () => {
|
||||||
|
|
@ -105,6 +180,7 @@ export function DocumentUploadTab({
|
||||||
const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
|
const valid = incoming.filter((f) => f.size <= MAX_FILE_SIZE_BYTES);
|
||||||
if (valid.length === 0) return;
|
if (valid.length === 0) return;
|
||||||
|
|
||||||
|
setFolderUpload(null);
|
||||||
setFiles((prev) => {
|
setFiles((prev) => {
|
||||||
const newEntries = valid.map((f) => ({
|
const newEntries = valid.map((f) => ({
|
||||||
id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
|
id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
|
||||||
|
|
@ -159,6 +235,7 @@ export function DocumentUploadTab({
|
||||||
file: new File([fd.data], fd.name, { type: fd.mimeType }),
|
file: new File([fd.data], fd.name, { type: fd.mimeType }),
|
||||||
})
|
})
|
||||||
);
|
);
|
||||||
|
setFolderUpload(null);
|
||||||
setFiles((prev) => [...prev, ...newFiles]);
|
setFiles((prev) => [...prev, ...newFiles]);
|
||||||
}, [electronAPI, supportedExtensionsSet, t]);
|
}, [electronAPI, supportedExtensionsSet, t]);
|
||||||
|
|
||||||
|
|
@ -167,18 +244,35 @@ export function DocumentUploadTab({
|
||||||
const fileList = e.target.files;
|
const fileList = e.target.files;
|
||||||
if (!fileList || fileList.length === 0) return;
|
if (!fileList || fileList.length === 0) return;
|
||||||
|
|
||||||
const folderFiles = Array.from(fileList).filter((f) => {
|
const allFiles = Array.from(fileList);
|
||||||
const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
|
const firstPath = allFiles[0]?.webkitRelativePath || "";
|
||||||
return ext !== "" && supportedExtensionsSet.has(ext);
|
const folderName = firstPath.split("/")[0];
|
||||||
});
|
|
||||||
|
|
||||||
if (folderFiles.length === 0) {
|
if (!folderName) {
|
||||||
|
addFiles(allFiles);
|
||||||
|
e.target.value = "";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const entries: FolderEntry[] = allFiles
|
||||||
|
.filter((f) => {
|
||||||
|
const ext = f.name.includes(".") ? `.${f.name.split(".").pop()?.toLowerCase()}` : "";
|
||||||
|
return ext !== "" && supportedExtensionsSet.has(ext);
|
||||||
|
})
|
||||||
|
.map((f) => ({
|
||||||
|
id: crypto.randomUUID?.() ?? `file-${Date.now()}-${Math.random().toString(36)}`,
|
||||||
|
file: f,
|
||||||
|
relativePath: f.webkitRelativePath.substring(folderName.length + 1),
|
||||||
|
}));
|
||||||
|
|
||||||
|
if (entries.length === 0) {
|
||||||
toast.error(t("no_supported_files_in_folder"));
|
toast.error(t("no_supported_files_in_folder"));
|
||||||
e.target.value = "";
|
e.target.value = "";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
addFiles(folderFiles);
|
setFiles([]);
|
||||||
|
setFolderUpload({ folderName, entries });
|
||||||
e.target.value = "";
|
e.target.value = "";
|
||||||
},
|
},
|
||||||
[addFiles, supportedExtensionsSet, t]
|
[addFiles, supportedExtensionsSet, t]
|
||||||
|
|
@ -192,9 +286,18 @@ export function DocumentUploadTab({
|
||||||
return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`;
|
return `${parseFloat((bytes / k ** i).toFixed(2))} ${sizes[i]}`;
|
||||||
};
|
};
|
||||||
|
|
||||||
const totalFileSize = files.reduce((total, entry) => total + entry.file.size, 0);
|
const totalFileSize = folderUpload
|
||||||
|
? folderUpload.entries.reduce((total, entry) => total + entry.file.size, 0)
|
||||||
|
: files.reduce((total, entry) => total + entry.file.size, 0);
|
||||||
|
|
||||||
const hasContent = files.length > 0;
|
const fileCount = folderUpload ? folderUpload.entries.length : files.length;
|
||||||
|
const hasContent = files.length > 0 || folderUpload !== null;
|
||||||
|
const isAnyUploading = isUploading || isFolderUploading;
|
||||||
|
|
||||||
|
const folderTreeItems = useMemo(() => {
|
||||||
|
if (!folderUpload) return [];
|
||||||
|
return flattenTree(buildFolderTree(folderUpload.entries));
|
||||||
|
}, [folderUpload]);
|
||||||
|
|
||||||
const handleAccordionChange = useCallback(
|
const handleAccordionChange = useCallback(
|
||||||
(value: string) => {
|
(value: string) => {
|
||||||
|
|
@ -204,7 +307,95 @@ export function DocumentUploadTab({
|
||||||
[onAccordionStateChange]
|
[onAccordionStateChange]
|
||||||
);
|
);
|
||||||
|
|
||||||
|
const handleFolderUpload = async () => {
|
||||||
|
if (!folderUpload) return;
|
||||||
|
|
||||||
|
setUploadProgress(0);
|
||||||
|
setIsFolderUploading(true);
|
||||||
|
const total = folderUpload.entries.length;
|
||||||
|
trackDocumentUploadStarted(Number(searchSpaceId), total, totalFileSize);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const batches: FolderEntry[][] = [];
|
||||||
|
let currentBatch: FolderEntry[] = [];
|
||||||
|
let currentSize = 0;
|
||||||
|
|
||||||
|
for (const entry of folderUpload.entries) {
|
||||||
|
const size = entry.file.size;
|
||||||
|
|
||||||
|
if (size >= FOLDER_BATCH_SIZE_BYTES) {
|
||||||
|
if (currentBatch.length > 0) {
|
||||||
|
batches.push(currentBatch);
|
||||||
|
currentBatch = [];
|
||||||
|
currentSize = 0;
|
||||||
|
}
|
||||||
|
batches.push([entry]);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (
|
||||||
|
currentBatch.length >= FOLDER_BATCH_MAX_FILES ||
|
||||||
|
currentSize + size > FOLDER_BATCH_SIZE_BYTES
|
||||||
|
) {
|
||||||
|
batches.push(currentBatch);
|
||||||
|
currentBatch = [];
|
||||||
|
currentSize = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentBatch.push(entry);
|
||||||
|
currentSize += size;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentBatch.length > 0) {
|
||||||
|
batches.push(currentBatch);
|
||||||
|
}
|
||||||
|
|
||||||
|
let rootFolderId: number | null = null;
|
||||||
|
let uploaded = 0;
|
||||||
|
|
||||||
|
for (const batch of batches) {
|
||||||
|
const result = await documentsApiService.folderUploadFiles(
|
||||||
|
batch.map((e) => e.file),
|
||||||
|
{
|
||||||
|
folder_name: folderUpload.folderName,
|
||||||
|
search_space_id: Number(searchSpaceId),
|
||||||
|
relative_paths: batch.map((e) => e.relativePath),
|
||||||
|
root_folder_id: rootFolderId,
|
||||||
|
enable_summary: shouldSummarize,
|
||||||
|
use_vision_llm: useVisionLlm,
|
||||||
|
}
|
||||||
|
);
|
||||||
|
|
||||||
|
if (result.root_folder_id && !rootFolderId) {
|
||||||
|
rootFolderId = result.root_folder_id;
|
||||||
|
}
|
||||||
|
|
||||||
|
uploaded += batch.length;
|
||||||
|
setUploadProgress(Math.round((uploaded / total) * 100));
|
||||||
|
}
|
||||||
|
|
||||||
|
trackDocumentUploadSuccess(Number(searchSpaceId), total);
|
||||||
|
toast(t("upload_initiated"), { description: t("upload_initiated_desc") });
|
||||||
|
setFolderUpload(null);
|
||||||
|
onSuccess?.();
|
||||||
|
} catch (error) {
|
||||||
|
const message = error instanceof Error ? error.message : "Upload failed";
|
||||||
|
trackDocumentUploadFailure(Number(searchSpaceId), message);
|
||||||
|
toast(t("upload_error"), {
|
||||||
|
description: `${t("upload_error_desc")}: ${message}`,
|
||||||
|
});
|
||||||
|
} finally {
|
||||||
|
setIsFolderUploading(false);
|
||||||
|
setUploadProgress(0);
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
const handleUpload = async () => {
|
const handleUpload = async () => {
|
||||||
|
if (folderUpload) {
|
||||||
|
await handleFolderUpload();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
setUploadProgress(0);
|
setUploadProgress(0);
|
||||||
trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize);
|
trackDocumentUploadStarted(Number(searchSpaceId), files.length, totalFileSize);
|
||||||
|
|
||||||
|
|
@ -218,6 +409,7 @@ export function DocumentUploadTab({
|
||||||
files: rawFiles,
|
files: rawFiles,
|
||||||
search_space_id: Number(searchSpaceId),
|
search_space_id: Number(searchSpaceId),
|
||||||
should_summarize: shouldSummarize,
|
should_summarize: shouldSummarize,
|
||||||
|
use_vision_llm: useVisionLlm,
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
onSuccess: () => {
|
onSuccess: () => {
|
||||||
|
|
@ -341,28 +533,35 @@ export function DocumentUploadTab({
|
||||||
</button>
|
</button>
|
||||||
)
|
)
|
||||||
) : (
|
) : (
|
||||||
<button
|
<div
|
||||||
type="button"
|
role="button"
|
||||||
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent border-none"
|
tabIndex={0}
|
||||||
onClick={() => {
|
className="flex flex-col items-center gap-4 py-12 px-4 cursor-pointer w-full bg-transparent outline-none select-none"
|
||||||
|
onClick={() => {
|
||||||
|
if (!isElectron) fileInputRef.current?.click();
|
||||||
|
}}
|
||||||
|
onKeyDown={(e) => {
|
||||||
|
if (e.key === "Enter" || e.key === " ") {
|
||||||
|
e.preventDefault();
|
||||||
if (!isElectron) fileInputRef.current?.click();
|
if (!isElectron) fileInputRef.current?.click();
|
||||||
}}
|
}
|
||||||
|
}}
|
||||||
|
>
|
||||||
|
<Upload className="h-10 w-10 text-muted-foreground" />
|
||||||
|
<div className="text-center space-y-1.5">
|
||||||
|
<p className="text-base font-medium">
|
||||||
|
{isElectron ? t("select_files_or_folder") : t("tap_select_files_or_folder")}
|
||||||
|
</p>
|
||||||
|
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
|
||||||
|
</div>
|
||||||
|
<fieldset
|
||||||
|
className="w-full mt-1 border-none p-0 m-0"
|
||||||
|
onClick={(e) => e.stopPropagation()}
|
||||||
|
onKeyDown={(e) => e.stopPropagation()}
|
||||||
>
|
>
|
||||||
<Upload className="h-10 w-10 text-muted-foreground" />
|
{renderBrowseButton({ fullWidth: true })}
|
||||||
<div className="text-center space-y-1.5">
|
</fieldset>
|
||||||
<p className="text-base font-medium">
|
</div>
|
||||||
{isElectron ? "Select files or folder" : "Tap to select files or folder"}
|
|
||||||
</p>
|
|
||||||
<p className="text-sm text-muted-foreground">{t("file_size_limit")}</p>
|
|
||||||
</div>
|
|
||||||
<fieldset
|
|
||||||
className="w-full mt-1 border-none p-0 m-0"
|
|
||||||
onClick={(e) => e.stopPropagation()}
|
|
||||||
onKeyDown={(e) => e.stopPropagation()}
|
|
||||||
>
|
|
||||||
{renderBrowseButton({ fullWidth: true })}
|
|
||||||
</fieldset>
|
|
||||||
</button>
|
|
||||||
)}
|
)}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
@ -398,55 +597,92 @@ export function DocumentUploadTab({
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{/* FILES SELECTED */}
|
{/* FILES SELECTED */}
|
||||||
{files.length > 0 && (
|
{hasContent && (
|
||||||
<div className="rounded-lg border border-border p-3 space-y-2">
|
<div className="rounded-lg border border-border p-3 space-y-2">
|
||||||
<div className="flex items-center justify-between">
|
<div className="flex items-center justify-between">
|
||||||
<p className="text-sm font-medium">
|
<p className="text-sm font-medium">
|
||||||
{t("selected_files", { count: files.length })}
|
{folderUpload ? (
|
||||||
<Dot className="inline h-4 w-4" />
|
<>
|
||||||
{formatFileSize(totalFileSize)}
|
<FolderOpen className="inline h-4 w-4 mr-1 -mt-0.5" />
|
||||||
|
{folderUpload.folderName}
|
||||||
|
<Dot className="inline h-4 w-4" />
|
||||||
|
{folderUpload.entries.length}{" "}
|
||||||
|
{folderUpload.entries.length === 1 ? "file" : "files"}
|
||||||
|
<Dot className="inline h-4 w-4" />
|
||||||
|
{formatFileSize(totalFileSize)}
|
||||||
|
</>
|
||||||
|
) : (
|
||||||
|
<>
|
||||||
|
{t("selected_files", { count: files.length })}
|
||||||
|
<Dot className="inline h-4 w-4" />
|
||||||
|
{formatFileSize(totalFileSize)}
|
||||||
|
</>
|
||||||
|
)}
|
||||||
</p>
|
</p>
|
||||||
<Button
|
<Button
|
||||||
variant="ghost"
|
variant="ghost"
|
||||||
size="sm"
|
size="sm"
|
||||||
className="h-7 text-xs text-muted-foreground hover:text-foreground"
|
className="h-7 text-xs text-muted-foreground hover:text-foreground"
|
||||||
onClick={() => setFiles([])}
|
onClick={() => {
|
||||||
disabled={isUploading}
|
setFiles([]);
|
||||||
|
setFolderUpload(null);
|
||||||
|
}}
|
||||||
|
disabled={isAnyUploading}
|
||||||
>
|
>
|
||||||
{t("clear_all")}
|
{t("clear_all")}
|
||||||
</Button>
|
</Button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<div className="max-h-[160px] sm:max-h-[200px] overflow-y-auto -mx-1">
|
<div className="max-h-[160px] sm:max-h-[200px] overflow-y-auto -mx-1">
|
||||||
{files.map((entry) => (
|
{folderUpload
|
||||||
<div
|
? folderTreeItems.map((item, i) => (
|
||||||
key={entry.id}
|
<div
|
||||||
className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
|
key={`${item.depth}-${i}-${item.name}`}
|
||||||
>
|
className="flex items-center gap-1.5 py-0.5 px-2"
|
||||||
<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
|
style={{ paddingLeft: `${item.depth * 16 + 8}px` }}
|
||||||
{entry.file.name.split(".").pop() || "?"}
|
>
|
||||||
</span>
|
{item.isFolder ? (
|
||||||
<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
|
<FolderOpen className="h-3.5 w-3.5 text-blue-400 shrink-0" />
|
||||||
<span className="text-xs text-muted-foreground shrink-0">
|
) : (
|
||||||
{formatFileSize(entry.file.size)}
|
<FileIcon className="h-3.5 w-3.5 text-muted-foreground shrink-0" />
|
||||||
</span>
|
)}
|
||||||
<Button
|
<span className="text-sm truncate flex-1 min-w-0">{item.name}</span>
|
||||||
variant="ghost"
|
{!item.isFolder && item.size != null && (
|
||||||
size="icon"
|
<span className="text-xs text-muted-foreground shrink-0">
|
||||||
className="h-6 w-6 shrink-0"
|
{formatFileSize(item.size)}
|
||||||
onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
|
</span>
|
||||||
disabled={isUploading}
|
)}
|
||||||
>
|
</div>
|
||||||
<X className="h-3 w-3" />
|
))
|
||||||
</Button>
|
: files.map((entry) => (
|
||||||
</div>
|
<div
|
||||||
))}
|
key={entry.id}
|
||||||
|
className="flex items-center gap-2 py-1.5 px-2 rounded-md hover:bg-slate-400/5 dark:hover:bg-white/5 group"
|
||||||
|
>
|
||||||
|
<span className="text-[10px] font-medium uppercase leading-none bg-muted px-1.5 py-0.5 rounded text-muted-foreground shrink-0">
|
||||||
|
{entry.file.name.split(".").pop() || "?"}
|
||||||
|
</span>
|
||||||
|
<span className="text-sm truncate flex-1 min-w-0">{entry.file.name}</span>
|
||||||
|
<span className="text-xs text-muted-foreground shrink-0">
|
||||||
|
{formatFileSize(entry.file.size)}
|
||||||
|
</span>
|
||||||
|
<Button
|
||||||
|
variant="ghost"
|
||||||
|
size="icon"
|
||||||
|
className="h-6 w-6 shrink-0"
|
||||||
|
onClick={() => setFiles((prev) => prev.filter((e) => e.id !== entry.id))}
|
||||||
|
disabled={isAnyUploading}
|
||||||
|
>
|
||||||
|
<X className="h-3 w-3" />
|
||||||
|
</Button>
|
||||||
|
</div>
|
||||||
|
))}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
{isUploading && (
|
{isAnyUploading && (
|
||||||
<div className="space-y-1">
|
<div className="space-y-1">
|
||||||
<div className="flex items-center justify-between text-xs">
|
<div className="flex items-center justify-between text-xs">
|
||||||
<span>{t("uploading_files")}</span>
|
<span>{folderUpload ? t("uploading_folder") : t("uploading_files")}</span>
|
||||||
<span>{Math.round(uploadProgress)}%</span>
|
<span>{Math.round(uploadProgress)}%</span>
|
||||||
</div>
|
</div>
|
||||||
<Progress value={uploadProgress} className="h-1.5" />
|
<Progress value={uploadProgress} className="h-1.5" />
|
||||||
|
|
@ -463,19 +699,31 @@ export function DocumentUploadTab({
|
||||||
<Switch checked={shouldSummarize} onCheckedChange={setShouldSummarize} />
|
<Switch checked={shouldSummarize} onCheckedChange={setShouldSummarize} />
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div className={toggleRowClass}>
|
||||||
|
<div className="space-y-0.5">
|
||||||
|
<p className="font-medium text-sm">Enable Vision LLM</p>
|
||||||
|
<p className="text-xs text-muted-foreground">
|
||||||
|
Describes images using AI vision (costly, slower)
|
||||||
|
</p>
|
||||||
|
</div>
|
||||||
|
<Switch checked={useVisionLlm} onCheckedChange={setUseVisionLlm} />
|
||||||
|
</div>
|
||||||
|
|
||||||
<Button
|
<Button
|
||||||
className="w-full"
|
className="w-full"
|
||||||
onClick={handleUpload}
|
onClick={handleUpload}
|
||||||
disabled={isUploading || files.length === 0}
|
disabled={isAnyUploading || fileCount === 0}
|
||||||
>
|
>
|
||||||
{isUploading ? (
|
{isAnyUploading ? (
|
||||||
<span className="flex items-center gap-2">
|
<span className="flex items-center gap-2">
|
||||||
<Spinner size="sm" />
|
<Spinner size="sm" />
|
||||||
{t("uploading")}
|
{t("uploading")}
|
||||||
</span>
|
</span>
|
||||||
) : (
|
) : (
|
||||||
<span className="flex items-center gap-2">
|
<span className="flex items-center gap-2">
|
||||||
{t("upload_button", { count: files.length })}
|
{folderUpload
|
||||||
|
? t("upload_folder_button", { count: fileCount })
|
||||||
|
: t("upload_button", { count: fileCount })}
|
||||||
</span>
|
</span>
|
||||||
)}
|
)}
|
||||||
</Button>
|
</Button>
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,7 @@ export const searchSourceConnector = z.object({
|
||||||
last_indexed_at: z.string().nullable(),
|
last_indexed_at: z.string().nullable(),
|
||||||
config: z.record(z.string(), z.any()),
|
config: z.record(z.string(), z.any()),
|
||||||
enable_summary: z.boolean().default(false),
|
enable_summary: z.boolean().default(false),
|
||||||
|
enable_vision_llm: z.boolean().default(false),
|
||||||
periodic_indexing_enabled: z.boolean(),
|
periodic_indexing_enabled: z.boolean(),
|
||||||
indexing_frequency_minutes: z.number().nullable(),
|
indexing_frequency_minutes: z.number().nullable(),
|
||||||
next_scheduled_at: z.string().nullable(),
|
next_scheduled_at: z.string().nullable(),
|
||||||
|
|
@ -98,6 +99,7 @@ export const createConnectorRequest = z.object({
|
||||||
last_indexed_at: true,
|
last_indexed_at: true,
|
||||||
config: true,
|
config: true,
|
||||||
enable_summary: true,
|
enable_summary: true,
|
||||||
|
enable_vision_llm: true,
|
||||||
periodic_indexing_enabled: true,
|
periodic_indexing_enabled: true,
|
||||||
indexing_frequency_minutes: true,
|
indexing_frequency_minutes: true,
|
||||||
next_scheduled_at: true,
|
next_scheduled_at: true,
|
||||||
|
|
@ -123,6 +125,7 @@ export const updateConnectorRequest = z.object({
|
||||||
last_indexed_at: true,
|
last_indexed_at: true,
|
||||||
config: true,
|
config: true,
|
||||||
enable_summary: true,
|
enable_summary: true,
|
||||||
|
enable_vision_llm: true,
|
||||||
periodic_indexing_enabled: true,
|
periodic_indexing_enabled: true,
|
||||||
indexing_frequency_minutes: true,
|
indexing_frequency_minutes: true,
|
||||||
next_scheduled_at: true,
|
next_scheduled_at: true,
|
||||||
|
|
|
||||||
|
|
@ -148,6 +148,7 @@ export const uploadDocumentRequest = z.object({
|
||||||
files: z.array(z.instanceof(File)),
|
files: z.array(z.instanceof(File)),
|
||||||
search_space_id: z.number(),
|
search_space_id: z.number(),
|
||||||
should_summarize: z.boolean().default(false),
|
should_summarize: z.boolean().default(false),
|
||||||
|
use_vision_llm: z.boolean().default(false),
|
||||||
});
|
});
|
||||||
|
|
||||||
export const uploadDocumentResponse = z.object({
|
export const uploadDocumentResponse = z.object({
|
||||||
|
|
|
||||||
|
|
@ -127,7 +127,7 @@ class DocumentsApiService {
|
||||||
throw new ValidationError(`Invalid request: ${errorMessage}`);
|
throw new ValidationError(`Invalid request: ${errorMessage}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const { files, search_space_id, should_summarize } = parsedRequest.data;
|
const { files, search_space_id, should_summarize, use_vision_llm } = parsedRequest.data;
|
||||||
const UPLOAD_BATCH_SIZE = 5;
|
const UPLOAD_BATCH_SIZE = 5;
|
||||||
|
|
||||||
const batches: File[][] = [];
|
const batches: File[][] = [];
|
||||||
|
|
@ -146,6 +146,7 @@ class DocumentsApiService {
|
||||||
for (const file of batch) formData.append("files", file);
|
for (const file of batch) formData.append("files", file);
|
||||||
formData.append("search_space_id", String(search_space_id));
|
formData.append("search_space_id", String(search_space_id));
|
||||||
formData.append("should_summarize", String(should_summarize));
|
formData.append("should_summarize", String(should_summarize));
|
||||||
|
formData.append("use_vision_llm", String(use_vision_llm));
|
||||||
|
|
||||||
const controller = new AbortController();
|
const controller = new AbortController();
|
||||||
const timeoutId = setTimeout(() => controller.abort(), 120_000);
|
const timeoutId = setTimeout(() => controller.abort(), 120_000);
|
||||||
|
|
@ -442,6 +443,7 @@ class DocumentsApiService {
|
||||||
relative_paths: string[];
|
relative_paths: string[];
|
||||||
root_folder_id?: number | null;
|
root_folder_id?: number | null;
|
||||||
enable_summary?: boolean;
|
enable_summary?: boolean;
|
||||||
|
use_vision_llm?: boolean;
|
||||||
},
|
},
|
||||||
signal?: AbortSignal
|
signal?: AbortSignal
|
||||||
): Promise<{ message: string; status: string; root_folder_id: number; file_count: number }> => {
|
): Promise<{ message: string; status: string; root_folder_id: number; file_count: number }> => {
|
||||||
|
|
@ -456,6 +458,7 @@ class DocumentsApiService {
|
||||||
formData.append("root_folder_id", String(metadata.root_folder_id));
|
formData.append("root_folder_id", String(metadata.root_folder_id));
|
||||||
}
|
}
|
||||||
formData.append("enable_summary", String(metadata.enable_summary ?? false));
|
formData.append("enable_summary", String(metadata.enable_summary ?? false));
|
||||||
|
formData.append("use_vision_llm", String(metadata.use_vision_llm ?? false));
|
||||||
|
|
||||||
const totalSize = files.reduce((acc, f) => acc + f.size, 0);
|
const totalSize = files.reduce((acc, f) => acc + f.size, 0);
|
||||||
const timeoutMs = Math.min(Math.max((totalSize / (1024 * 1024)) * 5000, 30_000), 600_000);
|
const timeoutMs = Math.min(Math.max((totalSize / (1024 * 1024)) * 5000, 30_000), 600_000);
|
||||||
|
|
|
||||||
|
|
@ -396,7 +396,11 @@
|
||||||
"supported_file_types": "Supported File Types",
|
"supported_file_types": "Supported File Types",
|
||||||
"file_too_large": "File Too Large",
|
"file_too_large": "File Too Large",
|
||||||
"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
|
"file_too_large_desc": "\"{name}\" exceeds the {maxMB}MB per-file limit.",
|
||||||
"no_supported_files_in_folder": "No supported file types found in the selected folder."
|
"no_supported_files_in_folder": "No supported file types found in the selected folder.",
|
||||||
|
"uploading_folder": "Uploading folder…",
|
||||||
|
"upload_folder_button": "Upload Folder ({count} {count, plural, one {file} other {files}})",
|
||||||
|
"select_files_or_folder": "Select files or folder",
|
||||||
|
"tap_select_files_or_folder": "Tap to select files or folder"
|
||||||
},
|
},
|
||||||
"add_webpage": {
|
"add_webpage": {
|
||||||
"title": "Add Webpages for Crawling",
|
"title": "Add Webpages for Crawling",
|
||||||
|
|
|
||||||
|
|
@ -396,7 +396,11 @@
|
||||||
"supported_file_types": "Tipos de archivo soportados",
|
"supported_file_types": "Tipos de archivo soportados",
|
||||||
"file_too_large": "Archivo demasiado grande",
|
"file_too_large": "Archivo demasiado grande",
|
||||||
"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
|
"file_too_large_desc": "\"{name}\" excede el límite de {maxMB} MB por archivo.",
|
||||||
"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada."
|
"no_supported_files_in_folder": "No se encontraron tipos de archivo compatibles en la carpeta seleccionada.",
|
||||||
|
"uploading_folder": "Subiendo carpeta…",
|
||||||
|
"upload_folder_button": "Subir carpeta ({count} {count, plural, one {archivo} other {archivos}})",
|
||||||
|
"select_files_or_folder": "Seleccionar archivos o carpeta",
|
||||||
|
"tap_select_files_or_folder": "Toca para seleccionar archivos o carpeta"
|
||||||
},
|
},
|
||||||
"add_webpage": {
|
"add_webpage": {
|
||||||
"title": "Agregar páginas web para rastreo",
|
"title": "Agregar páginas web para rastreo",
|
||||||
|
|
|
||||||
|
|
@ -396,7 +396,11 @@
|
||||||
"supported_file_types": "समर्थित फ़ाइल प्रकार",
|
"supported_file_types": "समर्थित फ़ाइल प्रकार",
|
||||||
"file_too_large": "फ़ाइल बहुत बड़ी है",
|
"file_too_large": "फ़ाइल बहुत बड़ी है",
|
||||||
"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
|
"file_too_large_desc": "\"{name}\" प्रति फ़ाइल {maxMB}MB की सीमा से अधिक है।",
|
||||||
"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।"
|
"no_supported_files_in_folder": "चयनित फ़ोल्डर में कोई समर्थित फ़ाइल प्रकार नहीं मिला।",
|
||||||
|
"uploading_folder": "फ़ोल्डर अपलोड हो रहा है…",
|
||||||
|
"upload_folder_button": "फ़ोल्डर अपलोड करें ({count} {count, plural, one {फ़ाइल} other {फ़ाइलें}})",
|
||||||
|
"select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनें",
|
||||||
|
"tap_select_files_or_folder": "फ़ाइलें या फ़ोल्डर चुनने के लिए टैप करें"
|
||||||
},
|
},
|
||||||
"add_webpage": {
|
"add_webpage": {
|
||||||
"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
|
"title": "क्रॉलिंग के लिए वेबपेज जोड़ें",
|
||||||
|
|
|
||||||
|
|
@ -396,7 +396,11 @@
|
||||||
"supported_file_types": "Tipos de arquivo suportados",
|
"supported_file_types": "Tipos de arquivo suportados",
|
||||||
"file_too_large": "Arquivo muito grande",
|
"file_too_large": "Arquivo muito grande",
|
||||||
"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
|
"file_too_large_desc": "\"{name}\" excede o limite de {maxMB} MB por arquivo.",
|
||||||
"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada."
|
"no_supported_files_in_folder": "Nenhum tipo de arquivo suportado encontrado na pasta selecionada.",
|
||||||
|
"uploading_folder": "Enviando pasta…",
|
||||||
|
"upload_folder_button": "Enviar pasta ({count} {count, plural, one {arquivo} other {arquivos}})",
|
||||||
|
"select_files_or_folder": "Selecionar arquivos ou pasta",
|
||||||
|
"tap_select_files_or_folder": "Toque para selecionar arquivos ou pasta"
|
||||||
},
|
},
|
||||||
"add_webpage": {
|
"add_webpage": {
|
||||||
"title": "Adicionar páginas web para rastreamento",
|
"title": "Adicionar páginas web para rastreamento",
|
||||||
|
|
|
||||||
|
|
@ -380,7 +380,11 @@
|
||||||
"supported_file_types": "支持的文件类型",
|
"supported_file_types": "支持的文件类型",
|
||||||
"file_too_large": "文件过大",
|
"file_too_large": "文件过大",
|
||||||
"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
|
"file_too_large_desc": "\"{name}\" 超过了每个文件 {maxMB}MB 的限制。",
|
||||||
"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。"
|
"no_supported_files_in_folder": "所选文件夹中没有找到支持的文件类型。",
|
||||||
|
"uploading_folder": "正在上传文件夹…",
|
||||||
|
"upload_folder_button": "上传文件夹({count}个文件)",
|
||||||
|
"select_files_or_folder": "选择文件或文件夹",
|
||||||
|
"tap_select_files_or_folder": "点击选择文件或文件夹"
|
||||||
},
|
},
|
||||||
"add_webpage": {
|
"add_webpage": {
|
||||||
"title": "添加网页爬取",
|
"title": "添加网页爬取",
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue